Cannot webscrape web page with pagination with python-CodePudding

I am trying to web scrape the links of products in a category of

https://www.acihellas.gr/gaming-pontikia#/

It has 4 pages of products..but for some reason I get only the first one.. with the following

headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
filterprods = '/#/pageSize=21&viewMode=grid&orderBy=10&pageNumber='


for itm in range(1,page_number):
    print("Page",itm)
    urlget = str(url2get filterprods str(itm))
    time.sleep(2)
    ses=requests.Session()
    r = ses.get(urlget, headers=headers)
    
    if r.status_code == 200:

        Myhtml = r.text
        
        soup = BeautifulSoup(Myhtml, 'lxml')
        
        productlist = soup.find_all('div',attrs = {'class','item-box'})
   
        for p_item in productlist:
            
            a = p_item.find('a')
            if a:
                producttitle = a['title']                  
                productlink = a['href']
                url_item = 'https://acihellas.gr' productlink
                print(url_item)
                urllist.append(url_item)
                time.sleep(2)
            else:
                pass
        ses.close()

    else:
        print(r.status_code)

return urllist

The links are joined correctly but ses.get(url) is not working so I thought if I maybe close session again nothing.

The page is not having when inspecting a link to next page. So I constructed with filterprods variable

How can we fix this?

thank you

CodePudding user response：

You didn't provided an URL in your code to website. You can use website API to gather products. Here is a starting code, I leave parsing to you :)

import requests

url = 'https://www.acihellas.gr/getFilteredProducts'

for pagenum in range(1, 5):

    payload = {
        "categoryId": "828",
        "manufacturerId": "0",
        "vendorId": "0",
        "priceRangeFilterModel7Spikes": "null",
        "specificationFiltersModel7Spikes": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "SpecificationFilterGroups": [{
                "Id": 998,
                "FilterItems": [{
                    "Id": "25188",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "18572",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7361",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7362",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7368",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "18060",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "19024",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "24876",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "28037",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "23321",
                    "FilterItemState": "Unchecked"
                }]
            }, {
                "Id": 990,
                "FilterItems": [{
                    "Id": "7336",
                    "FilterItemState": "Unchecked"
                }]
            }, {
                "Id": 995,
                "FilterItems": [{
                    "Id": "7350",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7348",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7349",
                    "FilterItemState": "Unchecked"
                }]
            }]
        },
        "pageNumber": str(pagenum),
        "orderby": "10",
        "viewmode": "grid",
        "pagesize": "21",
        "queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber="   str(pagenum),
        "shouldNotStartFromFirstPage": "true",
        "keyword": "",
        "searchCategoryId": "0",
        "searchManufacturerId": "0",
        "searchVendorId": "0",
        "priceFrom": "",
        "priceTo": "",
        "includeSubcategories": "False",
        "searchInProductDescriptions": "False",
        "advancedSearch": "False",
        "isOnSearchPage": "False",
        "inStockFilterModel": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "Id": "1",
            "FilterItemState": "Unchecked"
        }
    }

    res = requests.post(url, json=payload)

    print(res.text)

Note: Your links are relative, so you need to prepend them with website URL: https://www.acihellas.gr/

EDIT:

Answering a question if there is a way to only change category in payload , it looks like yes, I removed entire section from payload variable which was basicly product filters, and It still works:

payload = {
        "categoryId": "828",
        "manufacturerId": "0",
        "vendorId": "0",
        "priceRangeFilterModel7Spikes": "null",
        "pageNumber": str(pagenum),
        "orderby": "10",
        "viewmode": "grid",
        "pagesize": "21",
        "queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber="   str(pagenum),
        "shouldNotStartFromFirstPage": "true",
        "keyword": "",
        "searchCategoryId": "0",
        "searchManufacturerId": "0",
        "searchVendorId": "0",
        "priceFrom": "",
        "priceTo": "",
        "includeSubcategories": "False",
        "searchInProductDescriptions": "False",
        "advancedSearch": "False",
        "isOnSearchPage": "False",
        "inStockFilterModel": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "Id": "1",
            "FilterItemState": "Unchecked"
        }
    }

CodePudding user response：

I cannot accurately test this because this domain is blocked in my country, but maybe you could try something like this:

import requests
import bs4 as bs

url_base = 'http://www.acihellas.gr/gaming-pontikia#/pageSize=21&viewMode=grid&orderBy=10&pageNumber={page}'
total_pages = 4
products = {}

for page in range(1, total_pages   1):
    url = url_base.format(page=page)
    print(f'Scraping page {page}...')
    res = requests.get(url)

    soup = bs.BeautifulSoup(res.text, 'lxml')
    item = soup.find_all('div', {'class': 'details'})
    # Add items to dictionary
    for i in item:
        name = i.find("h2").text
        print(name)
        url = i.find("a")['href']
        products[i] = {'name': name, 'url': url}

products

The products dictionary should have the name and url of items.