Home > Software design >  Cannot webscrape web page with pagination with python
Cannot webscrape web page with pagination with python

Time:11-07

I am trying to web scrape the links of products in a category of

https://www.acihellas.gr/gaming-pontikia#/

It has 4 pages of products..but for some reason I get only the first one.. with the following

headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
filterprods = '/#/pageSize=21&viewMode=grid&orderBy=10&pageNumber='


for itm in range(1,page_number):
    print("Page",itm)
    urlget = str(url2get filterprods str(itm))
    time.sleep(2)
    ses=requests.Session()
    r = ses.get(urlget, headers=headers)
    
    if r.status_code == 200:

        Myhtml = r.text
        
        soup = BeautifulSoup(Myhtml, 'lxml')
        
        productlist = soup.find_all('div',attrs = {'class','item-box'})
   
        for p_item in productlist:
            
            a = p_item.find('a')
            if a:
                producttitle = a['title']                  
                productlink = a['href']
                url_item = 'https://acihellas.gr' productlink
                print(url_item)
                urllist.append(url_item)
                time.sleep(2)
            else:
                pass
        ses.close()

    else:
        print(r.status_code)

return urllist

The links are joined correctly but ses.get(url) is not working so I thought if I maybe close session again nothing.

The page is not having when inspecting a link to next page. So I constructed with filterprods variable

How can we fix this?

thank you

CodePudding user response:

You didn't provided an URL in your code to website. You can use website API to gather products. Here is a starting code, I leave parsing to you :)

import requests

url = 'https://www.acihellas.gr/getFilteredProducts'

for pagenum in range(1, 5):

    payload = {
        "categoryId": "828",
        "manufacturerId": "0",
        "vendorId": "0",
        "priceRangeFilterModel7Spikes": "null",
        "specificationFiltersModel7Spikes": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "SpecificationFilterGroups": [{
                "Id": 998,
                "FilterItems": [{
                    "Id": "25188",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "18572",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7361",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7362",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7368",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "18060",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "19024",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "24876",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "28037",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "23321",
                    "FilterItemState": "Unchecked"
                }]
            }, {
                "Id": 990,
                "FilterItems": [{
                    "Id": "7336",
                    "FilterItemState": "Unchecked"
                }]
            }, {
                "Id": 995,
                "FilterItems": [{
                    "Id": "7350",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7348",
                    "FilterItemState": "Unchecked"
                }, {
                    "Id": "7349",
                    "FilterItemState": "Unchecked"
                }]
            }]
        },
        "pageNumber": str(pagenum),
        "orderby": "10",
        "viewmode": "grid",
        "pagesize": "21",
        "queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber="   str(pagenum),
        "shouldNotStartFromFirstPage": "true",
        "keyword": "",
        "searchCategoryId": "0",
        "searchManufacturerId": "0",
        "searchVendorId": "0",
        "priceFrom": "",
        "priceTo": "",
        "includeSubcategories": "False",
        "searchInProductDescriptions": "False",
        "advancedSearch": "False",
        "isOnSearchPage": "False",
        "inStockFilterModel": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "Id": "1",
            "FilterItemState": "Unchecked"
        }
    }

    res = requests.post(url, json=payload)

    print(res.text)

Note: Your links are relative, so you need to prepend them with website URL: https://www.acihellas.gr/

EDIT:

Answering a question if there is a way to only change category in payload , it looks like yes, I removed entire section from payload variable which was basicly product filters, and It still works:

payload = {
        "categoryId": "828",
        "manufacturerId": "0",
        "vendorId": "0",
        "priceRangeFilterModel7Spikes": "null",
        "pageNumber": str(pagenum),
        "orderby": "10",
        "viewmode": "grid",
        "pagesize": "21",
        "queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber="   str(pagenum),
        "shouldNotStartFromFirstPage": "true",
        "keyword": "",
        "searchCategoryId": "0",
        "searchManufacturerId": "0",
        "searchVendorId": "0",
        "priceFrom": "",
        "priceTo": "",
        "includeSubcategories": "False",
        "searchInProductDescriptions": "False",
        "advancedSearch": "False",
        "isOnSearchPage": "False",
        "inStockFilterModel": {
            "CategoryId": "828",
            "ManufacturerId": "0",
            "VendorId": "0",
            "Id": "1",
            "FilterItemState": "Unchecked"
        }
    }

CodePudding user response:

I cannot accurately test this because this domain is blocked in my country, but maybe you could try something like this:

import requests
import bs4 as bs

url_base = 'http://www.acihellas.gr/gaming-pontikia#/pageSize=21&viewMode=grid&orderBy=10&pageNumber={page}'
total_pages = 4
products = {}

for page in range(1, total_pages   1):
    url = url_base.format(page=page)
    print(f'Scraping page {page}...')
    res = requests.get(url)

    soup = bs.BeautifulSoup(res.text, 'lxml')
    item = soup.find_all('div', {'class': 'details'})
    # Add items to dictionary
    for i in item:
        name = i.find("h2").text
        print(name)
        url = i.find("a")['href']
        products[i] = {'name': name, 'url': url}

products

The products dictionary should have the name and url of items.

  • Related