How to scrape this website items where the lazy load pauses with each scroll before loading the item-CodePudding

Hi I am trying to scrape this site products name and units by using the search function in the website from a list of words.

I tried using the scroll method however there's a pause for every scroll down, how do I handle this? As I am scraping a lot of the pages, what's the best way to handle the scroll? I tried using a headless chrome, however it's not possible , so here's a chromedrivermanager that opens the window and scroll. The website is this https://www.sayurbox.com/

import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options



def selenium(soup):

  driver = webdriver.Chrome(ChromeDriverManager().install() )
  driver.maximize_window()
  driver.get(url)
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(3)
  soup = BeautifulSoup(driver.page_source, 'html.parser')
  driver.close()

#scraping components
list=['ayam', 'sabun','sayur', 'common']
for item in list:
    URL= "https://www.sayurbox.com"
    itemsEncoded = str(item).replace(" ", " ")
    url = f"{URL}/products/s/{itemsEncoded}"
    print(f"{url} start scraping")
    soup = selenium(url)

    #handling for items not found
    try:
        found = soup.find_all("span", {"class" : "NotFoundMessage__container__title"})
        if found[0].text == "Produk tidak ditemukan.":
            print('url not found')
    #if found continue scraping
    except:  
      #scrape details
      #get product title
      productTitle = soup.find_all('span', {"class":"ProductItem__container__name"})
      product=[]
      for p in productTitle:
         p = p.text
         product.append(p)   
      #get unit
      units= soup.find_all('span', {"class":"Product__container__priceWrapper__packDesc"})
      unit =[]
      for u in units:
         u = u.text
         unit.append(u)

     #write into dataframe
     data = {'product':product,
    'unit':unit,
    'date':datetime.date(datetime.now())
    }

The above code can only scroll 1 time, however there are still items beneath the 1st scroll.

CodePudding user response：

Do you need to use Selenium. You get the data through a POST and just change the page parameter to get more. Essentially thats what happens when you scroll. And then just change the 'value' parameter to go through your list as well.

import requests
import pandas as pd


url = 'https://api.sayurbox.io/graphql'
headers = {
    'authorization': 'eyJhbGciOiJSUzI1NiIsImtpZCI6ImY4NDY2MjEyMTQxMjQ4NzUxOWJiZjhlYWQ4ZGZiYjM3ODYwMjk5ZDciLCJ0eXAiOiJKV1QifQ.eyJhbm9ueW1vdXMiOnRydWUsImF1ZCI6InNheXVyYm94LWF1ZGllbmNlIiwiYXV0aF90aW1lIjoxNjUwNTUxMDYxLCJleHAiOjE2NTMxNDMwNjEsImlhdCI6MTY1MDU1MTA2MSwiaXNzIjoiaHR0cHM6Ly93d3cuc2F5dXJib3guY29tIiwibWV0YWRhdGEiOnsiZGV2aWNlX2luZm8iOm51bGx9LCJuYW1lIjpudWxsLCJwaWN0dXJlIjpudWxsLCJwcm92aWRlcl9pZCI6ImFub255bW91cyIsInNpZCI6IjFjNDE1ODFiLWQzMjItNDFhZi1hOWE5LWE4YTQ4OTZkODMxZiIsInN1YiI6InFSWXF2OFV2bEFucVR3NlE1NGhfbHdTNFBvTk8iLCJ1c2VyX2lkIjoicVJZcXY4VXZsQW5xVHc2UTU0aF9sd1M0UG9OTyJ9.MSmOz0mAe3UjhH9KSRp-fCk65tkTUPlxiJrRHweDEY2vqBSnUP43TO8ug3P38x8igxC4qguCOlwCTCPfUEWFhr3X8ePY7u7I7D22tV1LOF7Tm6T8PuLzHbmlBTgPK9C_GJpXwLAKnD2A535r-9DttYGt4QytIeWua8NKyW_riURfWGnhZBBMjEPeVPJBqGn1jMtZoh_iUeRb-kWccJ8IhBDQr0T1Op6IDMJuw3x6uf1Ks_SVqEVA0ZGIM1GVwuyZ87JYT4kqITNgi6yNy69jVH6gDFqBkTwJ7ZNWj8NCQsaRfh03bZROZzY9MeCtL6if_8D9newYZagyZu5mKTJNzg',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'}

rows = []
for page in range(1,10):
    print(page)
    payload = {
        'operationName': "getCatalogVariant",
        'query': "query getCatalogVariant($deliveryDate: String!, $deliveryArea: String!, $deliveryCode: String, $limit: Int!, $page: Int!, $type: CatalogType, $value: String) {\n  catalogVariantList(deliveryDate: $deliveryDate, deliveryArea: $deliveryArea, deliveryCode: $deliveryCode, limit: $limit, page: $page, type: $type, value: $value) {\n    limit\n    page\n    size\n    hasNextPage\n    category {\n      displayName\n    }\n    list {\n      key\n      availability\n      categories\n      farmers {\n        image\n        name\n      }\n      image {\n        md\n        sm\n        lg\n      }\n      isDiscount\n      discount\n      labelDesc\n      labelName\n      maxQty\n      name\n      displayName\n      nextAvailableDates\n      packDesc\n      packNote\n      price\n      priceFormatted\n      actualPrice\n      actualPriceFormatted\n      shortDesc\n      stockAvailable\n      type\n      emptyMessageHtml\n      promoMessageHtml\n    }\n  }\n}\n",
        'variables': {
            'deliveryArea': "Jabodetabek",
            'deliveryCode': "JK01",
            'deliveryDate': "Friday, 22 April 2022",
            'limit': 12,
            'page': page,
            'type': "SEARCH",
            'value': "ayam"}}
    
    jsonData = requests.post(url, headers = headers, json=payload).json()
    items = jsonData['data']['catalogVariantList']['list']
    
    rows  = items
    
df = pd.DataFrame(rows)

Output:

print(df)
                                                   key  ...     promoMessageHtml
0                    Sreeya Sayap Ayam Frozen 500 gram  ...                 None
1                               SunOne Kulit Ayam 1 kg  ...                 None
2                         Bundling Ayam & Pisau 1 pack  ...   Promo!! maksimal 5
3                                SunOne Hati Ayam 1 kg  ...     Hanya tersedia 1
4                  Wellfed Daging Ayam Giling 250 gram  ...                 None
..                                                 ...  ...                  ...
103                Frozchick Ayam Bumbu Kecap 400 gram  ...     Hanya tersedia 5
104      Sasa Larasa Bumbu Ungkep Ayam Kalasan 33 gram  ...   Promo!! maksimal 5
105    Bundling Indomie Kuah Ayam Bawang 69 gram 5 pcs  ...   Promo!! maksimal 7
106                    Bundling MPASI Dada Ayam 1 pack  ...  Promo!! maksimal 10
107  Berkah Chicken Paha Bawah Probiotik Organik 55...  ...  Promo!! maksimal 10

[108 rows x 24 columns]