Hi I am trying to scrape this site products name and units by using the search function in the website from a list of words.
I tried using the scroll method however there's a pause for every scroll down, how do I handle this? As I am scraping a lot of the pages, what's the best way to handle the scroll? I tried using a headless chrome, however it's not possible , so here's a chromedrivermanager that opens the window and scroll. The website is this https://www.sayurbox.com/
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def selenium(soup):
driver = webdriver.Chrome(ChromeDriverManager().install() )
driver.maximize_window()
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
#scraping components
list=['ayam', 'sabun','sayur', 'common']
for item in list:
URL= "https://www.sayurbox.com"
itemsEncoded = str(item).replace(" ", " ")
url = f"{URL}/products/s/{itemsEncoded}"
print(f"{url} start scraping")
soup = selenium(url)
#handling for items not found
try:
found = soup.find_all("span", {"class" : "NotFoundMessage__container__title"})
if found[0].text == "Produk tidak ditemukan.":
print('url not found')
#if found continue scraping
except:
#scrape details
#get product title
productTitle = soup.find_all('span', {"class":"ProductItem__container__name"})
product=[]
for p in productTitle:
p = p.text
product.append(p)
#get unit
units= soup.find_all('span', {"class":"Product__container__priceWrapper__packDesc"})
unit =[]
for u in units:
u = u.text
unit.append(u)
#write into dataframe
data = {'product':product,
'unit':unit,
'date':datetime.date(datetime.now())
}
The above code can only scroll 1 time, however there are still items beneath the 1st scroll.
CodePudding user response:
Do you need to use Selenium. You get the data through a POST and just change the page parameter to get more. Essentially thats what happens when you scroll. And then just change the 'value'
parameter to go through your list as well.
import requests
import pandas as pd
url = 'https://api.sayurbox.io/graphql'
headers = {
'authorization': 'eyJhbGciOiJSUzI1NiIsImtpZCI6ImY4NDY2MjEyMTQxMjQ4NzUxOWJiZjhlYWQ4ZGZiYjM3ODYwMjk5ZDciLCJ0eXAiOiJKV1QifQ.eyJhbm9ueW1vdXMiOnRydWUsImF1ZCI6InNheXVyYm94LWF1ZGllbmNlIiwiYXV0aF90aW1lIjoxNjUwNTUxMDYxLCJleHAiOjE2NTMxNDMwNjEsImlhdCI6MTY1MDU1MTA2MSwiaXNzIjoiaHR0cHM6Ly93d3cuc2F5dXJib3guY29tIiwibWV0YWRhdGEiOnsiZGV2aWNlX2luZm8iOm51bGx9LCJuYW1lIjpudWxsLCJwaWN0dXJlIjpudWxsLCJwcm92aWRlcl9pZCI6ImFub255bW91cyIsInNpZCI6IjFjNDE1ODFiLWQzMjItNDFhZi1hOWE5LWE4YTQ4OTZkODMxZiIsInN1YiI6InFSWXF2OFV2bEFucVR3NlE1NGhfbHdTNFBvTk8iLCJ1c2VyX2lkIjoicVJZcXY4VXZsQW5xVHc2UTU0aF9sd1M0UG9OTyJ9.MSmOz0mAe3UjhH9KSRp-fCk65tkTUPlxiJrRHweDEY2vqBSnUP43TO8ug3P38x8igxC4qguCOlwCTCPfUEWFhr3X8ePY7u7I7D22tV1LOF7Tm6T8PuLzHbmlBTgPK9C_GJpXwLAKnD2A535r-9DttYGt4QytIeWua8NKyW_riURfWGnhZBBMjEPeVPJBqGn1jMtZoh_iUeRb-kWccJ8IhBDQr0T1Op6IDMJuw3x6uf1Ks_SVqEVA0ZGIM1GVwuyZ87JYT4kqITNgi6yNy69jVH6gDFqBkTwJ7ZNWj8NCQsaRfh03bZROZzY9MeCtL6if_8D9newYZagyZu5mKTJNzg',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'}
rows = []
for page in range(1,10):
print(page)
payload = {
'operationName': "getCatalogVariant",
'query': "query getCatalogVariant($deliveryDate: String!, $deliveryArea: String!, $deliveryCode: String, $limit: Int!, $page: Int!, $type: CatalogType, $value: String) {\n catalogVariantList(deliveryDate: $deliveryDate, deliveryArea: $deliveryArea, deliveryCode: $deliveryCode, limit: $limit, page: $page, type: $type, value: $value) {\n limit\n page\n size\n hasNextPage\n category {\n displayName\n }\n list {\n key\n availability\n categories\n farmers {\n image\n name\n }\n image {\n md\n sm\n lg\n }\n isDiscount\n discount\n labelDesc\n labelName\n maxQty\n name\n displayName\n nextAvailableDates\n packDesc\n packNote\n price\n priceFormatted\n actualPrice\n actualPriceFormatted\n shortDesc\n stockAvailable\n type\n emptyMessageHtml\n promoMessageHtml\n }\n }\n}\n",
'variables': {
'deliveryArea': "Jabodetabek",
'deliveryCode': "JK01",
'deliveryDate': "Friday, 22 April 2022",
'limit': 12,
'page': page,
'type': "SEARCH",
'value': "ayam"}}
jsonData = requests.post(url, headers = headers, json=payload).json()
items = jsonData['data']['catalogVariantList']['list']
rows = items
df = pd.DataFrame(rows)
Output:
print(df)
key ... promoMessageHtml
0 Sreeya Sayap Ayam Frozen 500 gram ... None
1 SunOne Kulit Ayam 1 kg ... None
2 Bundling Ayam & Pisau 1 pack ... Promo!! maksimal 5
3 SunOne Hati Ayam 1 kg ... Hanya tersedia 1
4 Wellfed Daging Ayam Giling 250 gram ... None
.. ... ... ...
103 Frozchick Ayam Bumbu Kecap 400 gram ... Hanya tersedia 5
104 Sasa Larasa Bumbu Ungkep Ayam Kalasan 33 gram ... Promo!! maksimal 5
105 Bundling Indomie Kuah Ayam Bawang 69 gram 5 pcs ... Promo!! maksimal 7
106 Bundling MPASI Dada Ayam 1 pack ... Promo!! maksimal 10
107 Berkah Chicken Paha Bawah Probiotik Organik 55... ... Promo!! maksimal 10
[108 rows x 24 columns]