I am trying to webscrape from multiple pages, my code seems to work really well for just page one and when I use loop to do web scrapping for example first 5 pages then im getting below error:TimeoutException: Message: Stacktrace: Backtrace:
My code is below
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
import requests as r
import time
from selenium.webdriver.support.ui import Select
PATH="chromedriver.exe"
driver=webdriver.Chrome(PATH)
_list=[]
for page_num in range(1,3):
#print("----")
url=f"https://valuebuds.com/pages/search-results-page?tab=products&page={page_num}"
driver.get(url)
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
title=driver.find_elements_by_class_name("snize-overhidden")
for j in title:
Pro=j.find_element_by_class_name("snize-title").text
Price=j.find_element_by_class_name("snize-price-list").text
Desc=j.find_element_by_class_name("snize-description").text
prec_item={
"Product":Pro,
"Price":Price,
"Description":Desc
}
_list.append(prec_item)
df = pd.DataFrame(_list)
df.to_csv("Value Buds HTML Pricing.csv")
print("saved to file.")
please advise! Thanks in advance
CodePudding user response:
The code block
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
Is relevant when you landing the home page first time.
Once you have selected the year and clicked Agree
button you will be able to see all the pages of presented results with no need to select that year again.
So, your code could be something like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
import requests as r
import time
from selenium.webdriver.support.ui import Select
PATH="chromedriver.exe"
driver=webdriver.Chrome(PATH)
_list=[]
for page_num in range(1,3):
#print("----")
url=f"https://valuebuds.com/pages/search-results-page?tab=products&page={page_num}"
driver.get(url)
if page_num == 1:
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
else:
time.sleep(2)
title=driver.find_elements_by_class_name("snize-overhidden")
for j in title:
Pro=j.find_element_by_class_name("snize-title").text
Price=j.find_element_by_class_name("snize-price-list").text
Desc=j.find_element_by_class_name("snize-description").text
prec_item={
"Product":Pro,
"Price":Price,
"Description":Desc
}
_list.append(prec_item)
df = pd.DataFrame(_list)
df.to_csv("Value Buds HTML Pricing.csv")
print("saved to file.")
I have added a delay for non-first iteration to make pages loaded before you will scrape their data.
I will be better if you use Expected Conditions explicit waits there.
I don't know what condition to use there, leaved that for you decision.