I am trying to scrape all of the perfumes which are located at https://www.fragrantica.com/search/
There are almost 73,367 perfumes on the site and I want to load all of them. The problem is the site shows 30 perfumes and then you need to click 'show more results' which will show additional 30 perfumes and so on.. so basically we must press the show more button almost 2444 times to reach the end of the page and have all perfumes loaded. This is my code so far:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
options = Options()
options.add_argument("--profile-directory=Default")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument("start-maximized")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service = Service(executable_path='C:/Users/armon/Downloads/chromedriver_win32/chromedriver.exe'), options=options)
url = 'https://www.fragrantica.com/'
driver.get(url)
time.sleep(3)
perfumes_btn = driver.find_element(by=By.XPATH, value = '//*[@id="offCanvasLeft"]/ul/li[5]/a')
perfumes_btn.click()
search_btn = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/div[2]/div[2]/ul/li[5]/ul/li[1]/a')))
search_btn.click()
load_more_btn = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/div/div/button'
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
maxPerfumes = 73346;
i = 0
while loadingButton:
driver.execute_script("arguments[0].click();", loadingButton)
i = i 1
print(i)
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
loadElems = driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]')
if len(loadElems)>0:
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
PerfumesLoaded = len(driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]'))
else:
print("Loaded all the tires")
break
if PerfumesLoaded >= maxPerfumes:
print (PerfumesLoaded " are loaded successfully.")
break
The problem is each time I run it, it only makes maximum of 34 loops and then stops and throws and error:
TimeoutException Traceback (most recent call last)
c:\Users\armon\OneDrive\Desktop\OLD\Data Analytics\Portfolio Projects\Jolse Project\Jolse Scraping Notebook.ipynb Cell 1' in <cell line: 36>()
41 loadElems = driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]')
42 if len(loadElems)>0:
---> 43 loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
44 PerfumesLoaded = len(driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]'))
45 else:
File c:\Users\armon\anaconda3\envs\armoniaenv\lib\site-packages\selenium\webdriver\support\wait.py:87, in WebDriverWait.until(self, method, message)
85 if time.monotonic() > end_time:
86 break
---> 87 raise TimeoutException(message, screen, stacktrace)
TimeoutException: Message:
The print(i) that I have in the loop is simply to know how many loops it made.. any suggestions to how tweak it? or if I am doing something wrong?
My goal is to click load more until I can't no more, and then I can access all the perfumes that are on the page. TIA
CodePudding user response:
I solve it using try-except when finding elements. You can also change a bit your code to get results only one time after the while loop ends
try:
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
except Exception as e:
print("cant locate element: %s" %e)
loadingButton = False
#My code to solve this
more = True
while more:
more = my_custom_function("//XX")
#my_custom_function try to get element by Xpath
#and also click it, on exception it will return False
if more:
#Need sleep cause website change when click load more
# python code can run faster than website loads, can do sleep inside my_custom_function also
sleep(1)