Home > Back-end >  selenium trying to scrape multiple pages on website but get error when going to next page
selenium trying to scrape multiple pages on website but get error when going to next page

Time:10-23

I am quite new to selenium

my current code looks like this

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
        
shoe_links = []
page = 1
path = r"C:\Users\redacted\OneDrive\Desktop\chrome driver\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://www.myntra.com")
search = driver.find_element_by_class_name("desktop-searchBar")
search.send_keys("jordan")
search.send_keys(Keys.RETURN)
try:
    mountRoot = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "mountRoot"))
    )
    while page != 6:
        item = mountRoot.find_element(By.XPATH, "//*[@id='desktopSearchResults']/div[2]/section/ul")
        liclass = item.find_elements_by_class_name("product-base")
        for i in range(len(liclass)):
            print(i)
            print(liclass[i].text)
            if 'shoe' in liclass[i].find_element_by_tag_name('a').get_attribute("href").lower():
                shoe_links.append(liclass[i].find_element_by_tag_name('a').get_attribute("href"))
            print("\n")
        for i in range(len(shoe_links)):
            print(shoe_links[i])
        next_link = mountRoot.find_element_by_link_text("Next")
        next_link.click()
        page = page   1

finally:
    driver.quit()

this code works for the first page but once it goes to the next page it give me the error

C:\Users\redacted\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webelement.py:264: UserWarning: find_element_by_* commands are deprecated. Please use find_element() instead
  warnings.warn("find_element_by_* commands are deprecated. Please use find_element() instead")
0Traceback (most recent call last):
  File "C:/Users/redacted/OneDrive/Desktop/Myntra.py", line 27, in <module>
    print(liclass[i].text)
  File "C:\Users\redacted\AppData\Local\Programs\Python\Python39\Lib\site-packages\selenium\webdriver\remote\webelement.py", line 76, in text
    return self._execute(Command.GET_ELEMENT_TEXT)['value']
  File "C:\Users\redacted\AppData\Local\Programs\Python\Python39\Lib\site-packages\selenium\webdriver\remote\webelement.py", line 693, in _execute
    return self._parent.execute(command, params)
  File "C:\Users\redacted\AppData\Local\Programs\Python\Python39\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 418, in execute
    self.error_handler.check_response(response)
  File "C:\Users\redacted\AppData\Local\Programs\Python\Python39\Lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 243, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=95.0.4638.54)
Stacktrace:
Backtrace:
    Ordinal0 [0x0083B9D3 2472403]
    Ordinal0 [0x007D6961 2058593]
    Ordinal0 [0x006E22B8 1057464]
    Ordinal0 [0x006E49E4 1067492]
    Ordinal0 [0x006E48AE 1067182]
    Ordinal0 [0x006E4B10 1067792]
    Ordinal0 [0x00708140 1212736]
    Ordinal0 [0x00725593 1332627]
    Ordinal0 [0x00703236 1192502]
    Ordinal0 [0x0072567A 1332858]
    Ordinal0 [0x0073495F 1395039]
    Ordinal0 [0x0072545B 1332315]
    Ordinal0 [0x00701FB4 1187764]
    Ordinal0 [0x00702E09 1191433]
    GetHandleVerifier [0x009C3396 1551078]
    GetHandleVerifier [0x00A6D53A 2247818]
    GetHandleVerifier [0x008C7FBB 521995]
    GetHandleVerifier [0x008C7079 518089]
    Ordinal0 [0x007DBD4D 2080077]
    Ordinal0 [0x007DFC18 2096152]
    Ordinal0 [0x007DFD52 2096466]
    Ordinal0 [0x007E94B1 2135217]
    BaseThreadInitThunk [0x757DFA29 25]
    RtlGetAppContainerNamedObjectPath [0x775B7A9E 286]
    RtlGetAppContainerNamedObjectPath [0x775B7A6E 238]

the inspect element that I have using to guide me is: https://i.stack.imgur.com/TOv1F.png

I am thinking the problem comes form the for loop (I also used a while loop and got the same error)

What is the problem with my code? any help would be very appreciated.

CodePudding user response:

Wait until the loading spinner is gone

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
        
shoe_links = []
page = 1
path = r"C:\Users\redacted\OneDrive\Desktop\chrome driver\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://www.myntra.com")
search = driver.find_element_by_class_name("desktop-searchBar")
search.send_keys("jordan")
search.send_keys(Keys.RETURN)
try:
    mountRoot = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "mountRoot"))
    )
    while page != 6:
        item = mountRoot.find_element(By.XPATH, "//*[@id='desktopSearchResults']/div[2]/section/ul")
        liclass = item.find_elements_by_class_name("product-base")
        for i in range(len(liclass)):
            print(i)
            print(liclass[i].text)
            if 'shoe' in liclass[i].find_element_by_tag_name('a').get_attribute("href").lower():
                shoe_links.append(liclass[i].find_element_by_tag_name('a').get_attribute("href"))
            print("\n")
        for i in range(len(shoe_links)):
            print(shoe_links[i])
        next_link = mountRoot.find_element_by_link_text("Next")
        next_link.click()
        page = page   1
        WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CLASS_NAME, "spinner-spinner")))
        WebDriverWait(driver, 10).until(
        EC.invisibility_of_element_located((By.CLASS_NAME, "spinner-spinner")))

finally:
    driver.quit()
  • Related