Home > OS >  Try to scrape title using selenium
Try to scrape title using selenium

Time:05-25

I am try to scrape title they will go inside every link and scrape the title but they will show me error

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep

PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)

CodePudding user response:

You need to change the selector to get the h1 tag text.

In this snippet, the scraper will visit the first link and print the title

# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)

# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)

The driver will visit every link and scrape the title

# parse all the links
page_links = [element.get_attribute('href') for element in
              driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

# visit all the links
for link in page_links:
    driver.get(link)
    time.sleep(2)
    title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

    # parse title for all the links
    print(title)
    time.sleep(2)

full code with both snippet included -


import time

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
        driver.get(URL)
        time.sleep(3)

        # opt #1 visit first link, print the title uncomment to see
        # click the single link
        # WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
        # time.sleep(2)
        #
        # # parse the h1 tag text
        # title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
        # print(title)

        # opt #2 visit all links, print titles
        # parse all the links
        page_links = [element.get_attribute('href') for element in
                      driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

        # visit all the links
        for link in page_links:
            driver.get(link)
            time.sleep(2)
            title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

            # parse title for all the links
            print(title)
            # driver.back()
            time.sleep(2)

        time.sleep(2)
        driver.quit()


supplyvan_scraper()

Output for all the visited links -

Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....

  • Related