Home > other >  Scrape multiple pages with selenium
Scrape multiple pages with selenium

Time:06-22

How to scrape multiple pages with selenuim I am trying to scrape multiple pages but They show me error Is there any method share with me I am trying to scrape multiple pages by clicking on button these is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

productlink=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
        driver.get(URL)
        time.sleep(3)
        links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("https://www.ifep.ro/"):
                productlink.append(link_href)
              
                
        for k in range(1,5):      
            for product in productlink:
                driver.get(product)
                time.sleep(2)
                title = driver.find_element(By.CSS_SELECTOR, '#HeadingContent_lblTitle').text
                d1 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[1]").text
                d1 = d1.strip()
                d2 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[2]").text
                d2 = d2.strip()
                d3 =driver.find_element_by_xpath(
                    "//div[@class='col-md-10']//p[3]//span").text
                d3 = d3.strip()
                d4 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[4]").text
                d4 = d4.strip()
                
                WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{k}")) ).click()
                
                
                print(title,d1,d2,d3,d4)
             
                # driver.back()
        time.sleep(2)

        driver.quit()


supplyvan_scraper()

CodePudding user response:

You have some errors:

  1. You are doing click(next page) inside the loop of links.
  2. After visit all links you need to go back to click next page. If you have visited 15 links you will need to go back 15 times or save source URL and go back to source URL.

Better solution:

  1. Crawl all links by click next page only.
  2. Visit all links, crawl all data and print them.

Here you have some code did with playwright:

import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.webkit.launch(headless=False)
    baseurl = "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx"
    page = browser.new_page()
    page.goto(baseurl)
    productlinks = []
    for k in range(1, 10):
        links = page.query_selector_all("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("LawyerFile.aspx"):
                productlinks.append("https://www.ifep.ro/justice/lawyers/"   link_href)
        page.wait_for_selector("#MainContent_PagerTop_NavNext").click()
        time.sleep(2)  # wait for load the page
    for product in productlinks:
        page.goto(product)
        title = page.wait_for_selector('#HeadingContent_lblTitle').text_content()
        d1 = page.wait_for_selector("//div[@class='col-md-10']//p[1]").text_content()
        d1 = d1.strip()
        d2 = page.wait_for_selector("//div[@class='col-md-10']//p[2]").text_content()
        d2 = d2.strip()
        d3 = page.wait_for_selector("//div[@class='col-md-10']//p[3]//span").text_content()
        d3 = d3.strip()
        d4 = page.wait_for_selector("//div[@class='col-md-10']//p[4]").text_content()
        d4 = d4.strip()
        print(title, d1, d2, d3, d4)
    browser.close()

OUTPUT:

ILIE Marius-Constantin, Baroul Ilfov Avocat Definitiv, Baroul Ilfov Dată înscriere: 14-03-2011 ACTIV Instanţe cu drept de concluzii: Toate instanţele
DIN GEORGIANA-CLAUDIA, Baroul Bucureşti Avocat Definitiv, Baroul Bucureşti Dată înscriere: 15-05-2008 ACTIV Instanţe cu drept de concluzii: Toate instanţele
MOLDANSCHI ANDREEA-IOANA, Baroul Bucureşti Avocat Stagiar, Baroul Bucureşti Dată înscriere: 30-05-2022 ACTIV Instanţe cu drept de concluzii: Judecătorii
  • Related