Home > Mobile >  Scrape only 1 page I want to scrape multiple pages with selenium
Scrape only 1 page I want to scrape multiple pages with selenium

Time:06-23

I am trying to scrape multiple pages with selenium but they will scrape only 1 page what mistake I will do is there any solution then provide us this is the page link https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        
        URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?qvrtqca=&filters[rechtsgebieden]=[]&ypb=&locatie[adres]=Holland&locatie[geo]={"lat":"52.132633","lng":"5.291266"}&locatie[straal]=56&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[hash]='
        driver.get(URL)
        time.sleep(3)
        page=1
        page_links = [element.get_attribute('href') for element in
                    driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]

        data=[]
        for link in page_links:
            wev={}
            driver.get(link)
            time.sleep(2)
            try:
                title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
            except:
                pass
            wev['title']=title
            
            try:
                advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
            except:
                pass
            
            wev['advocaten']=advocaten
            
            details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
            for detail in details:
                
                try:
                    address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
                except:
                    pass
                wev['address']=address
                try:
                    email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
                except:
                    pass
                wev['email']=email
                try:
                    website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
                except:
                    pass
                
                wev['website']=website
                
                data.append(wev)
                
            if len(driver.find_elements_by_xpath("//a[@class='button next']")) > 0:
                url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={}".format(page)
                driver.get(url)
                page  = 1
                if int(page)>5:
                    break
                else:
                    break           
    df=pd.DataFrame(data)
    print(df)

CodePudding user response:

You can make the pagination in starting url using for loop as follows:

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options)

data=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        
        URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
        for page in range(1,11):

            driver.get(URL.format(page=page))
            time.sleep(3)
        
            page_links = [element.get_attribute('href') for element in driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]

            
            for link in page_links:
                wev={}
                driver.get(link)
                time.sleep(2)
                try:
                    title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
                except:
                    pass
                wev['title']=title
            
                try:
                    advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
                except:
                    pass
            
                wev['advocaten']=advocaten
            
                details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
                for detail in details:
                
                    try:
                        address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
                    except:
                        pass
                    wev['address']=address
                    try:
                        email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
                    except:
                        pass
                    wev['email']=email
                    try:
                        website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
                    except:
                        pass
                
                    wev['website']=website
                
                    data.append(wev)
                

df=pd.DataFrame(data)
print(df)

You also can try:

URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
        for page in range(1,11):
            url=URL.format(page=page)

            driver.get(url)
  • Related