I am trying to scrape multiple pages with selenium but they will scrape only 1 page what mistake I will do is there any solution then provide us this is the page link https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?qvrtqca=&filters[rechtsgebieden]=[]&ypb=&locatie[adres]=Holland&locatie[geo]={"lat":"52.132633","lng":"5.291266"}&locatie[straal]=56&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[hash]='
driver.get(URL)
time.sleep(3)
page=1
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]
data=[]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
if len(driver.find_elements_by_xpath("//a[@class='button next']")) > 0:
url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={}".format(page)
driver.get(url)
page = 1
if int(page)>5:
break
else:
break
df=pd.DataFrame(data)
print(df)
CodePudding user response:
You can make the pagination in starting url using for loop
as follows:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options)
data=[]
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
driver.get(URL.format(page=page))
time.sleep(3)
page_links = [element.get_attribute('href') for element in driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
df=pd.DataFrame(data)
print(df)
You also can try:
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
url=URL.format(page=page)
driver.get(url)