I've been playing around learning how to create web scrapers using Selenium. One thing I'm struggling with is scraping pages with pagination. I've written a script that i thought would scrape every page
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import getpass
import datetime
import pandas as pd
custom_options = webdriver.ChromeOptions()
custom_options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
driver = webdriver.Chrome(ChromeDriverManager().install(), options=custom_options)
driver.get("https://lr.caa.cz/letecky-rejstrik?lang=en")
data =[]
while(True):
try:
table_body = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "tbody")))
table_body_rows = table_body.find_elements_by_tag_name("tr")
button = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/main/div/div/app-avreg-list/nav/div/app-pagination/div/a[3]/i')))
for i in table_body_rows:
row_data = []
table_data = i.find_elements_by_tag_name("td")
for j in table_data:
row_data.append(j.text.strip())
data.append(row_data)
button.click()
except:
break
df = pd.DataFrame(data)
print(df)
driver.quit()
It scrapes the first page but then it doesn't seem to go beyond that. This the result i get:
0 1 2 3
0 Glider MDM-1 FOX OK-1213
1 Glider MDM-1 FOX OK-7801
2 Glider A 15 OK-7906
3 Powered glider SZD-45A OK-6902
4 Powered glider SZD-45A OK-8903
5 Hot-air balloon AB OK-9004
6 Hot-air balloon AB OK-4012
7 Hot-air balloon AB OK-4014
8 Hot-air balloon AB OK-7006
9 Hot-air balloon AB OK-7004
10 None None None
I've looked at the xpath of the pagination button on the site and it seems correct in the script.
Any thoughts on what might be wrong?
CodePudding user response:
Instead of presence_of_element_located()
use element_to_be_clickable()
and following css selector or xpath to identify the element.
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'app-pagination a:nth-of-type(3)')))
OR
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "(//app-pagination//a)[3]")))