I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate
to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data
function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[@id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[@id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[@id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()
CodePudding user response:
What you need is this:
something.get_attribute('innerText')
because, perhaps, due to the added animation, the regular text
is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[@id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[@class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(@class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0
CodePudding user response:
This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/