Selenium fails to scroll down-CodePudding

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.

Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!

In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.

I tried:

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)

copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)

Here is my full script:

import os 
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys


language = "en" # to take this from the user 
main_link = f"https://www.atlp.ae/{language}"

driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True

driver = webdriver.Chrome(driver_path) #  options=options
driver.maximize_window()

def get_services_links():
    links = []
    driver.get(main_link)
    services_header_xpath = '//*[@id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
    driver.find_element(By.XPATH, services_header_xpath).click()
    services_menu_xpath = '//*[@id="serviceInfotitle"]/nav/ul'
    services_menu = driver.find_element(By.XPATH, services_menu_xpath)
    options = services_menu.find_elements(By.TAG_NAME ,"li")
    for option in options:
        a_tag = option.find_element(By.TAG_NAME ,"a")
        links.append(a_tag.get_attribute("href"))
    return links[:-1] if len(links) > 0 else []


def get_service_data(link):
    driver.get(link)
    wait = WebDriverWait(driver, 10)

    service_name_xpath = '//*[@id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
    wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
    service_name = driver.find_element(By.XPATH, service_name_xpath).text
    print("Service Name: ", service_name)
    
    # row serviceSubsetRow ng-star-inserted
    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
    services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')

    container = services_wrapper.find_element(By.CLASS_NAME, 'container')

    service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
    for service in service_sections:
        textual_div = service.find_element(By.CLASS_NAME, 'textCol')
        something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
        print("Text: ", something.text)
  

if __name__ == '__main__':
    # try:
    links = get_services_links()
    for link in links: 
        get_service_data(link) 
        break
    driver.quit()

CodePudding user response：

What you need is this: something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.

Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle

def get_service_data(link):
    driver.get(link)
    wait = WebDriverWait(driver, 10)

    service_name_xpath = '//*[@id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
    wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
    service_name = driver.find_element(By.XPATH, service_name_xpath).text
    print("Service Name: ", service_name)
    # ---- removed these lines --------
    # row serviceSubsetRow ng-star-inserted
    # wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
    # services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
    #
    # container = services_wrapper.find_element(By.CLASS_NAME, 'container')
    # service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
    # ----- End of lines removal ----------
   # Clicking out the cookie acceptance button
    try:
        driver.find_element(By.XPATH, "//*[@class='cc-btn cc-allow']").click()
    except:
        print("nothing there")
    # --- removed these lines
    # for service in service_sections:
    #     textual_div = service.find_element(By.CLASS_NAME, 'textCol')
    #     time.sleep(3)
    # --- end of lines removal ---------
    # These are my lines here from below:
    somethings = driver.find_elements(By.XPATH, "//*[contains(@class, 'serviceSubsetTitle')]")
    print(len(somethings))
    for something in somethings:
        # time.sleep(2)
        title_txt = something.get_attribute('innerText')
        print(title_txt)

here is the output:

Service Name:  Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports

Process finished with exit code 0

CodePudding user response：

This is one way of scrolling that page down:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")

webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

url = 'https://www.atlp.ae/en'
browser.get(url) 
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')

Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/