Scraping iteratively selected html data items inside each option in dropdown list-CodePudding

I am trying to scrape few items from html page. And the options has to be selected from drop down list and then iterate. But i am always getting item from the first option in the dropdown. I guess it because my click function is not working properly. How is it possible to iterate through all options and select the items to create the data

import pandas as pd
from selenium import webdriver
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
service = Service("/home/ubuntu/selenium_drivers/chromedriver")

base_url = 'https://www.crave.ca/en/tv-shows/16-and-pregnant'
page_one = True
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=service, options=options)
driver.get(base_url)
driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()
time.sleep(5)
total_seasons = driver.find_elements(By.CSS_SELECTOR,'button.dropdown-item')
driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()
print(len(total_seasons))
d=[]
for i in range(0,len(total_seasons)):
    alleps = driver.find_elements(By.XPATH,'//*[@id="episodes"]/div/ul/li')
    for j in range(1,len(alleps) 1):

        d.append({
            
            'Duration ': driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/span/span[1]').text,
            'Episode_Number ': j,
            'Episode_Synopsis ': driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/p').text,
            'Episode_Title ': re.sub(r'[^a-zA-Z ] ','',driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/h3').text).strip(),
            
        })
data = pd.DataFrame.from_dict(d)

CodePudding user response：

You are clicking twice this element:

driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()

So, you are opening the drop-down menu and closing it back. You are never selecting other seasons.
To make your code work better you should first scrape the Season1 data without selecting other seasons and then iterate over other seasons selecting them one by one and scraping their data.
Your code can be something like this:

import pandas as pd
from selenium import webdriver
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
service = Service("/home/ubuntu/selenium_drivers/chromedriver")

base_url = 'https://www.crave.ca/en/tv-shows/16-and-pregnant'
page_one = True
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=service, options=options)
driver.get(base_url)
driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()
time.sleep(1)
total_seasons = driver.find_elements(By.CSS_SELECTOR,'button.dropdown-item')
driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()
print(len(total_seasons))
d=[]
for i in range(len(total_seasons)):
    alleps = driver.find_elements(By.XPATH,'//*[@id="episodes"]/div/ul/li')
    for j in range(1,len(alleps) 1):

        d.append({
            
            'Duration ': driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/span/span[1]').text,
            'Episode_Number ': j,
            'Episode_Synopsis ': driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/p').text,
            'Episode_Title ': re.sub(r'[^a-zA-Z ] ','',driver.find_element(By.XPATH,f'//*[@id="episodes"]/div/ul/li[{j}]/div[1]/div[2]/h3').text).strip(),
            
        })
    driver.find_element(By.XPATH,'//*[@id="dropdown-basic"]').click()
    seasons = driver.find_elements(By.CSS_SELECTOR,'button.dropdown-item')
    seasons[i].click()
    time.sleep(1)

data = pd.DataFrame.from_dict(d)