Home > Software design >  Already complete scraping scrapes everything on the page. I would like to limit the scraping to only
Already complete scraping scrapes everything on the page. I would like to limit the scraping to only

Time:12-03

I placed the code of a complete and properly functioning scraping that I own. Successfully scrapes all elements on the page.

However, I would like to scrape only a small limited section of the page with the same elements as scraping. This limited section is already scraped correctly along with all elements of the page, but I would like to scrape only it and not "all it". The link is enter image description here

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Firefox()
driver.get("url")
driver.implicitly_wait(12)
#driver.minimize_window()

wait = WebDriverWait(driver, 10)


all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")

current_round = '?'

for bundesliga in all_rows:
    classes = bundesliga.get_attribute('class')
    #print(classes)

    if 'event__round' in classes:
        #round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
        #current_round = row.text  # full text `Round 20`

        current_round = bundesliga.text.split(" ")[-1]  # only `20` without `Round`   
                
    else:       
        datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")

        #Divide la data e l'ora
        date, time = datetime.text.split(" ")
        date = date.rstrip('.')  # right-strip to remove `.` at the end of date

        
        team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")            
        team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
        score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
        score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
        
 
        bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
        bundesliga.append(bundesliga)
        print(bundesliga)

CodePudding user response:

I think all you need to do is limit all_rows variable. One way to do this is finding the tab you are looking for with text and then getting the parent elements.

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

driver = webdriver.Firefox()
driver.get("https://www.someurl/some/other/page")
driver.implicitly_wait(12)
#driver.minimize_window()

wait = WebDriverWait(driver, 10)


# all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
############### UPDATE ####################
def parent_element(element):
    return element.find_element(By.XPATH, './..')
programma_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[text()='Programma']")))

programma_element_p1 = parent_element(programma_element)
programma_element_p2 = parent_element(programma_element_p1)
programma_element_p3 = parent_element(programma_element_p2)

all_rows = programma_element_p3.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
filter_rows = []

for row in all_rows:
    if "event__match--last" in row.get_attribute('class'):
        filter_rows.append(row)
        break
    else:
        filter_rows.append(row)

############### UPDATE ####################
current_round = '?'

for bundesliga in filter_rows:
    classes = bundesliga.get_attribute('class')
    #print(classes)

    if 'event__round' in classes:
        #round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
        #current_round = row.text  # full text `Round 20`

        current_round = bundesliga.text.split(" ")[-1]  # only `20` without `Round`   
                
    else:       
        datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")

        #Divide la data e l'ora
        date, time = datetime.text.split(" ")
        date = date.rstrip('.')  # right-strip to remove `.` at the end of date

        
        team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")            
        team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
        # score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
        # score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")

        try:
            score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
        except (TimeoutException, NoSuchElementException):
            MyObject = type('MyObject', (object,), {})
            score_home = MyObject()
            score_home.text = "-"
        try:
            score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
        except (TimeoutException, NoSuchElementException):
            MyObject = type('MyObject', (object,), {})
            score_away = MyObject()
            score_away.text = "-"
        
 
        bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
        bundesliga.append(bundesliga)
        print(bundesliga)
  • Related