Home > Back-end >  Selenium webdriver errors and crashing
Selenium webdriver errors and crashing

Time:03-27

I am building a webscraper to aquire a bunch of baseball data, I am 99% sure that the code that I wrote works, I have tested it all seperatley and it should get the data taht I want. However, I have not been able to run it all the way through yet without giving me a webdriver error like this:

WebDriverException                        Traceback (most recent call last)
c:\Users\jense\VSCODE\BR-selenium-scrape.py in find_plyr_links_pit(self)
     104             try:
---> 105                 WebDriverWait(self.driver, 5).until(
     106                     EC.presence_of_element_located((By.TAG_NAME, "a")))

It does not always stop at the same point, sometimes it gets all the way to grabbing player data and then stops, sometimes it doesnt get past etting league links. Here is my code, maybe there is something wrong or extremely innefficient about it, I am brand new to selenium.

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
    
class scrape_br():

    def __init__(self):
        ser = Service("/path/to/my/Chromedriver.exe")
        op = webdriver.ChromeOptions()
        self.driver = webdriver.Chrome(service=ser, options=op)

    def get_league_hist(self, lg_href):
        self.lg_home = self.driver.get(lg_href)
        table = self.driver.find_element(By.ID, "div_lg_history")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        tags = []
        for row in tr_list:
            tags.append(row.find_element(By.TAG_NAME, "a"))
        yr_list = []
        for link in tags:
            yr_list.append(link.get_attribute("href"))
        yr_list = [str(i) for i in yr_list]
        tm_list = []
        for yr in yr_list:
            self.driver.get(yr)
            tm_list.append(self.find_tm_links())
        for i,yr in enumerate(tm_list):
            for j, team in enumerate(yr):
                tm_list[i][j] = team.split(',')
        plyr_list = []
        for tm in tm_list:
            for player in tm:
                for player_link in player:
                    self.driver.get(player_link)
                    plyr_list.append(self.find_plyr_links_bat())
                    plyr_list.append(self.find_plyr_links_pit())
        plyr_data = []
        for team in plyr_list:
            for player in team:
                self.driver.get(player)
                try:
                    plyr_data.append(self.find_bat_tables())
                except:
                    plyr_data.append(self.find_pitch_tables())
        return plyr_data

    def find_tm_links(self):
        for i in range(1):
            try:
                table = self.driver.find_element(By.ID, "div_standings_pitching")
            except:
                table = self.driver.find_element(By.ID, "regular_season")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        tags = []
        for row in tr_list:
            try:
                tags.append(row.find_elements(By.TAG_NAME, "a"))
            except:
                print("find_tm_links error, could not get a tags from the tr's")
        tm_list = []
        for link in tags:
            for i in link:
                try:
                    tm_list.append(i.get_attribute("href"))
                except:
                    print("error getting a-ref attribute from find_tm_links")
        return tm_list

    def find_plyr_links_bat(self):
        table = self.driver.find_element(By.ID, "team_batting")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        a_tags = []
        for row in tr_list:
            try:
             a_tags.append(row.find_elements(By.TAG_NAME, "a"))
            except:
                print("Could not get player links in find_plyr_links_bat")
        player_hrefs = []
        for nested_tag in a_tags:
            for tag in nested_tag:
                try:
                    player_hrefs.append(tag.get_attribute("href"))
                except:
                    print("could not successfully implement find_plyr_links_bat")
        return player_hrefs

    def find_plyr_links_pit(self):
        table = self.driver.find_element(By.ID, "team_pitching")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'td')
        tags = []
        for row in tr_list:
            try: 
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((By.TAG_NAME, "a")))
            except:
                tags.append(row.find_element(By.TAG_NAME, "a"))
        tm_list = []
        for link in tags:
                tm_list.append(link.get_attribute("href"))
        return tm_list

    def find_bat_tables(self):
        x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_batting")
        y = x.get_attribute("innerHTML")
        z = pd.read_html(y)
        return z[0]

    def find_pitch_tables(self):
        x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_pitching")
        y = x.get_attribute("innerHTML")
        z = pd.read_html(y)
        return z[0]

#%% test 
lg = scrape_br()
nwds_hist = lg.get_league_hist("https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr")

CodePudding user response:

import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import re

url = 'https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}

# Get links
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
yearLinks = soup.find_all('th', {'data-stat':'year_ID'})

links = {}
for year in yearLinks:
    if year.find('a', href=True):
        links[year.text] = 'https://www.baseball-reference.com'   year.find('a', href=True)['href']

final_df = {'batting':[], 'pitching':[]}
for year, link in links.items():
    print(year)
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    if soup.find_all('th', {'data-stat':'team_ID'}):
        team_links = soup.find_all('th', {'data-stat':'team_ID'})
        
    else:
        team_links = []
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for each in comments:
            if 'th' in str(each):
                try:
                    soupAlpha = BeautifulSoup(str(each), 'html.parser').find_all('th', {'data-stat':'team_ID'})
                    if soupAlpha != []:
                        team_links  = soupAlpha
                except:
                    continue
                    
    teamLinks = {}
    for team_link in team_links:
        if team_link.find('a', href=True):
            teamLinks[team_link.text] = 'https://www.baseball-reference.com'   team_link.find('a', href=True)['href']
            
    for team, teamLink in teamLinks.items():
        print(f'\t{team}')
        response = requests.get(teamLink, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        batting_table = pd.read_html(response.text, attrs = {'id': 'team_batting'})[0]
        batting_table['Year'] = year
        batting_table['Team'] = team
        
        print(f'\t\t{team} - batting stats')
        
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for each in comments:
            if 'table' in str(each):
                try:
                    pitching_table = pd.read_html(str(each), attrs = {'id': 'team_pitching'})[0]
                    batting_table['Year'] = year
                    batting_table['Team'] = team
                    
                    print(f'\t\t{team} - pitching stats')
                    break
                except:
                    continue
                
        final_df['batting'].append(batting_table)
        final_df['pitching'].append(pitching_table)
            
batting = pd.concat(final_df['batting'], axis=0)     
pitching = pd.concat(final_df['pitching'], axis=0)

Output:

print(batting)
      Rk               Name   Age   G  ...  IBB  Notes  Year                 Team
0    1.0       Josh Buckley  20.0  35  ...  NaN    NaN  2021   Kokomo Jackrabbits
1    2.0       Justus Burke  22.0  15  ...  NaN    NaN  2021   Kokomo Jackrabbits
2    3.0      Adam Crampton  20.0  33  ...  NaN    NaN  2021   Kokomo Jackrabbits
3    4.0   Dylan Delvecchio  20.0   8  ...  NaN    NaN  2021   Kokomo Jackrabbits
4    5.0       Dylan Dennis  21.0  60  ...  NaN    NaN  2021   Kokomo Jackrabbits
..   ...                ...   ...  ..  ...  ...    ...   ...                  ...
19  20.0    Johnathon Tripp    19  24  ...  0.0    NaN  2013  Green Bay Bullfrogs
20  21.0         Logan West    --  20  ...  0.0    NaN  2013  Green Bay Bullfrogs
21  22.0       Boomer White    19  45  ...  0.0    NaN  2013  Green Bay Bullfrogs
22  23.0  Robert Youngdahl*    20  65  ...  0.0    NaN  2013  Green Bay Bullfrogs
23   NaN         23 Players  19.9  70  ...  1.0    NaN  2013  Green Bay Bullfrogs

[4674 rows x 29 columns]

         
print(pitching)
      Rk               Name   Age   W   L  ...  HR9   BB9   SO9  SO/W  Notes
0    1.0        Parker Bard  19.0   1   1  ...  0.0  10.7   7.9  0.74    NaN
1    2.0    Andrew Beauvais  21.0   1   0  ...  0.0   4.9   5.3  1.08    NaN
2    3.0          Ryan Beck  22.0   2   0  ...  0.0   2.9   6.5  2.20    NaN
3    4.0      Brock Begesha  19.0   2   2  ...  1.3  13.2   7.5  0.57    NaN
4    5.0       Garrett Bell  20.0   0   0  ...  0.0   6.0  15.0  2.50    NaN
..   ...                ...   ...  ..  ..  ...  ...   ...   ...   ...    ...
20  21.0         Logan West    --   3   2  ...  1.1   3.4   4.2  1.23    NaN
21  22.0      Jordan Wright    22   0   1  ...  1.4  10.8   6.8  0.63    NaN
22  23.0  Robert Youngdahl*    20   2   1  ...  0.3   3.8   7.9  2.08    NaN
23  24.0      Marshall Zahn    --   1   0  ...  0.8   5.3   7.5  1.43    NaN
24   NaN         24 Players  19.6  28  42  ...  0.7   3.8   6.8  1.77    NaN

[5225 rows x 32 columns]
  • Related