I am building a webscraper to aquire a bunch of baseball data, I am 99% sure that the code that I wrote works, I have tested it all seperatley and it should get the data taht I want. However, I have not been able to run it all the way through yet without giving me a webdriver error like this:
WebDriverException Traceback (most recent call last)
c:\Users\jense\VSCODE\BR-selenium-scrape.py in find_plyr_links_pit(self)
104 try:
---> 105 WebDriverWait(self.driver, 5).until(
106 EC.presence_of_element_located((By.TAG_NAME, "a")))
It does not always stop at the same point, sometimes it gets all the way to grabbing player data and then stops, sometimes it doesnt get past etting league links. Here is my code, maybe there is something wrong or extremely innefficient about it, I am brand new to selenium.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class scrape_br():
def __init__(self):
ser = Service("/path/to/my/Chromedriver.exe")
op = webdriver.ChromeOptions()
self.driver = webdriver.Chrome(service=ser, options=op)
def get_league_hist(self, lg_href):
self.lg_home = self.driver.get(lg_href)
table = self.driver.find_element(By.ID, "div_lg_history")
tbody = table.find_element(By.TAG_NAME, "tbody")
tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
tags = []
for row in tr_list:
tags.append(row.find_element(By.TAG_NAME, "a"))
yr_list = []
for link in tags:
yr_list.append(link.get_attribute("href"))
yr_list = [str(i) for i in yr_list]
tm_list = []
for yr in yr_list:
self.driver.get(yr)
tm_list.append(self.find_tm_links())
for i,yr in enumerate(tm_list):
for j, team in enumerate(yr):
tm_list[i][j] = team.split(',')
plyr_list = []
for tm in tm_list:
for player in tm:
for player_link in player:
self.driver.get(player_link)
plyr_list.append(self.find_plyr_links_bat())
plyr_list.append(self.find_plyr_links_pit())
plyr_data = []
for team in plyr_list:
for player in team:
self.driver.get(player)
try:
plyr_data.append(self.find_bat_tables())
except:
plyr_data.append(self.find_pitch_tables())
return plyr_data
def find_tm_links(self):
for i in range(1):
try:
table = self.driver.find_element(By.ID, "div_standings_pitching")
except:
table = self.driver.find_element(By.ID, "regular_season")
tbody = table.find_element(By.TAG_NAME, "tbody")
tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
tags = []
for row in tr_list:
try:
tags.append(row.find_elements(By.TAG_NAME, "a"))
except:
print("find_tm_links error, could not get a tags from the tr's")
tm_list = []
for link in tags:
for i in link:
try:
tm_list.append(i.get_attribute("href"))
except:
print("error getting a-ref attribute from find_tm_links")
return tm_list
def find_plyr_links_bat(self):
table = self.driver.find_element(By.ID, "team_batting")
tbody = table.find_element(By.TAG_NAME, "tbody")
tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
a_tags = []
for row in tr_list:
try:
a_tags.append(row.find_elements(By.TAG_NAME, "a"))
except:
print("Could not get player links in find_plyr_links_bat")
player_hrefs = []
for nested_tag in a_tags:
for tag in nested_tag:
try:
player_hrefs.append(tag.get_attribute("href"))
except:
print("could not successfully implement find_plyr_links_bat")
return player_hrefs
def find_plyr_links_pit(self):
table = self.driver.find_element(By.ID, "team_pitching")
tbody = table.find_element(By.TAG_NAME, "tbody")
tr_list = tbody.find_elements(By.TAG_NAME, 'td')
tags = []
for row in tr_list:
try:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.TAG_NAME, "a")))
except:
tags.append(row.find_element(By.TAG_NAME, "a"))
tm_list = []
for link in tags:
tm_list.append(link.get_attribute("href"))
return tm_list
def find_bat_tables(self):
x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_batting")
y = x.get_attribute("innerHTML")
z = pd.read_html(y)
return z[0]
def find_pitch_tables(self):
x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_pitching")
y = x.get_attribute("innerHTML")
z = pd.read_html(y)
return z[0]
#%% test
lg = scrape_br()
nwds_hist = lg.get_league_hist("https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr")
CodePudding user response:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import re
url = 'https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
# Get links
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
yearLinks = soup.find_all('th', {'data-stat':'year_ID'})
links = {}
for year in yearLinks:
if year.find('a', href=True):
links[year.text] = 'https://www.baseball-reference.com' year.find('a', href=True)['href']
final_df = {'batting':[], 'pitching':[]}
for year, link in links.items():
print(year)
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
if soup.find_all('th', {'data-stat':'team_ID'}):
team_links = soup.find_all('th', {'data-stat':'team_ID'})
else:
team_links = []
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for each in comments:
if 'th' in str(each):
try:
soupAlpha = BeautifulSoup(str(each), 'html.parser').find_all('th', {'data-stat':'team_ID'})
if soupAlpha != []:
team_links = soupAlpha
except:
continue
teamLinks = {}
for team_link in team_links:
if team_link.find('a', href=True):
teamLinks[team_link.text] = 'https://www.baseball-reference.com' team_link.find('a', href=True)['href']
for team, teamLink in teamLinks.items():
print(f'\t{team}')
response = requests.get(teamLink, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
batting_table = pd.read_html(response.text, attrs = {'id': 'team_batting'})[0]
batting_table['Year'] = year
batting_table['Team'] = team
print(f'\t\t{team} - batting stats')
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for each in comments:
if 'table' in str(each):
try:
pitching_table = pd.read_html(str(each), attrs = {'id': 'team_pitching'})[0]
batting_table['Year'] = year
batting_table['Team'] = team
print(f'\t\t{team} - pitching stats')
break
except:
continue
final_df['batting'].append(batting_table)
final_df['pitching'].append(pitching_table)
batting = pd.concat(final_df['batting'], axis=0)
pitching = pd.concat(final_df['pitching'], axis=0)
Output:
print(batting)
Rk Name Age G ... IBB Notes Year Team
0 1.0 Josh Buckley 20.0 35 ... NaN NaN 2021 Kokomo Jackrabbits
1 2.0 Justus Burke 22.0 15 ... NaN NaN 2021 Kokomo Jackrabbits
2 3.0 Adam Crampton 20.0 33 ... NaN NaN 2021 Kokomo Jackrabbits
3 4.0 Dylan Delvecchio 20.0 8 ... NaN NaN 2021 Kokomo Jackrabbits
4 5.0 Dylan Dennis 21.0 60 ... NaN NaN 2021 Kokomo Jackrabbits
.. ... ... ... .. ... ... ... ... ...
19 20.0 Johnathon Tripp 19 24 ... 0.0 NaN 2013 Green Bay Bullfrogs
20 21.0 Logan West -- 20 ... 0.0 NaN 2013 Green Bay Bullfrogs
21 22.0 Boomer White 19 45 ... 0.0 NaN 2013 Green Bay Bullfrogs
22 23.0 Robert Youngdahl* 20 65 ... 0.0 NaN 2013 Green Bay Bullfrogs
23 NaN 23 Players 19.9 70 ... 1.0 NaN 2013 Green Bay Bullfrogs
[4674 rows x 29 columns]
print(pitching)
Rk Name Age W L ... HR9 BB9 SO9 SO/W Notes
0 1.0 Parker Bard 19.0 1 1 ... 0.0 10.7 7.9 0.74 NaN
1 2.0 Andrew Beauvais 21.0 1 0 ... 0.0 4.9 5.3 1.08 NaN
2 3.0 Ryan Beck 22.0 2 0 ... 0.0 2.9 6.5 2.20 NaN
3 4.0 Brock Begesha 19.0 2 2 ... 1.3 13.2 7.5 0.57 NaN
4 5.0 Garrett Bell 20.0 0 0 ... 0.0 6.0 15.0 2.50 NaN
.. ... ... ... .. .. ... ... ... ... ... ...
20 21.0 Logan West -- 3 2 ... 1.1 3.4 4.2 1.23 NaN
21 22.0 Jordan Wright 22 0 1 ... 1.4 10.8 6.8 0.63 NaN
22 23.0 Robert Youngdahl* 20 2 1 ... 0.3 3.8 7.9 2.08 NaN
23 24.0 Marshall Zahn -- 1 0 ... 0.8 5.3 7.5 1.43 NaN
24 NaN 24 Players 19.6 28 42 ... 0.7 3.8 6.8 1.77 NaN
[5225 rows x 32 columns]