I have successfully scraped a leaderboard table from said site, at this URL:
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(sdate, enddate)
wRC1 = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
where the table is successfully assigned to the dataframe wRC1.
I'm trying to do something similar but with the following link:
using the following code:
import pandas as pd
import requests
from datetime import date, timedelta
from bs4 import BeautifulSoup
import lxml
import numpy as np
def parse_array_from_fangraphs_html(start_date,end_date, URL_1):
"""
Take a HTML stats page from fangraphs and parse it out to a dataframe.
"""
# parse input
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=c,13,7,8,120,121,331,105,111,24,19,14,329,324,45,122,6,42,43,328,330,322,323,326,332&season=2021&month=1000&season1=2015&ind=0&team=&rost=&age=&filter=&players=&startdate={}&enddate={}&page=1_2000".format(start_date, end_date)
PITCHERS_URL = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=1000&season1=2022&ind=0&team=0,ts&rost=0&age=0&filter=&players=0&startdate={}&enddate={}&sort=17,d".format(start_date, end_date)
PITCHERS_URL = URL_1
# request the data
pitchers_html = requests.get(PITCHERS_URL).text
soup = BeautifulSoup(pitchers_html, "lxml")
table = soup.find("table", {"class": "rgMasterTable"})
# get headers
headers_html = table.find("thead").find_all("th")
headers = []
for header in headers_html:
headers.append(header.text)
# get rows
rows = []
rows_html = table.find("tbody").find_all("tr")
for row in rows_html:
row_data = []
for cell in row.find_all("td"):
row_data.append(cell.text)
rows.append(row_data)
return pd.DataFrame(rows, columns = headers)
sdate = '2022-01-01'
enddate = date.today()
enddate =enddate.strftime("%Y-%m-%d")
#date.today() - timedelta(1)
#enddate = enddate.strftime("%Y-%m-%d")
PITCHERS = "https://www.fangraphs.com/players/trevor-rogers/22286/game-log?type=0&gds=2022-04-10&gde=2022-09-12&season=&position=P"
df = parse_array_from_fangraphs_html(sdate, enddate, PITCHERS)
But the program ends up producing the following error:
AttributeError: 'NoneType' object has no attribute 'find'
What could be producing this problem? Thanks in advance!
CodePudding user response:
That data is being pulled dynamically by page' javascript, from an API endpoint. You can see that endpoint by inspecting Dev tools - Network tab in browser. Here is one way to do it:
import pandas as pd
import requests
r = requests.get('https://cdn.fangraphs.com/api/players/game-log?playerid=22286&position=P&type=0&season=&gds=2022-04-10&gde=2022-09-12&z=1663107181')
df = pd.json_normalize(r.json()['mlb'])
print(df)
Result printed in terminal:
Date Opp teamid season Team HomeAway Age W L ERA G GS CG ShO SV HLD BS IP TBF H R ER HR BB IBB HBP WP BK SO K/9 BB/9 H/9 K/BB IFH% BUH% GB FB LD IFFB IFH BU BUH K% BB% K-BB% SIERA HR/9 AVG WHIP BABIP LOB% FIP E-F xFIP ERA- FIP- xFIP- GB/FB LD% GB% FB% IFFB% HR/FB RS RS/9 Balls Strikes Pitches WPA -WPA WPA RE24 REW pLI inLI gmLI exLI Pulls Games WPA/LI Clutch SD MD FB%1 FBv SL% SLv CH% CHv wFB wSL wCH wFB/C wSL/C wCH/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% Pull Cent Oppo Soft Med Hard bipCount Pull% Cent% Oppo% Soft% Med% Hard% PlayerName playerid tERA GSv2 pfxFA% pfxSI% pfxSL% pfxCH% pfxvFA pfxvSI pfxvSL pfxvCH pfxFA-X pfxSI-X pfxSL-X pfxCH-X pfxFA-Z pfxSI-Z pfxSL-Z pfxCH-Z pfxwFA pfxwSI pfxwSL pfxwCH pfxwFA/C pfxwSI/C pfxwSL/C pfxwCH/C pfxO-Swing% pfxZ-Swing% pfxSwing% pfxO-Contact% pfxZ-Contact% pfxContact% pfxZone% pfxPace piCH% piFA% piSI% piSL% piXX% pivCH pivFA pivSI pivSL pivXX piCH-X piFA-X piSI-X piSL-X piXX-X piCH-Z piFA-Z piSI-Z piSL-Z piXX-Z piwCH piwFA piwSI piwSL piwXX piwCH/C piwFA/C piwSI/C piwSL/C piwXX/C piO-Swing% piZ-Swing% piSwing% piO-Contact% piZ-Contact% piContact% piZone% Events EV LA Barrels Barrel% maxEV HardHit HardHit% gamedate dh
0 <a href="/wins.aspx?date=2050-01-01&team=total&dh=0">2050-01-01</a> - - - 20 2022 - - - A 24 4.0 11.0 5.349057 22.0 22.0 0.0 0.0 0.0 0.0 0.0 106.0 470.0 113.0 67.0 63.0 14.0 44.0 0.0 5.0 5.0 0.0 105.0 8.915095 3.735849 9.594340 2.386364 0.067669 0.5 133.0 112.0 69.0 5.0 9.0 2.0 1.0 0.223404 0.093617 0.129787 4.186245 1.188679 0.268409 1.481132 0.327815 0.667135 4.243406 1.105651 4.094434 134.238531 107.477754 101.526711 1.187500 0.219745 0.423567 0.356688 0.044643 0.125000 50.0 4.245283 685.0 1223.0 1908.0 -1.919216 -9.791034 7.871818 -15.8036 -1.638929e 00 0.998163 0.882337 0.867627 1.019309 22.0 22.0 -1.502681 -0.420067 0.0 0.0 0.527254 94.645129 0.179769 80.728863 0.292977 85.706619 -8.878565 -1.319738 -2.106202 -0.882561 -0.384763 -0.376780 0.322813 0.715539 0.487153 0.695531 0.826620 0.776103 0.418458 0.612766 0.109015 133.0 109.0 74.0 62.0 162.0 92.0 316.0 0.420886 0.344937 0.234177 0.196203 0.512658 0.291139 Trevor Rogers 22286 4.566455 45.0 0.525157 0.002096 0.179769 0.292977 94.608583 94.849997 80.678718 85.654563 8.065439 11.7475 -3.389592 9.090555 8.385319 3.2725 3.808251 1.335760 -7.785403 -0.614196 -1.246096 -1.899808 -0.776986 -15.354905 -0.363293 -0.339858 0.298838 0.674298 0.487945 0.660777 0.824074 0.774436 0.503669 23.385816 0.292453 0.524633 0.002096 0.179769 0.001048 85.611075 94.589424 94.797028 80.623514 91.021751 8.324599 7.148858 10.731025 -4.301155 5.42764 0.261684 7.435025 2.438555 2.763215 6.392905 -1.972312 -7.847004 -0.614196 -1.246096 0.134105 -0.353461 -0.783916 -15.354905 -0.363293 6.705273 0.298319 0.675732 0.487421 0.665493 0.823529 0.775269 0.501048 316.0 88.299055 11.857416 23.0 0.072785 113.824 116.0 0.367089 2050-01-01 0
1 <a href="/wins.aspx?date=2022-09-12&team=Marlins&dh=1">2022-09-12</a> TEX 20 2022 MIA H 24 0.0 0.0 2.842105 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.1 24.0 4.0 2.0 2.0 0.0 2.0 0.0 0.0 0.0 0.0 9.0 12.789474 2.842105 5.684211 4.500000 0.000000 0.0 7.0 3.0 3.0 0.0 0.0 0.0 0.0 0.375000 0.083333 0.291667 2.394518 0.000000 0.181818 0.947368 0.307692 0.666667 1.226027 1.616078 1.928979 71.324734 32.517111 48.896813 2.333333 0.230769 0.538462 0.230769 0.000000 0.000000 2.0 2.842105 29.0 60.0 89.0 0.034852 -0.393320 0.428172 0.9269 9.934618e-02 0.979617 1.082800 0.870000 3.073200 1.0 1.0 0.203277 -0.167700 0.0 0.0 0.573034 94.980392 0.067416 83.500000 0.359551 86.156250 0.518355 0.342558 0.944681 1.016382 5.709294 2.952127 0.301887 0.750000 0.483146 0.500000 0.740741 0.651163 0.404494 0.583333 0.168539 4.0 5.0 4.0 1.0 8.0 4.0 13.0 0.307692 0.384615 0.307692 0.076923 0.615385 0.307692 Trevor Rogers 22286 1.677209 68.0 0.573034 NaN 0.067416 0.359551 94.907839 NaN 83.483337 86.134377 9.537647 NaN -1.148333 10.087188 8.466666 NaN 4.296667 1.082500 0.518355 NaN 0.342558 0.944681 1.016382 NaN 5.709294 2.952127 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 20.453125 0.359551 0.573034 NaN 0.067416 NaN 86.203064 94.993490 NaN 83.560450 NaN 9.029422 8.369133 NaN -2.768143 NaN -0.289749 7.267294 NaN 2.612752 NaN 0.944681 0.518355 NaN 0.342558 NaN 2.952127 1.016382 NaN 5.709294 NaN 0.325581 0.652174 0.494382 0.500000 0.700000 0.636364 0.516854 13.0 92.482628 2.117006 1.0 0.076923 105.379 6.0 0.461538 2022-09-12 1
2 <a href="/wins.aspx?date=2022-09-07&team=Marlins&dh=0">2022-09-07</a> @PHI 20 2022 MIA A 24 0.0 1.0 4.500000 1.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 23.0 5.0 3.0 3.0 2.0 0.0 0.0 0.0 1.0 0.0 8.0 12.000000 0.000000 7.500000 8.000000 0.166667 0.0 6.0 6.0 3.0 0.0 1.0 0.0 0.0 0.347826 0.000000 0.347826 2.000855 3.000000 0.217391 0.833333 0.230769 0.909091 4.787431 -0.287431 1.938107 112.930823 120.992954 49.118658 1.000000 0.200000 0.400000 0.400000 0.000000 0.333333 2.0 3.000000 30.0 57.0 87.0 -0.059300 -0.440200 0.380900 0.0000 -3.725290e-09 0.804167 0.915000 0.920000 0.750000 1.0 1.0 -0.103264 0.029523 0.0 0.0 0.620690 94.185185 0.160920 82.428571 0.218391 85.947368 2.034562 -0.678451 -1.764662 3.767707 -4.846082 -9.287697 0.318182 0.674419 0.494253 0.642857 0.862069 0.790698 0.494253 0.608696 0.103448 9.0 4.0 2.0 2.0 8.0 5.0 15.0 0.600000 0.266667 0.133333 0.133333 0.533333 0.333333 Trevor Rogers 22286 4.851247 52.0 0.620690 NaN 0.160920 0.218391 94.181478 NaN 82.321429 85.836843 9.481296 NaN -0.274286 10.177368 8.612408 NaN 4.684286 1.065789 2.034562 NaN -0.678451 -1.764662 3.767707 NaN -4.846082 -9.287697 0.238095 0.733333 0.494253 0.600000 0.848485 0.790698 0.517241 22.000000 0.218391 0.620690 NaN 0.160920 NaN 86.182418 94.539867 NaN 82.674308 NaN 8.901917 8.122533 NaN -1.927560 NaN -0.384359 7.333927 NaN 2.989588 NaN -1.764662 2.034562 NaN -0.678451 NaN -9.287697 3.767707 NaN -4.846082 NaN 0.250000 0.702128 0.494253 0.600000 0.848485 0.790698 0.540230 15.0 89.261637 23.314498 2.0 0.133333 109.308 7.0 0.466667 2022-09-07 0
[...]
Data returned is quite extensive: you can slice & dice it further, to get what you want from there.
For relevant pandas documentation, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html