I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.
CodePudding user response:
Try to use the structured data from JSON response of XHR request to create your dataframe
. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId | playerName | ... | type | atBatsPerHomeRun | |||
---|---|---|---|---|---|---|---|
0 | 502671 | Paul Goldschmidt | ... | player | 5.5 | ||
1 | 621439 | Byron Buxton | ... | player | 6.4 | ||
2 | 547180 | Bryce Harper | ... | player | 4.38 | ||
3 | 658668 | Edward Olivares | ... | player | 11.33 | ||
4 | 670351 | Jose Rojas | ... | player | 9 | ||
... | ... | ... | |||||
156 | 593871 | Jorge Polanco | ... | player | 32.00 | ||
157 | 676475 | Alec Burleson | ... | player | -.-- | ||
158 | 608385 | Jesse Winker | ... | player | -.-- | ||
159 | 641355 | Cody Bellinger | ... | player | -.-- | ||
160 | 660162 | Yoan Moncada | ... | player | -.-- |
[161 rows x 72 columns]
CodePudding user response:
You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]