I am trying to get all tables with a class of "stats_table". However it is only pulling 2 tables. Yet when I print the actual soup it and search the document (manually) I can find 9 tables.
from bs4 import BeautifulSoup
import requests
# function to get hitting stats
def get_hitting_stats(team, soup):
# get tables
tables = soup.find_all("table", class_="stats_table")
print(tables)
# function to process game
def process_game(gamelink, headers):
# get boxscore page
req = requests.get(gamelink, headers)
soup = BeautifulSoup(req.content, 'html.parser')
home_hitting = get_hitting_stats("home", soup)
away_hitting = get_hitting_stats("away", soup)
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
process_game("https://www.baseball-reference.com/boxes/CLE/CLE202208151.shtml", headers)
Originally I thought that the other tables might be retrieved from a different request but it doesn't make sense that when I look at the soup returned I can find more than the two tables my code does. Any help appreciated.
CodePudding user response:
The content is within the comments. You need to pull it out.
Also, you never return anything in the functions. Is that what you want to do?
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
# function to get hitting stats
def get_hitting_stats(home_away, soup):
# get tables
idx = {'home':1, 'away':0}
hitting = soup.find_all('table', {'id':re.compile('.*batting.*')})
html = str(hitting[idx[home_away]])
df = pd.read_html(html)[0]
print(df)
return df
# function to process game
def process_game(gamelink, headers):
# get boxscore page
html = requests.get(gamelink, headers).text
html = html.replace('<!--', '').replace('-->', '')
soup = BeautifulSoup(html, 'html.parser')
home_hitting = get_hitting_stats("home", soup)
away_hitting = get_hitting_stats("away", soup)
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
process_game("https://www.baseball-reference.com/boxes/CLE/CLE202208151.shtml", headers)
Output:
Batting AB R H RBI ... acLI RE24 PO A Details
0 Steven Kwan LF 4.0 0.0 1.0 0.0 ... 1.76 0.1 0.0 0.0 NaN
1 Amed Rosario SS 4.0 1.0 0.0 0.0 ... 1.49 -1.5 0.0 1.0 NaN
2 Jose Ramirez 3B 4.0 0.0 2.0 1.0 ... 1.72 0.5 0.0 0.0 NaN
3 Andres Gimenez 2B 4.0 1.0 3.0 3.0 ... 1.84 3.6 0.0 4.0 HR,2B
4 Oscar Gonzalez RF 4.0 0.0 2.0 0.0 ... 1.38 0.3 3.0 0.0 2B
5 Owen Miller 1B 3.0 0.0 0.0 0.0 ... 1.71 -1.1 5.0 3.0 GDP
6 Nolan Jones DH 4.0 0.0 0.0 0.0 ... 1.77 -1.6 NaN NaN NaN
7 Austin Hedges C 3.0 0.0 0.0 0.0 ... 1.77 -1.2 13.0 0.0 NaN
8 Myles Straw CF 3.0 2.0 1.0 0.0 ... 1.09 1.0 3.0 0.0 2·SB
9 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN
10 Aaron Civale P NaN NaN NaN NaN ... NaN NaN 1.0 0.0 NaN
11 James Karinchak P NaN NaN NaN NaN ... NaN NaN 0.0 0.0 NaN
12 Trevor Stephan P NaN NaN NaN NaN ... NaN NaN 2.0 0.0 NaN
13 Emmanuel Clase P NaN NaN NaN NaN ... NaN NaN 0.0 0.0 NaN
14 Team Totals 33.0 4.0 9.0 4.0 ... 1.59 0.3 27.0 8.0 NaN
[15 rows x 24 columns]
Batting AB R H RBI ... acLI RE24 PO A Details
0 Riley Greene CF 4.0 1.0 1.0 0.0 ... 0.0 -0.1 3.0 0.0 NaN
1 Victor Reyes RF 3.0 0.0 1.0 0.0 ... 0.0 0.0 1.0 0.0 SB
2 Javier Baez SS 4.0 0.0 1.0 0.0 ... 0.0 0.2 2.0 3.0 2B
3 Harold Castro 1B 4.0 0.0 0.0 1.0 ... 0.0 -0.7 7.0 0.0 NaN
4 Miguel Cabrera DH 4.0 0.0 0.0 0.0 ... 0.0 -0.8 NaN NaN NaN
5 Jeimer Candelario 3B 3.0 0.0 0.0 0.0 ... 0.0 -0.5 1.0 1.0 NaN
6 Eric Haase C 2.0 0.0 0.0 0.0 ... 0.0 -0.3 6.0 1.0 NaN
7 Jonathan Schoop 2B 3.0 0.0 0.0 0.0 ... 0.0 -0.5 1.0 2.0 NaN
8 Akil Baddoo LF 3.0 0.0 0.0 0.0 ... 0.0 -0.5 3.0 0.0 NaN
9 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN
10 Drew Hutchison P NaN NaN NaN NaN ... NaN NaN 0.0 0.0 NaN
11 Will Vest P NaN NaN NaN NaN ... NaN NaN 0.0 1.0 NaN
12 Andrew Chafin P NaN NaN NaN NaN ... NaN NaN 0.0 0.0 NaN
13 Wily Peralta P NaN NaN NaN NaN ... NaN NaN 0.0 0.0 NaN
14 Team Totals 30.0 1.0 3.0 1.0 ... 0.0 -3.2 24.0 8.0 NaN
[15 rows x 24 columns]