Good day, everyone.
I'm trying to get the table on each page from the links appended to 'player_page.' I want the stats per game for each player in that season, and the table I want is listed on the players' individual page. Each link appended is correct, but I'm having trouble capturing the correct info when running my loops.
Any idea what I'm doing wrong here?
Any help is appreciated.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from numpy import sin
url = 'https://www.pro-football-reference.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
year = 2018
r = requests.get(url '/years/' str(year) '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
player_page = []
for player in player_list:
for link in player.find_all('a', href= True):
#names = str(link['href'])strip('')
link = str(link['href'].strip('.htm'))
player_page.append(url link '/gamelog' '/' str(year))
for page in player_page:
dfs = pd.read_html(page)
yearly_stats = []
for df in dfs:
yearly_stats.append(df)
final_stats = pd.concat(yearly_stats)
final_stats.to_excel('Fantasy2018.xlsx')
CodePudding user response:
This works. The table columns change according to the player's position, I believe. Not everyone has tackle information, for example.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.pro-football-reference.com'
year = 2018
r = requests.get(url '/years/' str(year) '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
dfs = []
for player in player_list:
for link in player.find_all('a', href= True):
name = link.getText()
link = str(link['href'].strip('.htm'))
try:
df = pd.read_html(url link '/gamelog' '/' str(year))[0]
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df.columns = df.columns.map('|'.join).str.strip('|')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df.insert(0,'Name',name)
df.insert(1,'Moment','Regular Season')
dfs.append(df)
except:
pass
try:
df1 = pd.read_html(url link '/gamelog' '/' str(year))[1]
for i, columns_old in enumerate(df1.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df1.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df1.columns = df1.columns.map('|'.join).str.strip('|')
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df1 = df1.dropna(subset=['Date'])
df1.insert(0,'Name',name)
df1.insert(1,'Moment','Playoffs')
dfs.append(df1)
except:
pass
dfall = pd.concat(dfs)
dfall.to_excel('Fantasy2018.xlsx')