Trying to Capture the Table from Multiple Pages With For Loops-CodePudding

Good day, everyone.

I'm trying to get the table on each page from the links appended to 'player_page.' I want the stats per game for each player in that season, and the table I want is listed on the players' individual page. Each link appended is correct, but I'm having trouble capturing the correct info when running my loops.

Any idea what I'm doing wrong here?

Any help is appreciated.

from bs4 import BeautifulSoup
import requests
import pandas as pd

from numpy import sin


url = 'https://www.pro-football-reference.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
year = 2018

r = requests.get(url   '/years/'   str(year)   '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')


player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
player_page = []
for player in player_list:
    for link in player.find_all('a', href= True):
        #names = str(link['href'])strip('')
        link = str(link['href'].strip('.htm'))
        player_page.append(url   link   '/gamelog'   '/'   str(year))



for page in player_page:
    dfs = pd.read_html(page)

yearly_stats = []
for df in dfs:
        yearly_stats.append(df)
final_stats = pd.concat(yearly_stats)
final_stats.to_excel('Fantasy2018.xlsx')

CodePudding user response：

This works. The table columns change according to the player's position, I believe. Not everyone has tackle information, for example.

import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd


url = 'https://www.pro-football-reference.com'
year = 2018

r = requests.get(url   '/years/'   str(year)   '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')


player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})

dfs = []
for player in player_list:
    for link in player.find_all('a', href= True):
        name = link.getText()
        link = str(link['href'].strip('.htm'))
        try:
            df = pd.read_html(url   link   '/gamelog'   '/'   str(year))[0]
            for i, columns_old in enumerate(df.columns.levels):
                columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
                df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
            df.columns = df.columns.map('|'.join).str.strip('|')
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
            df = df.dropna(subset=['Date'])
            df.insert(0,'Name',name)
            df.insert(1,'Moment','Regular Season')
            dfs.append(df)
        except:
            pass
        try:
            df1 = pd.read_html(url   link   '/gamelog'   '/'   str(year))[1]
            for i, columns_old in enumerate(df1.columns.levels):
                columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
                df1.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
            df1.columns = df1.columns.map('|'.join).str.strip('|')
            df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
            df1 = df1.dropna(subset=['Date'])
            df1.insert(0,'Name',name)
            df1.insert(1,'Moment','Playoffs')
            dfs.append(df1)
        except:
            pass

    

dfall = pd.concat(dfs)
dfall.to_excel('Fantasy2018.xlsx')