Home > Blockchain >  I have a problem doing web scraping with python in fbref
I have a problem doing web scraping with python in fbref

Time:08-12

This is my first post. I will try to do my best.

I am trying to do web scrapping from fbref but I can't solve one of the errors. I get both that the list is out of range and the 'NoneType' object is not iterable.

I copy the code for someone to help me.

#Creamos listas

#Estadisticas estandar

stats = ["player","nationality","position","squad","age","birth_year","games","games_starts","minutes",
         "goals","assists","pens_made","pens_att","cards_yellow","cards_red","goals_per90","assists_per90",
         "goals_assists_per90","goals_pens_per90","goals_assists_pens_per90","xg","npxg","xa","xg_per90","xa_per90",
         "xg_xa_per90","npxg_per90","npxg_xa_per90"]

#Disparos
shooting2 = ["minutes_90s","goals","pens_made","pens_att","shots_total","shots_on_target","shots_free_kicks",
             "shots_on_target_pct","shots_total_per90","shots_on_target_per90","goals_per_shot",
             "goals_per_shot_on_target","xg","npxg","npxg_per_shot","xg_net","npxg_net"]

#Pases
passing2 = ["passes_completed","passes","passes_pct","passes_total_distance","passes_progressive_distance",
            "passes_completed_short","passes_short","passes_pct_short","passes_completed_medium","passes_medium",
            "passes_pct_medium","passes_completed_long","passes_long","passes_pct_long","assists","xa","xa_net",
            "assisted_shots","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area",
            "progressive_passes"]

#Tipos de pases
passing_types2 = ["passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_pressure",
                  "passes_switches","crosses","corner_kicks","corner_kicks_in","corner_kicks_out","corner_kicks_straight",
                  "passes_ground","passes_low","passes_high","passes_left_foot","passes_right_foot","passes_head",
                  "throw_ins","passes_other_body","passes_completed","passes_offsides","passes_oob","passes_intercepted",
                  "passes_blocked"]


#Creacion de gol y disparos (gca)
gca2 = ["sca","sca_per90","sca_passes_live","sca_passes_dead","sca_dribbles","sca_shots","sca_fouled", "sca_defense", 
        "gca","gca_per90","gca_passes_live","gca_passes_dead","gca_dribbles","gca_shots","gca_fouled", "gca_defense"]

#Acciones defensivas
defense2 = ["tackles","tackles_won","tackles_def_3rd","tackles_mid_3rd","tackles_att_3rd","dribble_tackles",
            "dribbles_vs","dribble_tackles_pct","dribbled_past","pressures","pressure_regains","pressure_regain_pct",
            "pressures_def_3rd","pressures_mid_3rd","pressures_att_3rd","blocks","blocked_shots","blocked_shots_saves",
            "blocked_passes","interceptions","clearances","errors"]

#Posesion
possession2 = ["touches","touches_def_pen_area","touches_def_3rd","touches_mid_3rd","touches_att_3rd",
               "touches_att_pen_area","touches_live_ball","dribbles_completed","dribbles","dribbles_completed_pct",
               "players_dribbled_past","nutmegs","carries","carry_distance","carry_progressive_distance",
               "progressive_carries","carries_into_final_third","carries_into_penalty_area","pass_targets",
               "passes_received","passes_received_pct","miscontrols","dispossessed"]

#Tiempo de juego
playingtime2 = ["games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs",
                "minutes_per_sub","unused_subs","points_per_match","on_goals_for","on_goals_against","plus_minus",
                "plus_minus_per90","plus_minus_wowy","on_xg_for","on_xg_against","xg_plus_minus","xg_plus_minus_per90",
                "xg_plus_minus_wowy"]

#Lances del juego
misc2 = ["cards_yellow","cards_red","cards_yellow_red","fouls","fouled","offsides","crosses","interceptions",
         "tackles_won","pens_won","pens_conceded","own_goals","ball_recoveries","aerials_won","aerials_lost",
         "aerials_won_pct"]

#Porteros
keepers = ["player","nationality","position","squad","age","birth_year","games_gk","games_starts_gk",
           "minutes_gk","goals_against_gk","goals_against_per90_gk","shots_on_target_against","saves",
           "save_pct","wins_gk","draws_gk","losses_gk","clean_sheets","clean_sheets_pct","pens_att_gk",
           "pens_allowed","pens_saved","pens_missed_gk"]

#Porteros avanzados
keepersadv2 = ["minutes_90s","goals_against_gk","pens_allowed","free_kick_goals_against_gk","corner_kick_goals_against_gk",
               "own_goals_against_gk","psxg_gk","psnpxg_per_shot_on_target_against","psxg_net_gk","psxg_net_per90_gk",
               "passes_completed_launched_gk","passes_launched_gk","passes_pct_launched_gk","passes_gk","passes_throws_gk",
               "pct_passes_launched_gk","passes_length_avg_gk","goal_kicks","pct_goal_kicks_launched",
               "goal_kick_length_avg","crosses_gk","crosses_stopped_gk","crosses_stopped_pct_gk",
               "def_actions_outside_pen_area_gk","def_actions_outside_pen_area_per90_gk","avg_distance_def_actions_gk"]

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import seaborn as sns
import matplotlib.pyplot as plt

def countdown(time_sec):
    while time_sec:
        mins, secs = divmod(time_sec, 60)
        timeformat = '\r{:02d}:{:02d}'.format(mins, secs)
        print(timeformat, end='')
        time.sleep(1)
        time_sec -= 1
    print('\r{:02d}:{:02d} - Wait time elapsed. Will begin again...\n'.format(0, 0), end='')


#Functions to get the data in a dataframe using BeautifulSoup

def get_tables(url,text):
    print(url)
    retry = True
    waitTime = 60
    while retry == True:
        res = requests.get(url)
        if res.status_code != 200:
            print(f'Error - status code: {res.status_code}. Will wait {waitTime} seconds and retry')
            countdown(waitTime)
            waitTime  = 15
        else:
            retry = False
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("table")
    
    team_table = all_tables[0]
    player_table = all_tables[1]
    if text == 'for':
      return player_table, team_table
    if text == 'against':
      return player_table, team_vs_table

def get_frame(features, player_table):
    pre_df_player = dict()
    features_wanted_player = features
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
    
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='squad')&(f!='age')&(f!='birth_year')):
                    text = float(text.replace(',',''))
                if f in pre_df_player:
                    pre_df_player[f].append(text)
                else:
                    pre_df_player[f] = [text]
    df_player = pd.DataFrame.from_dict(pre_df_player)
    return df_player

def frame_for_category(category,top,end,features):
    url = (top   category   end)
    player_table, team_table = get_tables(url,'for')
    df_player = get_frame(features, player_table)
    return df_player

def get_outfield_data(top, end):
    df1 = frame_for_category('stats',top,end,stats)
    df2 = frame_for_category('shooting',top,end,shooting2)
    df3 = frame_for_category('passing',top,end,passing2)
    df4 = frame_for_category('passing_types',top,end,passing_types2)
    df5 = frame_for_category('gca',top,end,gca2)
    df6 = frame_for_category('defense',top,end,defense2)
    df7 = frame_for_category('possession',top,end,possession2)
    df8 = frame_for_category('misc',top,end,misc2)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df
def get_keeper_data(top,end):
    df1 = frame_for_category('keepers',top,end,keepers)
    df2 = frame_for_category('keepersadv',top,end,keepersadv2)
    df3 = frame_for_category('passing_types',top,end,passing_types2)
    df = pd.concat([df1, df2, df3], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df

df_2018 = get_outfield_data('https://fbref.com/en/comps/Big5/2017-2018/','/players/2017-2018-Big-5-European-Leagues-Stats')
df_2018["player"] = df_2018["player"]   ', 2017-18'
df_2019 = get_outfield_data('https://fbref.com/en/comps/Big5/2018-2019/','/players/2018-2019-Big-5-European-Leagues-Stats')
df_2019["player"] = df_2019["player"]   ', 2018-19'
df_2020 = get_outfield_data('https://fbref.com/en/comps/Big5/2019-2020/','/players/2019-2020-Big-5-European-Leagues-Stats')
df_2020["player"] = df_2020["player"]   ', 2019-20'
df_2021 = get_outfield_data('https://fbref.com/en/comps/Big5/2020-2021/','/players/2020-2021-Big-5-European-Leagues-Stats')
df_2021["player"] = df_2021["player"]   ', 2020-21'
df = pd.concat([df_2018, df_2019, df_2020, df_2021])

df.head()


I am using this for a TFM and I would like to know where the problem is, since I have visited different pages and none of them has worked for me.

I hope you can help me

Thanks! :)

CodePudding user response:

@chitown88. I have changed the code but now I see a new error in player_table = all_tables[2]

IndexError: list index out of range

What happens here?

#Creamos listas

#Estadisticas estandar

stats = ["player","nationality","position","squad","age","birth_year","games","games_starts","minutes",
         "goals","assists","pens_made","pens_att","cards_yellow","cards_red","goals_per90","assists_per90",
         "goals_assists_per90","goals_pens_per90","goals_assists_pens_per90","xg","npxg","xa","xg_per90","xa_per90",
         "xg_xa_per90","npxg_per90","npxg_xa_per90"]

#Disparos
shooting2 = ["minutes_90s","goals","pens_made","pens_att","shots_total","shots_on_target","shots_free_kicks",
             "shots_on_target_pct","shots_total_per90","shots_on_target_per90","goals_per_shot",
             "goals_per_shot_on_target","xg","npxg","npxg_per_shot","xg_net","npxg_net"]

#Pases
passing2 = ["passes_completed","passes","passes_pct","passes_total_distance","passes_progressive_distance",
            "passes_completed_short","passes_short","passes_pct_short","passes_completed_medium","passes_medium",
            "passes_pct_medium","passes_completed_long","passes_long","passes_pct_long","assists","xa","xa_net",
            "assisted_shots","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area",
            "progressive_passes"]

#Tipos de pases
passing_types2 = ["passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_pressure",
                  "passes_switches","crosses","corner_kicks","corner_kicks_in","corner_kicks_out","corner_kicks_straight",
                  "passes_ground","passes_low","passes_high","passes_left_foot","passes_right_foot","passes_head",
                  "throw_ins","passes_other_body","passes_completed","passes_offsides","passes_oob","passes_intercepted",
                  "passes_blocked"]


#Creacion de gol y disparos (gca)
gca2 = ["sca","sca_per90","sca_passes_live","sca_passes_dead","sca_dribbles","sca_shots","sca_fouled", "sca_defense", 
        "gca","gca_per90","gca_passes_live","gca_passes_dead","gca_dribbles","gca_shots","gca_fouled", "gca_defense"]

#Acciones defensivas
defense2 = ["tackles","tackles_won","tackles_def_3rd","tackles_mid_3rd","tackles_att_3rd","dribble_tackles",
            "dribbles_vs","dribble_tackles_pct","dribbled_past","pressures","pressure_regains","pressure_regain_pct",
            "pressures_def_3rd","pressures_mid_3rd","pressures_att_3rd","blocks","blocked_shots","blocked_shots_saves",
            "blocked_passes","interceptions","clearances","errors"]

#Posesion
possession2 = ["touches","touches_def_pen_area","touches_def_3rd","touches_mid_3rd","touches_att_3rd",
               "touches_att_pen_area","touches_live_ball","dribbles_completed","dribbles","dribbles_completed_pct",
               "players_dribbled_past","nutmegs","carries","carry_distance","carry_progressive_distance",
               "progressive_carries","carries_into_final_third","carries_into_penalty_area","pass_targets",
               "passes_received","passes_received_pct","miscontrols","dispossessed"]

#Tiempo de juego
playingtime2 = ["games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs",
                "minutes_per_sub","unused_subs","points_per_match","on_goals_for","on_goals_against","plus_minus",
                "plus_minus_per90","plus_minus_wowy","on_xg_for","on_xg_against","xg_plus_minus","xg_plus_minus_per90",
                "xg_plus_minus_wowy"]

#Lances del juego
misc2 = ["cards_yellow","cards_red","cards_yellow_red","fouls","fouled","offsides","crosses","interceptions",
         "tackles_won","pens_won","pens_conceded","own_goals","ball_recoveries","aerials_won","aerials_lost",
         "aerials_won_pct"]

#Porteros
keepers = ["player","nationality","position","squad","age","birth_year","games_gk","games_starts_gk",
           "minutes_gk","goals_against_gk","goals_against_per90_gk","shots_on_target_against","saves",
           "save_pct","wins_gk","draws_gk","losses_gk","clean_sheets","clean_sheets_pct","pens_att_gk",
           "pens_allowed","pens_saved","pens_missed_gk"]

#Porteros avanzados
keepersadv2 = ["minutes_90s","goals_against_gk","pens_allowed","free_kick_goals_against_gk","corner_kick_goals_against_gk",
               "own_goals_against_gk","psxg_gk","psnpxg_per_shot_on_target_against","psxg_net_gk","psxg_net_per90_gk",
               "passes_completed_launched_gk","passes_launched_gk","passes_pct_launched_gk","passes_gk","passes_throws_gk",
               "pct_passes_launched_gk","passes_length_avg_gk","goal_kicks","pct_goal_kicks_launched",
               "goal_kick_length_avg","crosses_gk","crosses_stopped_gk","crosses_stopped_pct_gk",
               "def_actions_outside_pen_area_gk","def_actions_outside_pen_area_per90_gk","avg_distance_def_actions_gk"]

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import seaborn as sns
import matplotlib.pyplot as plt

def countdown(time_sec):
    while time_sec:
        mins, secs = divmod(time_sec, 60)
        timeformat = '\r{:02d}:{:02d}'.format(mins, secs)
        print(timeformat, end='')
        time.sleep(1)
        time_sec -= 1
    print('\r{:02d}:{:02d} - Wait time elapsed. Will begin again...\n'.format(0, 0), end='')


#Functions to get the data in a dataframe using BeautifulSoup

def get_tables(url,text):
    print(url)
    retry = True
    waitTime = 60
    while retry == True:
        res = requests.get(url)
        if res.status_code != 200:
            print(f'Error - status code: {res.status_code}. Will wait {waitTime} seconds and retry')
            countdown(waitTime)
            waitTime  = 15
        else:
            retry = False
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("table")
    
    team_table = all_tables[0]
    team_vs_table=all_tables[1]
    player_table = all_tables[2]
    if text == 'for':
      return player_table, team_table
    if text == 'against':
      return player_table, team_vs_table

def get_frame(features, player_table):
    pre_df_player = dict()
    features_wanted_player = features
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
    
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='squad')&(f!='age')&(f!='birth_year')):
                    text = float(text.replace(',',''))
                if f in pre_df_player:
                    pre_df_player[f].append(text)
                else:
                    pre_df_player[f] = [text]
    df_player = pd.DataFrame.from_dict(pre_df_player)
    return df_player

def frame_for_category(category,top,end,features):
    url = (top   category   end)
    player_table, team_table = get_tables(url,'for')
    df_player = get_frame(features, player_table)
    return df_player

def get_outfield_data(top, end):
    df1 = frame_for_category('stats',top,end,stats)
    df2 = frame_for_category('shooting',top,end,shooting2)
    df3 = frame_for_category('passing',top,end,passing2)
    df4 = frame_for_category('passing_types',top,end,passing_types2)
    df5 = frame_for_category('gca',top,end,gca2)
    df6 = frame_for_category('defense',top,end,defense2)
    df7 = frame_for_category('possession',top,end,possession2)
    df8 = frame_for_category('misc',top,end,misc2)
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df
def get_keeper_data(top,end):
    df1 = frame_for_category('keepers',top,end,keepers)
    df2 = frame_for_category('keepersadv',top,end,keepersadv2)
    df3 = frame_for_category('passing_types',top,end,passing_types2)
    df = pd.concat([df1, df2, df3], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df

df_2018 = get_outfield_data('https://fbref.com/en/comps/Big5/2017-2018/','/players/2017-2018-Big-5-European-Leagues-Stats')
df_2018["player"] = df_2018["player"]   ', 2017-18'
df_2019 = get_outfield_data('https://fbref.com/en/comps/Big5/2018-2019/','/players/2018-2019-Big-5-European-Leagues-Stats')
df_2019["player"] = df_2019["player"]   ', 2018-19'
df_2020 = get_outfield_data('https://fbref.com/en/comps/Big5/2019-2020/','/players/2019-2020-Big-5-European-Leagues-Stats')
df_2020["player"] = df_2020["player"]   ', 2019-20'
df_2021 = get_outfield_data('https://fbref.com/en/comps/Big5/2020-2021/','/players/2020-2021-Big-5-European-Leagues-Stats')
df_2021["player"] = df_2021["player"]   ', 2020-21'
df = pd.concat([df_2018, df_2019, df_2020, df_2021])

df.head()

CodePudding user response:

You're doing an awful lot of work here, and it's also very difficult to follow your code. Let pandas do all this. All you need to iterate through are the different season urls with each category.

import pandas as pd
import requests
import re

season_dfs = {}
for season in ['2017-2018', '2018-2019', '2019-2020', '2020-2021']:
    url = f'https://fbref.com/en/comps/Big5/{season}/stats/players/{season}-Big-5-European-Leagues-Stats'
    res = requests.get(url).text
    htmlStr = res.replace('<!--', '')
    htmlStr = htmlStr.replace('-->', '')
    
    dfs = pd.read_html(htmlStr, header=1)
    
    team_table = dfs[0]
    player_table = dfs[1]
    player_table = player_table[player_table['Rk'].ne('Rk')]
    player_table['Season'] = season
    
    for cat in ['shooting', 'passing', 'gca', 'defense', 'possession', 'misc', 'keepers', 'keepersadv', 'passing_types']:
        print(cat)
        cat_url = f'https://fbref.com/en/comps/Big5/{season}/{cat}/players/{season}-Big-5-European-Leagues-Stats'
        resp = requests.get(cat_url).text
        htmlStr = res.replace('<!--', '')
        htmlStr = htmlStr.replace('-->', '')
        temp_df = pd.read_html(htmlStr, header=1)[1]
        temp_df = temp_df[temp_df['Rk'].ne('Rk')]
        
        newCols = ['Player']   [x for x in temp_df.columns if x not in player_table.columns]
        temp_df = temp_df[newCols]
        
        player_table = pd.merge(player_table, temp_df, how='outer', on='Player')
        
    season_dfs[season] = player_table
    print('Collected: ', season)

results = pd.concat([df for x, df in season_dfs.items()])
results = results.drop_duplicates()
results = results.reset_index(drop=True)

Output:

print(results)
         Rk               Player   Nation  ... npxG xA.1  Matches     Season
0         1  Patrick van Aanholt   nl NED  ...      0.21  Matches  2017-2018
1         2       Rolando Aarons  eng ENG  ...      0.08  Matches  2017-2018
2         3       Rolando Aarons  eng ENG  ...      0.10  Matches  2017-2018
3         4        Ignazio Abate   it ITA  ...      0.07  Matches  2017-2018
4         5      Aymen Abdennour   tn TUN  ...      0.02  Matches  2017-2018
    ...                  ...      ...  ...       ...      ...        ...
10896  2818           Kévin Zohi   ml MLI  ...      0.27  Matches  2020-2021
10897  2819           Kurt Zouma   fr FRA  ...      0.08  Matches  2020-2021
10898  2820        Igor Zubeldia   es ESP  ...      0.08  Matches  2020-2021
10899  2821         Steven Zuber   ch SUI  ...      0.41  Matches  2020-2021
10900  2822     Martín Zubimendi   es ESP  ...      0.05  Matches  2020-2021
  • Related