Creating a table through a list for Pandas-CodePudding

I am having a heck of a time turning data that i have into a dataframe through Pandas. I feel like this is far from a difficult task but i can't seem to figure it out. I have the headers i want for the dataframe and i have the data but this is data from the web. I know i need to turn it into a list and then put that into a DataFrame function but i am unable to figure out how to put this thing into a list.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd


PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
    if idx < number_of_players:
        down_arrow.click()
        time.sleep(.5)
    else:
        break

if round_to_select < 4:
    for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
        if idx < number_of_players:
            Select(menu).select_by_visible_text(f'Round {round_to_select}')
            time.sleep(.5)
        else:
            break

R1_page_source = driver.page_source

R1_soup = BeautifulSoup(R1_page_source, 'html.parser')

R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')

for R1_player in R1_leaderboard.find_all('tbody'):
    R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
    for R1_row in R1_rows:
        R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
        R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
        R1_Players = R1_row.find('a').text
        R1_Round = R1_row.find_all("select")[1].text
        R1_H1 = R1_row.find_all('span')[1].text
        R1_H2 = R1_row.find_all('span')[2].text
        R1_H3 = R1_row.find_all('span')[3].text
        R1_H4 = R1_row.find_all('span')[4].text
        R1_H5 = R1_row.find_all('span')[5].text
        R1_H6 = R1_row.find_all('span')[6].text
        R1_H7 = R1_row.find_all('span')[7].text
        R1_H8 = R1_row.find_all('span')[8].text
        R1_H9 = R1_row.find_all('span')[9].text
        R1_H10 = R1_row.find_all('span')[11].text
        R1_H11 = R1_row.find_all('span')[12].text
        R1_H12 = R1_row.find_all('span')[13].text
        R1_H13 = R1_row.find_all('span')[14].text
        R1_H14 = R1_row.find_all('span')[15].text
        R1_H15 = R1_row.find_all('span')[16].text
        R1_H16 = R1_row.find_all('span')[17].text
        R1_H17 = R1_row.find_all('span')[18].text
        R1_H18 = R1_row.find_all('span')[19].text
        print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)

CodePudding user response：

here's a modified version of your code. i used pandas.read_html to turn the html table into a dataframe.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd

# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
  if idx < number_of_players:
    try:
      down_arrow.click()
      time.sleep(.5)
    except:
      pass
  else:
      break

if round_to_select < 4:
  for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
    if idx < number_of_players:
      try:
        Select(menu).select_by_visible_text(f'Round {round_to_select}')
        time.sleep(.5)
      except:
        pass
    else:
        break

R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]

prettify turns the bs4 object into a normal string that pandas can process. read_html actually returns a list of dataframes - but in this case there's only one, which is why i added [0] on the end. when i run R1_df i get this:

    Unnamed: 0  POS     PLAYER  SCORE   R1  R2  R3  R4  TOT     EARNINGS    FEDEX PTS
0   NaN     1   Scottie Scheffler   -10     69  67  71  71  278     $2,700,000  600
1   NaN     2   Rory McIlroy    -7  73  73  71  64  281     $1,620,000  330
2   NaN     T3  Shane Lowry     -5  73  68  73  69  283     $870,000    180
3   NaN     T3  Cameron Smith   -5  68  74  68  73  283     $870,000    180
4   NaN     5   Collin Morikawa     -4  73  70  74  67  284     $600,000    120
...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...
86  NaN     -   Stewart Hagestad (a)    CUT     79  81  --  --  160     --  0
87  NaN     -   José María Olazábal     CUT     77  84  --  --  161     --  0
88  NaN     -   Laird Shepherd (a)  CUT     81  85  --  --  166     --  0
89  NaN     -   Louis Oosthuizen    WD  76  --  --  --  76  --  0
90  NaN     -   Paul Casey  WD  --  --  --  --  --  --  0

hopefully this is what you were looking for! because pandas can process html directly, there wasn't actually a need to make any intermediary lists.

CodePudding user response：

As stated, you could have Selenium click through each, then use pandas' .read_html() to parse the tables. However, there's an espn api, and if there is an api available, it's far better (more robust and efficient) to fetch the data that way as opposed to using Selenium. There's also far more data than what's shown on the site that you can get too. Basically as long as you have the tournment number/id, you feed that in, get the player ids form the leader board, then iterate through those:

import requests
import pandas as pd


tournamentId = '401353232'

url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
    'league': 'pga',
    'region': 'us',
    'lang': 'en',
    'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']
course = jsonData['events'][0]['courses'][0]['name']

print(tournament, ' - ', course)


url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/players'
payload = {
    'region': 'us',
    'lang': 'en',
    'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
leaderboard = jsonData['leaderboard']


payload = {
    'region': 'us',
    'lang': 'en',
    'season': '2022'}

results = pd.DataFrame()
for player in leaderboard:
    playerName = player['fullName']
    playerId = player['id']
    
    url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
    rounds = requests.get(url, params=payload).json()['rounds']
    
    for r in rounds:
        period = r['period']
        roundTotal = r['value']
        
        linescores = r['linescores']
        temp_df = pd.DataFrame(linescores)
        temp_df['player'] = playerName
        temp_df['tournament'] = tournament
        temp_df['course'] = course
        temp_df['round'] = period
        
        temp_df = temp_df.pivot(
            index=['player','tournament','course','round'],
            columns='period',
            values='value').reset_index(drop=False)
        
        results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
    print('Collected: ', playerName)

Output:

print(results.head(10).to_string())
period             player          tournament                      course  round    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18
0       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      1  4.0  5.0  4.0  3.0  4.0  3.0  4.0  4.0  3.0  4.0  4.0  2.0  5.0  4.0  5.0  3.0  3.0  5.0
1       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      2  5.0  4.0  5.0  3.0  4.0  3.0  3.0  4.0  4.0  4.0  4.0  2.0  4.0  4.0  4.0  2.0  4.0  4.0
2       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      3  4.0  4.0  3.0  4.0  4.0  2.0  4.0  4.0  4.0  4.0  4.0  4.0  4.0  5.0  6.0  3.0  3.0  5.0
3       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      4  4.0  5.0  3.0  3.0  4.0  3.0  3.0  5.0  4.0  5.0  4.0  3.0  5.0  3.0  4.0  3.0  4.0  6.0
4            Rory McIlroy  Masters Tournament  Augusta National Golf Club      1  4.0  4.0  4.0  3.0  4.0  4.0  4.0  5.0  4.0  4.0  3.0  3.0  5.0  5.0  5.0  4.0  4.0  4.0
5            Rory McIlroy  Masters Tournament  Augusta National Golf Club      2  4.0  4.0  4.0  3.0  5.0  3.0  4.0  5.0  4.0  5.0  6.0  3.0  4.0  4.0  5.0  2.0  4.0  4.0
6            Rory McIlroy  Masters Tournament  Augusta National Golf Club      3  5.0  5.0  4.0  2.0  4.0  4.0  3.0  5.0  4.0  4.0  4.0  4.0  4.0  4.0  4.0  3.0  4.0  4.0
7            Rory McIlroy  Masters Tournament  Augusta National Golf Club      4  3.0  5.0  3.0  3.0  4.0  3.0  3.0  4.0  4.0  3.0  4.0  3.0  3.0  4.0  5.0  3.0  4.0  3.0
8             Shane Lowry  Masters Tournament  Augusta National Golf Club      1  4.0  5.0  5.0  3.0  4.0  3.0  4.0  5.0  4.0  5.0  4.0  3.0  3.0  3.0  7.0  3.0  4.0  4.0
9             Shane Lowry  Masters Tournament  Augusta National Golf Club      2  5.0  4.0  4.0  3.0  4.0  3.0  3.0  5.0  4.0  3.0  4.0  3.0  4.0  4.0  4.0  3.0  4.0  4.0
....