I am having a heck of a time turning data that i have into a dataframe through Pandas. I feel like this is far from a difficult task but i can't seem to figure it out. I have the headers i want for the dataframe and i have the data but this is data from the web. I know i need to turn it into a list and then put that into a DataFrame function but i am unable to figure out how to put this thing into a list.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
down_arrow.click()
time.sleep(.5)
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
for R1_player in R1_leaderboard.find_all('tbody'):
R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
for R1_row in R1_rows:
R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
R1_Players = R1_row.find('a').text
R1_Round = R1_row.find_all("select")[1].text
R1_H1 = R1_row.find_all('span')[1].text
R1_H2 = R1_row.find_all('span')[2].text
R1_H3 = R1_row.find_all('span')[3].text
R1_H4 = R1_row.find_all('span')[4].text
R1_H5 = R1_row.find_all('span')[5].text
R1_H6 = R1_row.find_all('span')[6].text
R1_H7 = R1_row.find_all('span')[7].text
R1_H8 = R1_row.find_all('span')[8].text
R1_H9 = R1_row.find_all('span')[9].text
R1_H10 = R1_row.find_all('span')[11].text
R1_H11 = R1_row.find_all('span')[12].text
R1_H12 = R1_row.find_all('span')[13].text
R1_H13 = R1_row.find_all('span')[14].text
R1_H14 = R1_row.find_all('span')[15].text
R1_H15 = R1_row.find_all('span')[16].text
R1_H16 = R1_row.find_all('span')[17].text
R1_H17 = R1_row.find_all('span')[18].text
R1_H18 = R1_row.find_all('span')[19].text
print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)
CodePudding user response:
here's a modified version of your code. i used pandas.read_html
to turn the html table into a dataframe.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
try:
down_arrow.click()
time.sleep(.5)
except:
pass
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
try:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
except:
pass
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]
prettify
turns the bs4 object into a normal string that pandas can process. read_html
actually returns a list of dataframes - but in this case there's only one, which is why i added [0] on the end. when i run R1_df i get this:
Unnamed: 0 POS PLAYER SCORE R1 R2 R3 R4 TOT EARNINGS FEDEX PTS
0 NaN 1 Scottie Scheffler -10 69 67 71 71 278 $2,700,000 600
1 NaN 2 Rory McIlroy -7 73 73 71 64 281 $1,620,000 330
2 NaN T3 Shane Lowry -5 73 68 73 69 283 $870,000 180
3 NaN T3 Cameron Smith -5 68 74 68 73 283 $870,000 180
4 NaN 5 Collin Morikawa -4 73 70 74 67 284 $600,000 120
... ... ... ... ... ... ... ... ... ... ... ...
86 NaN - Stewart Hagestad (a) CUT 79 81 -- -- 160 -- 0
87 NaN - José María Olazábal CUT 77 84 -- -- 161 -- 0
88 NaN - Laird Shepherd (a) CUT 81 85 -- -- 166 -- 0
89 NaN - Louis Oosthuizen WD 76 -- -- -- 76 -- 0
90 NaN - Paul Casey WD -- -- -- -- -- -- 0
hopefully this is what you were looking for! because pandas can process html directly, there wasn't actually a need to make any intermediary lists.
CodePudding user response:
As stated, you could have Selenium click through each, then use pandas
' .read_html()
to parse the tables. However, there's an espn api, and if there is an api available, it's far better (more robust and efficient) to fetch the data that way as opposed to using Selenium. There's also far more data than what's shown on the site that you can get too. Basically as long as you have the tournment number/id, you feed that in, get the player ids form the leader board, then iterate through those:
import requests
import pandas as pd
tournamentId = '401353232'
url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
'league': 'pga',
'region': 'us',
'lang': 'en',
'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']
course = jsonData['events'][0]['courses'][0]['name']
print(tournament, ' - ', course)
url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/players'
payload = {
'region': 'us',
'lang': 'en',
'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
leaderboard = jsonData['leaderboard']
payload = {
'region': 'us',
'lang': 'en',
'season': '2022'}
results = pd.DataFrame()
for player in leaderboard:
playerName = player['fullName']
playerId = player['id']
url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
rounds = requests.get(url, params=payload).json()['rounds']
for r in rounds:
period = r['period']
roundTotal = r['value']
linescores = r['linescores']
temp_df = pd.DataFrame(linescores)
temp_df['player'] = playerName
temp_df['tournament'] = tournament
temp_df['course'] = course
temp_df['round'] = period
temp_df = temp_df.pivot(
index=['player','tournament','course','round'],
columns='period',
values='value').reset_index(drop=False)
results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
print('Collected: ', playerName)
Output:
print(results.head(10).to_string())
period player tournament course round 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0 Scottie Scheffler Masters Tournament Augusta National Golf Club 1 4.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 3.0 4.0 4.0 2.0 5.0 4.0 5.0 3.0 3.0 5.0
1 Scottie Scheffler Masters Tournament Augusta National Golf Club 2 5.0 4.0 5.0 3.0 4.0 3.0 3.0 4.0 4.0 4.0 4.0 2.0 4.0 4.0 4.0 2.0 4.0 4.0
2 Scottie Scheffler Masters Tournament Augusta National Golf Club 3 4.0 4.0 3.0 4.0 4.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 5.0 6.0 3.0 3.0 5.0
3 Scottie Scheffler Masters Tournament Augusta National Golf Club 4 4.0 5.0 3.0 3.0 4.0 3.0 3.0 5.0 4.0 5.0 4.0 3.0 5.0 3.0 4.0 3.0 4.0 6.0
4 Rory McIlroy Masters Tournament Augusta National Golf Club 1 4.0 4.0 4.0 3.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0 3.0 5.0 5.0 5.0 4.0 4.0 4.0
5 Rory McIlroy Masters Tournament Augusta National Golf Club 2 4.0 4.0 4.0 3.0 5.0 3.0 4.0 5.0 4.0 5.0 6.0 3.0 4.0 4.0 5.0 2.0 4.0 4.0
6 Rory McIlroy Masters Tournament Augusta National Golf Club 3 5.0 5.0 4.0 2.0 4.0 4.0 3.0 5.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.0 4.0 4.0
7 Rory McIlroy Masters Tournament Augusta National Golf Club 4 3.0 5.0 3.0 3.0 4.0 3.0 3.0 4.0 4.0 3.0 4.0 3.0 3.0 4.0 5.0 3.0 4.0 3.0
8 Shane Lowry Masters Tournament Augusta National Golf Club 1 4.0 5.0 5.0 3.0 4.0 3.0 4.0 5.0 4.0 5.0 4.0 3.0 3.0 3.0 7.0 3.0 4.0 4.0
9 Shane Lowry Masters Tournament Augusta National Golf Club 2 5.0 4.0 4.0 3.0 4.0 3.0 3.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 4.0 3.0 4.0 4.0
....