I'm trying to scrape play-by-play score data from ESPN for NBA. The data is slip by quarters, so I'm using Selenium to switch between the tabs/quarters, but it always scrapes the same data (the one for the first quarter). Code:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import requests
url = "https://www.espn.com/nba/playbyplay/_/gameId/401474876"
browser = webdriver.Chrome()
browser.get(url)
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, "lxml")
minutes = list()
a_score = list()
b_score = list()
a_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--away tr Table__TD")
b_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--home tr Table__TD")
minutes_raw = soup.find_all("td", class_="playByPlay__time Table__TD")
for i in range(len(a_score_raw)):
a_score.append(a_score_raw[i].text)
b_score.append(b_score_raw[i].text)
minutes.append(minutes_raw[i].text)
q2_xpath = "/html/body/div[1]/div/div/div/main/div[2]/div/div[5]/div/div/section[2]/div/nav/ul/li[2]/button"
WebDriverWait(browser, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, q2_xpath)))
browser.find_element(By.XPATH, value=q2_xpath).click()
a_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--away tr Table__TD")
b_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--home tr Table__TD")
minutes_raw = soup.find_all("td", class_="playByPlay__time Table__TD")
for i in range(len(a_score_raw)):
a_score.append(a_score_raw[i].text)
b_score.append(b_score_raw[i].text)
minutes.append(minutes_raw[i].text)
q3_xpath = "/html/body/div[1]/div/div/div/main/div[2]/div/div[5]/div/div/section[2]/div/nav/ul/li[4]/button"
WebDriverWait(browser, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, q3_xpath)))
browser.find_element(By.XPATH, value=q3_xpath).click()
a_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--away tr Table__TD")
b_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--home tr Table__TD")
minutes_raw = soup.find_all("td", class_="playByPlay__time Table__TD")
for i in range(len(a_score_raw)):
a_score.append(a_score_raw[i].text)
b_score.append(b_score_raw[i].text)
minutes.append(minutes_raw[i].text)
q4_xpath = "/html/body/div[1]/div/div/div/main/div[2]/div/div[5]/div/div/section[2]/div/nav/ul/li[4]/button"
WebDriverWait(browser, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, q4_xpath)))
browser.find_element(By.XPATH, value=q4_xpath).click()
a_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--away tr Table__TD")
b_score_raw = soup.find_all("td", class_="playByPlay__score playByPlay__score--home tr Table__TD")
minutes_raw = soup.find_all("td", class_="playByPlay__time Table__TD")
for i in range(len(a_score_raw)):
a_score.append(a_score_raw[i].text)
b_score.append(b_score_raw[i].text)
minutes.append(minutes_raw[i].text)
Data for all quarters has the same tag and class and selenium is switching properly, so those shouldn't be an issue. What could be the issue? How can I switch tabs within a website to scrape? Feel free to point to an issue or suggest alternative solutions (or even free data sources for gameflow data).
CodePudding user response:
The data is embedded inside <script>
on that page. To decode it you can use following script:
import re
import json
import requests
url = "https://www.espn.com/nba/playbyplay/_/gameId/401474876"
data = requests.get(url).text
data = re.search(r"window\['__espnfitt__'\]=(\{.*\});", data).group(1)
data = json.loads(data)
for g in data["page"]["content"]["gamepackage"]["pbp"]["playGrps"]:
for d in g:
print(
"{:<10} {:<5} {:<5} {}".format(
d["clock"]["displayValue"],
d["homeScore"],
d["awayScore"],
d["text"],
)
)
Prints:
...
2:05 100 111 Sandro Mamukelashvili turnover
1:51 100 111 Lindell Wigginton personal foul
1:42 100 113 Armoni Brooks makes 2-foot dunk (Chris Silva assists)
1:29 103 113 Lindell Wigginton makes 27-foot step back jumpshot
1:18 103 113 MarJon Beauchamp personal foul
1:18 103 114 Chris Silva makes free throw 1 of 2
1:18 103 115 Chris Silva makes free throw 2 of 2
1:03 103 115 Lindell Wigginton offensive foul
1:03 103 115 Lindell Wigginton turnover
50.5 103 115 Chris Silva bad pass (Lindell Wigginton steals)
45.8 106 115 AJ Green makes 28-foot three point shot (MarJon Beauchamp assists)
20.9 106 118 Vit Krejci makes 29-foot three point shot
5.9 109 118 AJ Green makes 28-foot three point jumper
0.0 109 118 End of the 4th Quarter
0.0 109 118 End of Game
EDIT: You can also parse the data with js2py
module:
import js2py
import requests
from bs4 import BeautifulSoup
url = "https://www.espn.com/nba/playbyplay/_/gameId/401474876"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
x = js2py.eval_js(soup.select("script")[2].text)
print(x.to_dict()["page"]["content"]["gamepackage"]["pbp"]["playGrps"])