Code below does not error. However it is not returning the desired elements. When I loop through the data item list the items are there but I don't understand why my loop for SportsEvent to get awayTeam and homeTeam, Stadium, and startdate are coming up blank. The links here dont have second pages so you can remove selenium and get_next_page function and calls if your dont have these installed to test.
The problem lies in this line
if "SportsEvent" in item:
Here entire script
import pandas as pd
import extruct as ex
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
urls = [
'https://www.oddsshark.com/nfl/odds',
'https://www.oddsshark.com/nba/odds'
]
def get_driver():
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
return driver
def get_source(driver, url):
driver.get(url)
return driver.page_source
def get_json(source):
return ex.extract(source, syntaxes=['json-ld'])
def get_next_page(driver, source):
"""IN the event teams are on more than 1 page Parse the page source and
return the URL for the next page of results.
:param driver: Selenium webdriver
:param source: Page source code from Selenium
:return
URL of next paginated page
"""
elements = driver.find_elements_by_xpath('//link[@rel="next"]')
if elements:
return driver.find_element_by_xpath('//link[@rel="next"]').get_attribute('href')
else:
return ''
df = pd.DataFrame(columns = ['awayTeam', 'homeTeam','location','startDate'])
def save_teams(data, df):
"""Scrape the teams from a schema.org JSON-LD tag and save the contents in
the df Pandas dataframe.
:param data: JSON-LD source containing schema.org SportsEvent markup
:param df: Name of Pandas dataframe to which to append SportsEvent
:return
df with teams appended
"""
for item in data['json-ld']:
print(item)
if "SportsEvent" in item: #issue is here it does not see SportsEvent in item so it wont continue doing the inner loops
for SportsEvent in item['SportsEvent']:
#print(item['SportsEvent'])
row = {
'awayTeam': SportsEvent.get('awayTeam', {}).get('name'),
'homeTeam': SportsEvent.get('homeTeam', {}).get('name'),
'location': SportsEvent.get('location', {}).get('name'),
'startDate': SportsEvent.get('startDate')
}
print(row)
df = df.append(row, ignore_index=True)
return df
for url in urls:
print(url)
# Save the teams from the first page
driver = get_driver()
source = get_source(driver, url)
json = get_json(source)
df = save_teams(json, df)
# Get teams on each paginated page if other pages exists
next_page = get_next_page(driver, source)
paginated_urls = []
paginated_urls.append(next_page)
if paginated_urls:
for url in paginated_urls:
if url:
#print(next_page)
driver = get_driver()
source = get_source(driver, url)
json = get_json(source)
df = save_teams(json, df)
next_page = get_next_page(driver, source)
paginated_urls.append(next_page)
CodePudding user response:
That's because there is no key "SportsEvent"
in your item
. Its a value under the key '@type'
.
So you'd need to alter your save_teams()
function to:
def save_teams(data, df):
"""Scrape the teams from a schema.org JSON-LD tag and save the contents in
the df Pandas dataframe.
:param data: JSON-LD source containing schema.org SportsEvent markup
:param df: Name of Pandas dataframe to which to append SportsEvent
:return
df with teams appended
"""
for item in data['json-ld']:
print(item)
if "SportsEvent" in item.values(): #issue is here it does not see SportsEvent in item so it wont continue doing the inner loops
row = {
'awayTeam': item.get('awayTeam', {}).get('name'),
'homeTeam': item.get('homeTeam', {}).get('name'),
'location': item.get('location', {}).get('name'),
'startDate': item.get('startDate')
}
print(row)
df = df.append(row, ignore_index=True)
return df
But it looks like you may be complicating this with Selenium. You can get that data by simply pulling it out with BeautifulSoup, then read it into json. Then let pandas flatten it out:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
urls = [
'https://www.oddsshark.com/nfl/odds',
'https://www.oddsshark.com/nba/odds']
for url in urls:
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
jsonStr = soup.find('script', {'type':'application/ld json'}).text
jsonData = json.loads(jsonStr)
df = pd.json_normalize(jsonData)
print(df.to_string())
# or to get just those columns
#print(df[['awayTeam.name','homeTeam.name','location.name','startDate']])
Output:
@type @context inLanguage name url startDate location.@type location.name location.address.@type location.address.addressLocality awayTeam.@type awayTeam.name homeTeam.@type homeTeam.name
0 SportsEvent http://schema.org en-US Tampa Bay Buccaneers vs New York Giants https://www.oddsshark.com/nfl/new-york-tampa-bay-odds-november-22-2021-1411211 2021-11-22T20:15:00-05:00 Place Raymond James Stadium PostalAddress Raymond James Stadium SportsTeam New York Giants SportsTeam Tampa Bay Buccaneers
1 SportsEvent http://schema.org en-US Detroit Lions vs Chicago Bears https://www.oddsshark.com/nfl/chicago-detroit-odds-november-25-2021-1411216 2021-11-25T12:30:00-05:00 Place Ford Field PostalAddress Ford Field SportsTeam Chicago Bears SportsTeam Detroit Lions
2 SportsEvent http://schema.org en-US Dallas Cowboys vs Las Vegas Raiders https://www.oddsshark.com/nfl/las-vegas-dallas-odds-november-25-2021-1411221 2021-11-25T16:30:00-05:00 Place AT&T Stadium PostalAddress AT&T Stadium SportsTeam Las Vegas Raiders SportsTeam Dallas Cowboys
3 SportsEvent http://schema.org en-US New Orleans Saints vs Buffalo Bills https://www.oddsshark.com/nfl/buffalo-new-orleans-odds-november-25-2021-1411226 2021-11-25T20:20:00-05:00 Place Caesars Superdome PostalAddress Caesars Superdome SportsTeam Buffalo Bills SportsTeam New Orleans Saints
4 SportsEvent http://schema.org en-US Houston Texans vs New York Jets https://www.oddsshark.com/nfl/new-york-houston-odds-november-28-2021-1411231 2021-11-28T13:00:00-05:00 Place NRG Stadium PostalAddress NRG Stadium SportsTeam New York Jets SportsTeam Houston Texans
5 SportsEvent http://schema.org en-US Indianapolis Colts vs Tampa Bay Buccaneers https://www.oddsshark.com/nfl/tampa-bay-indianapolis-odds-november-28-2021-1411236 2021-11-28T13:00:00-05:00 Place Lucas Oil Stadium PostalAddress Lucas Oil Stadium SportsTeam Tampa Bay Buccaneers SportsTeam Indianapolis Colts
6 SportsEvent http://schema.org en-US New York Giants vs Philadelphia Eagles https://www.oddsshark.com/nfl/philadelphia-new-york-odds-november-28-2021-1411241 2021-11-28T13:00:00-05:00 Place MetLife Stadium PostalAddress MetLife Stadium SportsTeam Philadelphia Eagles SportsTeam New York Giants
7 SportsEvent http://schema.org en-US Miami Dolphins vs Carolina Panthers https://www.oddsshark.com/nfl/carolina-miami-odds-november-28-2021-1411246 2021-11-28T13:00:00-05:00 Place Hard Rock Stadium PostalAddress Hard Rock Stadium SportsTeam Carolina Panthers SportsTeam Miami Dolphins
8 SportsEvent http://schema.org en-US New England Patriots vs Tennessee Titans https://www.oddsshark.com/nfl/tennessee-new-england-odds-november-28-2021-1411251 2021-11-28T13:00:00-05:00 Place Gillette Stadium PostalAddress Gillette Stadium SportsTeam Tennessee Titans SportsTeam New England Patriots
9 SportsEvent http://schema.org en-US Cincinnati Bengals vs Pittsburgh Steelers https://www.oddsshark.com/nfl/pittsburgh-cincinnati-odds-november-28-2021-1411256 2021-11-28T13:00:00-05:00 Place Paul Brown Stadium PostalAddress Paul Brown Stadium SportsTeam Pittsburgh Steelers SportsTeam Cincinnati Bengals
10 SportsEvent http://schema.org en-US Jacksonville Jaguars vs Atlanta Falcons https://www.oddsshark.com/nfl/atlanta-jacksonville-odds-november-28-2021-1411261 2021-11-28T13:00:00-05:00 Place TIAA Bank Field PostalAddress TIAA Bank Field SportsTeam Atlanta Falcons SportsTeam Jacksonville Jaguars
11 SportsEvent http://schema.org en-US Denver Broncos vs Los Angeles Chargers https://www.oddsshark.com/nfl/los-angeles-denver-odds-november-28-2021-1411266 2021-11-28T16:05:00-05:00 Place Empower Field at Mile High PostalAddress Empower Field at Mile High SportsTeam Los Angeles Chargers SportsTeam Denver Broncos
12 SportsEvent http://schema.org en-US San Francisco 49ers vs Minnesota Vikings https://www.oddsshark.com/nfl/minnesota-san-francisco-odds-november-28-2021-1411271 2021-11-28T16:25:00-05:00 Place Levi's Stadium PostalAddress Levi's Stadium SportsTeam Minnesota Vikings SportsTeam San Francisco 49ers
13 SportsEvent http://schema.org en-US Green Bay Packers vs Los Angeles Rams https://www.oddsshark.com/nfl/los-angeles-green-bay-odds-november-28-2021-1411276 2021-11-28T16:25:00-05:00 Place Lambeau Field PostalAddress Lambeau Field SportsTeam Los Angeles Rams SportsTeam Green Bay Packers
14 SportsEvent http://schema.org en-US Baltimore Ravens vs Cleveland Browns https://www.oddsshark.com/nfl/cleveland-baltimore-odds-november-28-2021-1411281 2021-11-28T20:20:00-05:00 Place M&T Bank Stadium PostalAddress M&T Bank Stadium SportsTeam Cleveland Browns SportsTeam Baltimore Ravens
15 SportsEvent http://schema.org en-US Washington Football Team vs Seattle Seahawks https://www.oddsshark.com/nfl/seattle-washington-odds-november-29-2021-1411286 2021-11-29T20:15:00-05:00 Place FedEx Field PostalAddress FedEx Field SportsTeam Seattle Seahawks SportsTeam Washington Football Team
@type @context inLanguage name url startDate location.@type location.name location.address.@type location.address.addressLocality awayTeam.@type awayTeam.name homeTeam.@type homeTeam.name
0 SportsEvent http://schema.org en-US Washington Wizards vs Charlotte Hornets https://www.oddsshark.com/nba/charlotte-washington-odds-november-22-2021-1460581 2021-11-22T19:00:00-05:00 Place Capital One Arena PostalAddress Capital One Arena SportsTeam Charlotte Hornets SportsTeam Washington Wizards
1 SportsEvent http://schema.org en-US Cleveland Cavaliers vs Brooklyn Nets https://www.oddsshark.com/nba/brooklyn-cleveland-odds-november-22-2021-1460586 2021-11-22T19:00:00-05:00 Place Rocket Mortgage FieldHouse PostalAddress Rocket Mortgage FieldHouse SportsTeam Brooklyn Nets SportsTeam Cleveland Cavaliers
2 SportsEvent http://schema.org en-US Boston Celtics vs Houston Rockets https://www.oddsshark.com/nba/houston-boston-odds-november-22-2021-1460591 2021-11-22T19:30:00-05:00 Place TD Garden PostalAddress TD Garden SportsTeam Houston Rockets SportsTeam Boston Celtics
3 SportsEvent http://schema.org en-US Atlanta Hawks vs Oklahoma City Thunder https://www.oddsshark.com/nba/oklahoma-city-atlanta-odds-november-22-2021-1460596 2021-11-22T19:30:00-05:00 Place State Farm Arena PostalAddress State Farm Arena SportsTeam Oklahoma City Thunder SportsTeam Atlanta Hawks
4 SportsEvent http://schema.org en-US Chicago Bulls vs Indiana Pacers https://www.oddsshark.com/nba/indiana-chicago-odds-november-22-2021-1460601 2021-11-22T20:00:00-05:00 Place United Center PostalAddress United Center SportsTeam Indiana Pacers SportsTeam Chicago Bulls
5 SportsEvent http://schema.org en-US Milwaukee Bucks vs Orlando Magic https://www.oddsshark.com/nba/orlando-milwaukee-odds-november-22-2021-1460606 2021-11-22T20:00:00-05:00 Place Fiserv Forum PostalAddress Fiserv Forum SportsTeam Orlando Magic SportsTeam Milwaukee Bucks
6 SportsEvent http://schema.org en-US New Orleans Pelicans vs Minnesota Timberwolves https://www.oddsshark.com/nba/minnesota-new-orleans-odds-november-22-2021-1460611 2021-11-22T20:00:00-05:00 Place Smoothie King Center PostalAddress Smoothie King Center SportsTeam Minnesota Timberwolves SportsTeam New Orleans Pelicans
7 SportsEvent http://schema.org en-US San Antonio Spurs vs Phoenix Suns https://www.oddsshark.com/nba/phoenix-san-antonio-odds-november-22-2021-1460616 2021-11-22T20:30:00-05:00 Place AT&T Center PostalAddress AT&T Center SportsTeam Phoenix Suns SportsTeam San Antonio Spurs
8 SportsEvent http://schema.org en-US Utah Jazz vs Memphis Grizzlies https://www.oddsshark.com/nba/memphis-utah-odds-november-22-2021-1460621 2021-11-22T21:00:00-05:00 Place Vivint Arena PostalAddress Vivint Arena SportsTeam Memphis Grizzlies SportsTeam Utah Jazz
9 SportsEvent http://schema.org en-US Sacramento Kings vs Philadelphia 76ers https://www.oddsshark.com/nba/philadelphia-sacramento-odds-november-22-2021-1460626 2021-11-22T22:00:00-05:00 Place Golden 1 Center PostalAddress Golden 1 Center SportsTeam Philadelphia 76ers SportsTeam Sacramento Kings
10 SportsEvent http://schema.org en-US Detroit Pistons vs Miami Heat https://www.oddsshark.com/nba/miami-detroit-odds-november-23-2021-1460631 2021-11-23T19:00:00-05:00 Place Little Caesars Arena PostalAddress Little Caesars Arena SportsTeam Miami Heat SportsTeam Detroit Pistons
11 SportsEvent http://schema.org en-US New York Knicks vs Los Angeles Lakers https://www.oddsshark.com/nba/los-angeles-new-york-odds-november-23-2021-1460636 2021-11-23T19:30:00-05:00 Place Madison Square Garden PostalAddress Madison Square Garden SportsTeam Los Angeles Lakers SportsTeam New York Knicks
12 SportsEvent http://schema.org en-US Portland Trail Blazers vs Denver Nuggets https://www.oddsshark.com/nba/denver-portland-odds-november-23-2021-1460641 2021-11-23T22:00:00-05:00 Place Moda Center at the Rose Quarter PostalAddress Moda Center at the Rose Quarter SportsTeam Denver Nuggets SportsTeam Portland Trail Blazers
13 SportsEvent http://schema.org en-US Los Angeles Clippers vs Dallas Mavericks https://www.oddsshark.com/nba/dallas-los-angeles-odds-november-23-2021-1460646 2021-11-23T22:30:00-05:00 Place Staples Center PostalAddress Staples Center SportsTeam Dallas Mavericks SportsTeam Los Angeles Clippers
EXTRA:
I've never used extruct before. I like it! Thanks for intorducing it to me. Here's a solution with that:
import pandas as pd
import extruct as ex
import requests
urls = [
'https://www.oddsshark.com/nfl/odds',
'https://www.oddsshark.com/nba/odds']
for url in urls:
response = requests.get(url).text
jsonData = ex.extract(response, syntaxes=['json-ld'])['json-ld']
df = pd.json_normalize(jsonData)
df = df[df['@type'] == 'SportsEvent']
print(df.to_string())
# or to get just those columns
#print(df[['awayTeam.name','homeTeam.name','location.name','startDate']])