Code used to scrape Oddsportal however now I am getting an error with this code.
import os
import re
import threading
from datetime import datetime
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
if len(td_tags) > 5: # or just if td_tags
yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
td_tags[4].text, td_tags[5].text, country, league]
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
browser.implicitly_wait(25)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
print(h1)
m = re.search(r'\d \w \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
# Score present?
if ':' not in row[2]:
# No, shift a few columns right:
row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
game_data.score.append(row[2])
game_data.home_odds.append(nan if row[3] == '-' else row[3])
game_data.draw_odds.append(nan if row[4] == '-' else row[4])
game_data.away_odds.append(nan if row[5] == '-' else row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
games = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
try:
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if games is None:
games = result
else:
games = games.append(result, ignore_index=True)
except ValueError:
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if games is None:
games = result
else:
games = games.append(result, ignore_index=True)
finally:
pass
print(games)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Model Output:
Unnamed: 0 date time game score home_odds draw_odds away_odds country league
0 0 08 Jan 2023 00:30 Boca Juniors - Independiente 0:0 1.93 3.23 3.91 Argentina Torneos De Verano
1 1 08 Jan 2023 00:45 CSP U20 - Sao Paulo U20 0:4 11.27 5.85 1.21 Brazil Copa Sao Paulo de juniores
2 2 08 Jan 2023 01:00 U. de Deportes (Per) - Aucas (Ecu) 0:0 1.94 3.28 3.74 World Club Friendly
3 3 08 Jan 2023 01:10 Atlas - Toluca NaN postp. 2.04 3.40 Mexico Liga MX
4 4 08 Jan 2023 01:30 Inac Kobe Leonesa W - Albirex Niigata W 2:1 1.22 5.42 12.01 Japan WE League Women
5 5 08 Jan 2023 02:00 Tampico Madero - Lobos ULMX 1:0 1.41 4.69 6.25 Mexico Liga Premier Serie A
However currently I am getting error:
Traceback (most recent call last):
File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 111, in <module>
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 261, in apply
return self.apply_async(func, args, kwds).get()
File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 657, in get
raise self._value
File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 78, in parse_data
table = div.find('table', {'class': 'table-main'})
AttributeError: 'NoneType' object has no attribute 'find'
How do I resolve this?
CodePudding user response:
Yesterday oddsportal.com switched to the new webpage and the html structure have been changed. You need to investigate the new html with your browser and rewrite the scraper.
CodePudding user response:
Can't scrape by html table tags anymore as Oddsportal obfuscated CSS class names in the latest website. Anyone has any idea how to scrape?