Scraping Oddsportal for Matches and odds-CodePudding

Code used to scrape Oddsportal however now I am getting an error with this code.

import os
import re
import threading
from datetime import datetime
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up
        print('The driver has been "quitted".')


threadLocal = threading.local()


def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


class GameData:
    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def generate_matches(table):
    tr_tags = table.findAll('tr')
    for tr_tag in tr_tags:
        if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
            th_tag = tr_tag.find('th', {'class': 'first2 tl'})
            a_tags = th_tag.findAll('a')
            country = a_tags[0].text
            league = a_tags[1].text
        else:
            td_tags = tr_tag.findAll('td')
            if len(td_tags) > 5:  # or just if td_tags
                yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
                       td_tags[4].text, td_tags[5].text, country, league]


def parse_data(url, return_urls=False):
    browser = create_driver()
    browser.get(url)
    browser.implicitly_wait(25)
    soup = bs(browser.page_source, "lxml")
    div = soup.find('div', {'id': 'col-content'})
    table = div.find('table', {'class': 'table-main'})
    h1 = soup.find('h1').text
    print(h1)
    m = re.search(r'\d  \w  \d{4}$', h1)
    game_date = m[0]
    game_data = GameData()
    for row in generate_matches(table):
        game_data.date.append(game_date)
        game_data.time.append(row[0])
        game_data.game.append(row[1])
        # Score present?
        if ':' not in row[2]:
            # No, shift a few columns right:
            row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
        game_data.score.append(row[2])
        game_data.home_odds.append(nan if row[3] == '-' else row[3])
        game_data.draw_odds.append(nan if row[4] == '-' else row[4])
        game_data.away_odds.append(nan if row[5] == '-' else row[5])
        game_data.country.append(row[6])
        game_data.league.append(row[7])

    if return_urls:
        span = soup.find('span', {'class': 'next-games-date'})
        a_tags = span.findAll('a')
        urls = ['https://www.oddsportal.com'   a_tag['href'] for a_tag in a_tags]
        return game_data, urls
    return game_data


if __name__ == '__main__':
    games = None
    pool = ThreadPool(5)  # We will be getting, however, 7 URLs
    # Get today's data and the Urls for the other days:
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
    urls.pop(1)  # Remove url for today: We already have the data for that
    game_data_results = pool.imap(parse_data, urls)
    for i in range(8):
        try:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if games is None:
                games = result
            else:
                games = games.append(result, ignore_index=True)
        except ValueError:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if games is None:
                games = result
            else:
                games = games.append(result, ignore_index=True)
        finally:
            pass
    print(games)
    # ensure all the drivers are "quitted":
    del threadLocal
    import gc

    gc.collect()  # a little extra insurance

Model Output:

Unnamed: 0         date   time                                               game     score home_odds  draw_odds  away_odds                country                               league
0              0  08 Jan 2023  00:30                       Boca Juniors - Independiente       0:0      1.93       3.23       3.91              Argentina                    Torneos De Verano
1              1  08 Jan 2023  00:45                            CSP U20 - Sao Paulo U20       0:4     11.27       5.85       1.21                 Brazil           Copa Sao Paulo de juniores
2              2  08 Jan 2023  01:00               U. de Deportes (Per)  - Aucas (Ecu)        0:0      1.94       3.28       3.74                  World                        Club Friendly
3              3  08 Jan 2023  01:10                                     Atlas - Toluca       NaN    postp.       2.04       3.40                 Mexico                              Liga MX
4              4  08 Jan 2023  01:30            Inac Kobe Leonesa W - Albirex Niigata W       2:1      1.22       5.42      12.01                  Japan                      WE League Women
5              5  08 Jan 2023  02:00                        Tampico Madero - Lobos ULMX       1:0      1.41       4.69       6.25                 Mexico                 Liga Premier Serie A

However currently I am getting error:

Traceback (most recent call last):
  File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 111, in <module>
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 261, in apply
    return self.apply_async(func, args, kwds).get()
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 657, in get
    raise self._value
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 78, in parse_data
    table = div.find('table', {'class': 'table-main'})
AttributeError: 'NoneType' object has no attribute 'find'

How do I resolve this?

CodePudding user response：

Yesterday oddsportal.com switched to the new webpage and the html structure have been changed. You need to investigate the new html with your browser and rewrite the scraper.

CodePudding user response：

Can't scrape by html table tags anymore as Oddsportal obfuscated CSS class names in the latest website. Anyone has any idea how to scrape?