Scrapping with request_HTML-CodePudding

I am trying to scrape this website down below:

def grab_ranking():
    tournament_list = grab_tournament_metadata()
    for item in tournament_list:
        url_to_scrape = f'https://www.kayak-polo.info/kphistorique.php?Group={item[1]}&lang=en'
        response = session.get(url_to_scrape)
        print(url_to_scrape)
        season_data = response.html.find('body > div.container-fluid > div > article')
        for season in season_data:
            season_year_raw = find_extract(season, selector='h3 > div.col-md-6.col-sm-6')
            season_year = season_year_raw.replace('Season ', '')
            print(season_year)

            # TODO Figure out how to deal with the n1h and n2h and other french national categories being togheter in one place. 
            category_table = season.find('div.col-md-3.col-sm-6.col-xs-12', first=True)

            umbrella_competition_name = find_extract(category_table, selector='caption')
            competition_name = umbrella_competition_name   " "   season_year

I tried multiple things, such as trying to get the HTML of that element and then wanting to a do .split on certain things. However it seems when I do .html I get the entire page's html which doesn't help my case.

I also tried .attrs in the hopes of finding the right tag, but it returns nothing.

CodePudding user response：

Here is one possible solution:

from time import time
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse


def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
    return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))

def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
    return (f'{url}?Group={_type}&lang=en' for _type in comp_types)

def get_data(competition_url: str, session: HTMLSession) -> None:
    response = session.get(competition_url)
    print(competition_url)
    article_data = response.html.find('article.tab-pane')
    for article in article_data:
        for data in (i.text.split('\n') for i in article.find('div caption')):
            if len(data) > 1:
                print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
            else:
                print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")

session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)

start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)

for url in competition_urls:
    get_data(url, session)

print(f"Total time: {round(time()-start, 3)}")

The performance of this solution(processing all 4960 elements) is 55 sec

Output:

ECA European Championships - Catania (ITA) 2021 Men
ECA European Championships - Catania (ITA) 2021 Women
ECA European Championships - Catania (ITA) 2021 U21 Men

Solution based on ThreadPoolExecutor:

from time import time
from itertools import repeat
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse
from concurrent.futures import ThreadPoolExecutor


def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
    return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))

def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
    return (f'{url}?Group={_type}&lang=en' for _type in comp_types)

def get_data(competition_url: str, session: HTMLSession) -> None:
    response = session.get(competition_url)
    print(competition_url)
    article_data = response.html.find('article.tab-pane')
    for article in article_data:
        for data in (i.text.split('\n') for i in article.find('div caption')):
            if len(data) > 1:
                print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
            else:
                print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")

session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)

start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)

with ThreadPoolExecutor() as executor:
    executor.map(get_data, list(competition_urls), repeat(session))
print(f"Total time: {round(time()-start, 3)}")

The performance of this solution(processing all 4960 elements) is ~35 sec

And of course, since in this solution we work with threads all data will be mixed

Output:

European Championships - Sheffield (GBR) 1993 Women
Coupe d'Europe des Nations - Strasbourg (FRA) 1990 Men
European Club Championship - Duisbourg (GER) 2021 Men