I am trying to scrape this website down below:
def grab_ranking():
tournament_list = grab_tournament_metadata()
for item in tournament_list:
url_to_scrape = f'https://www.kayak-polo.info/kphistorique.php?Group={item[1]}&lang=en'
response = session.get(url_to_scrape)
print(url_to_scrape)
season_data = response.html.find('body > div.container-fluid > div > article')
for season in season_data:
season_year_raw = find_extract(season, selector='h3 > div.col-md-6.col-sm-6')
season_year = season_year_raw.replace('Season ', '')
print(season_year)
# TODO Figure out how to deal with the n1h and n2h and other french national categories being togheter in one place.
category_table = season.find('div.col-md-3.col-sm-6.col-xs-12', first=True)
umbrella_competition_name = find_extract(category_table, selector='caption')
competition_name = umbrella_competition_name " " season_year
I tried multiple things, such as trying to get the HTML of that element and then wanting to a do .split on certain things. However it seems when I do .html I get the entire page's html which doesn't help my case.
I also tried .attrs in the hopes of finding the right tag, but it returns nothing.
CodePudding user response:
Here is one possible solution:
from time import time
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse
def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))
def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
return (f'{url}?Group={_type}&lang=en' for _type in comp_types)
def get_data(competition_url: str, session: HTMLSession) -> None:
response = session.get(competition_url)
print(competition_url)
article_data = response.html.find('article.tab-pane')
for article in article_data:
for data in (i.text.split('\n') for i in article.find('div caption')):
if len(data) > 1:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
else:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")
session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)
start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)
for url in competition_urls:
get_data(url, session)
print(f"Total time: {round(time()-start, 3)}")
The performance of this solution(processing all 4960 elements) is 55 sec
Output:
ECA European Championships - Catania (ITA) 2021 Men
ECA European Championships - Catania (ITA) 2021 Women
ECA European Championships - Catania (ITA) 2021 U21 Men
Solution based on ThreadPoolExecutor
:
from time import time
from itertools import repeat
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse
from concurrent.futures import ThreadPoolExecutor
def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))
def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
return (f'{url}?Group={_type}&lang=en' for _type in comp_types)
def get_data(competition_url: str, session: HTMLSession) -> None:
response = session.get(competition_url)
print(competition_url)
article_data = response.html.find('article.tab-pane')
for article in article_data:
for data in (i.text.split('\n') for i in article.find('div caption')):
if len(data) > 1:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
else:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")
session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)
start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)
with ThreadPoolExecutor() as executor:
executor.map(get_data, list(competition_urls), repeat(session))
print(f"Total time: {round(time()-start, 3)}")
The performance of this solution(processing all 4960 elements) is ~35 sec
And of course, since in this solution we work with threads all data will be mixed
Output:
European Championships - Sheffield (GBR) 1993 Women
Coupe d'Europe des Nations - Strasbourg (FRA) 1990 Men
European Club Championship - Duisbourg (GER) 2021 Men