I am trying to scrape a German Billiard League website for results, to tabulate and submit to a rating system that requires a certain format. I am no python expert, but I have muddled through how to pull a complete list of links from the root League page, and to get Date, Home/Visitor teams, and now I am trying to capture individual match data.
Here is the relevant HTML:
<tr>
<td colspan="3" nowrap="" rowspan="2" width="100"><b>
Spiel 2<br/>8-Ball </b>
</td>
<td colspan="6" valign="top">Christian Fachinger</td>
<td colspan="7" valign="top">Michael Schneider</td>
</tr>
<tr>
<td colspan="6" valign="top">7</td>
<td colspan="7" valign="top">4</td>
Site: https://hbu.billardarea.de/cms_leagues/matchday/344947
I am trying to find the "td" tag that contains the text string "Spiel 2". I should then be able to pull the game - "8-ball", and then move on to figuring out how to capture the data inside the relevant "class" tags. For the life of me, I cannot get a result back. I have tried many permutations of various soup commands, but either get a "None", or "[]". I "think" it might have something to do with the extra spaces, but have tried various regex-centric commands, but have not been able to "select" this td tag to do further data gathering.
What am I doing wrong? I know I am not coding this in the most efficient manner, and this is the first time I have tried to write a web scraper, and in general, am a python newb.
'''
import requests
import re
import os
from bs4 import BeautifulSoup
URL = "https://hbu.billardarea.de/cms_leagues/plan/7870/10406"
def import_all_links():
page = requests.get(URL).text
soup = BeautifulSoup(page, "html.parser")
path = soup.select("a[href*=matchday]")
for link in path:
file1 = open("league.txt", "a") # append mode
file1.write("https://hbu.billardarea.de" link['href'] '\n')
file1.close()
def get_date():
links_file = open(r'C:\Users\Russ\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.10\league.txt', "r")
for day_link in links_file:
day_link = day_link.rstrip("\n")
soup = requests.get(day_link).text
day_links_parse = BeautifulSoup(soup, "html.parser")
date = day_links_parse.select('label:contains(Datum)')
league = day_links_parse.select('label:contains(Saison)')
home = day_links_parse.find(attrs={"class": "home"}).text
home = home.partition(":")[2]
visitor = day_links_parse.find(attrs={"class": "visitor"}).text
visitor = visitor.partition(":")[2]
print(day_links_parse)
**play_table = day_links_parse.td.find_all(text = re.compile('Spiel 2'))** <<<<< Issue
**print(play_table)** <<<<< Returns 0 results
for item in date:
date = item.next_sibling.next_sibling.text
date = date.partition(" ")[0]
date = date.split(".")
date = date[1] "\\" date[0] "\\" date[2]
for item in league:
league = item.next_sibling.next_sibling.text
league = league.partition(" ")[0]
print(date, ",", league, ",", home, " (H) vs ", visitor, "(V)", sep='')
import_all_links()
get_date() '''
CodePudding user response:
You could try something like this, for example:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://hbu.billardarea.de/cms_leagues/matchday/344947")
table_rows = (
BeautifulSoup(page.text, "lxml")
.select(".report_table, .matchday_table, .score_table > tr")
)
spiel_zwei = table_rows[5].select_one("b").getText(strip=True, separator=" ")
heim, gast = [spieler.getText() for spieler in table_rows[5].select("td")[1:]]
spielergebnis = table_rows[6].getText(strip=True, separator=" vs. ")
print(f"{spiel_zwei}\n{heim} - {gast}\n{spielergebnis}")
Output:
Spiel 2 8-Ball
Rolf Berghöfer - Zühtü Uyanik
6 vs. 7
CodePudding user response:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import re
import pandas as pd
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = get_soup(r.text)
urls = [urljoin(url, x['href'])
for x in soup.select('a[href*=matchday]')]
allin = []
for link in urls:
r = req.get(link)
soup = get_soup(r.text).select_one('#main_frontend')
match = soup.find(text=re.compile('Spiel 2'))
allin.append(
{
'Date': soup.select('.nochange')[3].text.split()[0],
'League': soup.select('.nochange')[1].text.split()[1],
'Home': soup.select_one('.home').text.split(':')[1],
'Visitor': soup.select_one('.visitor').text.split(':')[1],
'Game': list(match.next_elements)[1].strip(),
'Whom': [x.text for x in match.find_all_next('td')[:2]],
'Result': [x['colspan'] for x in match.find_all_next('td')[:2]]
}
)
df = pd.DataFrame(allin)
print(df)
main('https://hbu.billardarea.de/cms_leagues/plan/7870/10406')
Output:
Date League ... Whom Result
0 11.09.2021 (2021/2022) ... [Rolf Berghöfer, Zühtü Uyanik] [6, 7]
1 11.09.2021 (2021/2022) ... [Christian Roller, Balthasar Nebel] [6, 7]
2 11.09.2021 (2021/2022) ... [Peter Graessner, Christian Fachinger] [6, 7]
[3 rows x 7 columns]