I have a problem when iterating through many links. I scrape according to columns css selector. however, as it seems there is among all links not a rating for every player. How do I manage that I get a "None" for the home_rating list when there is among the eleven starting squad no rating available in a specific "player row".
I basically need to scrape all column entries per row. thanks a lot for your support.
gamedays_url = range(1,35)
url_list = []
daylinks = []
for gameday in gamedays_url:
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" str(gameday)
url_list.append(url)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
gameLinks = []
for i in range(len(url_list)):
page = url_list
tree = requests.get(page[i], headers = {'User-Agent': 'Custom5'})
soup_2 = BeautifulSoup(tree.content, 'html.parser')
links_2 = soup_2.find_all("a", {"class": "liveLink"}, href=re.compile("spielbericht"))
for j in range(len(links_2)):
gameLinks.append(links_2[j].get('href').split('/')[4])
for j in range(len(gameLinks)):
gameLinks[j] = "https://www.transfermarkt.de/spiele/aufstellung/spielbericht/" gameLinks [j]
home_id = []
home_name = []
homerating = []
for p in range(len(gameLinks)):
page = gameLinks[p]
response = requests.get(page, headers={'User-Agent': 'Custom5'})
lineup_data = response.text
soup = BeautifulSoup(lineup_data, 'html.parser')
test =soup.find('div', class_='responsive-table')
for homeid in test.find_all('a', href=re.compile('profil/spieler')):
home_id.append(homeid.get('href').split('/')[4])
for homename in test.find_all('a', href=re.compile('profil/spieler')):
home_name.append(homename.get('href').split('/')[1])
for grade in test.find_all('span', class_=None):
homerating.append(grade.text.split()[0])
homerating.append(None)
CodePudding user response:
Inject the right class attribute value instead of class_=None
and you can get the None applying if else None
statement
for grade in test.find_all('span', class_=class attribute value):
rating = grade.text.split()[0] if grade else None
homerating.append(rating)
CodePudding user response:
Try to check if your selected element is available and scrape the text alse set it to None
:
row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
Also try to work with structured dicts
instead of list
.
Example
import requests
from bs4 import BeautifulSoup
data = []
for gameday in range(1,3):
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" str(gameday)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.content)
for a in soup.select('a.liveLink[href*="spielbericht"]'):
report_url = 'https://www.transfermarkt.de/spiele/aufstellung/spielbericht/' a.get('href').split('/')[-1]
response = requests.get(report_url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.text)
for row in soup.select_one('table.items').select('tr:has(table)'):
data.append({
'home_id': row.select_one('a').get('href').split('/')[-1],
'home_name': row.select_one('a img').get('title'),
'home_rating': row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
})
data
Output
[...{'home_id': '45672', 'home_name': 'Kevin Trapp', 'home_rating': '3,4'},{'home_id': '256866', 'home_name': 'Carlos Salcedo', 'home_rating': None},{'home_id': '58178', 'home_name': 'David Abraham', 'home_rating': '3,4'}, {'home_id': '146258', 'home_name': 'Jetro Willems', 'home_rating': '5,5'},...]