I'm trying to scrape some football data from the transfer market. I want to extract:
- league name
- clubs within the league
- each player's information
The code I have runs with no issues but it doesn't retrieve any information. I'm new to data scraping. I'm not sure why it's not working. please help
'leauge.py'
from bs4 import BeautifulSoup
import csv
from team import team
import requests
headers = {'user-agent': '>> put my user agent<< '}
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/saison_id/2019"
numTeams = 20
teamcount = 0
result = requests.get(url, headers=headers)
src = result.content
soup = BeautifulSoup(src, 'lxml')
f = open('database.csv', 'w')
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
for td_tag in soup.find_all("td"):
if td_tag.get('class') == ['zentriert']:
a_tag = td_tag.find('a')
if a_tag != None and teamcount < numTeams:
teamcount = 1
url = 'https://www.transfermarkt.com' a_tag.get('href') '/plus/1'
print(url)
t1 = team(url)
wr.writerow([a_tag.get('title')])
wr.writerow(['Name', 'Club', 'Position', 'Nationality', 'DOB (Age)', 'Height', 'Foot', 'Date Joined', 'Contract Expires'])
names = t1.getNames()
bdays = t1.getBirth()
pos = t1.getPos()
nats = t1.getNat()
for x in range(0,len(names)):
playerdata = []
playerdata.append(names[x])
playerdata.append(a_tag.get('title'))
playerdata.append(pos[x])
playerdata.append("N/A")
playerdata.append(bdays[5*x])
playerdata.append(bdays[5*x 1])
playerdata.append(bdays[5*x 2])
playerdata.append(bdays[5*x 3])
playerdata.append(bdays[5*x 4])
wr.writerow(playerdata)
f.close()
'team.py'
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
if td_tag.get('class') == ['zentriert']:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
if td_tag.string == 'Second Striker' or td_tag.string == 'Right Midfield' or td_tag.string == 'Left Midfield' or td_tag.string == 'Goalkeeper' or td_tag.string == 'Left-Back' or td_tag.string == 'Centre-Back' or td_tag.string == 'Right-Back' or td_tag.string == 'Defensive Midfield' or td_tag.string == 'Central Midfield' or td_tag.string == 'Attacking Midfield' or td_tag.string == 'Left Winger' or td_tag.string == 'Right Winger' or td_tag.string == 'Centre-Forward':
pos.append(td_tag.string)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
if td_tag.get('class') == ['flaggenrahmen'] and td_tag.string == None:
nat.append(td_tag.get('title'))
return nat
'output'
"Manchester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Liverpool FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Tottenham Hotspur"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Chelsea FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Manchester United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Arsenal FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Everton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Leicester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Wolverhampton Wanderers"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"West Ham United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"AFC Bournemouth"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Newcastle United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Aston Villa"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Southampton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Brighton & Hove Albion"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Watford FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Crystal Palace"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Burnley FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Norwich City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Sheffield United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
CodePudding user response:
You have a number of problems in your scraping code. You really need to print out the HTML and make sure what you're asking is what you want. Intermediate debug prints would always help.
For one thing, the td_tag.string
value returns a long string with the combined contents of the tag. It might CONTAIN the string "Second Striker", but it also has a lot of other crap. Also, span_tag.get('class') == ['hide-for-small']
only matches if the span contains ONLY that class. Most of the spans have other classes as well.
This seems to work. This is team.py
:
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
classes = td_tag.get('class')
if classes and 'zentriert' in classes:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
positions = ('Second Striker', 'Right Midfield', 'Left Midfield', 'Goalkeeper', 'Left-Back', 'Centre-Back', 'Right-Back', 'Defensive Midfield', 'Central Midfield', 'Attacking Midfield', 'Left Winger', 'Right Winger', 'Centre-Forward')
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
for p in self.positions:
if td_tag.string and p in td_tag.string:
pos.append(p)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
classes = td_tag.get('class')
if classes and 'flaggenrahmen' in classes and not td_tag.string:
nat.append(td_tag.get('title'))
return nat