Home > database >  Scraping issue. data not retrieved
Scraping issue. data not retrieved

Time:05-26

I'm trying to scrape some football data from the transfer market. I want to extract:

  1. league name
  2. clubs within the league
  3. each player's information

The code I have runs with no issues but it doesn't retrieve any information. I'm new to data scraping. I'm not sure why it's not working. please help

'leauge.py'

from bs4 import BeautifulSoup
import csv
from team import team
import requests
    
    headers = {'user-agent': '>> put my user agent<< '}
    url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/saison_id/2019"
    numTeams = 20
    teamcount = 0
    result = requests.get(url, headers=headers)
    src = result.content
    soup = BeautifulSoup(src, 'lxml')
    f = open('database.csv', 'w')
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    for td_tag in soup.find_all("td"):
        if td_tag.get('class') == ['zentriert']:
            a_tag = td_tag.find('a')
            if a_tag != None and teamcount < numTeams:
                teamcount  = 1
                url = 'https://www.transfermarkt.com'   a_tag.get('href')   '/plus/1'
                print(url)
                t1 = team(url)
                wr.writerow([a_tag.get('title')])
                wr.writerow(['Name', 'Club', 'Position', 'Nationality', 'DOB (Age)', 'Height', 'Foot', 'Date Joined', 'Contract Expires'])
                names = t1.getNames()
                bdays = t1.getBirth()
                pos = t1.getPos()
                nats = t1.getNat()
                for x in range(0,len(names)):
                    playerdata = []
                    playerdata.append(names[x])
                    playerdata.append(a_tag.get('title'))
                    playerdata.append(pos[x])
                    playerdata.append("N/A")
                    playerdata.append(bdays[5*x])
                    playerdata.append(bdays[5*x 1])
                    playerdata.append(bdays[5*x 2])
                    playerdata.append(bdays[5*x 3])
                    playerdata.append(bdays[5*x 4])
                    wr.writerow(playerdata)
                   
    
    f.close()  

'team.py'

import requests
from bs4 import BeautifulSoup
class team:
    
    def __init__(self, url):
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
        result = requests.get(url, headers=headers)
        src = result.content
        self.soup = BeautifulSoup(src, 'lxml')


    def getNames(self):
        names = []
        for span_tag in self.soup.find_all("span"):
            a_tag = span_tag.find('a')
            if a_tag != None:
                if span_tag.get('class') == ['hide-for-small']:
                    names.append(a_tag.get('title'))
        return names

    def getID(self):
        ids = []
        for span_tag in self.soup.find_all("span"):
            a_tag = span_tag.find('a')
            if a_tag != None:
                if span_tag.get('class') == ['hide-for-small']:
                    ids.append(a_tag.get('id'))
        return ids

    def getBirth(self):
        bday = []
        for td_tag in self.soup.find_all('td'):
            if td_tag.get('class') == ['zentriert']:
                if td_tag.string != None:
                    bday.append(td_tag.string)
        return bday

    def getPos(self):
        pos = []
        for td_tag in self.soup.find_all('td'):
            if td_tag.string == 'Second Striker' or td_tag.string == 'Right Midfield' or td_tag.string == 'Left Midfield' or td_tag.string == 'Goalkeeper' or td_tag.string == 'Left-Back' or td_tag.string == 'Centre-Back' or td_tag.string == 'Right-Back' or td_tag.string == 'Defensive Midfield' or td_tag.string == 'Central Midfield' or td_tag.string == 'Attacking Midfield' or td_tag.string == 'Left Winger' or td_tag.string == 'Right Winger' or td_tag.string == 'Centre-Forward':
                pos.append(td_tag.string)
        return pos

    def getNat(self):
        nat = []
        for td_tag in self.soup.find_all('img'):
            if td_tag.get('class') == ['flaggenrahmen'] and td_tag.string == None:
                nat.append(td_tag.get('title'))
        return nat

'output'

"Manchester City"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Liverpool FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Tottenham Hotspur"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Chelsea FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Manchester United"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Arsenal FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Everton FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Leicester City"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Wolverhampton Wanderers"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"West Ham United"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"AFC Bournemouth"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Newcastle United"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Aston Villa"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Southampton FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Brighton & Hove Albion"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Watford FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Crystal Palace"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Burnley FC"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Norwich City"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

"Sheffield United"

"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"

CodePudding user response:

You have a number of problems in your scraping code. You really need to print out the HTML and make sure what you're asking is what you want. Intermediate debug prints would always help.

For one thing, the td_tag.string value returns a long string with the combined contents of the tag. It might CONTAIN the string "Second Striker", but it also has a lot of other crap. Also, span_tag.get('class') == ['hide-for-small'] only matches if the span contains ONLY that class. Most of the spans have other classes as well.

This seems to work. This is team.py:

import requests
from bs4 import BeautifulSoup
class team:
    def __init__(self, url):
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
        result = requests.get(url, headers=headers)
        src = result.content
        self.soup = BeautifulSoup(src, 'lxml')


    def getNames(self):
        names = []
        for span_tag in self.soup.find_all("span"):
            classes = span_tag.get("class")
            if not (classes and 'hide-for-small' in classes):
                continue
            a_tag = span_tag.find('a')
            if a_tag:
                names.append(a_tag.get('title'))
        return names

    def getID(self):
        ids = []
        for span_tag in self.soup.find_all("span"):
            classes = span_tag.get("class")
            if not (classes and 'hide-for-small' in classes):
                continue
            a_tag = span_tag.find('a')
            if a_tag:
                ids.append(a_tag.get('id'))
        return ids

    def getBirth(self):
        bday = []
        for td_tag in self.soup.find_all('td'):
            classes = td_tag.get('class')
            if classes and 'zentriert' in classes:
                if td_tag.string != None:
                    bday.append(td_tag.string)
        return bday

    positions = ('Second Striker', 'Right Midfield', 'Left Midfield', 'Goalkeeper', 'Left-Back', 'Centre-Back', 'Right-Back', 'Defensive Midfield', 'Central Midfield', 'Attacking Midfield', 'Left Winger', 'Right Winger', 'Centre-Forward')

    def getPos(self):
        pos = []
        for td_tag in self.soup.find_all('td'):
            for p in self.positions:
                if td_tag.string and p in td_tag.string:
                    pos.append(p)
        return pos

    def getNat(self):
        nat = []
        for td_tag in self.soup.find_all('img'):
            classes = td_tag.get('class')
            if classes and 'flaggenrahmen' in classes and not td_tag.string:
                nat.append(td_tag.get('title'))
        return nat
  • Related