Home > Net >  Scrapy not scraping the whole website even though I find the data with xpath
Scrapy not scraping the whole website even though I find the data with xpath

Time:12-09

I created my first scrapy-project by scraping from https://www.tapology.com. I have a list of links of all the fighters in the world that are ordered by their countries. The scraper opens up the links, scrapes all the fighters on the link, and then moves to the next page for that country or move to the next country if finished.

When I scrape the information of each fighter I get all the data without problem, but when I try to scrape all the rows of their amateur matches or/and pro matches I get nothing. I've tried multiple solutions, but it doesn't work when I scrape it even though I can find the data with XPath.

Link example: https://www.tapology.com/search/mma-fighters-by-nationality/country-no

Fighter example from the link:https://www.tapology.com/fightcenter/fighters/126571-eric-mambo

I just found out that I can't see the data when I open the page source instead of inspecting the elements.

import scrapy
import os
import logging

# , FighterscraperDetailsItem, FighterscraperProRecordItem, FighterscraperAmateurRecordItem
class FighterSpider(scrapy.Spider):
    name = "fighter"
    allowed_domains = ["tapology.com"]
    count = 0
    
    custom_settings = dict(
        DOWNLOADER_MIDDLEWARES={
            "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
            "scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
            "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
            "scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
        },
        FAKEUSERAGENT_PROVIDERS=[
            "scrapy_fake_useragent.providers.FakerProvider",
            "scrapy_fake_useragent.providers.FakeUserAgentProvider",
            "scrapy_fake_useragent.providers.FixedUserAgentProvider",
        ],
    )
    
 
    def start_requests(self):
        with open(os.path.abspath("linkListTest.txt"), "rt") as f:
            urls = [url.strip() for url in f.readlines()]
        
        for url in urls:
            
            yield scrapy.Request(url=url, callback=self.parse_country)


        
    
    def parse_country(self, response):
        print(response.request.headers.get("User-Agent"))
        # hrefs = response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href").getall()
        # fixedNames = [s.replace('\\', '').replace('"', '') for s in names]
        
        for href in response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href"):
            
            recievedURL = response.urljoin(href.extract())
            yield scrapy.Request(recievedURL, callback = self.parse_fighter_page)
        
        nextPage = response.xpath("(//a[@rel='next'][contains(text(),'Next ›')])[2]/@href").get()
        if nextPage is not None:
            print("***** NEXTPAGE *****")
            newNextPage = response.urljoin(nextPage)
            # print(newNextPage)
            yield scrapy.Request(newNextPage, callback = self.parse_country)
            
        
    def parse_fighter_page(self, response):
       
        lista = []
        item = {}
        # details = ItemLoader(item = FighterscraperDetailsItem(), response = response)

        item['name'] = response.xpath("(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()").get()
        item['fighter_url'] = f"{response.request.url}"
        # fighter.add_xpath('name', "(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()")
        stats = "//div[@id='stats'][1]"
        
        item['details'] = {}
        item['details']['nickname'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()").get()
        item['details']['record'] = response.xpath(f"{stats}[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()").get()
        item['details']['age'] = response.xpath("//span[@class='dateToAge'][1]/text()").get()
        item['details']['date_of_birth'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()").get()
        item['details']['weight_class'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()").get()
        item['details']['born'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()").get()
        item['details']['gym'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Affiliation')]/following-sibling::span/a/text()").get()
        
        item['details']['country_code'] = response.xpath("(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href").get()
        
        
        # details.add_value('fighter_url', f"{response.request.url}")
        
        # details.add_xpath('nickname',"[1]//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('record',"[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()")
        # details.add_xpath('age',"//span[@class='dateToAge'][1]/text()")
        # details.add_xpath('date_of_birth',"//div[@id='stats'][1]//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()")
        # details.add_xpath('weight_class',"//div[@id='stats'][1]//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('born',"//div[@id='stats'][1]//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('country_code',"(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href")
        # details.add_xpath('gym',"//div[@id='stats'][1]//li/strong[contains(text(), 'Affiliation')]/following-sibling::span[position()=1]/text()")
        
        item['pro_record'] = {}
        item['amateur_record'] = {}
        
        
        
        for match in response.xpath("//div[@id='react-container'][1]/div[@id='fighterRecord']/section[2]/ul//li/div"): 
            opponent = match.xpath("/div[1]/div[1]/a/text()").extract_first()
            print(opponent)
        
        
        yield item

CodePudding user response:

You're not getting the results because they are created dynamically.

Learn how to use scrapy shell if you don't know how, it will help you a lot.

If we inspect the page with devtools in the browser under the 'xhr' tab we can see that it gets the data from https://api.tapology.com/v1/internal_fighters/1389126571. So we need to recreate the request with the same number.

scrapy shell "https://www.tapology.com/fightcenter/fighters/126571-eric-mambo"

# Let's first get the correct number so we can create the correct request to the API
In [1]: fid = response.xpath('//meta[@name="fid"]/@content').get()

In [2]: salt = response.xpath('//meta[@name="salt"]/@content').get()

In [3]: file_number = salt   fid

# Now we need to create the headers (I just copied them from devtools).
In [4]: headers = {
   ...: "Accept": "*/*",
   ...: "Accept-Encoding": "gzip, deflate, br",
   ...: "Accept-Language": "en-US,en;q=0.5",
   ...: "authorization": "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyIjoiaW50ZXJuYWxfYXBpIiwiZXhwIjoyNTM3NjU0N
   ...: DAwfQ.C1E9hhkQOH7XrfZ5c7aTYS4CKN3ACkJ1nvgvx2v10YY",
   ...: "Cache-Control": "no-cache",
   ...: "Connection": "keep-alive",
   ...: "content-type": "application/vnd.api json",
   ...: "DNT": "1",
   ...: "Host": "api.tapology.com",
   ...: "Origin": "https://www.tapology.com",
   ...: "Pragma": "no-cache",
   ...: "Referer": "https://www.tapology.com/",
   ...: "Sec-Fetch-Dest": "empty",
   ...: "Sec-Fetch-Mode": "cors",
   ...: "Sec-Fetch-Site": "same-site",
   ...: "Sec-GPC": "1",
   ...: "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
   ...: }

# create the request and fetch it
In [5]: req = scrapy.Request(url=f"https://api.tapology.com/v1/internal_fighters/{file_number}", headers=headers)

In [6]: fetch(req)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://api.tapology.com/v1/internal_fighters/138931242> (referer: https://www.tapology.com/)

# we got a json file, let's parse it (you can check it out by printing it or type view(response) to open it)

In [7]: json_data = response.json()
In [8]: for fighter in json_data['included']:
   ...:     print(fighter['id'])
   ...:     print(fighter['attributes']['opponent_fighter_name'])
   ...:
1170431
Greg Fischer
931909
Mohammad Alavi
879994
Kenta Takizawa
846160
Rodney Mondala
764333
Scott MacGregor
674056
Hiroyuki Oshiro
650257
Luke Catubig
600477
Josh Branham
534279
Won Jun Jang
458178
Rilley Dutro
427351
Jerome Cruz
356615
Kwan Ho Kwak
306251
Mark Abelardo
356662
Vince Masga
256096
Trevin Jones
175233
Ian Dela Cuesta
129057
Jung Hoon Ko
129499
Vince Pua
656653
John Paul Mendiola
129503
Carlos Tiongson

CodePudding user response:

You can extract fighter information with this code also please check:

import scrapy
from scrapy.utils.response import open_in_browser
import urllib.parse
class FighterinfoSpider(scrapy.Spider):
    name = 'fighter'
    allowed_domains = ['tapology.com']
    start_urls = ['https://www.tapology.com/search']

    def parse(self, response):
        # extract countrywise figher list
        fighters_by_nationality = response.xpath('//div[@]/dd/a/@href').getall()
        for link in fighters_by_nationality:
            yield response.follow(link, callback = self.parse_fighter_list_by_country)
    
    

    def parse_fighter_list_by_country(self,response):
        # extract fighter list
        all_fighter_links = response.xpath('//table[@]/tr/td/a/@href').getall()

        for individual_fighter in all_fighter_links:
            url = urllib.parse.urljoin('https://www.tapology.com',individual_fighter)
            yield scrapy.Request(url,callback=self.parse_fighter_page)
        next_page = response.xpath('//span[@]/a/@href').get()
        if next_page:
            yield response.follow(next_page, callback = self.parse_fighter_list_by_country)

    def parse_fighter_page(self, response):
        # extract fighter information
  • Related