Home > front end >  Web scraping three page levels from GSMArena
Web scraping three page levels from GSMArena

Time:01-20

I managed to collect all datapoints, but only up to the last result of the first page on LEVEL 2... result.csv

How can I get all the results by following the pagination correctly on page LEVEL 2?

    next_page = response.xpath('//a[@]/@href').get()
    if next_page is not None:
        yield response.follow(next_page, callback=self.parse_allbranddevicesurl)

I am pretty sure that there is a lot of stuff wrong with the code, and that I am passing the datapoints from level to level incorrectly. I receive the following error message...

File "c:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:    
File ...\spiders\gsm.py", line 45, in parse_allbranddevicesurl
        brandname = response.meta['brandname']
    KeyError: 'brandname'

Here is the code:

import scrapy

class GsmSpider(scrapy.Spider):
    name = 'gsm'
    allowed_domains = ['gsmarena.com']
    start_urls = ['https://gsmarena.com/makers.php3']

    # LEVEL 1 | all brands

    def parse(self, response):
        gsms = response.xpath('//div[@]/table')
        for gsm in gsms:
            allbranddevicesurl = gsm.xpath('.//a/@href').get()
            brandname = gsm.xpath('.//a/text()').get()
            devicecount = gsm.xpath('.//span/text()').get()
            
            yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
                                    meta= {'brandname': brandname,
                                           'devicecount': devicecount})

    # LEVEL 2 | all devices

    def parse_allbranddevicesurl(self, response):
        
        brandname = response.meta['brandname']
        devicecount = response.meta['devicecount']
        
        phones = response.xpath('//*[@id="review-body"]//li')
        for phone in phones:
            thumbnailurl = phone.xpath('.//a/img/@src').get()
            detailpageurl = phone.xpath('.//a/@href').get()

            yield response.follow(detailpageurl,
                                    callback=self.parse_detailpage,
                                    meta= {'thumbnailurl': thumbnailurl, 
                                           'detailpageurl': detailpageurl,
                                           'brandname': brandname,
                                           'devicecount': devicecount})
    
        next_page = response.xpath('//a[@]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse_allbranddevicesurl)

    # LEVEL 3 | detailpage

    def parse_detailpage(self, response):
     
        brandname = response.meta['brandname']
        devicecount = response.meta['devicecount']

        thumbnailurl = response.meta['thumbnailurl']
        detailpageurl = response.meta['detailpageurl']
 
        details = response.xpath('//div[@]')
        for detail in details:
            phonename = detail.xpath('.//h1/text()').get()
            released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
            body = detail.xpath('.//ul/li[1]/span[2]/span/text()').get()
            os = detail.xpath('.//ul/li[1]/span[3]/span/text()').get()
            memory = detail.xpath('.//ul/li[1]/span[4]/span/text()').get()
            displaysize = detail.xpath('.//ul/li[4]/strong/span/text()').get()
            displayres = detail.xpath('.//ul/li[4]/div/text()').get()
            camerapixels = detail.xpath('.//ul/li[5]/strong/span[1]/text()').get()
            camerapixelsunit = detail.xpath('.//ul/li[5]/strong/span[2]/text()').get()
            videopixels = detail.xpath('.//ul/li[5]/div/text()').get()
            ramsize = detail.xpath('.//ul/li[6]/strong/span[1]/text()').get()
            ramsizeunit = detail.xpath('.//ul/li[6]/strong/span[2]/text()').get()
            chipset = detail.xpath('.//ul/li[6]/div/text()').get()
            batsize = detail.xpath('.//ul/li[7]/strong/span[1]/text()').get()
            batsizeunit = detail.xpath('.//ul/li[7]/strong/span[2]/text()').get()
            battype = detail.xpath('.//ul/li[7]/div/text()').get()
            popularity = detail.xpath('.//ul/li[2]/strong/text()[2]').extract()

        yield {'brandname': brandname,
               'devicecount': devicecount,

               'thumbnailurl': thumbnailurl,
               'detailpageurl': detailpageurl,

               'phonename': phonename,
               'released': released,
               'body': body,
               'os': os,
               'memory': memory,
               'displaysize': displaysize,
               'displayres': displayres,
               'camerapixels': camerapixels,
               'camerapixelsunit': camerapixelsunit,
               'videopixels': videopixels,
               'ramsize': ramsize,
               'ramsizeunit': ramsizeunit,
               'chipset': chipset,
               'batsize': batsize,
               'batsizeunit': batsizeunit,
               'battype': battype,
               'popularity': popularity}

CodePudding user response:

You forgot to add the meta data in the request of the next page inside level 2

import scrapy


class GsmSpider(scrapy.Spider):
    name = 'gsm'
    allowed_domains = ['gsmarena.com']
    start_urls = ['https://gsmarena.com/makers.php3']

    custom_settings = {
        'CONCURRENT_REQUESTS': 4,
        'DOWNLOAD_DELAY': 0.5
    }

    # LEVEL 1 | all brands

    def parse(self, response):
        gsms = response.xpath('//div[@]/table')
        for gsm in gsms:
            allbranddevicesurl = gsm.xpath('.//a/@href').get()
            brandname = gsm.xpath('.//a/text()').get()
            devicecount = gsm.xpath('.//span/text()').get()

            yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
                                  meta= {'brandname': brandname,
                                         'devicecount': devicecount})

    # LEVEL 2 | all devices

    def parse_allbranddevicesurl(self, response):
        brandname = response.meta['brandname']
        devicecount = response.meta['devicecount']

        phones = response.xpath('//*[@id="review-body"]//li')
        for phone in phones:
            thumbnailurl = phone.xpath('.//a/img/@src').get()
            detailpageurl = phone.xpath('.//a/@href').get()

            yield response.follow(detailpageurl,
                                  callback=self.parse_detailpage,
                                  meta= {'thumbnailurl': thumbnailurl,
                                         'detailpageurl': detailpageurl,
                                         'brandname': brandname,
                                         'devicecount': devicecount})

        next_page = response.xpath('//a[@]/@href').get()
        if next_page is not None:
            # This is the fixed line:
            yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
                                  meta={'brandname': brandname,
                                         'devicecount': devicecount})

    # LEVEL 3 | detailpage

    def parse_detailpage(self, response):
        brandname = response.meta['brandname']
        devicecount = response.meta['devicecount']

        thumbnailurl = response.meta['thumbnailurl']
        detailpageurl = response.meta['detailpageurl']

        details = response.xpath('//div[@]')
        for detail in details:
            phonename = detail.xpath('.//h1/text()').get()
            released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
            body = detail.xpath('.//ul/li[1]/span[2]/span/text()').get()
            os = detail.xpath('.//ul/li[1]/span[3]/span/text()').get()
            memory = detail.xpath('.//ul/li[1]/span[4]/span/text()').get()
            displaysize = detail.xpath('.//ul/li[4]/strong/span/text()').get()
            displayres = detail.xpath('.//ul/li[4]/div/text()').get()
            camerapixels = detail.xpath('.//ul/li[5]/strong/span[1]/text()').get()
            camerapixelsunit = detail.xpath('.//ul/li[5]/strong/span[2]/text()').get()
            videopixels = detail.xpath('.//ul/li[5]/div/text()').get()
            ramsize = detail.xpath('.//ul/li[6]/strong/span[1]/text()').get()
            ramsizeunit = detail.xpath('.//ul/li[6]/strong/span[2]/text()').get()
            chipset = detail.xpath('.//ul/li[6]/div/text()').get()
            batsize = detail.xpath('.//ul/li[7]/strong/span[1]/text()').get()
            batsizeunit = detail.xpath('.//ul/li[7]/strong/span[2]/text()').get()
            battype = detail.xpath('.//ul/li[7]/div/text()').get()
            popularity = detail.xpath('.//ul/li[2]/strong/text()[2]').extract()

        yield {'brandname': brandname,
               'devicecount': devicecount,

               'thumbnailurl': thumbnailurl,
               'detailpageurl': detailpageurl,

               'phonename': phonename,
               'released': released,
               'body': body,
               'os': os,
               'memory': memory,
               'displaysize': displaysize,
               'displayres': displayres,
               'camerapixels': camerapixels,
               'camerapixelsunit': camerapixelsunit,
               'videopixels': videopixels,
               'ramsize': ramsize,
               'ramsizeunit': ramsizeunit,
               'chipset': chipset,
               'batsize': batsize,
               'batsizeunit': batsizeunit,
               'battype': battype,
               'popularity': popularity}
  •  Tags:  
  • Related