I managed to collect all datapoints, but only up to the last result of the first page on LEVEL 2... result.csv
How can I get all the results by following the pagination correctly on page LEVEL 2?
next_page = response.xpath('//a[@]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl)
I am pretty sure that there is a lot of stuff wrong with the code, and that I am passing the datapoints from level to level incorrectly. I receive the following error message...
File "c:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File ...\spiders\gsm.py", line 45, in parse_allbranddevicesurl
brandname = response.meta['brandname']
KeyError: 'brandname'
Here is the code:
import scrapy
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
# LEVEL 1 | all brands
def parse(self, response):
gsms = response.xpath('//div[@]/table')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname,
'devicecount': devicecount})
# LEVEL 2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
thumbnailurl = phone.xpath('.//a/img/@src').get()
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'thumbnailurl': thumbnailurl,
'detailpageurl': detailpageurl,
'brandname': brandname,
'devicecount': devicecount})
next_page = response.xpath('//a[@]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl)
# LEVEL 3 | detailpage
def parse_detailpage(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
thumbnailurl = response.meta['thumbnailurl']
detailpageurl = response.meta['detailpageurl']
details = response.xpath('//div[@]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
body = detail.xpath('.//ul/li[1]/span[2]/span/text()').get()
os = detail.xpath('.//ul/li[1]/span[3]/span/text()').get()
memory = detail.xpath('.//ul/li[1]/span[4]/span/text()').get()
displaysize = detail.xpath('.//ul/li[4]/strong/span/text()').get()
displayres = detail.xpath('.//ul/li[4]/div/text()').get()
camerapixels = detail.xpath('.//ul/li[5]/strong/span[1]/text()').get()
camerapixelsunit = detail.xpath('.//ul/li[5]/strong/span[2]/text()').get()
videopixels = detail.xpath('.//ul/li[5]/div/text()').get()
ramsize = detail.xpath('.//ul/li[6]/strong/span[1]/text()').get()
ramsizeunit = detail.xpath('.//ul/li[6]/strong/span[2]/text()').get()
chipset = detail.xpath('.//ul/li[6]/div/text()').get()
batsize = detail.xpath('.//ul/li[7]/strong/span[1]/text()').get()
batsizeunit = detail.xpath('.//ul/li[7]/strong/span[2]/text()').get()
battype = detail.xpath('.//ul/li[7]/div/text()').get()
popularity = detail.xpath('.//ul/li[2]/strong/text()[2]').extract()
yield {'brandname': brandname,
'devicecount': devicecount,
'thumbnailurl': thumbnailurl,
'detailpageurl': detailpageurl,
'phonename': phonename,
'released': released,
'body': body,
'os': os,
'memory': memory,
'displaysize': displaysize,
'displayres': displayres,
'camerapixels': camerapixels,
'camerapixelsunit': camerapixelsunit,
'videopixels': videopixels,
'ramsize': ramsize,
'ramsizeunit': ramsizeunit,
'chipset': chipset,
'batsize': batsize,
'batsizeunit': batsizeunit,
'battype': battype,
'popularity': popularity}
CodePudding user response:
You forgot to add the meta data in the request of the next page inside level 2
import scrapy
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
custom_settings = {
'CONCURRENT_REQUESTS': 4,
'DOWNLOAD_DELAY': 0.5
}
# LEVEL 1 | all brands
def parse(self, response):
gsms = response.xpath('//div[@]/table')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname,
'devicecount': devicecount})
# LEVEL 2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
thumbnailurl = phone.xpath('.//a/img/@src').get()
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'thumbnailurl': thumbnailurl,
'detailpageurl': detailpageurl,
'brandname': brandname,
'devicecount': devicecount})
next_page = response.xpath('//a[@]/@href').get()
if next_page is not None:
# This is the fixed line:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta={'brandname': brandname,
'devicecount': devicecount})
# LEVEL 3 | detailpage
def parse_detailpage(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
thumbnailurl = response.meta['thumbnailurl']
detailpageurl = response.meta['detailpageurl']
details = response.xpath('//div[@]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
body = detail.xpath('.//ul/li[1]/span[2]/span/text()').get()
os = detail.xpath('.//ul/li[1]/span[3]/span/text()').get()
memory = detail.xpath('.//ul/li[1]/span[4]/span/text()').get()
displaysize = detail.xpath('.//ul/li[4]/strong/span/text()').get()
displayres = detail.xpath('.//ul/li[4]/div/text()').get()
camerapixels = detail.xpath('.//ul/li[5]/strong/span[1]/text()').get()
camerapixelsunit = detail.xpath('.//ul/li[5]/strong/span[2]/text()').get()
videopixels = detail.xpath('.//ul/li[5]/div/text()').get()
ramsize = detail.xpath('.//ul/li[6]/strong/span[1]/text()').get()
ramsizeunit = detail.xpath('.//ul/li[6]/strong/span[2]/text()').get()
chipset = detail.xpath('.//ul/li[6]/div/text()').get()
batsize = detail.xpath('.//ul/li[7]/strong/span[1]/text()').get()
batsizeunit = detail.xpath('.//ul/li[7]/strong/span[2]/text()').get()
battype = detail.xpath('.//ul/li[7]/div/text()').get()
popularity = detail.xpath('.//ul/li[2]/strong/text()[2]').extract()
yield {'brandname': brandname,
'devicecount': devicecount,
'thumbnailurl': thumbnailurl,
'detailpageurl': detailpageurl,
'phonename': phonename,
'released': released,
'body': body,
'os': os,
'memory': memory,
'displaysize': displaysize,
'displayres': displayres,
'camerapixels': camerapixels,
'camerapixelsunit': camerapixelsunit,
'videopixels': videopixels,
'ramsize': ramsize,
'ramsizeunit': ramsizeunit,
'chipset': chipset,
'batsize': batsize,
'batsizeunit': batsizeunit,
'battype': battype,
'popularity': popularity}