So my problem is this : I have a webpage with list of products. Every product has a lot of things defined in the attibutes of each div. However, one of the field is ambigous so I decided to open the product page and get it from there where I also found some other data which I thought might be useful in my analysis. However, when I merge the data, some columns are the same while others seem to be updated.
Here's the code :
import scrapy
from scrapy.utils.response import open_in_browser
class getSalesData(scrapy.Spider):
name = 'getsalesdata'
start_urls = ['https://link']
def __init__(self):
self.params = ['data-id-cod', 'id', 'data-name','data-area' ,'data-zone' ,'data-items','data-ssellertype' ,
'data-surface' ,'price' ,'tva' ,'mobile-container-url']
self.item = { "id_cod" : 'null',
"id" : 'null',
"tip_prop" : 'null',
"area" : 'null',
"zone" : 'null',
"no_items" : 'null',
"seller_type" : 'null',
"surface" : 'null',
"surface_orig" : 'null',
"price" : 'null',
"currency" : 'null',
"url" : 'null'
}
self.columns = { "data-id-cod" : 'id_cod',
"id" : 'id',
"data-name" : 'tip_prop',
"data-area" : 'area',
"data-zone" : 'zone',
"data-items" : 'nr_items',
"data-ssellertype" : 'seller_type',
"data-surface" : 'surface',
"price" : 'price',
"tva" : 'valuta',
"mobile-container-url" : 'url'
}
def parse(self, response):
item = self.item
for listing in response.css("div.box-an"):
if not 'box-an ' in listing.attrib['class']:
for parameter in self.params:
if parameter in ['price', 'tva']:
item[self.columns[parameter]] = \
(listing.css('span.' parameter '::text').get()).replace('.','') \
if (parameter in listing.get()) else item[self.columns[parameter]]
elif parameter in 'mobile-container-url':
url = listing.css('a.visible-xs.' parameter).attrib['href'] \
if (parameter in listing.get()) else item[self.columns[parameter]]
#self.logger.info('----->>> At URL : ' url)
item[self.columns[parameter]] = url
elif parameter in 'data-surface':
item['surface'] = str(int(listing.attrib[parameter])/100) \
if (int(listing.attrib[parameter])>1000) else listing.attrib[parameter]
item['surface_orig'] = listing.attrib[parameter] \
if (parameter in listing.get()) else item[self.columns[parameter]]
else:
item[self.columns[parameter]] = \
listing.attrib[parameter] if (parameter in listing.get()) else item[self.columns[parameter]]
request = scrapy.Request(url=item['url'],
callback=self.parseNextPage,
meta={'item' : item})
yield request
def parseNextPage(self, response):
item = response.meta['item']
self.logger.info('Running on : ' item['url'])
#for spec in response.css('li.list-group-item.specificatii-oferta__lista--item'):
for spec in response.css('ul.list-tab'):
for lst in spec.css('li'):
field = lst.css('li::text').get()
#self.logger.info('Adding ' field '\n')
item[field] = lst.css('span::text').get()
return item
Here's the data (see the starred items which are the same):
id_cod id tip_prop no_items surface surface_all
*A2Q00LMBUS9 *XA2Q0001E *prodType1 2 41,21 mp *46.89 mp
*A2Q00LMBUS9 *XA2Q0001E *prodType1 3 140 mp *46.89 mp
I have a feeling that I don't understand how the self.item is updated and some data is being kept from previous run, maybe ?
UPDATE: This is strange. if I use this code :
request = scrapy.Request(url=item['url'],
callback=self.parseNextPage,
meta={'item' : {'id' : item['id'] ,
'id_cod' : item['id_cod'],
'area' : item['area'],
'nr_items' : item['nr_items'],
'seller_type' : item['seller_type'],
'surface' : item['suprafata'],
'tip_prop' : item['tip_prop'],
'url' : item['url'],
'currency' : item['valuta'],
'area' : item['zona']
}
}
)
It works fine. But if I use this code :
request = scrapy.Request(url=item['url'],
callback=self.parseNextPage,
meta={'item' : item})
It doesn't work anymore. Item and the dictionary I am passing above are identical.
CodePudding user response:
Because item
is passed to Request's meta as a reference and the memory it's referring to is reused between all products, that is being overwritten all the time.
To fix that remove declaring self.item
in the constructor and simply create new item in the loop:
...
def init_item(self):
return { "id_cod" : 'null',
"id" : 'null',
"tip_prop" : 'null',
"area" : 'null',
"zone" : 'null',
"no_items" : 'null',
"seller_type" : 'null',
"surface" : 'null',
"surface_orig" : 'null',
"price" : 'null',
"currency" : 'null',
"url" : 'null'
}
def parse(self, response):
for listing in response.css("div.box-an"):
item = self.init_item()
if not 'box-an ' in listing.attrib['class']:
...