How can I extract data and append it to current object in scrapy?-CodePudding

So my problem is this : I have a webpage with list of products. Every product has a lot of things defined in the attibutes of each div. However, one of the field is ambigous so I decided to open the product page and get it from there where I also found some other data which I thought might be useful in my analysis. However, when I merge the data, some columns are the same while others seem to be updated.

Here's the code :

import scrapy
from scrapy.utils.response import open_in_browser

class getSalesData(scrapy.Spider):
    name = 'getsalesdata'

    start_urls = ['https://link']

    def __init__(self):
        self.params = ['data-id-cod', 'id', 'data-name','data-area' ,'data-zone' ,'data-items','data-ssellertype' ,
                       'data-surface' ,'price' ,'tva' ,'mobile-container-url']

        self.item = { "id_cod"              : 'null',
                      "id"                  : 'null',
                      "tip_prop"            : 'null',
                      "area"                : 'null',
                      "zone"                : 'null',
                      "no_items"            : 'null',
                      "seller_type"         : 'null',
                      "surface"             : 'null',
                      "surface_orig"        : 'null',
                      "price"               : 'null',
                      "currency"            : 'null',
                      "url"                 : 'null'
        }

        self.columns = { "data-id-cod"          : 'id_cod',
                         "id"                   : 'id',
                         "data-name"            : 'tip_prop',
                         "data-area"            : 'area',
                         "data-zone"            : 'zone',
                         "data-items"           : 'nr_items',
                         "data-ssellertype"     : 'seller_type',
                         "data-surface"         : 'surface',
                         "price"                : 'price',
                         "tva"                  : 'valuta',
                         "mobile-container-url" : 'url'
        }

    def parse(self, response):
        item = self.item
        for listing in response.css("div.box-an"):
            if not 'box-an ' in listing.attrib['class']:
                for parameter in self.params:
                    if parameter in ['price', 'tva']:
                        item[self.columns[parameter]] = \
                            (listing.css('span.'   parameter   '::text').get()).replace('.','') \
                                if (parameter in listing.get()) else item[self.columns[parameter]]

                    elif parameter in 'mobile-container-url':
                        url = listing.css('a.visible-xs.'   parameter).attrib['href'] \
                                if (parameter in listing.get()) else item[self.columns[parameter]]

                        #self.logger.info('----->>> At URL : '   url)

                        item[self.columns[parameter]] = url

                    elif parameter in 'data-surface':
                        item['surface'] = str(int(listing.attrib[parameter])/100) \
                            if (int(listing.attrib[parameter])>1000) else listing.attrib[parameter]
                        item['surface_orig'] = listing.attrib[parameter] \
                            if (parameter in listing.get()) else item[self.columns[parameter]]

                    else:
                        item[self.columns[parameter]] = \
                            listing.attrib[parameter] if (parameter in listing.get()) else item[self.columns[parameter]]

            request = scrapy.Request(url=item['url'],
                                     callback=self.parseNextPage,
                                     meta={'item' : item})

            yield request

    def parseNextPage(self, response):
        item = response.meta['item']

        self.logger.info('Running on : '   item['url'])

        #for spec in response.css('li.list-group-item.specificatii-oferta__lista--item'):
        for spec in response.css('ul.list-tab'):
            for lst in spec.css('li'):
                field = lst.css('li::text').get()
                #self.logger.info('Adding '   field   '\n')
                item[field] = lst.css('span::text').get()

        return item

Here's the data (see the starred items which are the same):

id_cod          id          tip_prop    no_items surface    surface_all
*A2Q00LMBUS9    *XA2Q0001E  *prodType1  2        41,21 mp   *46.89 mp
*A2Q00LMBUS9    *XA2Q0001E  *prodType1  3        140 mp     *46.89 mp

I have a feeling that I don't understand how the self.item is updated and some data is being kept from previous run, maybe ?

UPDATE: This is strange. if I use this code :

    request = scrapy.Request(url=item['url'],
                             callback=self.parseNextPage,
                             meta={'item' : {'id'     : item['id'] ,
                                             'id_cod' : item['id_cod'],
                                             'area'  : item['area'],
                                             'nr_items' : item['nr_items'],
                                             'seller_type'      : item['seller_type'],
                                             'surface' : item['suprafata'],
                                             'tip_prop' : item['tip_prop'],
                                             'url'             : item['url'],
                                             'currency'          : item['valuta'],
                                             'area'            : item['zona']
                                             }
                                   }
                             )

It works fine. But if I use this code :

request = scrapy.Request(url=item['url'],
                         callback=self.parseNextPage,
                         meta={'item' : item})

It doesn't work anymore. Item and the dictionary I am passing above are identical.

CodePudding user response：

Because item is passed to Request's meta as a reference and the memory it's referring to is reused between all products, that is being overwritten all the time.

To fix that remove declaring self.item in the constructor and simply create new item in the loop:

...    
def init_item(self):
    return { "id_cod"              : 'null',
             "id"                  : 'null',
             "tip_prop"            : 'null',
             "area"                : 'null',
             "zone"                : 'null',
             "no_items"            : 'null',
             "seller_type"         : 'null',
             "surface"             : 'null',
             "surface_orig"        : 'null',
             "price"               : 'null',
             "currency"            : 'null',
             "url"                 : 'null'
    }

def parse(self, response):
    for listing in response.css("div.box-an"):
        item = self.init_item()
        if not 'box-an ' in listing.attrib['class']:
...