Home > database >  Scrapy misssed a few items randomly on each run
Scrapy misssed a few items randomly on each run

Time:03-22

My spider is extracting the desired data successfully except that each time I run the spider it misses out a few items randomly (mostly 1 or 2), responses of those requests are 200 and there is no javascript involved because when I tested those URLs alone they worked perfectly fine. I tried to slow down the scraping speed by reducing the number of concurrent requests, increasing download delays, and increasing download timeout; but none of them solved the problem. Below is the image of my output CSV file following the code.Items CSV

Spider code:

import scrapy
from ..items import EurofaseItem
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst

class EuroFaseSpider(scrapy.Spider):
    name = 'euro_fase'
    start_urls = ['https://www.eurofase.com']

    def parse(self, response):
        for link in response.xpath("(//li[contains(@id,'nav-menu-item')]//div[@class='inner'])[1]/ul/li[contains(@id,'nav-menu-item')]/a/@href").getall():
            yield scrapy.Request(link, callback=self.parse_links)
    
    def parse_links(self, response):
        anchor = response.xpath("//a[@class='eltd-product-link']")
        for i in range(len(anchor)):
            link = anchor[i].xpath(".//@href").get()
            yield scrapy.Request(link, self.parse_details)

    def dict_section(self, response, xpath1, xpath2, loader, field_name):
        list1 = response.xpath(xpath1).getall()
        list2 = response.xpath(xpath2).getall()
        values = {}

        counter = 0
        if list1 and list2:
            img = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src").get()
            if img and 'Approval' in list1:
                list2.insert(list1.index('Approval'), img)
            elif not img and 'Approval' in list1:
                list2.insert(list1.index('Approval'), '')
                
            for i, j in zip(list1, list2):
                temp = {i: j}
                values.update({counter: temp})
                counter  = 1
            loader.add_value(field_name, values)

        elif list1 and not list2:
            for i in list1:
                temp = {i: ''}
                values.update({counter: temp})
                counter  = 1
            loader.add_value(field_name, values)

        else:
            values = None
            loader.add_value(field_name, values)
         
    def parse_details(self, response):
        loader = ItemLoader(EurofaseItem(), response=response)
        loader.default_output_processor = TakeFirst()
        loader.add_xpath('title', "//h4/text()")
        loader.add_xpath('description', "//div[@class='summary entry-summary']//p[@class='eltd-single-product-subtitle']/text()")
        loader.add_xpath('copy', "//div[@class='woocommerce-product-details__short-description']/p[1]/text()")

        self.dict_section(response, "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'product_details')
        self.dict_section(response, "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'light_source_details')

        technical_details = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span").getall()
        if len(technical_details) < 2:
            self.dict_section(response, "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span[contains(text(),'Approval')]/text()", "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src", loader, 'technical_details')
        else:
            self.dict_section(response, "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'technical_details')

        loader.add_xpath('images', "(//img/@data-large_image)[position() > 1]")
        loader.add_xpath('images', "//a[@class='eltd-btn eltd-btn-medium eltd-btn-solid CollectionBtn']/parent::node()/div/text()")
        loader.add_xpath('download_resources', "//div[@class='ResourcesWrap']//a/@href")
        loader.add_xpath('additional_finishes', "//ul[@class='ColorList']//a/@href")
        loader.add_value('product_url', response.url)

        yield loader.load_item()

items.py:

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from itemloaders.processors import MapCompose, Join

def decode_unicode(value):
    if value is not None:
        value = value.encode('ascii', 'ignore')
        value = value.decode()
    return value

def dict_decode_unicode(value):
    for key in value.keys():
        for k, i in value[key].items():
            d_key = decode_unicode(k)
            d_value = decode_unicode(i)
            value.update({key: {d_key: d_value}})

    return value 

class EurofaseItem(scrapy.Item):
    title = scrapy.Field(input_processor=MapCompose(decode_unicode))
    description = scrapy.Field(input_processor=MapCompose(decode_unicode))
    copy = scrapy.Field(input_processor=MapCompose(decode_unicode))
    product_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
    light_source_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
    technical_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
    images = scrapy.Field(output_processor=Join(separator=';'))
    download_resources = scrapy.Field(output_processor=Join(separator=';'))
    additional_finishes = scrapy.Field(output_processor=Join(separator=';'))
    product_url = scrapy.Field()

settings.py:

import scraper_helper as sh

BOT_NAME = 'eurofase'

SPIDER_MODULES = ['eurofase.spiders']
NEWSPIDER_MODULE = 'eurofase.spiders'

# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'

DEFAULT_REQUEST_HEADERS = sh.get_dict('''
    accept: text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
    accept-encoding: gzip, deflate, br
    accept-language: en-US,en;q=0.9
    cache-control: no-cache
    cookie: _ga=GA1.2.228374361.1647497775; _gid=GA1.2.66727921.1647497775; _gat=1
    pragma: no-cache
    sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"
    sec-ch-ua-mobile: ?0
    sec-ch-ua-platform: "Windows"
    sec-fetch-dest: document
    sec-fetch-mode: navigate
    sec-fetch-site: none
    sec-fetch-user: ?1
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36
    ''')

ROBOTSTXT_OBEY = False
LOG_FILE = 'spider.log'

# DOWNLOAD_DELAY = 10
# CONCURRENT_REQUESTS = 50
AUTOTHROTTLE_ENABLED = True
# DOWNLOAD_TIMEOUT = 500

ITEM_PIPELINES = {
    'eurofase.pipelines.EurofasePipeline': 300
}

Any help or suggestions would certainly save me a lot of time, thank you.

CodePudding user response:

After reading the documentation of retry middleware I realized that this is what I was looking for, So I overwrite the retry middleware like this if a request is a product page and response does not contain the specified xpath (title of the product) send it for the retry:

from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message


class CustomRetryMiddleware(RetryMiddleware):

    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        # this is my check
        if response.status == 200 and request.meta.get('is_product_page') and not response.xpath(spider.retry_xpath):
            return self._retry(request, 'response got xpath "{}"'.format(spider.retry_xpath), spider) or response
        return response

activating the custom middleware in settings.py:

DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'eurofase.middlewares.CustomMiddlewares.CustomRetryMiddleware': 550,

}

  • Related