My spider is extracting the desired data successfully except that each time I run the spider it misses out a few items randomly (mostly 1 or 2), responses of those requests are 200 and there is no javascript involved because when I tested those URLs alone they worked perfectly fine. I tried to slow down the scraping speed by reducing the number of concurrent requests, increasing download delays, and increasing download timeout; but none of them solved the problem. Below is the image of my output CSV file following the code.Items CSV
Spider code:
import scrapy
from ..items import EurofaseItem
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst
class EuroFaseSpider(scrapy.Spider):
name = 'euro_fase'
start_urls = ['https://www.eurofase.com']
def parse(self, response):
for link in response.xpath("(//li[contains(@id,'nav-menu-item')]//div[@class='inner'])[1]/ul/li[contains(@id,'nav-menu-item')]/a/@href").getall():
yield scrapy.Request(link, callback=self.parse_links)
def parse_links(self, response):
anchor = response.xpath("//a[@class='eltd-product-link']")
for i in range(len(anchor)):
link = anchor[i].xpath(".//@href").get()
yield scrapy.Request(link, self.parse_details)
def dict_section(self, response, xpath1, xpath2, loader, field_name):
list1 = response.xpath(xpath1).getall()
list2 = response.xpath(xpath2).getall()
values = {}
counter = 0
if list1 and list2:
img = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src").get()
if img and 'Approval' in list1:
list2.insert(list1.index('Approval'), img)
elif not img and 'Approval' in list1:
list2.insert(list1.index('Approval'), '')
for i, j in zip(list1, list2):
temp = {i: j}
values.update({counter: temp})
counter = 1
loader.add_value(field_name, values)
elif list1 and not list2:
for i in list1:
temp = {i: ''}
values.update({counter: temp})
counter = 1
loader.add_value(field_name, values)
else:
values = None
loader.add_value(field_name, values)
def parse_details(self, response):
loader = ItemLoader(EurofaseItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath('title', "//h4/text()")
loader.add_xpath('description', "//div[@class='summary entry-summary']//p[@class='eltd-single-product-subtitle']/text()")
loader.add_xpath('copy', "//div[@class='woocommerce-product-details__short-description']/p[1]/text()")
self.dict_section(response, "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'product_details')
self.dict_section(response, "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'light_source_details')
technical_details = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span").getall()
if len(technical_details) < 2:
self.dict_section(response, "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span[contains(text(),'Approval')]/text()", "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src", loader, 'technical_details')
else:
self.dict_section(response, "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'technical_details')
loader.add_xpath('images', "(//img/@data-large_image)[position() > 1]")
loader.add_xpath('images', "//a[@class='eltd-btn eltd-btn-medium eltd-btn-solid CollectionBtn']/parent::node()/div/text()")
loader.add_xpath('download_resources', "//div[@class='ResourcesWrap']//a/@href")
loader.add_xpath('additional_finishes', "//ul[@class='ColorList']//a/@href")
loader.add_value('product_url', response.url)
yield loader.load_item()
items.py:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemloaders.processors import MapCompose, Join
def decode_unicode(value):
if value is not None:
value = value.encode('ascii', 'ignore')
value = value.decode()
return value
def dict_decode_unicode(value):
for key in value.keys():
for k, i in value[key].items():
d_key = decode_unicode(k)
d_value = decode_unicode(i)
value.update({key: {d_key: d_value}})
return value
class EurofaseItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(decode_unicode))
description = scrapy.Field(input_processor=MapCompose(decode_unicode))
copy = scrapy.Field(input_processor=MapCompose(decode_unicode))
product_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
light_source_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
technical_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
images = scrapy.Field(output_processor=Join(separator=';'))
download_resources = scrapy.Field(output_processor=Join(separator=';'))
additional_finishes = scrapy.Field(output_processor=Join(separator=';'))
product_url = scrapy.Field()
settings.py:
import scraper_helper as sh
BOT_NAME = 'eurofase'
SPIDER_MODULES = ['eurofase.spiders']
NEWSPIDER_MODULE = 'eurofase.spiders'
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
DEFAULT_REQUEST_HEADERS = sh.get_dict('''
accept: text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: no-cache
cookie: _ga=GA1.2.228374361.1647497775; _gid=GA1.2.66727921.1647497775; _gat=1
pragma: no-cache
sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "Windows"
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36
''')
ROBOTSTXT_OBEY = False
LOG_FILE = 'spider.log'
# DOWNLOAD_DELAY = 10
# CONCURRENT_REQUESTS = 50
AUTOTHROTTLE_ENABLED = True
# DOWNLOAD_TIMEOUT = 500
ITEM_PIPELINES = {
'eurofase.pipelines.EurofasePipeline': 300
}
Any help or suggestions would certainly save me a lot of time, thank you.
CodePudding user response:
After reading the documentation of retry middleware I realized that this is what I was looking for, So I overwrite the retry middleware like this if a request is a product page and response does not contain the specified xpath (title of the product) send it for the retry:
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
# this is my check
if response.status == 200 and request.meta.get('is_product_page') and not response.xpath(spider.retry_xpath):
return self._retry(request, 'response got xpath "{}"'.format(spider.retry_xpath), spider) or response
return response
activating the custom middleware in settings.py:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'eurofase.middlewares.CustomMiddlewares.CustomRetryMiddleware': 550,
}