Home > Mobile >  Scrapy - file is not running through the list of items - Updated Code
Scrapy - file is not running through the list of items - Updated Code

Time:08-17

So these are my work files and then I have also added the terminal log I received when ran, thanks!

Settings

# Scrapy settings for antaira project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'antaira'

SPIDER_MODULES = ['antaira.spiders']
NEWSPIDER_MODULE = 'antaira.spiders'

CLOSESPIDER_PAGECOUNT = 25

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'antaira ( http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'antaira.middlewares.AntairaSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'antaira.middlewares.AntairaDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'antaira.pipelines.AntairaPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

#Domain Limited
#MAX_REQUESTS_PER_DOMAIN = 4
DOWNLOADER_MIDDLEWARES =     {
    #'<myproject>.middlewares.DomainlimitMiddleware': 543,
}

DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'

Item pipeline should be fairly standard PipeLine

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json

class AntairaPipeline:
    def process_item(self, item, spider):

        # calling dumps to create json data.
        line = json.dumps(dict(item))   "\n"
        self.file.write(line)
        return item
    
    def open_spider(self, spider):
        self.file = open('result.json', 'w')
        
    def close_spider(self, spider):
        self.file.close()

Standard Items, nothing exceptional. Items

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class AntairaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    product_sku         = scrapy.Field()
    summary             = scrapy.Field()
    description         = scrapy.Field()
    products_zoom_image = scrapy.Field()
    main_image          = scrapy.Field()
    product_link        = scrapy.Field()
    #rel_product_link    = scrapy.Field()
    #rel_links           = scrapy.Field()
    #datasheet           = scrapy.Field()

I changed some of the field names to match beter to my team's database names. Scrapy Spider

import scrapy
from ..items import AntairaItem


class ProductJumperFix(scrapy.Spider):  # classes should be TitleCase

    name = 'productJumperFix'
    allowed_domains = ['antaira.com']
    
    custom_settings = {
        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
    }

    def start_requests(self):
        urls = [
            'https://www.antaira.com/products/10-100Mbps',
            'https://www.antaira.com/products/unmanaged-gigabit',
            'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
            'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41,48,48',
            'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
            'https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41,43,43',
            'https://www.antaira.com/products/Unmanaged-10-gigabit',
            'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
            'https://www.antaira.com/products/unmanaged-gigabit',
        ]
        for url in urls:
                yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # iterate through each of the relative urls
        for url in response.xpath('//div[@]//a/@href').getall():
            product_link = response.urljoin(url)  # use variable
            yield scrapy.Request(product_link, callback=self.parse_new_item)

    def parse_new_item(self, response):
        for product in response.css('main.products'):
            items = AntairaItem() # Unique item for each iteration
            items['product_link'] = response.url # get the product link from response
            name_dirty = product.css('h1.product-name::text').get()
            product_sku = name_dirty.strip()
            summary = product.css(('section.features h3   ul')).getall()
            description =   product.css('.products .product-overview::text').getall()
            products_zoom_image = name_dirty.strip()   '.jpg'
            main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
            rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
            
            items['product_sku'] = product_sku,
            items['summary'] = summary,
            items['description'] = description,
            items['products_zoom_image'] = products_zoom_image
            items['main_image'] = main_image,
            #items['rel_links'] = rel_links,
            #items['datasheet'] = datasheet,
            yield items

I have trimmed a majority of the log and kept the only parts that it crawld but did not scrape. Terminal Log

joel@testbed:~/Desktop/antaira/antaira/spiders$ scrapy crawl productJumperFix -O products.csv
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: antaira)
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.8.10 (default, Jun 22 2022, 20:18:18) - [GCC 9.4.0], pyOpenSSL 19.0.0 (OpenSSL 1.1.1f  31 Mar 2020), cryptography 2.8, Platform Linux-5.15.0-46-generic-x86_64-with-glibc2.29
2022-08-15 16:33:34 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'antaira',
 'CLOSESPIDER_PAGECOUNT': 25,
 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
 'NEWSPIDER_MODULE': 'antaira.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['antaira.spiders']}
2022-08-15 16:33:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet Password: 3f9ff0160659640b
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.closespider.CloseSpider',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled item pipelines:
['antaira.pipelines.AntairaPipeline']
2022-08-15 16:33:34 [scrapy.core.engine] INFO: Spider opened
2022-08-15 16:33:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to acquire lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 acquired on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to release lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 released on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/robots.txt> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41,43,43> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41,48,48> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:37 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24>
{'description': (['Antaira Technologies’ LNP-1204G-10G-SFP-24 are industrial '
                  'gigabit PoE  unmanaged Ethernet switches featuring '
                  '8*10/100/1000Tx Gigabit Ethernet ports that support '
                  'IEEE802.3at for a maximum of 30W/port. The '
                  'LNP-1204G-10G-SFP-24 has 2*1G SFP slots and 2*10G SFP  '
                  'slots which provide options for long-distance fiber '
                  'connections. The Ethernet switches are designed with high '
                  'EFT and ESD protection and support standard operating '
                  'temperature from -40° to 65°C.',
                  'The LNP-1204G-10G-SFP-24 are IP30 rated and DIN-rail '
                  'mountable. These Ethernet switches are designed to be '
                  'powered with low voltage input (12~55VDC) while still '
                  'providing the higher voltages required by the PoE '
                  'standards. Additionally, these industrial PoE Ethernet '
                  'switches provide connectivity for outdoor or harsh '
                  'industrial automation application environments, such as '
                  'security surveillance, ITS-traffic monitoring systems, '
                  'oil/gas and mining, facility management for power/utility, '
                  'water wastewater treatment plants, and lastly, automated '
                  'production lines in factory automation.'],),
 'main_image': ('https://www.antaira.com/core/media/media.nl?id=1553822&c=685553&h=KRqHvivRvzYNGs_zSsw3x5fAu9EoYxBr3AAjkX2TH7iCoXyh',),
 'product_link': 'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24',
 'product_sku': ('LNP-1204G-10G-SFP-24',),
 'products_zoom_image': 'LNP-1204G-10G-SFP-24.jpg',
 'summary': (['<ul>\r\n'
              '<li>Supports 8*10/100/1000Tx IEEE 802.3af/at Compliant with '
              '30W/Port, 2*1G SFP Slots, and 2*10G SFP  Slots</li>\r\n'
              '<li>Store-and-Forward Switching Architecture</li>\r\n'
              '<li>60Gbps Back-Plane (Switching Fabric)</li>\r\n'
              '<li>16K MAC Address Table</li>\r\n'
              '<li>10Kbytes Jumbo Frame Support</li>\r\n'
              '<li>Redundant Power Input Design: 12~55VDC</li>\r\n'
              '<li>Bult-in 1 Relay Output for Power Failure Warning</li>\r\n'
              '<li>IP30 Rugged Metal Case Design</li>\r\n'
              '<li>DIN-Rail and Wall Mount Support Included</li>\r\n'
              '<li>Operating Temperature Range: -40°C~65°C</li>\r\n'
              '<li>5-Year Warranty</li>\r\n'
              '</ul>'],)}
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt-T> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41,43,43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41,43,43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-0501-ST-M-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-1600-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] INFO: Closing spider (closespider_pagecount)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE/LNP-0800-60-24-T> (referer: https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41,48,48)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C500G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP>
{'description': (['Antaira’s new LNX-1802G-SFP industrial gigabit unmanaged '
                  'Ethernet switch is IP30 rated and DIN-Rail mountable. Each '
                  'unit is designed with 16 gigabit Ethernet ports and 2 dual '
                  'rate (100/1000) SFP slots for fiber connections, making it '
                  'ideal for applications that demand high bandwidth and long '
                  'distance communication. \r\n',
                  '\r\n'
                  'This product provides high EFT and ESD protection to '
                  'prevent any unregulated voltage and is suitable for harsh '
                  'environments. The unit also supports a standard operating '
                  'temperature from -10 to 70°C. \r\n',
                  ' ',
                  '\r\n'
                  'The LNX-1802G-SFP is a perfect industrial networking '
                  'product to support any applications that require high '
                  'bandwidth or high density connections, such as '
                  'Power/Utility, Water Wastewater Treatment, Oil/Gas/Mining, '
                  'Process Control Automation, Security Access Control '
                  'Systems, and Intelligent Transportation Systems.'],),
 'main_image': ('https://www.antaira.com/core/media/media.nl?id=1236032&c=685553&h=ARdQdDsGuiZpMENJKZsmA3gN6RbhLAQSkBjKdazk1YE_PNrG',),
 'product_link': 'https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP',
 'product_sku': ('LNX-1802G-SFP',),
 'products_zoom_image': 'LNX-1802G-SFP.jpg',
 'summary': (['<ul>\r\n'
              '<li>Supports 16*10/100/1000Tx   2*100/1000 SFP ports </li>\r\n'
              '<li>Supports Auto MDI/MDI-X Function</li>\r\n'
              '<li>Store-and-Forward Switching Architecture</li>\r\n'
              '<li>8K MAC Address Table</li>\r\n'
              '<li>Surge Protection: 2,000 VDC Support</li>\r\n'
              '<li>ESD Protection: 6,000 VDC Support</li>\r\n'
              '<li>Redundant Power Input Design: 12~48VDC</li>\r\n'
              '<li>Built-in 1 Relay Output for Power Failure Detection</li>\r\n'
              '<li>IP30 Rugged Metal Case Design</li>\r\n'
              '<li>DIN-Rail and Wall Mount Support</li>\r\n'
              '<li>Operating Temperature Range: -10° to 70° C</li>\r\n'
              '<li>5-Year Warranty</li>\r\n'
              '</ul>'],)}
2022-08-15 16:33:39 [scrapy.extensions.feedexport] INFO: Stored csv feed (30 items) in: products.csv
2022-08-15 16:33:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 24728,
 'downloader/request_count': 40,
 'downloader/request_method_count/GET': 40,
 'downloader/response_bytes': 650133,
 'downloader/response_count': 40,
 'downloader/response_status_count/200': 40,
 'elapsed_time_seconds': 4.618773,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'closespider_pagecount',
 'finish_time': datetime.datetime(2022, 8, 15, 23, 33, 39, 600766),
 'httpcompression/response_bytes': 3250008,
 'httpcompression/response_count': 39,
 'item_scraped_count': 30,
 'log_count/DEBUG': 75,
 'log_count/INFO': 11,
 'memusage/max': 58769408,
 'memusage/startup': 58769408,
 'request_depth_max': 1,
 'response_received_count': 40,
 'robotstxt/request_count': 1,
 'robotstxt/response_count': 1,
 'robotstxt/response_status_count/200': 1,
 'scheduler/dequeued': 39,
 'scheduler/dequeued/memory': 39,
 'scheduler/enqueued': 225,
 'scheduler/enqueued/memory': 225,
 'start_time': datetime.datetime(2022, 8, 15, 23, 33, 34, 981993)}
2022-08-15 16:33:39 [scrapy.core.engine] INFO: Spider closed (closespider_pagecount)

CodePudding user response:

You can try changing your user agent and turning off ROBOTSTXT_OBEY and slowing down the crawl. If it is the webserver cutting you off these things might help mitigate that.

settings.py

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
  • Related