Home > database >  scrapy stops scraping elements that are addressed
scrapy stops scraping elements that are addressed

Time:12-05

Here are my spider code and the log I got. The problem is the spider seems to stop scraping items addressed from somewhere in the midst of page 10 (while there are 352 pages to be scraped). When I check the XPath expressions of the rest of the elements, I find them the same in my browser.

Here is my spider:

# -*- coding: utf-8 -*-
import scrapy
import logging
import urllib.parse
parts = urllib.parse.urlsplit(u'http://fa.wikipedia.org/wiki/صفحهٔ_اصلی')
parts = parts._replace(path=urllib.parse.quote(parts.path.encode('utf8')))
encoded_url = parts.geturl().encode('ascii')
'https://fa.wikipedia.org/wiki/صفحهٔ_اصلی'



class CriptolernSpider(scrapy.Spider):
    name = 'criptolern'
    allowed_domains = ['arzdigital.com']

    def start_requests(self):
            yield scrapy.Request(url='https://arzdigital.com',
                callback= self.parse,dont_filter = True)
    

    def parse(self, response):
            posts=response.xpath("//a[@class='arz-last-post arz-row']")
            
            try:

                for post in posts:
                    post_title=post.xpath(".//@title").get()
                    post_link=post.xpath(".//@href").get()
                    post_date=post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-last-post__publish-time']/time/@datetime").get()

                    if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()"):
                        likes=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
                    else:
                        likes=0
                    if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()"):
                        commnents=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()").get())
                    else:
                        commnents=0
                
                    yield{
                        'post_title':post_title,
                        'post_link':post_link,
                        'post_date':post_date,
                        'likes':likes,
                        'commnents':commnents
                    }

                next_page=response.xpath("//div[@class='arz-last-posts__get-more']/a[@class='arz-btn arz-btn-info arz-round arz-link-nofollow']/@href").get()
                if next_page:
                    yield scrapy.Request(url=next_page, callback=self.parse,dont_filter = True)

                else:

                    next_pages= response.xpath("//div[@class='arz-pagination']/ul/li[@class='arz-pagination__item arz-pagination__next']/a[@class='arz-pagination__link']/@href").get()
                    if next_pages:
                        yield scrapy.Request(url=next_pages, callback=self.parse, dont_filter = True)

            except AttributeError:
                logging.error("The element didn't exist")

Here is the log, when the spider stops:

2021-12-04 11:06:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/10/>
{'post_title': 'ولادیمیر پوتین: ارزهای دیجیتال در نوع خود ارزشمند هستند', 'post_link': 'https://arzdigital.com/russias-putin-says-crypto-has-value-but-maybe-not-for-trading-oil-html/', 'post_date': '2021-10-16', 'likes': 17, 'commnents': 1}
2021-12-04 11:06:51 [scrapy.core.scraper] ERROR: Spider error processing <GET https://arzdigital.com/latest-posts/page/10/> (referer: https://arzdigital.com/latest-posts/page/9/)
Traceback (most recent call last):
  File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
    yield next(it)
  File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\shima\projects\arzdigital\arzdigital\spiders\criptolern.py", line 32, in parse
    likes=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
ValueError: invalid literal for int() with base 10: '۱,۸۵۱'
2021-12-04 11:06:51 [scrapy.core.engine] INFO: Closing spider (finished)
2021-12-04 11:06:51 [scrapy.extensions.feedexport] INFO: Stored csv feed (242 items) in: dataset.csv
2021-12-04 11:06:51 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4112,
 'downloader/request_count': 12,
 'downloader/request_method_count/GET': 12,
 'downloader/response_bytes': 292561,
 'downloader/response_count': 12,
 'downloader/response_status_count/200': 12,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2021, 12, 4, 7, 36, 51, 830291),
 'item_scraped_count': 242,
 'log_count/DEBUG': 254,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'request_depth_max': 10,
 'response_received_count': 12,
 'robotstxt/request_count': 1,
 'robotstxt/response_count': 1,
 'robotstxt/response_status_count/200': 1,
 'scheduler/dequeued': 11,
 'scheduler/dequeued/memory': 11,
 'scheduler/enqueued': 11,
 'scheduler/enqueued/memory': 11,
 'spider_exceptions/ValueError': 1,
 'start_time': datetime.datetime(2021, 12, 4, 7, 36, 47, 423017)}
2021-12-04 11:06:51 [scrapy.core.engine] INFO: Spider closed (finished)

I can't find the problem, if it is related to wrong XPath expression. Thanks for any help!!

EDIT: So I guess it's better to see two files here. The first is settings.py:

BOT_NAME = 'arzdigital'

SPIDER_MODULES = ['arzdigital.spiders']
NEWSPIDER_MODULE = 'arzdigital.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'arzdigital.middlewares.ArzdigitalDownloaderMiddleware': None,
    'arzdigital.middlewares.UserAgentRotatorMiddleware':400
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 60
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 120
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = False
FEED_EXPORT_ENCODING='utf-8'

And the second file is middlewares.py:

from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random, logging


class UserAgentRotatorMiddleware(UserAgentMiddleware):
    user_agent_list=[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/2010010 1 Firefox/7.0.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb Kit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
    ]
    def __init__(self, user_agent=''):
        self.user_agent= user_agent
    def process_request(self, request, spider):
        try:
            self.user_agent= random.choice(self.user_agent_list)
            request.headers.setdefault('User-Agent', self.user_agent)
        except IndexError:
            logging.error("Couldn't fetch the user agent")

CodePudding user response:

Your code is working fine as your expectation and the problem was in pagination portion and I've made the pagination in start_urls which type of pagination is always accurate and more than two times faster than if next page.

Code

import scrapy
import logging
#base url=https://arzdigital.com/latest-posts/
#start_url =https://arzdigital.com/latest-posts/page/2/

class CriptolernSpider(scrapy.Spider):
    name = 'criptolern'
    allowed_domains = ['arzdigital.com']
    start_urls=[f'https://arzdigital.com/latest-posts/page/{i}/'.format(i) for i in range(1,353)]

    def parse(self, response):
        posts = response.xpath("//a[@class='arz-last-post arz-row']")

        try:

            for post in posts:
                post_title = post.xpath(".//@title").get()
                post_link = post.xpath(".//@href").get()
                post_date = post.xpath(
                    ".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-last-post__publish-time']/time/@datetime").get()

                if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()"):
                    likes = int(post.xpath(
                        ".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
                else:
                    likes = 0
                if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()"):
                    commnents = int(post.xpath(
                        ".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()").get())
                else:
                    commnents = 0

                yield{
                    'post_title': post_title,
                    'post_link': post_link,
                    'post_date': post_date,
                    'likes': likes,
                    'commnents': commnents
                }

            
        except AttributeError:
            logging.error("The element didn't exist")

Output:

2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'تأکید مقام رسمی سابق وزارت دفاع آمریکا مبنی بر تشویق سرمایه گذاری بر روی بلاکچین', 'post_link': 'https://arzdigital.com/blockchain-investment/', 'post_date': '2017-07-27', 'likes': 4, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'ریسک سرمایه گذاری از طریق ICO', 'post_link': 'https://arzdigital.com/ico-risk/', 'post_date': '2017-07-27', 'likes': 9, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': '\xa0ای.سی.او چیست؟', 'post_link': 'https://arzdigital.com/what-is-ico/', 'post_date': '2017-07-27', 'likes': 7, 'commnents': 7}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'چرا\xa0فراریت بیت کوین و واحدهای مشابه آن، نسبت به سایر واحدهای پولی بیش\u200cتر است؟', 'post_link': 'https://arzdigital.com/bitcoin-currency/', 'post_date': '2017-07-27', 'likes': 6, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'اتریوم کلاسیک Ethereum Classic چیست ؟', 'post_link': 'https://arzdigital.com/what-is-ethereum-classic/', 'post_date': '2017-07-24', 'likes': 10, 'commnents': 2}
2021-12-04 17:25:19 [scrapy.core.engine] INFO: Closing spider (finished)
2021-12-04 17:25:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 111431,
 'downloader/request_count': 353,
 'downloader/request_method_count/GET': 353,
 'downloader/response_bytes': 8814416,
 'downloader/response_count': 353,
 'downloader/response_status_count/200': 352,
 'downloader/response_status_count/301': 1,
 'elapsed_time_seconds': 46.29503,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2021, 12, 4, 11, 25, 19, 124154),
 'httpcompression/response_bytes': 55545528,
 'httpcompression/response_count': 352,
 'item_scraped_count': 7920

.. so on

settings.py file

Please make sure that the settings.py file, you have to change only the uncomment portion nothing else

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 10
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'gs_spider.middlewares.GsSpiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'gs_spider.middlewares.GsSpiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'gs_spider.pipelines.GsSpiderPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
  • Related