Here are my spider code and the log I got. The problem is the spider seems to stop scraping items addressed from somewhere in the midst of page 10 (while there are 352 pages to be scraped). When I check the XPath expressions of the rest of the elements, I find them the same in my browser.
Here is my spider:
# -*- coding: utf-8 -*-
import scrapy
import logging
import urllib.parse
parts = urllib.parse.urlsplit(u'http://fa.wikipedia.org/wiki/صفحهٔ_اصلی')
parts = parts._replace(path=urllib.parse.quote(parts.path.encode('utf8')))
encoded_url = parts.geturl().encode('ascii')
'https://fa.wikipedia.org/wiki/صفحهٔ_اصلی'
class CriptolernSpider(scrapy.Spider):
name = 'criptolern'
allowed_domains = ['arzdigital.com']
def start_requests(self):
yield scrapy.Request(url='https://arzdigital.com',
callback= self.parse,dont_filter = True)
def parse(self, response):
posts=response.xpath("//a[@class='arz-last-post arz-row']")
try:
for post in posts:
post_title=post.xpath(".//@title").get()
post_link=post.xpath(".//@href").get()
post_date=post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-last-post__publish-time']/time/@datetime").get()
if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()"):
likes=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
else:
likes=0
if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()"):
commnents=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()").get())
else:
commnents=0
yield{
'post_title':post_title,
'post_link':post_link,
'post_date':post_date,
'likes':likes,
'commnents':commnents
}
next_page=response.xpath("//div[@class='arz-last-posts__get-more']/a[@class='arz-btn arz-btn-info arz-round arz-link-nofollow']/@href").get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse,dont_filter = True)
else:
next_pages= response.xpath("//div[@class='arz-pagination']/ul/li[@class='arz-pagination__item arz-pagination__next']/a[@class='arz-pagination__link']/@href").get()
if next_pages:
yield scrapy.Request(url=next_pages, callback=self.parse, dont_filter = True)
except AttributeError:
logging.error("The element didn't exist")
Here is the log, when the spider stops:
2021-12-04 11:06:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/10/>
{'post_title': 'ولادیمیر پوتین: ارزهای دیجیتال در نوع خود ارزشمند هستند', 'post_link': 'https://arzdigital.com/russias-putin-says-crypto-has-value-but-maybe-not-for-trading-oil-html/', 'post_date': '2021-10-16', 'likes': 17, 'commnents': 1}
2021-12-04 11:06:51 [scrapy.core.scraper] ERROR: Spider error processing <GET https://arzdigital.com/latest-posts/page/10/> (referer: https://arzdigital.com/latest-posts/page/9/)
Traceback (most recent call last):
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\shima\projects\arzdigital\arzdigital\spiders\criptolern.py", line 32, in parse
likes=int(post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
ValueError: invalid literal for int() with base 10: '۱,۸۵۱'
2021-12-04 11:06:51 [scrapy.core.engine] INFO: Closing spider (finished)
2021-12-04 11:06:51 [scrapy.extensions.feedexport] INFO: Stored csv feed (242 items) in: dataset.csv
2021-12-04 11:06:51 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4112,
'downloader/request_count': 12,
'downloader/request_method_count/GET': 12,
'downloader/response_bytes': 292561,
'downloader/response_count': 12,
'downloader/response_status_count/200': 12,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 12, 4, 7, 36, 51, 830291),
'item_scraped_count': 242,
'log_count/DEBUG': 254,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'request_depth_max': 10,
'response_received_count': 12,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 11,
'scheduler/dequeued/memory': 11,
'scheduler/enqueued': 11,
'scheduler/enqueued/memory': 11,
'spider_exceptions/ValueError': 1,
'start_time': datetime.datetime(2021, 12, 4, 7, 36, 47, 423017)}
2021-12-04 11:06:51 [scrapy.core.engine] INFO: Spider closed (finished)
I can't find the problem, if it is related to wrong XPath expression. Thanks for any help!!
EDIT: So I guess it's better to see two files here. The first is settings.py:
BOT_NAME = 'arzdigital'
SPIDER_MODULES = ['arzdigital.spiders']
NEWSPIDER_MODULE = 'arzdigital.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'arzdigital.middlewares.ArzdigitalDownloaderMiddleware': None,
'arzdigital.middlewares.UserAgentRotatorMiddleware':400
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 60
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 120
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = False
FEED_EXPORT_ENCODING='utf-8'
And the second file is middlewares.py:
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random, logging
class UserAgentRotatorMiddleware(UserAgentMiddleware):
user_agent_list=[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/2010010 1 Firefox/7.0.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb Kit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
]
def __init__(self, user_agent=''):
self.user_agent= user_agent
def process_request(self, request, spider):
try:
self.user_agent= random.choice(self.user_agent_list)
request.headers.setdefault('User-Agent', self.user_agent)
except IndexError:
logging.error("Couldn't fetch the user agent")
CodePudding user response:
Your code is working fine as your expectation and the problem was in pagination portion and I've made the pagination in start_urls which type of pagination is always accurate and more than two times faster than if next page.
Code
import scrapy
import logging
#base url=https://arzdigital.com/latest-posts/
#start_url =https://arzdigital.com/latest-posts/page/2/
class CriptolernSpider(scrapy.Spider):
name = 'criptolern'
allowed_domains = ['arzdigital.com']
start_urls=[f'https://arzdigital.com/latest-posts/page/{i}/'.format(i) for i in range(1,353)]
def parse(self, response):
posts = response.xpath("//a[@class='arz-last-post arz-row']")
try:
for post in posts:
post_title = post.xpath(".//@title").get()
post_link = post.xpath(".//@href").get()
post_date = post.xpath(
".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-last-post__publish-time']/time/@datetime").get()
if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()"):
likes = int(post.xpath(
".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-likes']/span[2]/text()").get())
else:
likes = 0
if post.xpath(".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()"):
commnents = int(post.xpath(
".//div[@class='arz-col-12 arz-col-md arz-last-post__link-box']/div/div[@class='arz-last-post__info']/div[@class='arz-post__info-comment']/span[2]/text()").get())
else:
commnents = 0
yield{
'post_title': post_title,
'post_link': post_link,
'post_date': post_date,
'likes': likes,
'commnents': commnents
}
except AttributeError:
logging.error("The element didn't exist")
Output:
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'تأکید مقام رسمی سابق وزارت دفاع آمریکا مبنی بر تشویق سرمایه گذاری بر روی بلاکچین', 'post_link': 'https://arzdigital.com/blockchain-investment/', 'post_date': '2017-07-27', 'likes': 4, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'ریسک سرمایه گذاری از طریق ICO', 'post_link': 'https://arzdigital.com/ico-risk/', 'post_date': '2017-07-27', 'likes': 9, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': '\xa0ای.سی.او چیست؟', 'post_link': 'https://arzdigital.com/what-is-ico/', 'post_date': '2017-07-27', 'likes': 7, 'commnents': 7}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'چرا\xa0فراریت بیت کوین و واحدهای مشابه آن، نسبت به سایر واحدهای پولی بیش\u200cتر است؟', 'post_link': 'https://arzdigital.com/bitcoin-currency/', 'post_date': '2017-07-27', 'likes': 6, 'commnents': 0}
2021-12-04 17:25:19 [scrapy.core.scraper] DEBUG: Scraped from <200 https://arzdigital.com/latest-posts/page/352/>
{'post_title': 'اتریوم کلاسیک Ethereum Classic چیست ؟', 'post_link': 'https://arzdigital.com/what-is-ethereum-classic/', 'post_date': '2017-07-24', 'likes': 10, 'commnents': 2}
2021-12-04 17:25:19 [scrapy.core.engine] INFO: Closing spider (finished)
2021-12-04 17:25:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 111431,
'downloader/request_count': 353,
'downloader/request_method_count/GET': 353,
'downloader/response_bytes': 8814416,
'downloader/response_count': 353,
'downloader/response_status_count/200': 352,
'downloader/response_status_count/301': 1,
'elapsed_time_seconds': 46.29503,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 12, 4, 11, 25, 19, 124154),
'httpcompression/response_bytes': 55545528,
'httpcompression/response_count': 352,
'item_scraped_count': 7920
.. so on
settings.py file
Please make sure that the settings.py file, you have to change only the uncomment portion nothing else
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 10
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'gs_spider.middlewares.GsSpiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'gs_spider.middlewares.GsSpiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'gs_spider.pipelines.GsSpiderPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False