I am running my spider in the same structure as my other ones, but for this specific website and this specific spider, it closes after the very first request to starting url. What could possibly be the problem?
Terminal Output:
...
2022-04-03 17:42:34 [scrapy.core.engine] INFO: Spider opened
2022-04-03 17:42:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-04-03 17:42:34 [spiderTopo] INFO: Spider opened: spiderTopo
2022-04-03 17:42:34 [spiderTopo] INFO: Spider opened: spiderTopo
2022-04-03 17:42:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-04-03 17:42:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.topocentras.lt/> (referer: None)
2022-04-03 17:42:34 [scrapy.core.engine] INFO: Closing spider (finished)
2022-04-03 17:42:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 299,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 43691,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.293075,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 4, 3, 14, 42, 34, 535151),
'httpcompression/response_bytes': 267627,
'httpcompression/response_count': 1,
'log_count/DEBUG': 2,
'log_count/INFO': 12,
'memusage/max': 60579840,
'memusage/startup': 60579840,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2022, 4, 3, 14, 42, 34, 242076)}
2022-04-03 17:42:34 [scrapy.core.engine] INFO: Spider closed (finished)
spider Code:
import scrapy
from bs4 import BeautifulSoup
import re
from pbl.items import PblSpider
base_url = 'https://www.topocentras.lt'
class PblItem(scrapy.Spider):
name = 'spiderTopo'
allowed_domains = ['topocentras.lt']
start_urls = ['https://www.topocentras.lt/']
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllCategoriesXpath = '/html/body/div[1]/header[1]/nav/ul/li[1]/div/ul[1]/li/a/@href'
self.getAllSubCategoriesXpath = '//*[@id="root"]/main/div/aside/div/ul/li/a/@href'
self.getAllItemsXpath = '/html/body/div[1]/main/div/section/div[4]/div/article/div[1]/a/@href'
self.TitleXpath = '/html/body/div[2]/main/div[1]/div[2]/div/article/h1/text()'
self.ImageXpath = '/html/body/div[2]/main/div[1]/div[2]/div/article/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/img/@src'
self.PriceXpath = '/html/body/div[2]/main/div[1]/div[2]/div/article/div[2]/div[3]/div[1]/div[2]/div/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllCategoriesXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url,callback=self.parse_category,dont_filter=True)
def parse_category(self,response):
for href in response.xpath(self.getAllSubCategoriesXpath):
url = response.urljoin(href.extract())
print(response.body)
yield scrapy.Request(url,callback=self.parse_subcategory,dont_filter=True)
def parse_subcategory(self,response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url,callback=self.parse_main_item,dont_filter=True)
#next_page = response.xpath('/html/body/main/section/div[1]/div/div[2]/div[1]/div/div[2]/div[3]/ul/li/a[@rel="next"]/@href').extract_first()
#if next_page is not None:
# url = response.urljoin(next_page)
# yield scrapy.Request(url, callback=self.parse_category, dont_filter=True)
def parse_main_item(self,response):
item = PblSpider()
Title = response.xpath(self.TitleXpath).extract()
Title = self.cleanText(self.parseText(self.listToStr(Title)))
Link = response.url
Image = response.xpath(self.ImageXpath).extract_first()
Price = response.xpath(self.PriceXpath).extract()
Price = self.cleanText(self.parseText(self.listToStr(Price)))
sub_price = response.xpath(self.SubPriceXpath).extract()
sub_price = self.cleanText(self.parseText(self.listToStr(sub_price)))
# #Put each element into its item attribute.
item['Title'] = Title
#item['Category'] = Category
item['Price'] = Price
#item['Features'] = Features
item['Image'] = Image
item['Link'] = Link
return item
I have tried changing the User-Agent in settings.py file as it was the first problem when using scrapy shell, it used to give empty lists.
I have also tried specifying the User-Agent in command line before running the spider.
Added dont_filter=True option.
CodePudding user response:
i think there's a problem with the xpaths of your categories (getAllCategoriesXpath). I suggest you to simplify it, as an example if you want to scrape all the categories i would use:
self.getAllCategoriesXpath = '//a[@]/@href'