I ´m getting this error when I run my scraper :
2022-09-19 23:17:00 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/top-mujer-reebok-ts-ubf-seamless-rojo/p> (referer: https://www.justforsport.com.ar/mujer?page=7)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_mujer.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I try to understand what does it mean, but I can´t find the problem. The link works fine...but data is not collected...
My script looks like this:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_mujer.csv'):
os.remove('jfs_mujer.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_mujer(scrapy.Spider):
name = 'jfs_mujer'
start_urls = ["https://www.justforsport.com.ar/mujer?page=1"]
def parse(self,response):
# total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/32) 2
for count in range(1, 40):
yield SplashRequest(url=f'https://www.justforsport.com.ar/mujer?page={count}',
callback=self.parse_links, meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
#Extrae links de cada pagina de la seccion
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' link), self.parse_article_detail ,meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_mujer.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 3,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 3,
'DOWNLOAD_DELAY':24,
#'AUTOTHROTTLE_MAX_DELAY' : 12,
'USER_AGENT' : 'Googlebot/2.1 ( http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_mujer)
process.start()
What's wrong wit the script? or it's something about settings? . I think it has something to do with the way I join the prices, but from 770 products, it works fine for almost 660...I don´t understand... thanks for touyr help!
CodePudding user response:
Your error message means that your CSS selector doesn't find anything. You can try above XPath to get the price:
price = response.xpath('//meta[@property="product:price:amount"]/@content').get()