Need some help here. My code is working when I am crawling one category page via (scrapy.Spider). However when I try to scrape the next page of same category, it does not seems to goto next page and don't scrape at all.
Here is code
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
# for category in categ:
Category_Name=categ.xpath('.//a[contains(text(),"Historical Fiction")]/text()').get().replace('\n',"").strip()
Kategorylink=categ.xpath('.//a[contains(text(),"Historical Fiction")]/@href').get().replace('\n',"").strip()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse)
here is command prompt output
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/the-guernsey-literary-and-potato-peel-pie-society_253/index.html', 'Bookprize': '£49.53'}
2021-09-29 04:30:25 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): books.toscrape.com:80
2021-09-29 04:30:26 [urllib3.connectionpool] DEBUG: http://books.toscrape.com:80 "GET /catalogue/girl-in-the-blue-coat_160/index.html HTTP/1.1" 200 None
2021-09-29 04:30:26 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html>
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/girl-in-the-blue-coat_160/index.html', 'Bookprize': '£46.83'}
page-2.html
2021-09-29 04:30:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
2021-09-29 04:30:26 [scrapy.core.scraper] ERROR: Spider error processing <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
Traceback (most recent call last):
File "C:\Users\Abu Bakar Siddique\AppData\Local\Programs\Python\Python39\lib\site-packages\twisted\internet\defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
TypeError: info_parse() missing 2 required positional arguments: 'category_name' and 'Category_link'
2021-09-29 04:30:26 [scrapy.core.engine] INFO: Closing spider (finished)
Thanking in advance for awesome support.
CodePudding user response:
Look at the error you're getting. It's because your info_parse function expect arguments that you don't send.
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})
It should work.
EDIT: (your code with a few changes)
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
for category in categ:
Category_Name=category.xpath('./a/text()').get().strip()
Kategorylink=category.xpath('./a/@href').get()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
bookprize=response.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})