This is my output: I want to remove these from my output '4.9 out of 5 stars', '1,795 ratings',
'4.9 out of 5 stars', '1,795 ratings', '#3,626 in Kitchen & Dining (', 'See Top 100 in Kitchen & Dining', ')', '#18 in', 'Measuring Spoons'
This is my code:
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.amazon.com/s?k=measuring tools & scales&crid=10FYGF4D5KRO0&sprefix=measuring tools and scal,aps,363&ref=nb_sb_ss_ts-doa-p_4_24']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div//h2//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
coordinate = response.xpath("//table[@id='productDetails_detailBullets_sections1']//td//span//text()")[2:].extract()
coordinate = [i.strip() for i in coordinate]
# remove empty strings:s
coordinate = [i for i in coordinate if i]
yield{
'Best_sellerrank':coordinate
}
CodePudding user response:
Your xpath is correct and you can get your desired output just using list slicing and split method.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.amazon.com/s?k=measuring tools & scales&crid=10FYGF4D5KRO0&sprefix=measuring tools and scal,aps,363&ref=nb_sb_ss_ts-doa-p_4_24']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div//h2//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
coordinate = response.xpath("//table[@id='productDetails_detailBullets_sections1']//td//span//text()")[2:].extract()
coordinate = [i.strip() for i in coordinate]
# remove empty strings:s
coordinate = [i for i in coordinate if i]
coordinate = coordinate[:2]
coordinate = ', '.join([i for i in coordinate])
yield{
'Best_sellerrank':coordinate
}
Output:
{'Best_sellerrank': '4.7 out of 5 stars, 43,078 ratings'}
... so on