Home > Blockchain >  Xpath correct it
Xpath correct it

Time:03-01

This is my output: I want to remove these from my output '4.9 out of 5 stars', '1,795 ratings',

'4.9 out of 5 stars', '1,795 ratings', '#3,626 in Kitchen & Dining (', 'See Top 100 in Kitchen & Dining', ')', '#18 in', 'Measuring Spoons'

This is my page link enter image description here

This is my code:

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'pushpa'
    start_urls = ['https://www.amazon.com/s?k=measuring tools & scales&crid=10FYGF4D5KRO0&sprefix=measuring tools and scal,aps,363&ref=nb_sb_ss_ts-doa-p_4_24']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    
    def parse(self, response):
        books = response.xpath("//div//h2//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)




    def parse_book(self, response):
        coordinate = response.xpath("//table[@id='productDetails_detailBullets_sections1']//td//span//text()")[2:].extract()
        coordinate = [i.strip() for i in coordinate]
        # remove empty strings:s
        coordinate = [i for i in coordinate if i]
        yield{
            'Best_sellerrank':coordinate
        }
    

CodePudding user response:

Your xpath is correct and you can get your desired output just using list slicing and split method.

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'pushpa'
    start_urls = ['https://www.amazon.com/s?k=measuring tools & scales&crid=10FYGF4D5KRO0&sprefix=measuring tools and scal,aps,363&ref=nb_sb_ss_ts-doa-p_4_24']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }
    
    def parse(self, response):
        books = response.xpath("//div//h2//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)




    def parse_book(self, response):
        coordinate = response.xpath("//table[@id='productDetails_detailBullets_sections1']//td//span//text()")[2:].extract()
        coordinate = [i.strip() for i in coordinate]
        # remove empty strings:s
        coordinate = [i for i in coordinate if i]
        coordinate = coordinate[:2]

        coordinate = ', '.join([i for i in coordinate])
    

        yield{
            'Best_sellerrank':coordinate
        }

Output:

{'Best_sellerrank': '4.7 out of 5 stars,  43,078 ratings'}

... so on

  • Related