Home > Enterprise >  Xpath not found using scrapy
Xpath not found using scrapy

Time:02-22

I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: enter image description here

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'pushpa'
    start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }



    def parse(self, response):
        books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        link = response.xpath("//a[@class='field__item link link--external']//@href").get()
        yield{
            'website':link

        }

CodePudding user response:

Now,it's working.

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'pushpa'
    start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }



    def parse(self, response):
        books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        link = response.xpath("//a[@class='field__item link link--external']//@href").get()
        yield{
            'website':link,
            'phone':response.xpath('normalize-space(//*[@]//text()[2])').get(),
            'email':response.xpath('normalize-space(//*[@]/div[1]//text()[2])').get()
            }
  • Related