I want to get email
but they will provide wrong output these is page link https://zoekeenadvocaat.advocatenorde.nl/advocaten/soesterberg/mevrouw-mr-mm-strengers/11094237420
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from scrapy_selenium import SeleniumRequest
import re
class TestSpider(scrapy.Spider):
name = 'test'
page_number=1
def start_requests(self):
yield SeleniumRequest(
url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters[rechtsgebieden]=[]&filters[specialisatie]=0&filters[toevoegingen]=0&locatie[adres]=Holland&locatie[geo][lat]=52.132633&locatie[geo][lng]=5.291266&locatie[straal]=56&locatie[hash]=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1",
wait_time = 3,
screenshot = True,
callback = self.parse,
dont_filter = True
)
def parse(self, response):
books = response.xpath("//span[@class='h4 no-margin-bottom']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.css(".title h3::text").get()
advocaten=response.css(".secondary::text").get()
detail=response.xpath("//section[@class='lawyer-info']")
for i in range(len(detail)):
if re.search("@",detail[i].get()):
d1=detail[i].xpath("//div[@class='column small-9']//a//@href").get()
print(d1)
CodePudding user response:
Change your xpath
that it selects the second element:
(//div[@class='column small-9'])[2]/a/@href
Example: http://xpather.com/Hhjolrh1
Alternative would be to select it directly:
//a[starts-with(@href, 'mailto')]/@href
Example: http://xpather.com/EtD8noeI
CodePudding user response:
You get the phone number because it is the first element that fits 'column small-9'.
As an alternative to the answer with X-Path, here a solution without X-Path:
soup.find("span", string="E-mail").parent.find_next("div").find("a").contents[0]