i'm creating a script that lists all bussiness from one website, it need's to scrape (Name,address,website,email,telephone number). And i got to part that i kinda can scrape email, but i have small problem, i can't just tell my script to take all of them, they are specyifc and need to contain[Biuro or Sekretariat or name part of website www.(namePart).com] and i kinda don't know how to do it. Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.xpath('//a[contains(@href, "@")]/@href').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance!
CodePudding user response:
In your parseEmail
method, after extracting the email address, just check the extracted string like you would with any string.
For Example
from urllib.parse import urlsplit
def parseEmail(self,response, params=None):
email = response.xpath('//a[contains(@href, "@")]/@href').get()
netloc = urlsplit(response.url).netloc
if 'Biuro' in email or 'Sekretariat' in email:
params['email'] = email
elif any([(i in email) for i in netloc.split('.')[:-1] if i != 'www']):
params['email'] = email
yield params