Getting emails with specyfic text in them scrapy-CodePudding

i'm creating a script that lists all bussiness from one website, it need's to scrape (Name,address,website,email,telephone number). And i got to part that i kinda can scrape email, but i have small problem, i can't just tell my script to take all of them, they are specyifc and need to contain[Biuro or Sekretariat or name part of website www.(namePart).com] and i kinda don't know how to do it. Here is my code:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider


class RynekMainSpider(scrapy.Spider):
    name = "RynekMain"
    start_urls = [
        'https://rynekpierwotny.pl/deweloperzy/?page=1']
    def parse(self, response):
        websites = response.css('div#root')[0]
        PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
        for website in websites.css('li.rp-np9kb1'):
            page = website.css('a::attr(href)').get()
            address = website.css('address.rp-o9b83y::text').get()
            name = website.css('h2.rp-69f2r4::text').get()
            params = {
            'address' : address,
            'name' : name,
            'href' : page,
            }
            url  = response.urljoin(page)
            
            yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
            
        yield Request(url=response.urljoin(PAGETEST), callback=self.parse)

    def parseMain(self, response, params=None):
        # print(response.url)
        website = response.css('div.rp-l0pkv6 a::attr(href)').get()
        params['website'] = website
        urlem = response.urljoin(website)
        yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
        
    
    
    
    
    def parseEmail(self,response, params=None):
        email = response.xpath('//a[contains(@href, "@")]/@href').get()
        params['email'] = email        
        yield params
if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(RynekMainSpider)
    process.start()

Thanks for help in advance!

CodePudding user response：

In your parseEmail method, after extracting the email address, just check the extracted string like you would with any string.

For Example

from urllib.parse import urlsplit

def parseEmail(self,response, params=None):
    email = response.xpath('//a[contains(@href, "@")]/@href').get()
    netloc = urlsplit(response.url).netloc
    if 'Biuro' in email or 'Sekretariat' in email:
        params['email'] = email
    elif any([(i in email) for i in netloc.split('.')[:-1] if i != 'www']):
        params['email'] = email
    yield params