I'm trying to access or follow every link that appears for commercial contractors from this website: https://lslbc.louisiana.gov/contractor-search/search-type-contractor/ then extract the emails from the sites that each link leads to but when I run this script, scrapy follows the base url with the entire HTML element attached to the end of the base url instead of following only the link at the given element.
Does anyone know how I can get the desired result or what I'm doing wrong?
Here's the code that I have so far:
from urllib import request
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
#user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
#start_urls= ['https://lslbc.louisiana.gov/contractor-search/search-type-contractor/']
def start_requests(self):
start_urls = [
'https://lslbc.louisiana.gov/contractor-search/search-type-contractor/',
]
#request = scrapy.Request(url=urls, callback=self.parse, method="GET", cookies=[{'domain': 'lslbc.louisiana.gov','path': '/wp-admin/admin-ajax.php?api_action=advanced&contractor_type=Commercial License&classification=&action=api_actions'}], )
#yield request
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse, cookies=[{'name': 'test', 'value': '', 'domain': 'lslbc.louisiana.gov','path': '/wp-admin/admin-ajax.php?api_action=advanced&contractor_type=Commercial License&classification=&action=api_actions'}],)
def parse(self, response):
links = response.xpath('//*[@id="search-results"]/table/tbody/tr/td/a')
for link in links:
yield response.follow(link.get(), callback=self.parse)
def parse_links(self, response):
contractors = response.css()
for contractor in contractors:
yield {
'name': contractor.css('').get().strip(),
'email': contractor.css('td.[email_address]').get().strip(),
}
Which returns:
2022-08-13 16:53:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://lslbc.louisiana.gov/contractor-search/search-type-contractor/> (referer: None)
2022-08-13 16:53:13 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://lslbc.louisiana.gov/contractor-search/search-type-contractor/> (referer: https://lslbc.louisiana.gov/contractor-search/search-type-contractor/)
2022-08-13 16:53:13 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://lslbc.louisiana.gov/contractor-search/search-type-contractor/> (referer: https://lslbc.louisiana.gov/contractor-search/search-type-contractor/)
CodePudding user response:
The webpage contains its in-built search option. Whenever you search by selecting the commercial contractors then data is loaded dynamically by JS via API
as json format alomg with GET
method.That's why you can't get the desired data from the plain HTML DOM.
Full working Code as an example:
import scrapy
import json
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
url='https://lslbc.louisiana.gov/wp-admin/admin-ajax.php?api_action=advanced&contractor_type=Commercial License&classification=&action=api_actions'
yield scrapy.Request(
url=url,
headers=headers,
callback= self.parse,
method="GET")
def parse(self, response):
resp = json.loads(response.body)
for item in resp['results']:
api_url = 'https://lslbc.louisiana.gov/wp-admin/admin-ajax.php?action=api_actions&api_action=company_details&company_id=' item['id']
yield scrapy.Request(
url= api_url,
callback= self.parse_email,
method="GET"
)
def parse_email(self, response):
resp2 = json.loads(response.body)
yield {
'Email':resp2['email_address']
}