I'm working on a scaper that collects property information.
The original code works perfectly.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class huntsmanCSS(scrapy.Spider):
name = "huntsman"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('parcel_ids.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def parse(self, response):
yield {
'propId': response.css('#dnn_ctr388_View_tdPropertyID::text').extract_first(),
'address': response.css('#dnn_ctr388_View_tdPropertyAddress::text').extract_first(),
'owner': response.css('#dnn_ctr388_View_divOwnersLabel::text').extract_first(),
'propertyClass': response.css('#dnn_ctr388_View_tdGIPropertyClass::text').extract_first(),
'hood': response.css('#dnn_ctr388_View_tdGINeighborhood::text').extract_first(),
'buildType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(3)::text').extract_first(),
'improveType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(4)::text').extract_first(),
'yrBuilt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'saleDate': response.css('#dnn_ctr388_View_tblSalesHistoryData tr:nth-child(2) > td:nth-child(1)::text').extract_first(),
'TAV': response.css('#dnn_ctr388_View_tdPropertyValueHeader::text').extract_first(),
'price': response.css('#dnn_ctr388_View_tblSalesHistoryData > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'sqFt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(6)::text').extract_first()
}
Using a list of all parcels, it adjusts the URL to go to the next page.
Broken Code:
There is a link to a pdf that is embedded in a javascript button. The pdf contains more information that I want to scrape.
It will retrieve the first link but then throws errors.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class resDatasheetLink(scrapy.Spider):
name = "resDatasheetLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
self.driver.close()
yield {
'datasheet': link
}
Error:
2021-12-30 10:40:36 [scrapy.core.engine] DEBUG:
Crawled (200) <GET
https://orion.lancaster.ne.gov/Property-
Detail/PropertyQuickRefID/R402438> (referer: None)
2021-12-30 10:40:36
[selenium.webdriver.remote.remote_connection]
DEBUG: POST
http://localhost:19113/session/5acb1d8f4ebdb13482ab40a67f846d1d/url {"url": "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438"}
2021-12-30 10:40:36 [urllib3.connectionpool] DEBUG: http://localhost:19113 "POST /session/5acb1d8f4ebdb13482ab40a67f846d1d/url HTTP/1.1" 404 878
2021-12-30 10:40:36 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-12-30 10:40:36 [scrapy.core.scraper] ERROR: Spider error processing <GET https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438> (referer: None)
Traceback (most recent call last):
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id
CodePudding user response:
break
will take you out of the while
loop. You need to unindent the last few lines just below try-except{}
and invoke self.driver.close()
(preferably self.driver.quit()
) line at the end of parsing as follows:
def parse(self, response): self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
yield {
'datasheet': link
}
self.driver.close()
CodePudding user response:
based on the way the spider is configured the loop was the issue.
class rDataLink(scrapy.Spider):
name = "rDataLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="btnDataSheet"]')))
btn.click()
WebDriverWait(self.driver, 7).until(EC.url_changes(response.url))
link = self.driver.current_url
yield {
'datasheet': link
}