I am trying to build a spider, that gathers information regarding startups. Therefore I wrote a Python script with scrapy that should access the website and store the information in a dictionary. I think the code should work from a logik point of view, but somehow I do not get any output. My code:
import scrapy
class StartupsSpider(scrapy.Spider):
name = 'startups'
#name of the spider
allowed_domains = ['www.bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#list of allowed domains
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#starting url
def parse(self, response):
startups = response.xpath('//*[contains(@class,"card-link-overlay")]/@href').getall()
#parse initial start URL for the specific startup URL
for startup in startups:
absolute_url = response.urljoin(startup)
yield scrapy.Request(absolute_url, callback=self.parse_startup)
#parse the actual startup information
next_page_url = response.xpath('//*[@class ="pagination-link"]/@href').get()
#link to next page
absolute_next_page_url = response.urljoin(next_page_url)
#go through all pages on start URL
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
#get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[@]/a/@href').get()
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[@]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[@]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[@class ="person-contact"]/p/a/span/text()').get()
contact_address_street = response.xpath('//*[@class ="adr"]/text()').get()
contact_address_plz = response.xpath('//*[@class ="locality"]/text()').getall()
contact_state = response.xpath('//*[@class ="country-name"]/text()').get()
yield{'Startup':startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter':employees,
'Kapital Bedarf':capital,
'Datum des Förderbescheids':applied_for_invest,
'Contact': contact_name,
'Telefon':contact_phone,
'E-Mail':contact_mail,
'Adresse': contact_address_street contact_address_plz contact_state}
CodePudding user response:
you need to run in prompt: scrapy crawl -o filename.(json or csv)
CodePudding user response:
- You're not getting output because your
allowed_domains
is wrong. - In the last line (
Adresse
), you're trying to concatenatelist
andstr
types so you'll get an error. - Your pagination link is wrong, in the first page you're getting the next page, and in the second page you're getting the previous page.
- You're not doing any error checking. In some pages you're getting
None
for some of the values and you're trying to get their i'th character which results in an error.
I fixed 1, 2, and 3. But you'll need to fix number 4 yourself.
import scrapy
class StartupsSpider(scrapy.Spider):
# name of the spider
name = 'startups'
# list of allowed domains
allowed_domains = ['bmwk.de']
# starting url
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
def parse(self, response):
# parse initial start URL for the specific startup URL
startups = response.xpath('//*[contains(@class,"card-link-overlay")]/@href').getall()
for startup in startups:
absolute_url = response.urljoin(startup)
# parse the actual startup information
yield scrapy.Request(absolute_url, callback=self.parse_startup)
# link to next page
next_page_url = response.xpath('(//*[@class ="pagination-link"])[last()]/@href').get()
if next_page_url:
# go through all pages on start URL
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
# get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[@]/a/@href').get()
# for example for some of the pages you'll get an error here:
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[@]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[@]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[@class ="person-contact"]/p/a/span/text()').get()
Adresse = ' '.join(response.xpath('//*[@class ="address"]//text()').getall())
yield {'Startup': startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter': employees,
'Kapital Bedarf': capital,
'Datum des Förderbescheids': applied_for_invest,
'Contact': contact_name,
'Telefon': contact_phone,
'E-Mail': contact_mail,
'Adresse': Adresse}