I am writting my first web scrapping project and I want to scrap from booking.com.
I'd like to scrap info about include breakfast in hotel.
The problem is - I want every value to be ["Brekafast included"] or empty value [""] if there is no info about it. If Im runnig my code (below) I only get few values ["Brekafast included"].
I don't know how to solve this, bc when breakfast is not included there is no class "e05969d63d" in property card in this hotel (this class is directed to info about breakfast if it is included)
So if Hotel1 and Hotel3 have "Breakfast included" and Hotel2 doesn't have breakfast included.
I would like to export sth like ["Breakfast included","","Breakfast included"]
But I get only : ["Breakfast included", "Breakfast included"]
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
class CsvPipeline(object):
def __init__(self):
self.file = open ('hotel.tmp','wb')
self.exporter = CsvItemExporter(self.file,str)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_items(self,item,spider):
self.exporter.export_item(item)
return item
class hotelsNY(scrapy.Spider):
name = "hotelsNY"
start_urls =[]
#start_urls = ['https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy Jork&ssne=Nowy Jork&ssne_untouched=Nowy Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=0']
for i in range (0, 10):
start_urls.append('https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy Jork&ssne=Nowy Jork&ssne_untouched=Nowy Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=' str(i*25))
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter'},
'FEED_FORMAT': 'csv',
'FEED_URI': 'hotels_tmp1.csv'
}
def parse(self, response):
nexturl = 'https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy Jork&ssne=Nowy Jork&ssne_untouched=Nowy Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=0'
#all_names = response.xpath('//*[@data-testid="title"]')
alH = response.xpath('//*[@data-testid="property-card"]').getall()
for name in allH:
hotelName = response.xpath('//*[@data-testid="title"]/text()').extract(),
address = response.xpath('//*[@data-testid="address"]/text()').extract(),
price = response.xpath('//*[@data-testid="price-and-discounted-price"]/text()').extract(),
dist = response.xpath('//span[@data-testid="distance"]/text()').extract(),
breakfast = response.xpath('//span[@]/text()').extract(),
yield {'hotelName': hotelName, 'address': address, 'price': price, 'dist': dist, 'breakfast': breakfast}
process = CrawlerProcess(
{
'USER_AGENT':'Mozilla/4.0 (comatible;MSIE 7.0;Window NT 5.1)'
})
process.crawl(hotelsNY)
process.start()
CodePudding user response:
There are a few issues with your spider.
once you use
getall()
onallH
xpath, you are extracting the text of that xpath expression and you can no longer use it as a selector for which you can chain.Use relative xpath expressions with the chained selectors so that instead of extracting lists of matching elements, you are iterating through the page row by row which I think was your intention in the first place.
To ensure that "breakfast" becomes an empty string you can just test if it is None, and explicitly set it to the empty string if needed.
here is an example:
notice that there is a './/' in the xpath expressions in the for loop. these are relative xpath expressions. and also notice how I chain the selectors by calling i.xpath
instead of response.xpath
inside the for loop.
allH = response.xpath('//*[@data-testid="property-card"]')
for i in allH:
hotelName = i.xpath('.//*[@data-testid="title"]//text()').get()
address = i.xpath('.//*[@data-testid="address"]//text()').get()
price = i.xpath('.//*[@data-testid="price-and-discounted-price"]//text()').get()
dist = i.xpath('.//span[@data-testid="distance"]//text()').get()
breakfast = i.xpath('//span[@]//text()').get()
if breakfast is None:
breakfast = ""
yield {'hotelName': hotelName, 'address': address, 'price': price,
'dist': dist, 'breakfast': breakfast}
CodePudding user response:
You are currently not using the for name in allH
loop at all, also in the line above you define it as alH
not allH
.
I would recommend you import BeautifulSoup
like this from bs4 import BeautifulSoup
and then change your for loop to the following:
for name in alH:
hotel = BeautifulSoup(name.extract(), features="lxml")
hotelName = hotel.find(attrs={"data-testid":"title"}).get_text()
print(hotelName)
address = hotel.find(attrs={"data-testid":"address"}).get_text()
price = hotel.find(attrs={"data-testid": "price-and-discounted-price"}).get_text()
dist = hotel.find(attrs={"data-testid": "distance"}).get_text()
breakfast = hotel.find(class_="e05969d63d")
if breakfast:
breakfast = breakfast.get_text()
else:
breakfast = " "
print(breakfast)
yield {'hotelName': hotelName, 'address': address, 'price': price, 'dist': dist, 'breakfast': breakfast}
Using BeautifulSoup
you can much more easily pull data from html and xml files and you can use it further up in your code too to replace any xpath
calls. This is just a quick example of how it can be used but I recommend you look into this tool further.