I am trying to web-scrape multiple pages from a real estate website. I have been successful in scraping the first page of my URL, but unable to handle pagination. I have attempted trying to find a class tag with 'red' in it and identify next sibling. I believe this will get the next page response, and continue doing over and over. I read some examples were people wrote their code to be able to parse multiple pages at the same time.
Is it possible to do parallel/concurrent parsing? I want to be able to parse 90 pages as fast as possible, but don't know how to implement it. Any and all appreciated is greatly and much appreciated. Thank you.
PROGRESS UPDATE 1: I figured out why my CSV outputs UTF-8 and returns Cyrillic characters correctly in my Pycharm IDE, but returns ?? placeholders when I use Excel. I have been able to bypass this issue by importing CSV file through Excel Data>From Text/CSV.
PROGRESS UPDATE 2: I understand I could implement a for loop in my start_request function and loop pages (1,90) or even (1,120) but that is not what I want, and this would make it so my code parses page by page, rather than concurrently.
HTML Snippet:
<ul class="number-list">
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1" class="page-number js-page-filter red" data-page="1">1</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=2" class="page-number js-page-filter " data-page="2">2</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=3" class="page-number js-page-filter " data-page="3">3</a>
</li>
<li><span class="page-number">...</span></li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=89" class="page-number js-page-filter " data-page="89">89</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=90" class="page-number js-page-filter " data-page="90">90</a>
</li>
<div class="clear"></div>
</ul>
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>
Pagination Snippet:
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
Full Code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today ' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
CodePudding user response:
If I understand you correctly you need to move the 'next page' to the parse function. I also just take the 'next page' button value and follow it.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today ' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath('//a[contains(@class, "number-list-next js-page-filter number-list-line")]/@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
This should work.