Home > front end >  How to paginate and parse multiple pages concurrently on Scrapy
How to paginate and parse multiple pages concurrently on Scrapy

Time:11-15

I am trying to web-scrape multiple pages from a real estate website. I have been successful in scraping the first page of my URL, but unable to handle pagination. I have attempted trying to find a class tag with 'red' in it and identify next sibling. I believe this will get the next page response, and continue doing over and over. I read some examples were people wrote their code to be able to parse multiple pages at the same time.

Is it possible to do parallel/concurrent parsing? I want to be able to parse 90 pages as fast as possible, but don't know how to implement it. Any and all appreciated is greatly and much appreciated. Thank you.

PROGRESS UPDATE 1: I figured out why my CSV outputs UTF-8 and returns Cyrillic characters correctly in my Pycharm IDE, but returns ?? placeholders when I use Excel. I have been able to bypass this issue by importing CSV file through Excel Data>From Text/CSV.

PROGRESS UPDATE 2: I understand I could implement a for loop in my start_request function and loop pages (1,90) or even (1,120) but that is not what I want, and this would make it so my code parses page by page, rather than concurrently.

HTML Snippet:

<ul class="number-list">
  <li>
    <a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1" class="page-number js-page-filter red" data-page="1">1</a>
  </li>
 
  <li>
    <a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&amp;page=2" class="page-number js-page-filter " data-page="2">2</a>
  </li>
  <li>
    <a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&amp;page=3" class="page-number js-page-filter " data-page="3">3</a>
  </li>
  <li><span class="page-number">...</span></li>
  <li>
    <a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&amp;page=89" class="page-number js-page-filter " data-page="89">89</a>
  </li>
  <li>
    <a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&amp;page=90" class="page-number js-page-filter " data-page="90">90</a>
  </li>
  <div class="clear"></div>
</ul>
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>

Pagination Snippet:

# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
    yield response.follow(next_page, callback=self.parse)

Full Code:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess

dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today ' HPI Data'


# Create Spider class
class UneguiApartments(scrapy.Spider):
    name = "unegui_apts"
    allowed_domains = ["www.unegui.mn"]
    custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
                       }

    def start_requests(self):
        urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
        for url in urls:
            yield Request(url, self.parse)

    def parse(self, response, **kwargs):
        cards = response.xpath("//li[contains(@class,'announcement-container')]")

        # parse details
        for card in cards:
            name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
            price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
            rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
            link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
            date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
            date = date_block[0].strip()
            city = date_block[1].strip()

            item = {'name': name,
                    'date': date,
                    'rooms': rooms,
                    'price': price,
                    'city': city,
                    }
            # follow absolute link to scrape deeper level
            yield response.follow(link, callback=self.parse_item, meta={'item': item})

    def parse_item(self, response):
        # retrieve previously scraped item between callbacks
        item = response.meta['item']

        # parse additional details
        list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
        list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()

        # get additional details from list of <span> tags, element by element
        floor_type = list_span[0].strip()
        num_balcony = list_span[1].strip()
        garage = list_span[2].strip()
        window_type = list_span[3].strip()
        door_type = list_span[4].strip()
        num_window = list_span[5].strip()

        # get additional details from list of <a> tags, element by element
        commission_year = list_a[0].strip()
        num_floors = list_a[1].strip()
        area_sqm = list_a[2].strip()
        floor = list_a[3].strip()
        leasing = list_a[4].strip()
        district = list_a[5].strip()
        address = list_a[6].strip()

        # update item with newly parsed data
        item.update({
            'district': district,
            'address': address,
            'area_sqm': area_sqm,
            'floor': floor,
            'commission_year': commission_year,
            'num_floors': num_floors,
            'num_windows': num_window,
            'num_balcony': num_balcony,
            'floor_type': floor_type,
            'window_type': window_type,
            'door_type': door_type,
            'garage': garage,
            'leasing': leasing
        })
        yield item

        # handling pagination
        next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse)


# main driver
if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(UneguiApartments)
    process.start()

CodePudding user response:

If I understand you correctly you need to move the 'next page' to the parse function. I also just take the 'next page' button value and follow it.

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess

dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today ' HPI Data'


# Create Spider class
class UneguiApartments(scrapy.Spider):
    name = "unegui_apts"
    allowed_domains = ["www.unegui.mn"]
    custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
                       }

    def start_requests(self):
        urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
        for url in urls:
            yield Request(url, self.parse)

    def parse(self, response, **kwargs):
        cards = response.xpath("//li[contains(@class,'announcement-container')]")

        # parse details
        for card in cards:
            name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
            price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
            rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
            link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
            date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
            date = date_block[0].strip()
            city = date_block[1].strip()

            item = {'name': name,
                    'date': date,
                    'rooms': rooms,
                    'price': price,
                    'city': city,
                    }
            # follow absolute link to scrape deeper level
            yield response.follow(link, callback=self.parse_item, meta={'item': item})

        # handling pagination
        next_page = response.xpath('//a[contains(@class, "number-list-next js-page-filter number-list-line")]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_item(self, response):
        # retrieve previously scraped item between callbacks
        item = response.meta['item']

        # parse additional details
        list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
        list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()

        # get additional details from list of <span> tags, element by element
        floor_type = list_span[0].strip()
        num_balcony = list_span[1].strip()
        garage = list_span[2].strip()
        window_type = list_span[3].strip()
        door_type = list_span[4].strip()
        num_window = list_span[5].strip()

        # get additional details from list of <a> tags, element by element
        commission_year = list_a[0].strip()
        num_floors = list_a[1].strip()
        area_sqm = list_a[2].strip()
        floor = list_a[3].strip()
        leasing = list_a[4].strip()
        district = list_a[5].strip()
        address = list_a[6].strip()

        # update item with newly parsed data
        item.update({
            'district': district,
            'address': address,
            'area_sqm': area_sqm,
            'floor': floor,
            'commission_year': commission_year,
            'num_floors': num_floors,
            'num_windows': num_window,
            'num_balcony': num_balcony,
            'floor_type': floor_type,
            'window_type': window_type,
            'door_type': door_type,
            'garage': garage,
            'leasing': leasing
        })
        yield item


# main driver
if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(UneguiApartments)
    process.start()

This should work.

  • Related