Using scrapy to scrape multiple pages and multiple URLs-CodePudding

I have previously done a small project on scraping a real estate website using BeautifulSoup, but it took a long time to scrape around 5,000 data points. I was thinking of learning multithread processing and implementing it with BS, but someone informed me that web-crawling with Scrapy might be faster and easier. Additionally, I have switched from using a Spyder to Pycharm as my IDE. It is still jarring experience but I am trying to get used to it.

I have gone over the documentation once, and followed some scraping examples using Scrapy, but I am still experiencing difficulties. I was planning to use my previously created BS scraping script as a base, and create a new Scrapy project to web-scrape real estate data. However, I don't know how and where I can start. Any and all help is much appreciated. Thank you.

Desired Result: Scrape multiple pages from multiple URLs using Scrapy. Scrape multiple values by entering into the apartment listing links and getting data from each.

Scrapy Script (so far):

# -*- coding: utf-8 -*-

# Import library
import scrapy


# Create Spider class
class UneguiApartmentSpider(scrapy.Spider):
    name = 'apartments'
    allowed_domains = ['www.unegui.mn']
    start_urls = [
        'https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/'
        ]
    # headers
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    }

    def parse(self, response):
        for listings in response.xpath("//div[@class='list-announcement']"):
            item = ApartmentsItem()
            item['name'] = listings.xpath('text()').extract()
            item['link'] = listings.xpath('href').extract()
        yield item

BeautifulSoup Script:

This script still has some issues I am trying to address such as scraping city and price. For example, for 4 bedroom apartments url (/4-r/) it creates an error or empty value because there are VIP listings

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from timeit import default_timer as timer
import pandas as pd
import re
import csv

dt_today = datetime.today()
date_today = dt_today.strftime('%Y-%m-%d')
date_today2 = dt_today.strftime('%Y%m%d')
date_yesterday = (dt_today-relativedelta(day=1)).strftime('%Y-%m-%d')

def main():
    page = 0
    name = []
    date = []
    address = []
    district = []
    city = []
    price = []
    area_sqm = []
    rooms = []
    floor = []
    commission_year = []
    building_floors = []
    garage = []
    balcony = []
    windows = []
    window_type = []
    floor_type = []
    door_type = []
    leasing = []
    description = []
    link = []
    
    for i in range (5,6):
        BASE = 'https://www.unegui.mn'
        URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
        COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
                 'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
        
        with requests.Session() as session:
            while True:
                (r := session.get(f'{URL}{page 1}')).raise_for_status()
                m = re.search('.*page=(\d )$', r.url)
                if m and int(m.group(1)) == page:
                    break
                page  = 1
                start = timer()
                print(f'Scraping {i} bedroom apartments page {page}')
                
                soup = BS(r.text, 'lxml')
                for tag in soup.findAll('div', class_='list-announcement-block'):
                    _name = tag.find('a', attrs={'itemprop': 'name'})
                    name.append(_name.get('content', 'N/A'))
                    if (_link := _name.get('href', None)):
                        link.append(f'{BASE}{_link}')
                        (_r := session.get(link[-1])).raise_for_status()
                        _spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
                        floor_type.append(_spanlist[0].get_text().strip())
                        balcony.append(_spanlist[1].get_text().strip())
                        garage.append(_spanlist[2].get_text().strip())
                        window_type.append(_spanlist[3].get_text().strip())
                        door_type.append(_spanlist[4].get_text().strip())   
                        windows.append(_spanlist[5].get_text().strip())
                        
                        _alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
                        commission_year.append(_alist[0].get_text().strip())
                        building_floors.append(_alist[1].get_text().strip())
                        area_sqm.append(_alist[2].get_text().strip())
                        floor.append(_alist[3].get_text().strip())
                        leasing.append(_alist[4].get_text().strip())
                        district.append(_alist[5].get_text().strip())
                        address.append(_alist[6].get_text().strip())
                        
                        rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('»')[1].strip())
                        description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
                        date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
                        city.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[1].strip())         
                        # if ( _price := tag.find('div', class_='announcement-block__price _premium')) is None:
                        #     _price = tag.find('meta', attrs={'itemprop': 'price'})['content']
        
                        # price.append(_price)
                        end = timer()
                print(timedelta(seconds=end-start))
                
            df = pd.DataFrame(zip(name, date, address, district, city, 
                                  price, area_sqm, rooms, floor, commission_year,
                                  building_floors, garage, balcony, windows, window_type,
                                  floor_type, door_type, leasing, description, link), columns=COLUMNS)
            return(df)
    
        df['Date'] = df['Date'].replace('Өнөөдөр', date_today)
        df['Date'] = df['Date'].replace('Өчигдөр', date_yesterday)
        df['Area_sqm'] = df['Area_sqm'].replace('м²', '')
        df['Balcony'] = df['Balcony'].replace('тагттай', '')

if __name__ == '__main__':
    df = main()
    df.to_csv(f'{date_today2}HPD.csv', index=False)

CodePudding user response：

Scrapy is an asynchronous callback driven framework.
The parse() method is default callback to all start_urls. Now every callback can yield either:

item - which will send it to pipelines if there are any and to output
request - scrapy.Request object can be yielded to continue scraping.

So if you have multiple page scraper and you want to scrape all items your logic would look something like this:

class MySpider(Spider):
   name = "pagination_spider"
   start_urls = ["first_page_url"]

   def parse(self, response):
       total_pages = ... # find total pages number in first page
       for page in range(1, total_pages):
           url = base_url   page  # form page url 
           yield scrapy.Request(url, callback=self.parse_page)
       # also don't forget to parse first page
       yield from self.parse_page(self, response)

   
   def parse_page(self, response):
       """parse listing items from single pagination page"""
       item = {}  # parse page response for listing items
       yield item

Here the spider will request 1st page then schedule request for all remaining pages concurrently - meaning you can take full advantage of scrapy's speeds to get all listings.

CodePudding user response：

this is an example of scraping multiple URLs to the same website for example the website is amazon the first URL for the baby category the second for another category

import time
import scrapy
from scrapy_splash import SplashRequest
from scrapy.crawler import CrawlerProcess


class spiders(scrapy.Spider):
    name = "try"
    start_urls = ["https://www.amazon.sg/gp/bestsellers/baby/ref=zg_bs_nav_0",'https://www.amazon.sg/gp/browse.html?node=6537678051&ref_=nav_em__home_appliances_0_2_4_4']

    def parse(self, response):
        for url in response.css('.mr-directory-item a::attr(href)').getall(): #loop for each href
            yield scrapy.Request(f'https://muckrack.com{url}', callback=self.parse_products,
                                 dont_filter=True)
    def parse_products(self, response):
#these are for another website 
        full_name = response.css('.mr-font-family-2.top-none::text').get()
        Media_outlet = response.css('.mr-person-job-item a::text').get()
       
        yield {'Full Name': full_name, 'Media outlet':Media_outlet,'URL': response.url}

if you want to do the different processes for each URL you should use

import scrapy
from scrapy_splash import SplashRequest
from scrapy.crawler import CrawlerProcess


class spiders(scrapy.Spider):
    name = "try"

    def start_requests(self):
        yield scrapy.Request('url1',callback=self.parse1)
        yield scrapy.Request('url2',callback=self.parse2)

    def parse1(self, response):
        for url in response.css('.mr-directory-item a::attr(href)').getall():#loop for each href
            yield scrapy.Request(f'https://muckrack.com{url}', callback=self.parse_products,
                                 dont_filter=True)
    def parse2(self, response):
        for url in response.css('.mr-directory-item a::attr(href)').getall():#loop for each href
            yield scrapy.Request(f'https://muckrack.com{url}', callback=self.parse_products,
                                 dont_filter=True)

    def parse_products(self, response):
#these are for another website 
        full_name = response.css('.mr-font-family-2.top-none::text').get()
        Media_outlet = response.css('.mr-person-job-item a::text').get()
       #yield {'header':'data'}
        yield {'Full Name': full_name, 'Media outlet':Media_outlet,'URL': response.url}```