Home > Net >  Python Web Scraping - Wep Page Resources
Python Web Scraping - Wep Page Resources

Time:12-23

I'm trying to scrape a specific website, but the data is loaded dynamically. I found that the data is in json files, but I cant get the list of all the elements on the website and I need all the pages.

  • How can I get the list of all the similar json starting by number?
  • How can I go trough all the pages with this logic?

I'm not sure of what to use, I have tried Scrapy but it gets too complicated waiting for the load to page, wanted to know if beautifulsoup or other has a faster response.

Edit: Adding Scrapy Code

  • I did this code in scrapy, but I don't know how to get all the json from the page dynamically
# https://www.fincaraiz.com.co/_next/data/build/proyecto-de-vivienda/altos-del-eden/el-eden/barranquilla/7109201.json?title=altos-del-eden&location1=el-eden&location2=barranquilla&code=7109201

import logging
import scrapy
from scrapy_playwright.page import PageMethod
import json

# scrapy crawl fincaraiz-home -O output-home.json
class PwspiderSpider(scrapy.Spider):
    name = "fincaraiz-home"
    base_url = "https://www.fincaraiz.com.co"
    build_url = "https://www.fincaraiz.com.co/_next/data/build"

    def start_requests(self):
        yield scrapy.Request(
            "https://www.fincaraiz.com.co/finca-raiz/venta/antioquia",
            meta=dict(
                playwright=True,
                playwright_include_page=True,
                playwright_page_methods=[
                    #PageMethod("wait_for_selector", 'div[id="listingContainer"]')
                    PageMethod("wait_for_selector", 'button:has-text("1")')
                ],
            ),
            errback=self.errback,
        )

    async def parse(self, response):
        for anuncio in response.xpath("//div[@id='listingContainer']/div"):
            # if anuncio.xpath("article/a/@href").extract():

            #     yield scrapy.Request(
            #         self.build_url   anuncio.xpath("article/a/@href").extract()[0] ".json",
            #         callback=self.parse_json,
            #         # meta=dict(
            #         #     callback=self.parse_json,
            #         #     # playwright=True,
            #         #     # playwright_include_page=True,
            #         #     # playwright_page_methods=[
            #         #     #     PageMethod("wait_for_selector", 'button:has-text("1")')
            #         #     # ],
            #         # ),
            #         errback=self.errback,
            #     )
            yield {
                "link": anuncio.xpath("article/a/@href").extract(),
                "tipo_anuncio": anuncio.xpath("article/a/ul/li[1]/div/span/text()").extract(),
                "tipo_vendedor": anuncio.xpath("article/a/ul/li[2]/div/span/text()").extract(),
                "valor": anuncio.xpath("article/a/div/section/div[1]/span[1]/b/text()").extract(),
                "area": anuncio.xpath("article/a/div/section/div[2]/span[1]/text()").extract(),
                "habitaciones": anuncio.xpath("article/a/div/section/div[2]/span[3]/text()").extract(),
                "banos": anuncio.xpath("article/a/div/section/div[2]/span[5]/text()").extract(),
                "parqueadero": anuncio.xpath("article/a/div/section/div[2]/span[7]/text()").extract(),
                "ubicacion": anuncio.xpath("article/a/div/section/div[3]/div/span/text()").extract(),
                "imagen": anuncio.xpath("article/a/figure/img/@src").extract(),
                "tipo_inmueble": anuncio.xpath("article/a/div/footer/div/span/b/text()").extract(),
                "inmobiliaria": anuncio.xpath("article/a/div/footer/div/div/div").extract(),
            }

    # async def parse_json(self, response):
    #     yield json.loads(response.text)

    def errback(self, failure):
        logging.info(
            "Handling failure in errback, request=%r, exception=%r", failure.request, failure.value
        )

CodePudding user response:

Using playwright is not the way to go on this site. You should instead use their public search api @ https://api.fincaraiz.com.co/document/api/1.0/listing/search

Here is an example of how you can make POST requests to the api and get all of the information in json responses very quickly.

import scrapy
import json

payload = {"filter":{"offer":{"slug":["sell"]},"property_type":{"slug":["apartment"]},"locations":{"cities":{"slug":["colombia-atlántico-5700003-barranquilla"]}}},"fields":{"exclude":[],"facets":[],"include":["area","baths.id","baths.name","baths.slug","client.client_type","client.company_name","client.first_name","client.fr_client_id","client.last_name","client.logo.full_size","garages.name","is_new","locations.cities.fr_place_id","locations.cities.name","locations.cities.slug","locations.countries.fr_place_id","locations.countries.name","locations.countries.slug","locations.groups.name","locations.groups.slug","locations.groups.subgroups.name","locations.groups.subgroups.slug","locations.neighbourhoods.fr_place_id","locations.neighbourhoods.name","locations.neighbourhoods.slug","locations.states.fr_place_id","locations.states.name","locations.states.slug","locations.location_point","max_area","max_price","media.photos.list.image.full_size","media.photos.list.is_main","media.videos.list.is_main","media.videos.list.video","media.logo.full_size","min_area","min_price","offer.name","price","products.configuration.tag_id","products.configuration.tag_name","products.label","products.name","products.slug","property_id","property_type.name","fr_property_id","fr_parent_property_id","rooms.id","rooms.name","rooms.slug","stratum.name","title"],"limit":25,"offset":0,"ordering":[],"platform":40,"with_algorithm":True}}

class PwspiderSpider(scrapy.Spider):
    name = "fincaraiz-home"
    def start_requests(self):
        for i in range(20):
            offset = i * 25
            payload["fields"]["offset"] = offset
            _payload = json.dumps(payload)
            yield scrapy.Request(
                "https://api.fincaraiz.com.co/document/api/1.0/listing/search",
                method="POST",
                body=_payload,
                headers={"content-type": "application/json"}
            )

    def parse(self, response):
        data = response.json()
        for item in data["hits"]["hits"]:
            yield {"item": item }

This code generates 20 pages with 25 results per page in about 3 seconds, and each of the items it yields has all of the information you are trying to extract with playwright and looks like this.

2022-12-21 17:38:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.fincaraiz.com.co/document/api/1.0/listing/search>
{'item': {'_index': 'fr-site-listing', '_type': '_doc', '_id': 'ac5fe39b-fb51-4702-a248-c23cf358ea17', '_score': 49.7314, '_source': {'listing': {'area': '53.0', 'rooms': {'name': '3', 'id': 3, 'slug': 'ROOM_3'},
 'max_area': '0', 'is_new': True, 'media': {'logo': {'full_size': 'https://s3.amazonaws.com/imagenes.fincaraiz.com.co/FC_COL/2021/11/19/6857997/proyecto-nuevo-venta-atlantico-barranquilla-501212856_m.jpg'}, 'vide
os': [{'list': [{'is_main': False, 'video': 'https://www.youtube.com/embed/NZMDh5SQy4w'}]}], 'photos': [{'list': [{'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18
/3427045_252_14.jpg'}, 'is_main': True}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_192_15.jpg'}, 'is_main': False}, {'image': {'full_size': 'https:
//s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_710_19.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427
045_825_17.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_584_16.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3
.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_311_18.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_
825_21.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_512_20.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.ama
zonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_855_22.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_437_
12.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_268_13.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazona
ws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_232_11.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_516_9.jp
g'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_319_10.jpg'}, 'is_main': False}]}]}, 'title': 'Barloa  Trivento', 'property_id':
'ac5fe39b-fb51-4702-a248-c23cf358ea17', 'stratum': {'name': 'Estrato 2'}, 'offer': [{'name': 'Venta'}], 'garages': {'name': 'Sin especificar'}, 'baths': {'name': '1', 'id': 1, 'slug': 'BATH_1'}, 'max_price': '0',
 'min_price': '0', 'fr_parent_property_id': 6857997, 'price': '127500000.0', 'min_area': '0', 'client': {'company_name': 'CONSTRUCTORA BOLIVAR', 'logo': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincar
aiz.com.co/OVFR_COL/2015/11/18/201511181089RHXMDSJYOFULAQGWMCRIXODTJZOFULBQHWN.jpg'}, 'last_name': '', 'client_type': 'BUILDER', 'first_name': '', 'fr_client_id': 28249}, 'property_type': [{'name': 'Apartamento'}
], 'locations': {'neighbourhoods': [{'fr_place_id': 0, 'name': 'Caribe Verde', 'slug': 'colombia-atlantico-barranquilla-0-caribe-verde'}, {'fr_place_id': 0, 'name': 'A.s.d.', 'slug': ['neighbourhood-colombia-08-0
01-000190']}], 'cities': [{'fr_place_id': 5700003, 'name': 'Barranquilla', 'slug': 'colombia-atlántico-5700003-barranquilla'}, {'fr_place_id': 5700003, 'name': 'Barranquilla', 'slug': ['city-colombia-08-001', 'co
lombia-atlántico-5700003-barranquilla']}], 'location_point': 'POINT (-74.8502426147461 10.95703411102295)', 'groups': [{'subgroups': {'name': 'Zona Sur Occidente', 'slug': 'colombia-atlantico-barranquilla-5700104
-zona-sur-occidente'}, 'name': 'Zonas', 'slug': 'zonas'}], 'countries': [{'fr_place_id': 1, 'name': 'Colombia', 'slug': 'colombia'}, {'fr_place_id': 1, 'name': 'Colombia', 'slug': ['country-48-colombia', 'colombi
a']}], 'states': [{'fr_place_id': 57, 'name': 'Atlántico', 'slug': 'colombia-atlántico'}, {'fr_place_id': 57, 'name': 'Atlantico', 'slug': ['state-colombia-08-atlantico', 'colombia-atlántico']}]}, 'fr_property_id
': 6858014}}}}
  • Related