Home > Blockchain >  Scraping scraped url's (nested)
Scraping scraped url's (nested)

Time:06-10

I am new to scrapping and need your help. In the first part of scraping, getting park names with details including links(url) to the park pages. I want to get phone numbers from scraped URL's(link) and show them all together.

Thanks in advance.

from bs4 import BeautifulSoup
import requests
import re

def get_parknames():

    html_text = requests.get('http://www.jump-parks.com/en/trampoline-parks/usa/').text
    soup = BeautifulSoup(html_text, 'lxml')
    parks = soup.find_all('div', class_ = 'grid__item')

    for park in parks:

        park_name = park.find('h3', class_ = 'card__title').text
        state = park.find('span', class_ = "address__country_long")
        country = park.find('span', {'itemprop' : 'addressCountry'}).text
        link = park.find('a', attrs={'href': re.compile("^https://")})

        html_text2 = requests.get(link)
        soup2 = BeautifulSoup(html_text2, 'lxml')
        phones = soup.find_all('div', class_ = 'single-meta')

        for phone in phones:
            phone_number = phone.find('a', attrs={'href': re.compile("")})

        print(f'''
        Park Name: {park_name}
        State: {state}
        Country: {country}
        Link: {link['href']}
        Phone: {phone_number}
        ''')

if __name__ == '__main__':
    get_parknames()

CodePudding user response:

The data you see is loaded with JavaScript from different URL. To get all pages you can use next example:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


for payload["data"]["paged"] in range(1, 5): # <-- increase number of pages here

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")
    
    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )

        print("{:<50} {:<15} {}".format(name, state, link))

Prints:

Above All Trampoline Park Liberty                  Missouri        https://www.jump-parks.com/en/trampoline-park/above-all-trampoline-park-liberty/
Adrenaline Indoor Adventure Park LLC Fishers       Indiana         https://www.jump-parks.com/en/trampoline-park/adrenaline-indoor-adventure-park-llc-fishers/
Adventure Action Park Knoxville Knoxville          Tennessee       https://www.jump-parks.com/en/trampoline-park/adventure-action-park-knoxville-knoxville/
Adventure Air Sports Kennesaw                      Georgia         https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-kennesaw/
Adventure Air Sports Rock Hill                     South Carolina  https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-rock-hill/
Aerosports Trampoline Parks Corona                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-corona/
Aerosports Trampoline Parks Fresno                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-fresno/
Aerosports Trampoline Parks Murrieta               California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-murrieta/
Air Insanity Indoor Trampoline Park Rochester      Minnesota       https://www.jump-parks.com/en/trampoline-park/air-insanity-indoor-trampoline-park-rochester/

... and so on.

EDIT: To get phone numbers:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


all_data = []
for payload["data"]["paged"] in range(1, 5):
    print(f'Page {payload["data"]["paged"]}')

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")

    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )
        all_data.append((name, state, link))


for name, state, link in all_data:
    soup = BeautifulSoup(requests.get(link).content, "html.parser")
    phone = soup.select_one('[itemprop="telephone"]').text
    print("{:<50} {:<15} {:<15}".format(name, state, phone, link))
  • Related