I am new to scrapping and need your help. In the first part of scraping, getting park names with details including links(url) to the park pages. I want to get phone numbers from scraped URL's(link) and show them all together.
Thanks in advance.
from bs4 import BeautifulSoup
import requests
import re
def get_parknames():
html_text = requests.get('http://www.jump-parks.com/en/trampoline-parks/usa/').text
soup = BeautifulSoup(html_text, 'lxml')
parks = soup.find_all('div', class_ = 'grid__item')
for park in parks:
park_name = park.find('h3', class_ = 'card__title').text
state = park.find('span', class_ = "address__country_long")
country = park.find('span', {'itemprop' : 'addressCountry'}).text
link = park.find('a', attrs={'href': re.compile("^https://")})
html_text2 = requests.get(link)
soup2 = BeautifulSoup(html_text2, 'lxml')
phones = soup.find_all('div', class_ = 'single-meta')
for phone in phones:
phone_number = phone.find('a', attrs={'href': re.compile("")})
print(f'''
Park Name: {park_name}
State: {state}
Country: {country}
Link: {link['href']}
Phone: {phone_number}
''')
if __name__ == '__main__':
get_parknames()
CodePudding user response:
The data you see is loaded with JavaScript from different URL. To get all pages you can use next example:
import json
import requests
from bs4 import BeautifulSoup
api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"
payload = {
"action": "facetwp_refresh",
"data": {
"extras": {"sort": "default"},
"facets": {"listings_counts": [], "listings_pager": []},
"first_load": 0,
"frozen_facets": {},
"http_params": {
"archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
"get": [],
"uri": "en/trampoline-parks/usa",
"url_vars": [],
},
"is_bfcache": 1,
"paged": 1,
"soft_refresh": 1,
"template": "listings",
},
}
for payload["data"]["paged"] in range(1, 5): # <-- increase number of pages here
data = requests.post(api_url, json=payload).json()
soup = BeautifulSoup(data["template"], "html.parser")
# to print all returned data uncomment next line:
# print(json.dumps(data, indent=4))
for article in soup.select("article"):
name, link, state = (
article.h3.text,
article.a["href"],
article.select_one('[itemprop="addressRegion"]').text,
)
print("{:<50} {:<15} {}".format(name, state, link))
Prints:
Above All Trampoline Park Liberty Missouri https://www.jump-parks.com/en/trampoline-park/above-all-trampoline-park-liberty/
Adrenaline Indoor Adventure Park LLC Fishers Indiana https://www.jump-parks.com/en/trampoline-park/adrenaline-indoor-adventure-park-llc-fishers/
Adventure Action Park Knoxville Knoxville Tennessee https://www.jump-parks.com/en/trampoline-park/adventure-action-park-knoxville-knoxville/
Adventure Air Sports Kennesaw Georgia https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-kennesaw/
Adventure Air Sports Rock Hill South Carolina https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-rock-hill/
Aerosports Trampoline Parks Corona California https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-corona/
Aerosports Trampoline Parks Fresno California https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-fresno/
Aerosports Trampoline Parks Murrieta California https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-murrieta/
Air Insanity Indoor Trampoline Park Rochester Minnesota https://www.jump-parks.com/en/trampoline-park/air-insanity-indoor-trampoline-park-rochester/
... and so on.
EDIT: To get phone numbers:
import json
import requests
from bs4 import BeautifulSoup
api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"
payload = {
"action": "facetwp_refresh",
"data": {
"extras": {"sort": "default"},
"facets": {"listings_counts": [], "listings_pager": []},
"first_load": 0,
"frozen_facets": {},
"http_params": {
"archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
"get": [],
"uri": "en/trampoline-parks/usa",
"url_vars": [],
},
"is_bfcache": 1,
"paged": 1,
"soft_refresh": 1,
"template": "listings",
},
}
all_data = []
for payload["data"]["paged"] in range(1, 5):
print(f'Page {payload["data"]["paged"]}')
data = requests.post(api_url, json=payload).json()
soup = BeautifulSoup(data["template"], "html.parser")
# to print all returned data uncomment next line:
# print(json.dumps(data, indent=4))
for article in soup.select("article"):
name, link, state = (
article.h3.text,
article.a["href"],
article.select_one('[itemprop="addressRegion"]').text,
)
all_data.append((name, state, link))
for name, state, link in all_data:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
phone = soup.select_one('[itemprop="telephone"]').text
print("{:<50} {:<15} {:<15}".format(name, state, phone, link))