I am trying to make another set of key-value pairs for each set of a show (set1, set2, encore) scraped from setlist.fm instead of just the list of songs without separation. What I cannot figure out is how to access the elements that state the set of the show and then append the list of songs after it until it hits the next set. Here is the html I am accessing: html code from setlist fm
Currently, my JSON file looks like this:
'''`{
"artist": "Sample Artist",
"day": 20,
"month": 1,
"songs": ["Song A","Song B","Song C"
],
"tour": "2000 U.S. Tour",
"venue": "Sample Venue, Atlanta, GA, USA",
"year": 2000
},`
whereas I want it to look like this:
"artist": "Sample Artist",
"day": 20,
"month": 1,
"songs": ["Song A","Song B","Song C"
],
"set1": ["Song A"],
"set2": ["Song B"],
"encore":["Song C"],
"tour": "2000 U.S. Tour",
"venue": "Sample Venue, Atlanta, GA, USA",
"year": 2000
},`
Here is the code I am using to generate the song list of the JSON but am not sure how to get the sets individually:
def getConcertData(i, url, concerts):
try:
soup = getSoup(url)
dateBlock = soup.find_all("div", {"class": "dateBlock"})[0]
infoContainer = soup.find_all("div", {"class": "infoContainer"})[0]
headLineDiv = infoContainer.find_all("div", {"class": "setlistHeadline"})[0]
setlistDiv = soup.find_all("div", {"class": "setlistList"})[0]
#removed unrelated code for question
songs = []
for a in setlistDiv.find_all("a", {"class": "songLabel"}):
songs.append(a.getText().strip())
print(str(year) "." str(month).zfill(2) "." str(day).zfill(2) ": " venue)
data = dict()
data["artist"] = artist
data["year"] = year
data["month"] = month
data["day"] = day
data["venue"] = venue
data["tour"] = tour
data["songs"] = songs
# data["set1"] = 0
# data["set2"] = 0
# data["encore"] = 0
concerts[i] = data
CodePudding user response:
If I understand you correctly, you want to "group" songs to their sections:
import requests
from bs4 import BeautifulSoup
url = "https://www.setlist.fm/setlist/phish/2022/ruoff-home-mortgage-music-center-noblesville-in-3b4e5a7.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
out = {}
out["artist"] = soup.h1.a.get_text(strip=True)
out["month"] = soup.select_one(".month").text
out["day"] = soup.select_one(".day").text
out["year"] = soup.select_one(".year").text
out["venue"] = soup.select_one('a[href*="/venue/"]').text
for li in soup.select(".setlistList li.song"):
song_name = li.a.get_text(strip=True)
section = (
li.find_previous("li", class_="highlight")
.get_text(strip=True)
.strip(" :")
)
out.setdefault("songs", []).append(song_name)
out.setdefault(section, []).append(song_name)
print(out)
Prints:
{
"artist": "Phish",
"month": "Jun",
"day": "5",
"year": "2022",
"venue": "Ruoff Home Mortgage Music Center, Noblesville, IN, USA",
"songs": [
"While My Guitar Gently Weeps",
"My Soul",
"Rift",
"Horn",
"Wombat",
"Evolve",
"Guyute",
"Limb by Limb",
"Mercury",
"The Moma Dance",
"Sand",
"Sigma Oasis",
"Twenty Years Later",
"The Mango Song",
"Rise/Come Together",
"Free",
"Grind",
"Slave to the Traffic Light",
],
"Set 1": [
"While My Guitar Gently Weeps",
"My Soul",
"Rift",
"Horn",
"Wombat",
"Evolve",
"Guyute",
"Limb by Limb",
"Mercury",
"The Moma Dance",
],
"Set 2": [
"Sand",
"Sigma Oasis",
"Twenty Years Later",
"The Mango Song",
"Rise/Come Together",
"Free",
],
"Encore": ["Grind", "Slave to the Traffic Light"],
}