How to get the first line of <ol> using beautiful soup and make it the JSON key with the rest-CodePudding

I am trying to make another set of key-value pairs for each set of a show (set1, set2, encore) scraped from setlist.fm instead of just the list of songs without separation. What I cannot figure out is how to access the elements that state the set of the show and then append the list of songs after it until it hits the next set. Here is the html I am accessing: html code from setlist fm

Currently, my JSON file looks like this:

'''`{

    "artist": "Sample Artist",
    "day": 20,
    "month": 1,
    "songs": ["Song A","Song B","Song C"
    ],
    "tour": "2000 U.S. Tour",
    "venue": "Sample Venue, Atlanta, GA, USA",
    "year": 2000
},`

whereas I want it to look like this:

 "artist": "Sample Artist",
    "day": 20,
    "month": 1,
    "songs": ["Song A","Song B","Song C"
    ],
    "set1": ["Song A"],
    "set2": ["Song B"],
    "encore":["Song C"],
    "tour": "2000 U.S. Tour",
    "venue": "Sample Venue, Atlanta, GA, USA",
    "year": 2000
},`

Here is the code I am using to generate the song list of the JSON but am not sure how to get the sets individually:

def getConcertData(i, url, concerts):

try:
    
    soup = getSoup(url)
    
    dateBlock = soup.find_all("div", {"class": "dateBlock"})[0]
    infoContainer = soup.find_all("div", {"class": "infoContainer"})[0]
    headLineDiv = infoContainer.find_all("div", {"class": "setlistHeadline"})[0]
    setlistDiv = soup.find_all("div", {"class": "setlistList"})[0]


    #removed unrelated code for question
    
    songs = []
    
    for a in setlistDiv.find_all("a", {"class": "songLabel"}):
        songs.append(a.getText().strip())
    
    print(str(year) "." str(month).zfill(2) "." str(day).zfill(2) ": " venue)
    
    data = dict()
    data["artist"] = artist
    data["year"] = year
    data["month"] = month
    data["day"] = day
    data["venue"] = venue
    data["tour"] = tour
    data["songs"] = songs
    # data["set1"] = 0
    # data["set2"] = 0
    # data["encore"] = 0
    
    concerts[i] = data

CodePudding user response：

If I understand you correctly, you want to "group" songs to their sections:

import requests
from bs4 import BeautifulSoup


url = "https://www.setlist.fm/setlist/phish/2022/ruoff-home-mortgage-music-center-noblesville-in-3b4e5a7.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")


out = {}
out["artist"] = soup.h1.a.get_text(strip=True)
out["month"] = soup.select_one(".month").text
out["day"] = soup.select_one(".day").text
out["year"] = soup.select_one(".year").text
out["venue"] = soup.select_one('a[href*="/venue/"]').text

for li in soup.select(".setlistList li.song"):
    song_name = li.a.get_text(strip=True)
    section = (
        li.find_previous("li", class_="highlight")
        .get_text(strip=True)
        .strip(" :")
    )

    out.setdefault("songs", []).append(song_name)
    out.setdefault(section, []).append(song_name)

print(out)

Prints:

{
    "artist": "Phish",
    "month": "Jun",
    "day": "5",
    "year": "2022",
    "venue": "Ruoff Home Mortgage Music Center, Noblesville, IN, USA",
    "songs": [
        "While My Guitar Gently Weeps",
        "My Soul",
        "Rift",
        "Horn",
        "Wombat",
        "Evolve",
        "Guyute",
        "Limb by Limb",
        "Mercury",
        "The Moma Dance",
        "Sand",
        "Sigma Oasis",
        "Twenty Years Later",
        "The Mango Song",
        "Rise/Come Together",
        "Free",
        "Grind",
        "Slave to the Traffic Light",
    ],
    "Set 1": [
        "While My Guitar Gently Weeps",
        "My Soul",
        "Rift",
        "Horn",
        "Wombat",
        "Evolve",
        "Guyute",
        "Limb by Limb",
        "Mercury",
        "The Moma Dance",
    ],
    "Set 2": [
        "Sand",
        "Sigma Oasis",
        "Twenty Years Later",
        "The Mango Song",
        "Rise/Come Together",
        "Free",
    ],
    "Encore": ["Grind", "Slave to the Traffic Light"],
}