Python BeautifulSoup Scraper - apply function to each <li> element in <ol> on page-CodePudding

We are scraping Billboard's hot 100 list https://www.billboard.com/charts/hot-100/2021-10-30 and have some decent code, but struggling to finish up:

from bs4 import BeautifulSoup 
import requests
import pandas as pd

def CleanBullet(bullet):
    this_rank = all_bullets[0].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising')
    this_song = all_bullets[0].find("span", class_="chart-element__information__song").get_text().strip('\n')
    this_artist = all_bullets[0].find("span", class_="chart-element__information__artist").get_text().strip('\n')
    this_last_week = all_bullets[0].find("span", class_="text--last").get_text().strip(' Last Week')
    this_peak = all_bullets[0].find("span", class_="text--peak").get_text().strip(' Peak Rank')
    this_weeks_on = all_bullets[0].find("span", class_="text--week").get_text().strip(' Weeks on Chart')

    this_df = pd.DataFrame()
    data={
        'rank': this_rank,
        'song': this_song,
        'artist': this_artist,
        'last_week': this_last_week,
        'peak': this_peak,
        'weeks_on': this_weeks_on
    }
    this_df = this_df.append(data, ignore_index=True)
    return this_df


base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")    
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")

df1 = CleanBullet(full_table[0])
df1

How can we:

apply this function over each of the 100 elements in full_table, resulting in a single dataframe with 100 rows?
remove the \n in the rank column, since strip('\n') is seemingly not working...

CodePudding user response：

How about this?

from bs4 import BeautifulSoup 
import requests
import pandas as pd

def CleanBullet(bullet):
    this_df = pd.DataFrame()
    for n in range(len(bullet)):
        this_rank = bullet[n].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising').strip('\n')
        this_song = bullet[n].find("span", class_="chart-element__information__song").get_text().strip('\n')
        this_artist = bullet[n].find("span", class_="chart-element__information__artist").get_text().strip('\n')
        this_last_week = bullet[n].find("span", class_="text--last").get_text().strip(' Last Week')
        this_peak = bullet[n].find("span", class_="text--peak").get_text().strip(' Peak Rank')
        this_weeks_on = bullet[n].find("span", class_="text--week").get_text().strip(' Weeks on Chart')

        
        data={
            'rank': this_rank,
            'song': this_song,
            'artist': this_artist,
            'last_week': this_last_week,
            'peak': this_peak,
            'weeks_on': this_weeks_on
        }
        this_df = this_df.append(data, ignore_index=True)
    return this_df


base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url,  verify = False)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")    
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")

df1 = CleanBullet(full_table)

print(df1)

CodePudding user response：

I would probably just extract all the data from the JavaScript object housing all that info. Use a loop to generate each dictionary and convert to df.

import requests
import pandas as pd
import re
import json
import html

r = requests.get('https://www.billboard.com/charts/hot-100/2021-10-30')
data = json.loads(html.unescape(re.search(r'data-charts="(.*)"',
                  r.text).group(1)))
df = pd.DataFrame(
    {'rank': i['rank'],
     'song': i['title'],
     'artist': i['artist_name'],
     'last_week': str(i['history']['last_week']).split('.')[0],
     'peak': i['history']['peak_rank'],
     'weeks_on': i['history']['weeks_on_chart']} for i in data
)
# df.to_csv('top100.csv', index = False)