Home > Enterprise >  Python BeautifulSoup Scraper - apply function to each <li> element in <ol> on page
Python BeautifulSoup Scraper - apply function to each <li> element in <ol> on page

Time:10-27

We are scraping Billboard's hot 100 list https://www.billboard.com/charts/hot-100/2021-10-30 and have some decent code, but struggling to finish up:

from bs4 import BeautifulSoup 
import requests
import pandas as pd

def CleanBullet(bullet):
    this_rank = all_bullets[0].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising')
    this_song = all_bullets[0].find("span", class_="chart-element__information__song").get_text().strip('\n')
    this_artist = all_bullets[0].find("span", class_="chart-element__information__artist").get_text().strip('\n')
    this_last_week = all_bullets[0].find("span", class_="text--last").get_text().strip(' Last Week')
    this_peak = all_bullets[0].find("span", class_="text--peak").get_text().strip(' Peak Rank')
    this_weeks_on = all_bullets[0].find("span", class_="text--week").get_text().strip(' Weeks on Chart')

    this_df = pd.DataFrame()
    data={
        'rank': this_rank,
        'song': this_song,
        'artist': this_artist,
        'last_week': this_last_week,
        'peak': this_peak,
        'weeks_on': this_weeks_on
    }
    this_df = this_df.append(data, ignore_index=True)
    return this_df


base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")    
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")

df1 = CleanBullet(full_table[0])
df1

How can we:

  • apply this function over each of the 100 elements in full_table, resulting in a single dataframe with 100 rows?
  • remove the \n in the rank column, since strip('\n') is seemingly not working...

CodePudding user response:

How about this?

from bs4 import BeautifulSoup 
import requests
import pandas as pd

def CleanBullet(bullet):
    this_df = pd.DataFrame()
    for n in range(len(bullet)):
        this_rank = bullet[n].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising').strip('\n')
        this_song = bullet[n].find("span", class_="chart-element__information__song").get_text().strip('\n')
        this_artist = bullet[n].find("span", class_="chart-element__information__artist").get_text().strip('\n')
        this_last_week = bullet[n].find("span", class_="text--last").get_text().strip(' Last Week')
        this_peak = bullet[n].find("span", class_="text--peak").get_text().strip(' Peak Rank')
        this_weeks_on = bullet[n].find("span", class_="text--week").get_text().strip(' Weeks on Chart')

        
        data={
            'rank': this_rank,
            'song': this_song,
            'artist': this_artist,
            'last_week': this_last_week,
            'peak': this_peak,
            'weeks_on': this_weeks_on
        }
        this_df = this_df.append(data, ignore_index=True)
    return this_df


base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url,  verify = False)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")    
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")

df1 = CleanBullet(full_table)

print(df1)

CodePudding user response:

I would probably just extract all the data from the JavaScript object housing all that info. Use a loop to generate each dictionary and convert to df.

import requests
import pandas as pd
import re
import json
import html

r = requests.get('https://www.billboard.com/charts/hot-100/2021-10-30')
data = json.loads(html.unescape(re.search(r'data-charts="(.*)"',
                  r.text).group(1)))
df = pd.DataFrame(
    {'rank': i['rank'],
     'song': i['title'],
     'artist': i['artist_name'],
     'last_week': str(i['history']['last_week']).split('.')[0],
     'peak': i['history']['peak_rank'],
     'weeks_on': i['history']['weeks_on_chart']} for i in data
)
# df.to_csv('top100.csv', index = False)
  • Related