We are scraping Billboard's hot 100 list https://www.billboard.com/charts/hot-100/2021-10-30 and have some decent code, but struggling to finish up:
from bs4 import BeautifulSoup
import requests
import pandas as pd
def CleanBullet(bullet):
this_rank = all_bullets[0].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising')
this_song = all_bullets[0].find("span", class_="chart-element__information__song").get_text().strip('\n')
this_artist = all_bullets[0].find("span", class_="chart-element__information__artist").get_text().strip('\n')
this_last_week = all_bullets[0].find("span", class_="text--last").get_text().strip(' Last Week')
this_peak = all_bullets[0].find("span", class_="text--peak").get_text().strip(' Peak Rank')
this_weeks_on = all_bullets[0].find("span", class_="text--week").get_text().strip(' Weeks on Chart')
this_df = pd.DataFrame()
data={
'rank': this_rank,
'song': this_song,
'artist': this_artist,
'last_week': this_last_week,
'peak': this_peak,
'weeks_on': this_weeks_on
}
this_df = this_df.append(data, ignore_index=True)
return this_df
base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")
df1 = CleanBullet(full_table[0])
df1
How can we:
- apply this function over each of the 100 elements in
full_table
, resulting in a single dataframe with 100 rows? - remove the
\n
in the rank column, sincestrip('\n')
is seemingly not working...
CodePudding user response:
How about this?
from bs4 import BeautifulSoup
import requests
import pandas as pd
def CleanBullet(bullet):
this_df = pd.DataFrame()
for n in range(len(bullet)):
this_rank = bullet[n].find("span", class_="chart-element__rank").get_text().strip('\n').strip('\n').strip('Rising').strip('\n')
this_song = bullet[n].find("span", class_="chart-element__information__song").get_text().strip('\n')
this_artist = bullet[n].find("span", class_="chart-element__information__artist").get_text().strip('\n')
this_last_week = bullet[n].find("span", class_="text--last").get_text().strip(' Last Week')
this_peak = bullet[n].find("span", class_="text--peak").get_text().strip(' Peak Rank')
this_weeks_on = bullet[n].find("span", class_="text--week").get_text().strip(' Weeks on Chart')
data={
'rank': this_rank,
'song': this_song,
'artist': this_artist,
'last_week': this_last_week,
'peak': this_peak,
'weeks_on': this_weeks_on
}
this_df = this_df.append(data, ignore_index=True)
return this_df
base_url = "https://www.billboard.com/charts/hot-100/2021-10-30"
response = requests.get(base_url, verify = False)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
full_table = soup.find("ol", class_="chart-list__elements").find_all("li")
df1 = CleanBullet(full_table)
print(df1)
CodePudding user response:
I would probably just extract all the data from the JavaScript object housing all that info. Use a loop to generate each dictionary and convert to df.
import requests
import pandas as pd
import re
import json
import html
r = requests.get('https://www.billboard.com/charts/hot-100/2021-10-30')
data = json.loads(html.unescape(re.search(r'data-charts="(.*)"',
r.text).group(1)))
df = pd.DataFrame(
{'rank': i['rank'],
'song': i['title'],
'artist': i['artist_name'],
'last_week': str(i['history']['last_week']).split('.')[0],
'peak': i['history']['peak_rank'],
'weeks_on': i['history']['weeks_on_chart']} for i in data
)
# df.to_csv('top100.csv', index = False)