Scrape Spotify using Python-CodePudding

Im getting the AttributeError: 'NoneType' object has no attribute 'find' error. Before posting here with some research, possible problem could be that Cloudflare is blocking my access to Spotify. What would be thje workaround to this problem?

part of code looks like this:

dates=[]
urls=[]
final=[]

url = 'https://spotifycharts.com/regional/us/daily'
start_date = date(2022,3,1)
end_date = date(2022,4,30)
delta = end_date - start_date

# print(delta.days 1)

for i in range(delta.days 1):
    day= start_date  timedelta(days=i)
    day_string =day.strftime('%Y-%m-%d')
    dates.append(day_string)



def add_url():
    for date in dates:
        c_string=url date
        urls.append(c_string)

add_url()

def song_scrape(x):
    pg = x
    for tr in songs.find("tbody").findAll("tr"):
        artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
        artist = artist.replace("by ", "").strip()

        title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
        songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
        songid = songid.split("track/")[1]
        url_date = x.split("daily/")[1]
        final.append([title, artist, songid, url_date])


for u in urls:
    read_pg= requests.get(u)
    sleep(2)
    # return read_pg.status_code
    soup= BeautifulSoup(read_pg.content, "html.parser")
    songs = soup.find("table", {"class": "chart-table"})
    song_scrape(u)


final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
with open('spmooddata.csv', 'w') as f:
        final_df.to_csv(f, header= True, index=False)

CodePudding user response：

As I mentioned in my comment, you need the add certain code as is shown in this answer for solve the 403 forbidden error.

After making more changes to your code, I was able to get the data.

This is your modified and working code:

# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd

# Variables: 
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily/'
start_date = datetime(2022,3,1)
end_date = datetime(2022,3,5)
delta = end_date - start_date

# print(delta.days 1)

for i in range(delta.days 1):
  day= start_date   timedelta(days=i)
  day_string =day.strftime('%Y-%m-%d')
  dates.append(day_string)

def add_url(): 
  for date in dates:
    c_string=url date
    urls.append(c_string)

add_url()

def song_scrape(x, songs):
  pg = x
  for tr in songs.find("tbody").findAll("tr"): 
    artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
    artist = artist.replace("by ", "").strip()
    
    title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
    songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
    songid = songid.split("track/")[1]
    url_date = x.split("daily/")[1]
    final.append([title, artist, songid, url_date])


# Avoid http 403 forbidden error with this code: 
# Source: https://stackoverflow.com/a/43590290/12511801 
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

for u in urls:
  read_pg= requests.get(u, headers=header)
  sleep(2)
  # return read_pg.status_code
  soup= BeautifulSoup(read_pg.text, "html.parser")

  #Using BeautifulSoup, we're getting the specific data from the HTML: 
  # There is only 1 table = which is the table with the data to extract:
  songs = soup.findAll("table")[0]

  # Call "song_scrape" function to retrieve the data from the table: 
  song_scrape(u, songs)

final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
# print(final_df) # Print the dataframe, if you want

with open('spmooddata.csv', 'w') as f: 
  final_df.to_csv(f, header= True, index=False)