Im making a code that scrapes a website for lyrics on a bunch of artists and saves the lyrics as .txt files in a directories named after their respective albums.
But after my program has finished the first artist, it keeps looping the same artist. Why?
Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/" str(artist)):
os.mkdir("D:/Folder/" str(artist))
link=urlhome str(artist[0]) "/" artist.replace(" "," ")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
if not os.path.exists("D:/Folder/" str(artist) "/" albumname):
os.mkdir("D:/Folder/" str(artist) "/" albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com" song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] '\n')
print("parsing " str(songname))
CodePudding user response:
The block of code after:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
Needs to be indented to be included with in that condition statement. Other wise, it just skips that little block and then just repeats everything on the last albumurl
stored string.
Full Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/" str(artist)):
os.mkdir("D:/Folder/" str(artist))
link=urlhome str(artist[0]) "/" artist.replace(" "," ")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
if not os.path.exists("D:/Folder/" str(artist) "/" albumname): #<-- INDENT REST OF CODE
os.mkdir("D:/Folder/" str(artist) "/" albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com" song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] '\n')
print("parsing " str(songname))