Home > Blockchain >  Why is my code looping itself on first item
Why is my code looping itself on first item

Time:11-19

Im making a code that scrapes a website for lyrics on a bunch of artists and saves the lyrics as .txt files in a directories named after their respective albums.

But after my program has finished the first artist, it keeps looping the same artist. Why?

Code:

import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")

import urllib.request

if os.path.isfile('hist'):    
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
    with open('hist', 'r', encoding='utf-8') as file:
        history = file.read().split()
else:
    history=[]
    
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")

for artist in artists:
    if not os.path.exists("D:/Folder/" str(artist)):
        os.mkdir("D:/Folder/" str(artist))
    link=urlhome str(artist[0]) "/" artist.replace(" "," ")
    getartist=urllib.request.urlopen(link)
    artistpage = BeautifulSoup(getartist,features="lxml")
    albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})

    for album in albums:
        if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear
        if not os.path.exists("D:/Folder/" str(artist) "/" albumname):
            os.mkdir("D:/Folder/" str(artist) "/" albumname)
        songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
        
        for song in songs:
            if song['href'] in history:
                print('Skipping', song['href'], '-already on drive')
                continue #if it's already scraped, it continues to the next song

            time.sleep(3)
            if "/album/" not in song["href"]:
                songurl = "https://www.lyricsfreak.com" song["href"]
                songpage = urllib.request.urlopen(songurl)
                songsoup = BeautifulSoup(songpage,features="lxml")
                songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
                lyrics = songsoup.find("div",attrs={"id":"content"})
                fixedlyrics = lyrics.text.strip()
                lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
                lyricfile.write(fixedlyrics)
                with open('hist', 'a', encoding='utf-8') as file: #a for append
                    file.write(song['href']   '\n')
                print("parsing " str(songname))

CodePudding user response:

The block of code after:

if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear

Needs to be indented to be included with in that condition statement. Other wise, it just skips that little block and then just repeats everything on the last albumurl stored string.

Full Code:

import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")

import urllib.request

if os.path.isfile('hist'):    
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
    with open('hist', 'r', encoding='utf-8') as file:
        history = file.read().split()
else:
    history=[]
    
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")

for artist in artists:
    if not os.path.exists("D:/Folder/" str(artist)):
        os.mkdir("D:/Folder/" str(artist))
    link=urlhome str(artist[0]) "/" artist.replace(" "," ")
    getartist=urllib.request.urlopen(link)
    artistpage = BeautifulSoup(getartist,features="lxml")
    albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})

    for album in albums:
        if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear
        
            if not os.path.exists("D:/Folder/" str(artist) "/" albumname):              #<-- INDENT REST OF CODE
                os.mkdir("D:/Folder/" str(artist) "/" albumname)
            songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
            
            for song in songs:
                if song['href'] in history:
                    print('Skipping', song['href'], '-already on drive')
                    continue #if it's already scraped, it continues to the next song
    
                time.sleep(3)
                if "/album/" not in song["href"]:
                    songurl = "https://www.lyricsfreak.com" song["href"]
                    songpage = urllib.request.urlopen(songurl)
                    songsoup = BeautifulSoup(songpage,features="lxml")
                    songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
                    lyrics = songsoup.find("div",attrs={"id":"content"})
                    fixedlyrics = lyrics.text.strip()
                    lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
                    lyricfile.write(fixedlyrics)
                    with open('hist', 'a', encoding='utf-8') as file: #a for append
                        file.write(song['href']   '\n')
                    print("parsing " str(songname))
  • Related