I made this code to extrat lyrics from a website informing the artist and the music name.
The code is working, the problem is that I have a DataFrame (named years_1920_2020) with 10000 musics, and it took 1:30h to retrieve all these lyrics .
Is there a way to do it faster?
def url_lyric(music,artist):
url_list = ("https://www.letras.mus.br/", str(artist),"/", str(music),"/")
url = ''.join(url_list)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
webpage = urlopen(req).read()
bs = BeautifulSoup(webpage, 'html.parser')
lines =bs.find('div', {'class':'cnt-letra p402_premium'})
final_lines = lines.find_all('p')
return final_lines
except:
return 0
final_lyric_series = pd.Series(name = "lyrics")
for year in range (1920,2021):
lyrics_serie = lyrics_from_year(year)
final_lyric_series = pd.concat([final_lyric_series, lyrics_serie])
print(year)
the function lyrics_from_year(year) uses the function url_lyric, perform some re tasks and return a pd.series with all the lyrics of the chosen year
CodePudding user response:
We can get the solution using the pythons asyncio module. Please refer to this Article It's not an exact solution but similar to your problem.
import asyncio
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def url_lyric(music, artist):
pass
def lyrics_from_year(year):
music = None
artist = None
return url_lyric(music, artist)
async def get_work_done():
with ThreadPoolExecutor(max_workers=10) as executor:
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(
executor,
lyrics_from_year,
*(year) # Allows us to pass in arguments to `lyrics_from_year`
)
for year in range(1920, 2021)
]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_work_done())
loop.run_until_complete(future)
final_lyric_series = pd.Series(name="lyrics")
for result in future:
final_lyric_series = pd.concat([final_lyric_series, result])
print(result)
CodePudding user response:
Here is a simple example of how you could do it:
import aiohttp
import asyncio
import requests, bs4
async def main():
async with aiohttp.ClientSession() as session:
urls = [f"https://www.letras.mus.br{x['href']}" for x in bs4.BeautifulSoup(requests.get(
url = 'https://www.letras.mus.br/adele/mais-tocadas.html'
).content, 'html.parser').find_all('a', {'class':'song-name'})]
for url in urls:
async with session.get(url) as r:
lyrics = bs4.BeautifulSoup(await r.text(), 'html.parser').find('div', {'class':'cnt-letra'}).text
print('\n'.join(x.strip() for x in lyrics.strip().split('\n')))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())