I have the following code using Selenium to scrape this page (the list of albums, and the list of songs when you click on the album). The script is running but I would like to create a dataframe with panda with column with the list of albums (one per row) and the list of songs in another column.
I need it to us the data in Excel.
Thanks for all,
Léa
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
PATH = '/Users/prati/Desktop/WDD/Projet_Rapgenius/chromedriver'
#choix du navigateur
driver = webdriver.Chrome(PATH)
# ouvrir le site web concerné
driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
sleep(2)
# cliquer sur "j'accepte" pour les cookies
accept_button = driver.find_element_by_id('onetrust-accept-btn-handler')
accept_button.send_keys(Keys.ENTER)
sleep(2)
# trouver le titre de l'album/date/artiste
links = driver.find_elements_by_class_name('ReferentFragmentVariantdesktop__Highlight-sc-1837hky-1.jShaMP')
# boucle = à chaque fois qu'il le trouve...
for link in links:
# il doit le scraper et afficher puis se reposer
try:
album = link.text
print(album)
# puis cliquer dessus
link.click()
sleep(1)
# et scraper et afficher la liste des titres d'album
div = driver.find_element_by_class_name('RichText__Container-oz284w-0.gVsQub')
morceaux = div.find_elements_by_tag_name('li')
for morceau in morceaux:
print(morceau.text)
#...s'il trouve pas, il passe
except:
pass
#fermer l'onglet
driver.close()
CodePudding user response:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
options.add_argument("--disable-popup-blocking")
options.add_argument('--no-default-browser-check')
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(options=options, service=service)
driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//p/b")))
ListAlbunsDF = []
ListMusicsDF = []
ListMusicsAlbum = []
for k in driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"):
try:
k.click()
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@class, 'Annotation__Container')]")))
for i in driver.find_elements(By.XPATH, "//div[contains(@class, 'Annotation__Container')]//li"):
ListMusicsAlbum.append(str(i.text))
except:
pass
ListAlbunsDF.append(str(k.text))
ListMusicsDF.append(ListMusicsAlbum[:])
ListMusicsAlbum.clear()
# to track the progress:
print("{:.0%}".format(len(ListAlbunsDF)/len(driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"))))
df = pd.DataFrame(ListMusicsDF, index=ListAlbunsDF)
df.columns = 1
df.to_excel('au.xlsx')