Home > Net >  create a pandas dataframe from scraping with selenium
create a pandas dataframe from scraping with selenium

Time:12-14

I have the following code using Selenium to scrape this page (the list of albums, and the list of songs when you click on the album). The script is running but I would like to create a dataframe with panda with column with the list of albums (one per row) and the list of songs in another column.

I need it to us the data in Excel.

Thanks for all,

Léa

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

PATH = '/Users/prati/Desktop/WDD/Projet_Rapgenius/chromedriver'
#choix du navigateur
driver = webdriver.Chrome(PATH)

# ouvrir le site web concerné 
driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
sleep(2)

# cliquer sur "j'accepte" pour les cookies 
accept_button = driver.find_element_by_id('onetrust-accept-btn-handler')
accept_button.send_keys(Keys.ENTER)
sleep(2)

# trouver le titre de l'album/date/artiste
links = driver.find_elements_by_class_name('ReferentFragmentVariantdesktop__Highlight-sc-1837hky-1.jShaMP')
# boucle = à chaque fois qu'il le trouve...
for link in links:
# il doit le scraper et afficher puis se reposer 
    try:
        album = link.text
        print(album)
# puis cliquer dessus
        link.click()
        sleep(1)
# et scraper et afficher la liste des titres d'album
        div = driver.find_element_by_class_name('RichText__Container-oz284w-0.gVsQub')
        morceaux = div.find_elements_by_tag_name('li')
        for morceau in morceaux:
            print(morceau.text)
#...s'il trouve pas, il passe
    except:
        pass

#fermer l'onglet
driver.close()

CodePudding user response:

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

options = webdriver.ChromeOptions() 
options.add_argument("--disable-popup-blocking")
options.add_argument('--no-default-browser-check')
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(options=options, service=service)

driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//p/b")))
ListAlbunsDF = []
ListMusicsDF = []
ListMusicsAlbum = []
for k in driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"):
    try:
        k.click()
        WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@class, 'Annotation__Container')]")))
        for i in driver.find_elements(By.XPATH, "//div[contains(@class, 'Annotation__Container')]//li"):
            ListMusicsAlbum.append(str(i.text))
    except:
        pass
    ListAlbunsDF.append(str(k.text))
    ListMusicsDF.append(ListMusicsAlbum[:])
    ListMusicsAlbum.clear()

    # to track the progress:
    print("{:.0%}".format(len(ListAlbunsDF)/len(driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"))))


df = pd.DataFrame(ListMusicsDF, index=ListAlbunsDF)    
df.columns  = 1 
df.to_excel('au.xlsx')
  • Related