PYTHON: appending a dataframe in a loop-CodePudding

I'm trying to retrieve stock information from 2 different urls and write the information to a panda's dataframework. However, i keep on getting errors. Could anyone please help me out here? I'm pretty new to python, so my code will probably look very ugly :D

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd



headers= {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

PATH='C:\Program Files (x86)\chromedriver.exe'

options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("--window-size=2550,1440")
s = Service('C:\Program Files (x86)\chromedriver.exe')
driver = webdriver.Chrome(PATH, options=options)
driver.implicitly_wait(10)

#maak een dataframe aan
dn=[]

def accept_cookies():
    try:
        driver.find_element(By.ID, 'accept-choices').click()
    except:
        print('fu')

stocklist=["FB","KLIC"]
for x in stocklist:
    url = f"https://stockanalysis.com/stocks/{x}/financials/"
    driver.get(url)
    driver.implicitly_wait(10)
    accept_cookies()
    driver.implicitly_wait(10)
    driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
    xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    df = pd.read_html(str(soup), attrs={'id': 'financial-table'})[0]
    new_df = pd.concat(df)
    dn.to_excel(xlwriter, sheet_name='key', index=False)
    xlwriter.save()

CodePudding user response：

pd.concat needs a list of objects to concatenate, whereas you have only given it df.

So I think replace pd.concat(df) with pd.concat([df, new_df]) and have new_df = pd.DataFrame() before the for loop.

CodePudding user response：

In case that there is no issue with the .read_html() part you should push your df to a list of data frames:

dflist =[]

for x in stocklist:
    url = f"https://stockanalysis.com/stocks/{x}/financials/"
    driver.get(url)
    driver.implicitly_wait(10)
    accept_cookies()
    driver.implicitly_wait(10)
    driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])

Finishing the iteration you can simply concat the list of data frames to a single one:

xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()

example

dflist =[]

for x in stocklist:
    url = f"https://stockanalysis.com/stocks/{x}/financials/"
    driver.get(url)
    driver.implicitly_wait(10)
    accept_cookies()
    driver.implicitly_wait(10)
    driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
    
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()