How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with-CodePudding

I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.

So far I have:

import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd

file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)

for index, file in enumerate(files):
    df = pd.read_csv(file, sep=',', index_col=[0])
    df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
    pd.concat(df2[df.index == 'total'])

df.to_csv('file_path')

I'm sure there are several ways in which this is messed up, but any advice is appreciated

CodePudding user response：

import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd

file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)

dfs = []
for index, file in enumerate(files):
    df = pd.read_csv(file, sep=',', index_col=[0])
    # Would remove the .loc, but it does no harm
    df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function  
    dfs.append(df[['total']])

df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')

CodePudding user response：

OK I figured it out:

import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd

file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)

filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)

dfs = []
for index, file in enumerate(files):
    df = pd.read_csv(file, sep=',', index_col=[0])
    # Would remove the .loc, but it does no harm
    df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function  
    dfs.append(df)

dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')