I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.
So far I have:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
pd.concat(df2[df.index == 'total'])
df.to_csv('file_path')
I'm sure there are several ways in which this is messed up, but any advice is appreciated
CodePudding user response:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df[['total']])
df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')
CodePudding user response:
OK I figured it out:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df)
dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')