Concat dataframes with same names from multiple folders using pandas-CodePudding

I have three folders folder1, folder2, and folder3. They have data frames as follows:

folder1/
df1.csv
df4.csv
df5.csv

folder2/
df1.csv
df3.csv
df4.csv

folder3/
df4.csv

I am confused about how to contact the data frames using pandas.concat() with the same names in all three folders and save them in a new folder "finalfolder" such that the finalfolder contains concatinated files:

finalfolder/
df1.csv (concat from folder1 and folder2)
df3.csv (From folder 2)
df4.csv (concat from folder1, 2, and 3)
df5.csv (From folder 1)

CodePudding user response：

edit to first answer:

from os import listdir
import pandas as pd

folder_paths = ['put all the folder paths here']
df_dict = {'folder': [], 'file': []}
for folder_path in folder_paths:
    for file in listdir(folder_path):
        df_dict['folder'].append(folder_path)
        df_dict['file'].append(file)

df = pd.DataFrame(df_dict)
for file, group in df.groupby(df['file']):
    df_temp = pd.DataFrame()
    for folder in group['folder'].tolist():
        df_temp = pd.concat([df_temp, pd.read_csv(f'{folder}/{file}')])
    df_temp.to_csv(f'finalfolder/{file}')

this should do the trick, just make the list with your folders and it should do what you want.

CodePudding user response：

import os
folders_list = ['folder1','folder2','folder3']
files1 = os.listdir(folders_list[0])
files2 = os.listdir(folders_list[1])
files3 = os.listdir(folders_list[2])

max_files_size =3
for i in range(max_files_size):
    f1=False
    f2=False
    f3=False
    try:
        folder1_df = pd.read_csv(os.path.abspath("folder1") "/" files1[i])
        f1=True
    except:
        pass
    try:
        folder2_df = pd.read_csv(os.path.abspath("folder2") "/" files2[i])
        f2=True
    except:
        pass
    try:
        folder3_df = pd.read_csv(os.path.abspath("folder3") "/" files3[i])
        f3=True
    except:
        pass
    if f1 and f2 and f3:
        final_df = pd.concat([folder1_df,folder2_df,folder3_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
        print(final_df.shape)
    elif f1 and f3:
        final_df = pd.concat([folder1_df,folder3_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
        print(final_df.shape)
    elif f2 and f3:
        final_df = pd.concat([folder2_df,folder3_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
        print(final_df.shape)
    elif f1 and f2:
        final_df = pd.concat([folder1_df,folder2_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
        print(final_df.shape)
    elif f1:
        final_df = pd.concat([folder1_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
        print(final_df.shape)
    elif f2:
        final_df = pd.concat([folder2_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
        print(final_df.shape)
    elif f3:
        final_df = pd.concat([folder3_df])
        final_df.to_csv(os.path.abspath("final_folder") "/" files3[i], index=None)
        print(final_df.shape)
    else:
        print("No")

A bit complicated but it should work if you are working on windows. Provide all folders names in list. max_files_size is maximum number of files any folder can have. You must have final_folder in which you want to save new files. os.path.abspath get absolute path to files

CodePudding user response：

import os
import csv as cs
import pandas as pd

base = os.path.abspath('/home/hari/Documents/python/pandas/') #base 
directory 
where the program saved

print(os.path.join(base, 'dir1/df1.csv'))
q = os.path.join(base, 'dir1/df1.csv')
w = os.path.join(base, 'dir1/df4.csv')
e = os.path.join(base, 'dir1/df5.csv')
r = os.path.join(base, 'dir2/df1.csv')
t = os.path.join(base, 'dir2/df3.csv')
y = os.path.join(base, 'dir2/df4.csv')
u = os.path.join(base, 'dir3/df4.csv')

csv = [q, w, e, r, t, y, u]

fi = pd.concat(map(pd.read_csv, csv), ignore_index=True) #used map 
function because of array

print(fi) #for testing

final = open(os.path.join(base, 'final/final.csv'), 'w', encoding='UTF8')

write = cs.writer(final) #passing the final file in csv writer

write.writerow(fi) #passing the concatenated csv's in writer

#make sure to change the url in base directory