I have three folders folder1, folder2, and folder3. They have data frames as follows:
folder1/
df1.csv
df4.csv
df5.csv
folder2/
df1.csv
df3.csv
df4.csv
folder3/
df4.csv
I am confused about how to contact the data frames using pandas.concat()
with the same names in all three folders and save them in a new folder "finalfolder" such that the finalfolder contains concatinated files:
finalfolder/
df1.csv (concat from folder1 and folder2)
df3.csv (From folder 2)
df4.csv (concat from folder1, 2, and 3)
df5.csv (From folder 1)
CodePudding user response:
edit to first answer:
from os import listdir
import pandas as pd
folder_paths = ['put all the folder paths here']
df_dict = {'folder': [], 'file': []}
for folder_path in folder_paths:
for file in listdir(folder_path):
df_dict['folder'].append(folder_path)
df_dict['file'].append(file)
df = pd.DataFrame(df_dict)
for file, group in df.groupby(df['file']):
df_temp = pd.DataFrame()
for folder in group['folder'].tolist():
df_temp = pd.concat([df_temp, pd.read_csv(f'{folder}/{file}')])
df_temp.to_csv(f'finalfolder/{file}')
this should do the trick, just make the list with your folders and it should do what you want.
CodePudding user response:
import os
folders_list = ['folder1','folder2','folder3']
files1 = os.listdir(folders_list[0])
files2 = os.listdir(folders_list[1])
files3 = os.listdir(folders_list[2])
max_files_size =3
for i in range(max_files_size):
f1=False
f2=False
f3=False
try:
folder1_df = pd.read_csv(os.path.abspath("folder1") "/" files1[i])
f1=True
except:
pass
try:
folder2_df = pd.read_csv(os.path.abspath("folder2") "/" files2[i])
f2=True
except:
pass
try:
folder3_df = pd.read_csv(os.path.abspath("folder3") "/" files3[i])
f3=True
except:
pass
if f1 and f2 and f3:
final_df = pd.concat([folder1_df,folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
print(final_df.shape)
elif f1 and f3:
final_df = pd.concat([folder1_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
print(final_df.shape)
elif f2 and f3:
final_df = pd.concat([folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
print(final_df.shape)
elif f1 and f2:
final_df = pd.concat([folder1_df,folder2_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
print(final_df.shape)
elif f1:
final_df = pd.concat([folder1_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files1[i], index=None)
print(final_df.shape)
elif f2:
final_df = pd.concat([folder2_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files2[i], index=None)
print(final_df.shape)
elif f3:
final_df = pd.concat([folder3_df])
final_df.to_csv(os.path.abspath("final_folder") "/" files3[i], index=None)
print(final_df.shape)
else:
print("No")
A bit complicated but it should work if you are working on windows. Provide all folders names in list. max_files_size is maximum number of files any folder can have. You must have final_folder in which you want to save new files. os.path.abspath get absolute path to files
CodePudding user response:
import os
import csv as cs
import pandas as pd
base = os.path.abspath('/home/hari/Documents/python/pandas/') #base
directory
where the program saved
print(os.path.join(base, 'dir1/df1.csv'))
q = os.path.join(base, 'dir1/df1.csv')
w = os.path.join(base, 'dir1/df4.csv')
e = os.path.join(base, 'dir1/df5.csv')
r = os.path.join(base, 'dir2/df1.csv')
t = os.path.join(base, 'dir2/df3.csv')
y = os.path.join(base, 'dir2/df4.csv')
u = os.path.join(base, 'dir3/df4.csv')
csv = [q, w, e, r, t, y, u]
fi = pd.concat(map(pd.read_csv, csv), ignore_index=True) #used map
function because of array
print(fi) #for testing
final = open(os.path.join(base, 'final/final.csv'), 'w', encoding='UTF8')
write = cs.writer(final) #passing the final file in csv writer
write.writerow(fi) #passing the concatenated csv's in writer
#make sure to change the url in base directory