I am trying to write a script which will be grabbing newly added csv file from the folder and adding it to one big file. Basically, I want all of the csv files added to a particular folder, being stored in one resulting csv file. I have a code below which generates the list of files and I am selecting the newly added file there:
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size = fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp)
else:
#new file
if new_cb:
new_cb(fp)
fh[f] = fs
return total_size
def new_file(fp):
print("New File {0}!".format(fp))
def changed_file(fp):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/Users/.../Desktop/files',new_file,changed_file)
if total and nt != total:
print("Total size changed from {0} to {1}".format(total,nt))
total = nt
time.sleep(200)
print("File list:\n{0}".format(file_history))
print(list(dict.keys(file_history))[-1])
I don't really know how to create this empty pandas data frame to which this latest added file will be added on a regular basis (that's why I have a time.sleep
there). In the end I want to have this big csv file with all the files added to it.
Please, help :(
P.S. I am new to Python, so please don't judge if it is super simple..
CodePudding user response:
I think that pandas.concat()
is what you are looking for
CodePudding user response:
Are you going to be using Pandas to process the data in the csv or only to concatenate the files?
If you simply want to append each csv file to the big one, then why not use python io for speed and simplicity. Assuming that all csv files use the same type of formatting that is.
I have updated the new_file method to append to the big csv using io. I have added an append_pandas function which is not used but should help you if you must use pandas to do the job. I haven't tested the pandas function, there are more things to consider like the format of the csv files. Check out the documentation for more details.
import os
import time
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None,**kwargs):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size = fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp,**kwargs)
else:
#new file
if new_cb:
new_cb(fp, **kwargs)
fh[f] = fs
return total_size
def is_csv(f):
# you can add more to check here
return 'csv' in f
def append_csv(s,d):
with open(s,'r') as readcsv:
with open(d,'a') as appendcsv:
for line in readcsv:
appendcsv.write(line)
if not "\n" in line:
appendcsv.write("\n")
def append_pandas(s,d):
# i haven't tested this
pd = pandas.read_csv(s)
pdb = pandas.read_csv(d)
newpd = pdb.append(pd)
DataFrame.to_csv(d)
def new_file(fp, **kwargs):
if is_csv(fp):
print("Appending {0}!".format(fp))
bcsv = kwargs.get('append_to_csv','/default/path/to/big.csv')
append_csv(fp,bcsv)
def changed_file(fp, **kwargs):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/tmp/test/',new_file,changed_file, append_to_csv ='/tmp/big.csv')
if total and ns != total:
print("Total size changed from {0} to {1}".format(total,ns))
total = ns
time.sleep(10)
print("File list:\n{0}".format(file_history))