I have a long list of CSV's I do NOT want to append into one, I wish to keep the CSV's names unchanged only change 2 columns from what currently has a 13 digit unix date to a natural date time I.E YYYY/MM/DD HH:MM:SS.
I am happy using Pandas that seems to be the easier way, but I am struggling with this, I had hoped something like this might have worked. Any help is appreciated!
Here is an example of unix time: 1640227953000 this converts to Thursday, 23 December 2021 02:52:33
import pandas as pd
import datetime
from pathlib import Path # available in python 3.4
dir = r'csv/' # raw string for windows.
csv_files = [f for f in Path(dir).glob('*.csv')] # finds all csvs in your folder.
print(csv_files)
for csv in csv_files: #iterate list
df = pd.read_csv(csv) #read cs
print(df.columns.tolist()) # used for trouble shooting
df['values_authorTimestamp']=df['values_authorTimestamp'].apply(lambda d: datetime.datetime.fromtimestamp(int(d)/1000).strftime('%Y-%m-%d %H:%M:%S'))
df['values_committerTimestamp']=df['values_committerTimestamp'].apply(lambda d: datetime.datetime.fromtimestamp(int(d)/1000).strftime('%Y-%m-%d %H:%M:%S'))
#df['values_authorTimestamp'] = pd.to_datetime(df['values_authorTimestamp'], format='%Y-%m-%d %H:%M:%S')
# print(df)
print(f'{csv.name} saved.')
df.to_csv(f'csv/{csv.name}')
#values_committerTimestamp
This is working, saving to CSV's however it only gets through a number of them and throws an error, any ideas?
File "Scripts/Audit/change-csv.py", line 16, in <module>
df['values_authorTimestamp']=df['values_authorTimestamp'].apply(lambda d: datetime.datetime.fromtimestamp(int(d)/1000).strftime('%Y-%m-%d %H:%M:%S'))
File "/opt/homebrew/lib/python3.9/site-packages/pandas/core/series.py", line 4433, in apply
return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
File "/opt/homebrew/lib/python3.9/site-packages/pandas/core/apply.py", line 1082, in apply
return self.apply_standard()
File "/opt/homebrew/lib/python3.9/site-packages/pandas/core/apply.py", line 1137, in apply_standard
mapped = lib.map_infer(
File "pandas/_libs/lib.pyx", line 2870, in pandas._libs.lib.map_infer
File "Scripts/Audit/change-csv.py", line 16, in <lambda>
df['values_authorTimestamp']=df['values_authorTimestamp'].apply(lambda d: datetime.datetime.fromtimestamp(int(d)/1000).strftime('%Y-%m-%d %H:%M:%S'))
ValueError: invalid literal for int() with base 10: '2021-11-04 17:19:24'
CodePudding user response:
It seems there are mixed formats of datetimes, try use errors='coerce'
parameter for missing values if not mathed format and then replace missing values by another Series
by Series.fillna
:
df = pd.DataFrame({'values_authorTimestamp':[1640227953000,'2021-11-04 17:19:24']})
d1 = pd.to_datetime(df['values_authorTimestamp'], unit='ms', errors='coerce')
d2 = pd.to_datetime(df['values_authorTimestamp'], errors='coerce')
df['values_authorTimestamp'] = d1.fillna(d2).dt.strftime('%Y/%m/%d %H:%M:%S')
print (df)
values_authorTimestamp
0 2021/12/23 02:52:33
1 2021/11/04 17:19:24
So your solution is changed:
for csv in csv_files: #iterate list
df = pd.read_csv(csv) #read cs
d1 = pd.to_datetime(df['values_authorTimestamp'], unit='ms', errors='coerce')
d2 = pd.to_datetime(df['values_authorTimestamp'], errors='coerce')
df['values_authorTimestamp'] = d1.fillna(d2).dt.strftime('%Y/%m/%d %H:%M:%S')
d11 = pd.to_datetime(df['values_committerTimestamp'], unit='ms', errors='coerce')
d21 = pd.to_datetime(df['values_committerTimestamp'], errors='coerce')
df['values_committerTimestamp'] = d11.fillna(d21).dt.strftime('%Y/%m/%d %H:%M:%S')
# print(df)
print(f'{csv.name} saved.')
df.to_csv(f'csv/{csv.name}')