My current process involves looping through my source directory and adding the name of each file to my dataframe in python. I want to get the datemodified for each of these files as well
import datetime
import os
import pandas as pd
#set src directory
os.chdir('C:/Users/jj/Desktop/do/Claims/globmove')
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
folder_path = os.path.abspath('C:/Users/jj/Desktop/do/Claims/globmove')
files = [file for file in os.listdir(folder_path) if file.endswith(".xlsx")]
dfooc = read_files(files)
I am able to run this without errors, but the datemodified timestamp currently does not append to the final dataframe- dfooc. How can I get the datemodified to append?
Edit: Getting an indent error after changing order of my original code above
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
file['ModificationDate'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
return pd.concat(result, ignore_index=True)
^
IndentationError: unexpected indent
CodePudding user response:
Here's how I do it.
import os
from pathlib import Path
import pandas as pd
import pendulum
class FileDates:
def __init__(self, **kwargs):
self.file_type = kwargs.get("file_type")
self.file_path = kwargs.get("file_path")
self.path = kwargs.get("path")
self.tz = pendulum.now().timezone.name
def main(self) -> pd.DataFrame:
files = self.get_files()
dates = self.get_dates(files)
return pd.DataFrame(list(zip([str(Path(x)).split("/")[-1] for x in files], dates)), columns=["file", "date"])
def get_files(self) -> list:
files = [str(x) for x in self.file_path.rglob("*") if x.is_file()]
return [x for x in files if self.file_type in x]
def get_dates(self, files: list) -> list:
return [pendulum.from_timestamp(os.path.getmtime(Path(x))).in_tz(self.tz).to_date_string() for x in files]
file_type = ".xlsx"
file_path = Path(f"{Path.home()}/Desktop/do/Claims/globmove/")
data = FileDates(file_type=file_type, file_path=file_path).main()
CodePudding user response:
Here're a few suggestions on how you could simplify your code, and get it to do what you want. See also comments in the code. Also, make sure your code indentation is correct and doesn't mix tabs and spaces.
# for convenient handling of file paths:
from pathlib import Path
import pandas as pd
# you don't need to necessarily change the current working dir,
# it's potentially enough to specify the source path
src = Path('C:/Users/jj/Desktop/do/Claims/globmove')
# pandas datetime is good if you work with pandas anyway
# and has all you need here:
def modification_date(file):
"""get UTC date/time of file modification"""
return pd.to_datetime(file.lstat().st_mtime, unit="s")
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename.name
# here's the call to add the mod date
file['ModificationDate'] = modification_date(filename)
result.append(file)
return pd.concat(result, ignore_index=True)
files = list(src.glob("*.xlsx"))
dfooc = read_files(files)
Note that this code is not tested as I do not have your input and don't know what read_sheets
is doing exactly.