I have a folder of csv files that I'd like to loop over to create individual DataFrames named after the file itself.
So if I have file_1.csv
, file_2.csv
, file_3.csv
... I'd like DataFrames created for each file and have the df named after the file of the data it contains.
Here is what I've tried so far:
# get list of all files
all_files = os.listdir("./Data/")
# get list of only csv files
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
# remove file extension to get name only
file_names = []
for i in csv_files:
file = i[:-4]
file_names.append(file)
# create DataFrames from each file named after the corresonding file
dfs = []
def make_files_dfs():
for a,b in zip(file_names, csv_files):
if a == b[:-4]:
a = pd.read_csv(eval(f"'Data/{b}'"))
dfs.append(a)
error log:
--------------------------------------------------------------------------- ParserError Traceback (most recent call
last) ~\AppData\Local\Temp/ipykernel_592/2054074323.py in <module>
----> 1 make_files_dfs()
~\AppData\Local\Temp/ipykernel_592/3264801573.py in make_files_dfs()
3 for a,b in zip(file_names, csv_files):
4 if a == b[:-4]:
----> 5 a = pd.read_csv(eval(f"'Data/{b}'"))
6 dfs.append(a)
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\util\_decorators.py
in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in read_csv(filepath_or_buffer, sep, delimiter, header, names,
index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine,
converters, true_values, false_values, skipinitialspace, skiprows,
skipfooter, nrows, na_values, keep_default_na, na_filter, verbose,
skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
thousands, decimal, lineterminator, quotechar, quoting, doublequote,
escapechar, comment, encoding, encoding_errors, dialect,
error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in _read(filepath_or_buffer, kwds)
486
487 with parser:
--> 488 return parser.read(nrows)
489
490
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in read(self, nrows) 1045 def read(self, nrows=None): 1046
nrows = validate_integer("nrows", nrows)
-> 1047 index, columns, col_dict = self._engine.read(nrows) 1048 1049 if index is None:
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py
in read(self, nrows)
221 try:
222 if self.low_memory:
--> 223 chunks = self._reader.read_low_memory(nrows)
224 # destructive to chunks
225 data = _concatenate_chunks(chunks)
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader.read_low_memory()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader._read_rows()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader._tokenize_rows()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 70 fields in line 7728, saw 74
CodePudding user response:
Your code is a bit difficult to understand. You have some unnecessary functions. First of all, it is easier to change the working directory path (by os.chdir(path)
. Secondly, you can get rid of your lambda function and use glob.glob
. Lastly, you cannot make a DataFrame named after a variable. Your dfs
list will hold some class names that won't give you much insight into the DataFrame. It is much better to use a dictionary. Overall, this is how your code can look like:
import os
import glob
path = "the path to your data"
os.chdir(path)
# get list of only csv files
csv_files = glob.glob("/*.csv")
# create a dictionary with key as the DF name and values as DataFrames
dataFrameDictionary={}
def make_files_dfs():
for a in csv_files:
dataFrameDictionary[a[:-4], pd.read_csv(a)]
CodePudding user response:
I don't understand why your code is so lengthy, but this can be done by following:
csv_list = ['file_1.csv', 'file_2.csv', 'file_3.csv']
for i in range(len(csv_list)):
globals()[f"df_{i}"] = pd.read_csv(csv_list[i])
Output:
Three dataframes will be created. df_1 will have 1st file in the list, df_2 will have 2nd file in the list and so on..