Home > Software engineering >  Consolidate multiple tsv files from a folder into one using python
Consolidate multiple tsv files from a folder into one using python

Time:01-19

I have mutiple tsv files in a folder like 2018Q1.tsv, 2018Q2.tsv, 2018Q3.tsv and so on. Each tuple inside the tsv file is seperated by '\t' and each row is seperated by '\n'.

I want to consolidate all the tsv files inside a folder into one single file with all the rows including the filename as a new column in a new consolidated file in python.

import os
import pandas as pd
#read the path
cwd = os.path.abspath(r'path/to/directory')
#list all the files from the directory
file_list = os.listdir(cwd)
file_list

columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']

for file in file_list:
    (pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns)
            for f in file_list}, names=['Date'])
   .reset_index(0)
   .to_csv('output_file.tsv', index=False)
)

Sample Input:

2018Q1.tsv
------------

860585|RBS PARTNERS L P /CT|13FCONP|1993-02-11|edgar/data/860585/9999999997-04-035713.txt|edgar/data/860585/9999999997-04-035713-index.html

2018Q2.tsv
-------------
926688|SMITH THOMAS W|13F-HR|1993-02-12|edgar/data/926688/9999999997-05-015654.txt|edgar/data/926688/9999999997-05-015654-index.html

Sample consolidated output:
---------------

Date,CIK_number,Companyname,FilingType,Filingdate,filingtext,filingurl
2018Q1,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt,edgar/data/860585/9999999997-04-035713-index.html
2018Q2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt,edgar/data/926688/9999999997-05-015654-index.html

Trace back error:

FileNotFoundError                         Traceback (most recent call last)
Input In [25], in <cell line: 3>()
      1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)

Input In [25], in <dictcomp>(.0)
      1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    665 kwds_defaults = _refine_defaults_read(
    666     dialect,
    667     delimiter,
   (...)
    676     defaults={"delimiter": ","},
    677 )
    678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
    572 _validate_names(kwds.get("names", None))
    574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
    577 if chunksize or iterator:
    578     return parser

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
    930     self.options["has_index_names"] = kwds["has_index_names"]
    932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:1217, in TextFileReader._make_engine(self, f, engine)
   1213     mode = "rb"
   1214 # error: No overload variant of "get_handle" matches argument types
   1215 # "Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]"
   1216 # , "str", "bool", "Any", "Any", "Any", "Any", "Any"
-> 1217 self.handles = get_handle(  # type: ignore[call-overload]
   1218     f,
   1219     mode,
   1220     encoding=self.options.get("encoding", None),
   1221     compression=self.options.get("compression", None),
   1222     memory_map=self.options.get("memory_map", False),
   1223     is_text=is_text,
   1224     errors=self.options.get("encoding_errors", "strict"),
   1225     storage_options=self.options.get("storage_options", None),
   1226 )
   1227 assert self.handles is not None
   1228 f = self.handles.handle

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\common.py:789, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    784 elif isinstance(handle, str):
    785     # Check whether the filename is to be opened in binary mode.
    786     # Binary mode does not support 'encoding' and 'newline'.
    787     if ioargs.encoding and "b" not in ioargs.mode:
    788         # Encoding
--> 789         handle = open(
    790             handle,
    791             ioargs.mode,
    792             encoding=ioargs.encoding,
    793             errors=errors,
    794             newline="",
    795         )
    796     else:
    797         # Binary mode
    798         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '1993-QTR1.tsv'

I am not able to consolidate. Please help

Thank you

CodePudding user response:

You can use a list comprehension and concat, chained with to_csv:

(pd.concat([pd.read_csv(f, sep='\t') for f in file_list],
           ignore_index=True # optional, if you want to keep the index
          )
   .to_csv('output_file.tsv', sep='\t',
           index=False # optional, if you don't want the index in the output
           )
)

update with real example:

file_list = ['2018Q1.tsv', '2018Q2.tsv']
columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']

(pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns)
            for f in file_list}, names=['Date'])
   .reset_index(0)
   .to_csv('output_file.tsv', index=False)
)

Output file:

Date,CIK_number,Companyname,FilingType,Filingdate,filingtext,filingurl
2018Q1,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt,edgar/data/860585/9999999997-04-035713-index.html
2018Q2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt,edgar/data/926688/9999999997-05-015654-index.html

CodePudding user response:

import pandas as pd
from glob import glob

data_files = sorted(glob(r'path/to/directory/*.tsv'))
data_files

columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
mergedata = pd.concat(pd.read_csv(datafile, sep = '|',header=None, names=columns).assign(Year_Quater = datafile)
                     for datafile in data_files)
  • Related