Combine multiple csv files and add filename of original file to combined output-CodePudding

I am trying to merge multiple csv files under one directory into one csv files. All headers are same for each file. But when I see the merged file it is hard to understand the data is coming from which file actually. I have also fixed my desired columns for my purpose. Is there any way to uniquely identify them using the following code? import csv from pathlib import Path

p = Path(r'E:\Neurogen\Merging_test_data') 

file_list = p.glob("*.csv")

desired_columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene', 'Xref.refGene', 'cytoBand', 'cosmic70', 'avsnp147', 'ExAC_ALL', 'ExAC_AFR', 'ExAC_AMR', 'ExAC_EAS', 'ExAC_FIN', 'ExAC_NFE', 'ExAC_OTH', 'ExAC_SAS', 'CLINSIG', 'CLNDBN', 'CLNACC', 'CLNDSDB', 'CLNDSDBID', '1000g2015aug_all', 'SIFT_score', 'SIFT_pred', 'Polyphen2_HDIV_score', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred', 'LRT_score', 'LRT_pred', 'MutationTaster_score', 'MutationTaster_pred', 'MutationAssessor_score', 'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_pred', 'VEST3_score', 'CADD_raw', 'CADD_phred', 'DANN_score', 'fathmm-MKL_coding_score', 'fathmm-MKL_coding_pred', 'MetaSVM_score', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_pred', 'integrated_fitCons_score', 'integrated_confidence_value', 'GERP  _RS', 'phyloP7way_vertebrate', 'phyloP20way_mammalian', 'phastCons7way_vertebrate', 'phastCons20way_mammalian', 'SiPhy_29way_logOdds', 'Otherinfo']
desired_rows = []

for csv_file in file_list:
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            desired_rows.append({c: row[c] for c in desired_columns})

with open('merged.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=desired_columns)
    writer.writeheader()
    writer.writerows(desired_rows)

CodePudding user response：

As you did not provide any sample data, I generated some arbitrary files to show the general concept:

a.txt:

col_1;col_2;col_3
1;2;3
4;5;6
7;8;9

b.txt:

col_1;col_2;col_3
10;20;30
40;50;60
70;80;90

Assuming you want to filter on the columns col_1 and col_3, a very basic approach based on the built-in csv module could look like this:

import csv
from pathlib import Path


DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'

DESIRED_COLUMNS = ['col_1', 'col_3']


files = sorted(
    DIRECTORY.glob(FILE_SUFFIX),
    key=lambda x: x.name,
    )

filtered = []


for f in files:
    reader = csv.DictReader(f.open(), delimiter=';')
    for row in reader:
        d = {k: v for k, v in row.items() if k in DESIRED_COLUMNS}
        d['from_file'] = f.name
        filtered.append(d)

print(filtered)
# filtered is a list of dicts and can be written to file with csv.DictWriter

The above snipped prints:

[{'col_1': '1', 'col_3': '3', 'from_file': 'a.txt'}, {'col_1': '4', 'col_3': '6', 'from_file': 'a.txt'}, {'col_1': '7', 'col_3': '9', 'from_file': 'a.txt'}, {'col_1': '10', 'col_3': '30', 'from_file': 'b.txt'}, {'col_1': '40', 'col_3': '60', 'from_file': 'b.txt'}, {'col_1': '70', 'col_3': '90', 'from_file': 'b.txt'}]

A more elegant solution can be based on pandas:

import pandas as pd
from pathlib import Path


DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'

DESIRED_COLUMNS = ['col_1', 'col_3']


files = sorted(
    DIRECTORY.glob(FILE_SUFFIX),
    key=lambda x: x.name,
    )

filtered = []


for f in files:
    df = pd.read_csv(
        f,
        delimiter=';',
        usecols=DESIRED_COLUMNS,
    )
    df['from_file'] = f.name
    filtered.append(df)

# print(filtered)

concated = pd.concat(filtered, ignore_index=True)
print(concated)
# concated is a pandas.DataFrame. Use `concated.to_csv()` to write it to file

The pandas approach results in:

   col_1  col_3 from_file
0      1      3     a.txt
1      4      6     a.txt
2      7      9     a.txt
3     10     30     b.txt
4     40     60     b.txt
5     70     90     b.txt