I am trying to merge multiple csv files under one directory into one csv files. All headers are same for each file. But when I see the merged file it is hard to understand the data is coming from which file actually. I have also fixed my desired columns for my purpose. Is there any way to uniquely identify them using the following code? import csv from pathlib import Path
p = Path(r'E:\Neurogen\Merging_test_data')
file_list = p.glob("*.csv")
desired_columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene', 'Xref.refGene', 'cytoBand', 'cosmic70', 'avsnp147', 'ExAC_ALL', 'ExAC_AFR', 'ExAC_AMR', 'ExAC_EAS', 'ExAC_FIN', 'ExAC_NFE', 'ExAC_OTH', 'ExAC_SAS', 'CLINSIG', 'CLNDBN', 'CLNACC', 'CLNDSDB', 'CLNDSDBID', '1000g2015aug_all', 'SIFT_score', 'SIFT_pred', 'Polyphen2_HDIV_score', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred', 'LRT_score', 'LRT_pred', 'MutationTaster_score', 'MutationTaster_pred', 'MutationAssessor_score', 'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_pred', 'VEST3_score', 'CADD_raw', 'CADD_phred', 'DANN_score', 'fathmm-MKL_coding_score', 'fathmm-MKL_coding_pred', 'MetaSVM_score', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_pred', 'integrated_fitCons_score', 'integrated_confidence_value', 'GERP _RS', 'phyloP7way_vertebrate', 'phyloP20way_mammalian', 'phastCons7way_vertebrate', 'phastCons20way_mammalian', 'SiPhy_29way_logOdds', 'Otherinfo']
desired_rows = []
for csv_file in file_list:
with open(csv_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
desired_rows.append({c: row[c] for c in desired_columns})
with open('merged.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=desired_columns)
writer.writeheader()
writer.writerows(desired_rows)
CodePudding user response:
As you did not provide any sample data, I generated some arbitrary files to show the general concept:
a.txt:
col_1;col_2;col_3
1;2;3
4;5;6
7;8;9
b.txt:
col_1;col_2;col_3
10;20;30
40;50;60
70;80;90
Assuming you want to filter on the columns col_1
and col_3
, a very basic approach based on the built-in csv
module could look like this:
import csv
from pathlib import Path
DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'
DESIRED_COLUMNS = ['col_1', 'col_3']
files = sorted(
DIRECTORY.glob(FILE_SUFFIX),
key=lambda x: x.name,
)
filtered = []
for f in files:
reader = csv.DictReader(f.open(), delimiter=';')
for row in reader:
d = {k: v for k, v in row.items() if k in DESIRED_COLUMNS}
d['from_file'] = f.name
filtered.append(d)
print(filtered)
# filtered is a list of dicts and can be written to file with csv.DictWriter
The above snipped prints:
[{'col_1': '1', 'col_3': '3', 'from_file': 'a.txt'}, {'col_1': '4', 'col_3': '6', 'from_file': 'a.txt'}, {'col_1': '7', 'col_3': '9', 'from_file': 'a.txt'}, {'col_1': '10', 'col_3': '30', 'from_file': 'b.txt'}, {'col_1': '40', 'col_3': '60', 'from_file': 'b.txt'}, {'col_1': '70', 'col_3': '90', 'from_file': 'b.txt'}]
A more elegant solution can be based on pandas:
import pandas as pd
from pathlib import Path
DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'
DESIRED_COLUMNS = ['col_1', 'col_3']
files = sorted(
DIRECTORY.glob(FILE_SUFFIX),
key=lambda x: x.name,
)
filtered = []
for f in files:
df = pd.read_csv(
f,
delimiter=';',
usecols=DESIRED_COLUMNS,
)
df['from_file'] = f.name
filtered.append(df)
# print(filtered)
concated = pd.concat(filtered, ignore_index=True)
print(concated)
# concated is a pandas.DataFrame. Use `concated.to_csv()` to write it to file
The pandas
approach results in:
col_1 col_3 from_file
0 1 3 a.txt
1 4 6 a.txt
2 7 9 a.txt
3 10 30 b.txt
4 40 60 b.txt
5 70 90 b.txt