Given two list which contain full path to multiple files
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml','fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
and
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
I would like to sync/ group the path if the representative path having the filename
.
For example. I would like to group
'xml_path' = {str} 'gt/so/TWO_AG_EY.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_EY.xml'
or
'xml_path' = {str} 'gt/so/TWO_AG_GH.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_GH.xml'
To achieve this, I rely on the following code which involve splitting the path and use pandas
merge capability.
However, I wonder whether there is more compact and efficient way of doing this.
Here is what I tried so far
import os
import re
import pandas as pd
def sort_path(path_all):
path_all.sort(key=lambda x: [int(c) if c.isdigit() else c for c in re.split(r'(\d )', x)])
return path_all
def split_sbj_id(s, idx):
return {'fname': os.path.split(s)[-1].split('.')[0], 'xml_path' if idx == 1 else 'txt_path': s}
def merge_path(ls_eb, ls_mff):
df = pd.merge(pd.DataFrame([split_sbj_id(fname, 1) for fname in ls_eb]),
pd.DataFrame([split_sbj_id(fname, 2) for fname in ls_mff]), on='fname',how='left')
all_d=[dict(fname=row['fname'],xml_path=row['xml_path'],txt_path=row['txt_path'])
for index, row in df.iterrows()]
return all_d
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml',
'fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
path_sbj = merge_path(list1, list2)
Which produce
[{'fname': 'TWO_BB_P3', 'xml_path': 'fow/fol/TWO_BB_P3.xml', 'txt_path': 'gt/so/TWO_BB_P3.txt'}, {'fname': 'N0_AG_ES', 'xml_path': 'fow/fol/N0_AG_ES.xml', 'txt_path': nan}, {'fname': 'TWO_AG_GH', 'xml_path': 'fow/fol/TWO_AG_GH.xml', 'txt_path': 'gt/so/TWO_AG_GH.txt'}, {'fname': 'TWO_AG_EY', 'xml_path': 'fow/fol/TWO_AG_EY.xml', 'txt_path': 'gt/so/TWO_AG_EY.txt'}]
CodePudding user response:
Pandas approach
s1 = pd.DataFrame({'xml_path': list1})
s2 = pd.DataFrame({'txt_path': list2})
regex = r'([^/] )\.(?:xml|txt)$'
s1['fname'] = s1['xml_path'].str.extract(regex)
s2['fname'] = s2['txt_path'].str.extract(regex)
s1.merge(s2, how='outer').to_dict('r')
[{'fname': 'TWO_BB_P3',
'txt_path': 'gt/so/TWO_BB_P3.txt',
'xml_path': 'fow/fol/TWO_BB_P3.xml'},
{'fname': 'N0_AG_ES', 'txt_path': nan, 'xml_path': 'fow/fol/N0_AG_ES.xml'},
{'fname': 'TWO_AG_GH',
'txt_path': 'gt/so/TWO_AG_GH.txt',
'xml_path': 'fow/fol/TWO_AG_GH.xml'},
{'fname': 'TWO_AG_EY',
'txt_path': 'gt/so/TWO_AG_EY.txt',
'xml_path': 'fow/fol/TWO_AG_EY.xml'}]