Group path by filename from two list in Python-CodePudding

Given two list which contain full path to multiple files

list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml','fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']

and

list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']

I would like to sync/ group the path if the representative path having the filename.

For example. I would like to group

'xml_path' = {str} 'gt/so/TWO_AG_EY.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_EY.xml'

'xml_path' = {str} 'gt/so/TWO_AG_GH.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_GH.xml'

To achieve this, I rely on the following code which involve splitting the path and use pandas merge capability.

However, I wonder whether there is more compact and efficient way of doing this.

Here is what I tried so far

import os
import re

import pandas as pd


def sort_path(path_all):
    path_all.sort(key=lambda x: [int(c) if c.isdigit() else c for c in re.split(r'(\d )', x)])
    return path_all

def split_sbj_id(s, idx):
    return {'fname': os.path.split(s)[-1].split('.')[0], 'xml_path' if idx == 1 else 'txt_path': s}


def merge_path(ls_eb, ls_mff):


    df = pd.merge(pd.DataFrame([split_sbj_id(fname, 1) for fname in ls_eb]),
                  pd.DataFrame([split_sbj_id(fname, 2) for fname in ls_mff]), on='fname',how='left')

    all_d=[dict(fname=row['fname'],xml_path=row['xml_path'],txt_path=row['txt_path'])
           for index, row in df.iterrows()]


    return all_d



list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml',
            'fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']

list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
path_sbj = merge_path(list1, list2)

Which produce

[{'fname': 'TWO_BB_P3', 'xml_path': 'fow/fol/TWO_BB_P3.xml', 'txt_path': 'gt/so/TWO_BB_P3.txt'}, {'fname': 'N0_AG_ES', 'xml_path': 'fow/fol/N0_AG_ES.xml', 'txt_path': nan}, {'fname': 'TWO_AG_GH', 'xml_path': 'fow/fol/TWO_AG_GH.xml', 'txt_path': 'gt/so/TWO_AG_GH.txt'}, {'fname': 'TWO_AG_EY', 'xml_path': 'fow/fol/TWO_AG_EY.xml', 'txt_path': 'gt/so/TWO_AG_EY.txt'}]

CodePudding user response：

Pandas approach

s1 = pd.DataFrame({'xml_path': list1})
s2 = pd.DataFrame({'txt_path': list2})

regex = r'([^/] )\.(?:xml|txt)$'
s1['fname'] = s1['xml_path'].str.extract(regex)
s2['fname'] = s2['txt_path'].str.extract(regex)

s1.merge(s2, how='outer').to_dict('r')

[{'fname': 'TWO_BB_P3',
  'txt_path': 'gt/so/TWO_BB_P3.txt',
  'xml_path': 'fow/fol/TWO_BB_P3.xml'},
 {'fname': 'N0_AG_ES', 'txt_path': nan, 'xml_path': 'fow/fol/N0_AG_ES.xml'},
 {'fname': 'TWO_AG_GH',
  'txt_path': 'gt/so/TWO_AG_GH.txt',
  'xml_path': 'fow/fol/TWO_AG_GH.xml'},
 {'fname': 'TWO_AG_EY',
  'txt_path': 'gt/so/TWO_AG_EY.txt',
  'xml_path': 'fow/fol/TWO_AG_EY.xml'}]