Home > Software design >  How to get unique substrings from list of strings with python
How to get unique substrings from list of strings with python

Time:12-03

Given data similar to the following:

['blah_12_1_bbc_services_cbbc',
 'blah_12_1_high-profile_and_a',
 'blah_12_1_iplayer,_known',
 'blah_12_1_sport,_as_co-branded',
 'er_ds_such_it',
 'er_ds_websites_bbc_video',
 'er_ds_bbc',
 'er_ds_service._sport,',
 'th_ss_13_a',
 "th_ss_13_iplayer,_large_bbc's",
 'th_ss_13_the_a_co-branded',
 "th_ss_13_the_bbc's_bbc's"]

I'd like to create a list as:

['blah_12_1_',
 'blah_12_1_',
 'blah_12_1_',
 'blah_12_1_',
 'er_ds_',
 'er_ds_',
 'er_ds_',
 'er_ds_',
 'th_ss_13_',
 'th_ss_13_',
 'th_ss_13_',
 'th_ss_13_']

Given that the substrings to extract have differing lengths and structures I'm not sure how to go about this.

CodePudding user response:

This should meet your requirements.

# Imports
import os
import numpy as np

list_str = ['blah_12_1_bbc_services_cbbc',
 'blah_12_1_high-profile_and_a',
 'blah_12_1_iplayer,_known',
 'blah_12_1_sport,_as_co-branded',
 'er_ds_such_it',
 'er_ds_websites_bbc_video',
 'er_ds_bbc',
 'er_ds_service._sport,',
 'th_ss_13_a',
 "th_ss_13_iplayer,_large_bbc's",
 'th_ss_13_the_a_co-branded',
 "th_ss_13_the_bbc's_bbc's"]

def get_common_substring(list_str):
    # List sorted
    list_sorted = list(np.sort(list_str))
    
    # Necessary to check if the new common string already exists in a more contracted form 
    # (i.e for ["th_ss_13_iplayer,_large_bbc's", 'th_ss_13_the_a_co-branded', "th_ss_13_the_bbc's_bbc's"]
    # we want "th_ss_13_" not "th_ss_13_the" )
    last_common_substring = None
    
    # Association between items and common substrings
    common_substrings = {}
    
    # result as a list as expected 
    results = []
    
    for i in range(len(list_sorted) - 1):
        # Iteration two by two items
        pair_items = list_sorted[i:i 2]
    
        # Search for common substring
        common_substring = os.path.commonprefix(pair_items)
        
        if len(common_substring) > 0:
            if last_common_substring is not None and last_common_substring in common_substring:
                # Necessary to check if the new common string already exists in a more contracted form 
                common_substring = last_common_substring
            
            # We add the association for the pair of items  
            common_substrings[pair_items[0]] = common_substring
            common_substrings[pair_items[1]] = common_substring

            # We keep trace of the last common substring
            last_common_substring = common_substring
        else:
            if pair_items[0] not in common_substrings:
                # This is the case when there are no common substrings
                common_substrings[pair_items[0]] = None

    # To have the result as a list as expected            
    for item in common_substrings:
        results.append(common_substrings[item])
        
    return results

get_common_substring(list_str)

CodePudding user response:

You can use recursion:

from collections import defaultdict
def paths(p, s = '', c = None):
   d = defaultdict(list)
   for a, *b in p:
      d[a].extend(b if not b else [b])
   if c is None or len(d) == 1:
      yield from [j for a, b in d.items() for j in paths(b, s=s a '_', c=c if c is not None else len(b))]
   else:
      yield from [s]*c

data = ['blah_12_1_bbc_services_cbbc', 'blah_12_1_high-profile_and_a', 'blah_12_1_iplayer,_known', 'blah_12_1_sport,_as_co-branded', 'er_ds_such_it', 'er_ds_websites_bbc_video', 'er_ds_bbc', 'er_ds_service._sport,', 'th_ss_13_a', "th_ss_13_iplayer,_large_bbc's", 'th_ss_13_the_a_co-branded', "th_ss_13_the_bbc's_bbc's"]
r = list(paths([i.split('_') for i in data]))

Output:

['blah_12_1_', 
 'blah_12_1_', 
 'blah_12_1_', 
 'blah_12_1_', 
 'er_ds_', 
 'er_ds_', 
 'er_ds_',  
 'er_ds_', 
 'th_ss_13_', 
 'th_ss_13_', 
 'th_ss_13_', 
 'th_ss_13_']
  • Related