Given data similar to the following:
['blah_12_1_bbc_services_cbbc',
'blah_12_1_high-profile_and_a',
'blah_12_1_iplayer,_known',
'blah_12_1_sport,_as_co-branded',
'er_ds_such_it',
'er_ds_websites_bbc_video',
'er_ds_bbc',
'er_ds_service._sport,',
'th_ss_13_a',
"th_ss_13_iplayer,_large_bbc's",
'th_ss_13_the_a_co-branded',
"th_ss_13_the_bbc's_bbc's"]
I'd like to create a list as:
['blah_12_1_',
'blah_12_1_',
'blah_12_1_',
'blah_12_1_',
'er_ds_',
'er_ds_',
'er_ds_',
'er_ds_',
'th_ss_13_',
'th_ss_13_',
'th_ss_13_',
'th_ss_13_']
Given that the substrings to extract have differing lengths and structures I'm not sure how to go about this.
CodePudding user response:
This should meet your requirements.
# Imports
import os
import numpy as np
list_str = ['blah_12_1_bbc_services_cbbc',
'blah_12_1_high-profile_and_a',
'blah_12_1_iplayer,_known',
'blah_12_1_sport,_as_co-branded',
'er_ds_such_it',
'er_ds_websites_bbc_video',
'er_ds_bbc',
'er_ds_service._sport,',
'th_ss_13_a',
"th_ss_13_iplayer,_large_bbc's",
'th_ss_13_the_a_co-branded',
"th_ss_13_the_bbc's_bbc's"]
def get_common_substring(list_str):
# List sorted
list_sorted = list(np.sort(list_str))
# Necessary to check if the new common string already exists in a more contracted form
# (i.e for ["th_ss_13_iplayer,_large_bbc's", 'th_ss_13_the_a_co-branded', "th_ss_13_the_bbc's_bbc's"]
# we want "th_ss_13_" not "th_ss_13_the" )
last_common_substring = None
# Association between items and common substrings
common_substrings = {}
# result as a list as expected
results = []
for i in range(len(list_sorted) - 1):
# Iteration two by two items
pair_items = list_sorted[i:i 2]
# Search for common substring
common_substring = os.path.commonprefix(pair_items)
if len(common_substring) > 0:
if last_common_substring is not None and last_common_substring in common_substring:
# Necessary to check if the new common string already exists in a more contracted form
common_substring = last_common_substring
# We add the association for the pair of items
common_substrings[pair_items[0]] = common_substring
common_substrings[pair_items[1]] = common_substring
# We keep trace of the last common substring
last_common_substring = common_substring
else:
if pair_items[0] not in common_substrings:
# This is the case when there are no common substrings
common_substrings[pair_items[0]] = None
# To have the result as a list as expected
for item in common_substrings:
results.append(common_substrings[item])
return results
get_common_substring(list_str)
CodePudding user response:
You can use recursion:
from collections import defaultdict
def paths(p, s = '', c = None):
d = defaultdict(list)
for a, *b in p:
d[a].extend(b if not b else [b])
if c is None or len(d) == 1:
yield from [j for a, b in d.items() for j in paths(b, s=s a '_', c=c if c is not None else len(b))]
else:
yield from [s]*c
data = ['blah_12_1_bbc_services_cbbc', 'blah_12_1_high-profile_and_a', 'blah_12_1_iplayer,_known', 'blah_12_1_sport,_as_co-branded', 'er_ds_such_it', 'er_ds_websites_bbc_video', 'er_ds_bbc', 'er_ds_service._sport,', 'th_ss_13_a', "th_ss_13_iplayer,_large_bbc's", 'th_ss_13_the_a_co-branded', "th_ss_13_the_bbc's_bbc's"]
r = list(paths([i.split('_') for i in data]))
Output:
['blah_12_1_',
'blah_12_1_',
'blah_12_1_',
'blah_12_1_',
'er_ds_',
'er_ds_',
'er_ds_',
'er_ds_',
'th_ss_13_',
'th_ss_13_',
'th_ss_13_',
'th_ss_13_']