I have two list of strings
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
I want to remove common words from strings present in list and return two separate lists
result_1 = ['james john','']
result_2 = ['hans', 'very']
I tried this way
print([' '.join(set(i.split()).difference(set(data_1))) for i in data_2])
How to obtain a result like result_1 and result_2
CodePudding user response:
You could try using numpy's setdiff1d
function. Like:
difference_1 = [" ".join(list(np.setdiff1d(np.array(x.split()), np.array(y.split())))) for x, y in zip(data_1, data_2)]
Using set.diference()
also should work:
difference_1 = [" ".join(set(x.split()).difference(set(z.split()))) for x, z in zip(data_1, data_2)]
CodePudding user response:
First tokenize the sentences using nltk
from nltk import word_tokenize
def list_tokenize(data):
return [word_tokenize(sentence) for sentence in data]
then get the common words
def get_common_words(data_1_tokenized,data_2_tokenized):
return [
list(set.intersection(set(sentence_1), set(sentence_2)))
for sentence_1, sentence_2 in zip(data_1_tokenized, data_2_tokenized)
]
Then remove the common words
def remove_common_words(data, common_words):
result = []
for i in range(len(data)):
result.append(
" ".join([word for word in data[i] if word not in common_words[i]]))
return result
Combined function to get unique words
def get_unique(data_1,data_2):
data_1_tokenized = list_tokenize(data_1)
data_2_tokenized = list_tokenize(data_2)
common_words = get_common_words(data_1_tokenized,data_2_tokenized)
result1 = remove_common_words(data_1_tokenized,common_words)
result2 = remove_common_words(data_2_tokenized,common_words)
return result1,result2
final usage
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
result1,result2 = get_unique(data_1,data_2)
Results
result1=['james john', '']
result2=['hans', 'very']