Get the location of word in text and the distance of the same word to the next similar word-CodePudding

ive been trying to find the given a list of a word to my script and then i search for the (word - how many times its been repeated - its location in text and whats the distance of the first word to the other similar word)
so i coded the first 2 things but cant find their location and distance to other similar words
my code :

import re

#Dummy text
long_string = "one Groups are marked by the ()meta-characters. two They group together the expressions contained one inside them, and you can one repeat the contents of a group with a repeating qualifier, such as there"
search_list = ['one', 'two', 'there']

find_words = re.compile('|'.join(search_list),re.IGNORECASE).findall(long_string)
if find_words:
    print('len_of_words : {}'.format(len(find_words)),find_words)
    
else:
    pass

output :

len_of_words : 5 ['one', 'two', 'one', 'one', 'there']

but cant get the location of the founded words in text .

CodePudding user response：

  from collections import Counter

finditer can find the start and the end of the match.

long_string = "one Groups are marked by the ()meta-characters. two They group together the expressions contained one inside them, and you can one repeat the contents of a group with a repeating qualifier, such as there"


p = re.compile(r'one|two|there')


counter = Counter(long_string.split())

indexes = []
words = []
counts = []
for m in p.finditer(long_string):
    indexes.append(m.start())
    words.append(m.group())
    counts.append(counter[m.group()])


print(list(zip(indexes,words,counts)))

[(0, 'one', 3), (48, 'two', 1), (98, 'one', 3), (127, 'one', 3), (198, 'there', 1)]

another variation that also has a list of index differences for the same word.

from collections import Counter, defaultdict 
from itertools import combinations
import pprint


d = defaultdict(list)
counter = Counter(long_string.split())

indexes = []
words = []
counts = []


for m in p.finditer(long_string):
    word_index = m.start()
    word = m.group()
    indexes.append(word_index)
    words.append(word)
    counts.append(counter[word])
    d[word].append(word_index) 


def index_distance(d):
    for k, v in d.items():
        d[k] = [v, [int(math.fabs(y - x)) for (x,y) in   combinations(v, 2) ]]
    return d


pprint.pprint(index_distance(d))

defaultdict(<class 'list'>,
            {'one': [[0, 98, 127], [98, 127, 29]],
             'there': [[198], []],
             'two': [[48], []]})