Home > Net >  Get the location of word in text and the distance of the same word to the next similar word
Get the location of word in text and the distance of the same word to the next similar word

Time:08-18

ive been trying to find the given a list of a word to my script and then i search for the (word - how many times its been repeated - its location in text and whats the distance of the first word to the other similar word)
so i coded the first 2 things but cant find their location and distance to other similar words
my code :

import re

#Dummy text
long_string = "one Groups are marked by the ()meta-characters. two They group together the expressions contained one inside them, and you can one repeat the contents of a group with a repeating qualifier, such as there"
search_list = ['one', 'two', 'there']

find_words = re.compile('|'.join(search_list),re.IGNORECASE).findall(long_string)
if find_words:
    print('len_of_words : {}'.format(len(find_words)),find_words)
    
else:
    pass

output :

len_of_words : 5 ['one', 'two', 'one', 'one', 'there']

but cant get the location of the founded words in text .

CodePudding user response:

  from collections import Counter

finditer can find the start and the end of the match.

long_string = "one Groups are marked by the ()meta-characters. two They group together the expressions contained one inside them, and you can one repeat the contents of a group with a repeating qualifier, such as there"


p = re.compile(r'one|two|there')


counter = Counter(long_string.split())

indexes = []
words = []
counts = []
for m in p.finditer(long_string):
    indexes.append(m.start())
    words.append(m.group())
    counts.append(counter[m.group()])


print(list(zip(indexes,words,counts)))

[(0, 'one', 3), (48, 'two', 1), (98, 'one', 3), (127, 'one', 3), (198, 'there', 1)]

another variation that also has a list of index differences for the same word.

from collections import Counter, defaultdict 
from itertools import combinations
import pprint


d = defaultdict(list)
counter = Counter(long_string.split())

indexes = []
words = []
counts = []


for m in p.finditer(long_string):
    word_index = m.start()
    word = m.group()
    indexes.append(word_index)
    words.append(word)
    counts.append(counter[word])
    d[word].append(word_index) 


def index_distance(d):
    for k, v in d.items():
        d[k] = [v, [int(math.fabs(y - x)) for (x,y) in   combinations(v, 2) ]]
    return d


pprint.pprint(index_distance(d))

defaultdict(<class 'list'>,
            {'one': [[0, 98, 127], [98, 127, 29]],
             'there': [[198], []],
             'two': [[48], []]})
  • Related