How can I merge sentences if they are spoken by the same speaker consecutively-CodePudding

I have below list. I need to join sentences spoken by the same speaker consecutively.

[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. </v>', 
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]

Expected output:

[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye.</v>'
]

What I have tried so far-

def speak(i):
    speaker_out = lines[i].split('<v ')[1].split('>')[0]
    return speaker_out
    
def getspeech(b):
    x=((nltk.tokenize.sent_tokenize(lines[b].split('<v ')[1].split('>')[1].split('</v')[0])))
    return x

t=[]
for i in range(1,len(lines)-2,2):
    if(speak(i)==speak(i 2)):
        t.append(getspeech(i)   getspeech(i 2))
        lines[i 2]=lines[i 2].replace(lines[i 2], '')
        #t.append(x for x in getspeech(i 2) if x not in getspeech(i))

CodePudding user response：

You've already figured out how to find the speaker given the line. Consider itertools.groupby (docs), it is supposed to do exactly what you want to do. You just need to modify your speak function to take a single line, and return the speaker of that line. Now, since your lines list contains timestamps and then the actual speech, we'll iterate over it in groups of two:

import itertools

lines = [
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>', 
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I\'m watching so not to what I\'m thinking. </v>', 
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]

def iterate_over_lines(lines):
    lines_iter = iter(lines) # create an iterator for the list

    while True:
        try:
            timestamp = next(lines)
            speech = next(lines)
            yield (timestamp, speech)  # Each "speech group" contains the timestamp, and what was said
        except StopIteration:
            break

def get_speaker(speech_group):
    line = speech_group[1]     # What was said is the second element of the group
    speaker_out = line.split('<v ')[1].split('>')[0]   # Extract speaker from line
    return speaker_out


line_groups = iterate_over_lines(lines)
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
    print(speaker)
    print(*lines_spoken, sep="\n")
    print("")

Which gives:

Davis, Tres
('00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>')

Crook, Tim. J.
('00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>')

Davis, Tres
('00:00:10.000 --> 00:01:00.581', "<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. </v>")
('00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>')
('00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>')

The lines_spoken in each group will contain all the lines spoken, so you can iterate over that and merge those lines quite easily.

# This function does basically the opposite of get_speaker
def get_speech(line):
    speech_out = line.split('<v ')[1].split('>')[1].split('</v')[0]   # Extract speech from line
    return speech_out

def merge_group(group):
    timestamp = None
    speech = []
    for ts, sp in group:
        if timestamp is None: 
            timestamp = ts
        speech.append(get_speech(sp)

line_groups = iterate_over_lines(lines)

result = []
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
    timestamp, speech = merge_group(lines_spoken)
    result.append(timestamp)
    result.append(f"<v {speaker}>{speech}</v>")

print(result)

Which gives the desired list:

[
 '00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
 '00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
 '00:00:10.000 --> 00:01:00.581', "<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking.  Ofcourse. Thanks, Bye.</v>"
]

Note that the timestamp you show in your expected output will no longer be accurate for the merged lines.

Instead of using str.split to extract the speaker or speech, you could use regular expressions.

import re

re_speaker = re.compile(r"<v ([^>] )>")            # https://regex101.com/r/C0n47c/1 
re_speech = re.compile(r"<v [^>] >(.*?)(?=</v>)")  # https://regex101.com/r/qVaiAy/1


def get_speaker(speech_group):
    line = speech_group[1]
    speakers = re_speaker.findall(line)
    return speakers[0]

def get_speech(line):
    speech = re_speech.findall(line)
    return speech[0]

You can play with the regex at the regex101 links provided. Here's a short explanation:

<v ([^>] )>

<v        >   : Literally those characters
   (     )    : Capturing group
    [^>]      : One or more characters that are not >

<v [^>] >(.*?)(?=</v>)
<v      >                : Same as before
   [^>]                  : Same as before (no capturing group because this is not what we want to capture
         (   )           : Capturing group
          .*?            : Zero or more of any character, lazy match
              (?=     )  : Positive lookahead, after the match enforce that the enclosed pattern exists
                 </v>    : The pattern to find in the lookahead

CodePudding user response：

There's a helper for table-like data in convtools library (docs | github).

Of course, the solution implies some knowledge of convtools, but you may enjoy the absence of imperative style:

import re

from convtools import conversion as c
from convtools.contrib.tables import Table

rows_iter = (
    Table.from_rows(
        # making an iterable of 2-element tuples
        zip(
            data[::2],
            data[1::2],
        ),
        header=["time_range", "sentence"],
    )
    .update(
        author=(
            c.col("sentence")
            # passing a sentence to regex findall method
            .pipe(re.compile(r"<v (. ?)>").findall)
            # if there's a match, we expect only one, so taking by 0 index
            .and_then(c.item(0))
        ),
        sentence=(
            c.col("sentence")
            .pipe(re.compile(r">(. )<").findall)
            .and_then(c.item(0))
        ),
        tmp1=c.col("time_range").call_method("split", " --> "),
        time_start=c.col("tmp1").item(0),
        time_end=c.col("tmp1").item(1),
    )
    .drop("tmp1", "time_range")
    .into_iter_rows(dict)
)

process_rows_iter = (
    c.chunk_by(c.item("author"))
    .aggregate(
        {
            "author": c.ReduceFuncs.First(c.item("author")),
            "time_start": c.ReduceFuncs.First(c.item("time_start")),
            "time_end": c.ReduceFuncs.Last(c.item("time_end")),
            "sentence": c.ReduceFuncs.Array(c.item("sentence")).pipe(" ".join),
        }
    )
    .execute(rows_iter)
)

In [20]: list(process_rows_iter)
Out[20]:
[{'author': 'Davis, Tres',
  'time_start': '00:00:00.000',
  'time_end': '00:00:00.740',
  'sentence': 'Hi, Tuim.'},
 {'author': 'Crook, Tim. J.',
  'time_start': '00:00:10.000',
  'time_end': '00:01:00.581',
  'sentence': 'Yeah. Hi, Tres.'},
 {'author': 'Davis, Tres',
  'time_start': '00:00:10.000',
  'time_end': '00:00:00.775',
  'sentence': "On the deck. We will go back. I'm watching so not to what I'm thinking.  Ofcourse. Thanks, Bye."}]

The output is intentionally different because it allows you to format the output the way you need.