I have below list. I need to join sentences spoken by the same speaker consecutively.
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. </v>',
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]
Expected output:
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye.</v>'
]
What I have tried so far-
def speak(i):
speaker_out = lines[i].split('<v ')[1].split('>')[0]
return speaker_out
def getspeech(b):
x=((nltk.tokenize.sent_tokenize(lines[b].split('<v ')[1].split('>')[1].split('</v')[0])))
return x
t=[]
for i in range(1,len(lines)-2,2):
if(speak(i)==speak(i 2)):
t.append(getspeech(i) getspeech(i 2))
lines[i 2]=lines[i 2].replace(lines[i 2], '')
#t.append(x for x in getspeech(i 2) if x not in getspeech(i))
CodePudding user response:
You've already figured out how to find the speaker given the line. Consider itertools.groupby
(docs), it is supposed to do exactly what you want to do. You just need to modify your speak
function to take a single line, and return the speaker of that line. Now, since your lines
list contains timestamps and then the actual speech, we'll iterate over it in groups of two:
import itertools
lines = [
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I\'m watching so not to what I\'m thinking. </v>',
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]
def iterate_over_lines(lines):
lines_iter = iter(lines) # create an iterator for the list
while True:
try:
timestamp = next(lines)
speech = next(lines)
yield (timestamp, speech) # Each "speech group" contains the timestamp, and what was said
except StopIteration:
break
def get_speaker(speech_group):
line = speech_group[1] # What was said is the second element of the group
speaker_out = line.split('<v ')[1].split('>')[0] # Extract speaker from line
return speaker_out
line_groups = iterate_over_lines(lines)
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
print(speaker)
print(*lines_spoken, sep="\n")
print("")
Which gives:
Davis, Tres
('00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>')
Crook, Tim. J.
('00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>')
Davis, Tres
('00:00:10.000 --> 00:01:00.581', "<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. </v>")
('00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>')
('00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>')
The lines_spoken
in each group will contain all the lines spoken, so you can iterate over that and merge those lines quite easily.
# This function does basically the opposite of get_speaker
def get_speech(line):
speech_out = line.split('<v ')[1].split('>')[1].split('</v')[0] # Extract speech from line
return speech_out
def merge_group(group):
timestamp = None
speech = []
for ts, sp in group:
if timestamp is None:
timestamp = ts
speech.append(get_speech(sp)
line_groups = iterate_over_lines(lines)
result = []
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
timestamp, speech = merge_group(lines_spoken)
result.append(timestamp)
result.append(f"<v {speaker}>{speech}</v>")
print(result)
Which gives the desired list:
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', "<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye.</v>"
]
Note that the timestamp you show in your expected output will no longer be accurate for the merged lines.
Instead of using str.split
to extract the speaker or speech, you could use regular expressions.
import re
re_speaker = re.compile(r"<v ([^>] )>") # https://regex101.com/r/C0n47c/1
re_speech = re.compile(r"<v [^>] >(.*?)(?=</v>)") # https://regex101.com/r/qVaiAy/1
def get_speaker(speech_group):
line = speech_group[1]
speakers = re_speaker.findall(line)
return speakers[0]
def get_speech(line):
speech = re_speech.findall(line)
return speech[0]
You can play with the regex at the regex101 links provided. Here's a short explanation:
<v ([^>] )>
<v > : Literally those characters
( ) : Capturing group
[^>] : One or more characters that are not >
<v [^>] >(.*?)(?=</v>)
<v > : Same as before
[^>] : Same as before (no capturing group because this is not what we want to capture
( ) : Capturing group
.*? : Zero or more of any character, lazy match
(?= ) : Positive lookahead, after the match enforce that the enclosed pattern exists
</v> : The pattern to find in the lookahead
CodePudding user response:
There's a helper for table-like data in convtools library (docs | github).
Of course, the solution implies some knowledge of convtools
, but you may enjoy the absence of imperative style:
import re
from convtools import conversion as c
from convtools.contrib.tables import Table
rows_iter = (
Table.from_rows(
# making an iterable of 2-element tuples
zip(
data[::2],
data[1::2],
),
header=["time_range", "sentence"],
)
.update(
author=(
c.col("sentence")
# passing a sentence to regex findall method
.pipe(re.compile(r"<v (. ?)>").findall)
# if there's a match, we expect only one, so taking by 0 index
.and_then(c.item(0))
),
sentence=(
c.col("sentence")
.pipe(re.compile(r">(. )<").findall)
.and_then(c.item(0))
),
tmp1=c.col("time_range").call_method("split", " --> "),
time_start=c.col("tmp1").item(0),
time_end=c.col("tmp1").item(1),
)
.drop("tmp1", "time_range")
.into_iter_rows(dict)
)
process_rows_iter = (
c.chunk_by(c.item("author"))
.aggregate(
{
"author": c.ReduceFuncs.First(c.item("author")),
"time_start": c.ReduceFuncs.First(c.item("time_start")),
"time_end": c.ReduceFuncs.Last(c.item("time_end")),
"sentence": c.ReduceFuncs.Array(c.item("sentence")).pipe(" ".join),
}
)
.execute(rows_iter)
)
In [20]: list(process_rows_iter)
Out[20]:
[{'author': 'Davis, Tres',
'time_start': '00:00:00.000',
'time_end': '00:00:00.740',
'sentence': 'Hi, Tuim.'},
{'author': 'Crook, Tim. J.',
'time_start': '00:00:10.000',
'time_end': '00:01:00.581',
'sentence': 'Yeah. Hi, Tres.'},
{'author': 'Davis, Tres',
'time_start': '00:00:10.000',
'time_end': '00:00:00.775',
'sentence': "On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye."}]
The output is intentionally different because it allows you to format the output the way you need.