I am trying to merge sentences between two timestamps and return in the form of a list: Input data is:
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. </v>',
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]
Expected output is:
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye.</v>'
]
Below is the code that I am trying-
import itertools
lines = [
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. Iam watching so not to what Iam thinking. </v>',
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]
def iterate_over_lines(lines):
lines_iter = iter(lines) # create an iterator for the list
while True:
try:
timestamp = next(lines_iter)
speech = next(lines_iter)
yield (timestamp, speech) # Each "speech group" contains the timestamp, and what was said
except StopIteration:
break
def get_speaker(speech_group):
line = speech_group[1] # What was said is the second element of the group
speaker_out = line.split('<v ')[1].split('>')[0] # Extract speaker from line
return speaker_out
line_groups = iterate_over_lines(lines)
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
print(speaker)
print(*lines_spoken, sep="\n")
print("")
def get_speech(line):
speech_out = line.split('<v ')[1].split('>')[1].split('</v')[0] # Extract speech from line
return speech_out
def merge_group(group):
timestamp = None
speech = []
for ts, sp in group:
if timestamp is None:
timestamp = ts
speech.append(get_speech(sp))
line_groups = iterate_over_lines(lines)
result = []
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
timestamp, speech = merge_group(lines_spoken)
result.append(timestamp)
result.append(f"<v {speaker}>{speech}</v>")
print(result)
Error that I am getting is:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
TypeError: cannot unpack non-iterable NoneType object
CodePudding user response:
you forgot to return the tuple in function merge_group:
import itertools
lines = [
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. Iam watching so not to what Iam thinking. </v>',
'00:00:12.056 --> 00:00:00.721', '<v Davis, Tres>Ofcourse.</v>',
'00:00:25.643 --> 00:00:00.775', '<v Davis, Tres>Thanks, Bye.</v>'
]
def iterate_over_lines(lines):
lines_iter = iter(lines) # create an iterator for the list
while True:
try:
timestamp = next(lines_iter)
speech = next(lines_iter)
yield (timestamp, speech) # Each "speech group" contains the timestamp, and what was said
except StopIteration:
break
def get_speaker(speech_group):
line = speech_group[1] # What was said is the second element of the group
speaker_out = line.split('<v ')[1].split('>')[0] # Extract speaker from line
return speaker_out
line_groups = iterate_over_lines(lines)
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
print(speaker)
print(*lines_spoken, sep="\n")
print("")
def get_speech(line):
speech_out = line.split('<v ')[1].split('>')[1].split('</v')[0] # Extract speech from line
return speech_out
def merge_group(group):
timestamp = None
speech = []
for ts, sp in group:
if timestamp is None:
timestamp = ts
speech.append(get_speech(sp))
return timestamp, ', '.join(speech)
line_groups = iterate_over_lines(lines)
result = []
for speaker, lines_spoken in itertools.groupby(line_groups, key=get_speaker):
timestamp, speech = merge_group(lines_spoken)
result.append(timestamp)
result.append(f"<v {speaker}>{speech}</v>")
'''
[
'00:00:00.000 --> 00:00:00.740', '<v Davis, Tres>Hi, Tuim.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Crook, Tim. J.>Yeah. Hi, Tres.</v>',
'00:00:10.000 --> 00:01:00.581', '<v Davis, Tres>On the deck. We will go back. I'm watching so not to what I'm thinking. Ofcourse. Thanks, Bye.</v>'
]
'''
print(result)