-
Notifications
You must be signed in to change notification settings - Fork 8.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve timestamp heuristics. (#1461)
* Improve timestamp heuristics. * Track pauses with last_speech_timestamp
- Loading branch information
Showing
2 changed files
with
60 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -225,28 +225,6 @@ def find_alignment( | |
for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) | ||
] | ||
|
||
# hack: truncate long words at the start of a window and the start of a sentence. | ||
# a better segmentation algorithm based on VAD should be able to replace this. | ||
word_durations = end_times - start_times | ||
word_durations = word_durations[word_durations.nonzero()] | ||
if len(word_durations) > 0: | ||
median_duration = np.median(word_durations) | ||
max_duration = median_duration * 2 | ||
sentence_end_marks = ".。!!??" | ||
# ensure words at sentence boundaries are not longer than twice the median word duration. | ||
for i in range(1, len(start_times)): | ||
if end_times[i] - start_times[i] > max_duration: | ||
if words[i] in sentence_end_marks: | ||
end_times[i] = start_times[i] + max_duration | ||
elif words[i - 1] in sentence_end_marks: | ||
start_times[i] = end_times[i] - max_duration | ||
# ensure the first and second word is not longer than twice the median word duration. | ||
if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: | ||
if len(start_times) > 1 and end_times[1] - start_times[1] > max_duration: | ||
boundary = max(end_times[1] / 2, end_times[1] - max_duration) | ||
end_times[0] = start_times[1] = boundary | ||
start_times[0] = max(0, end_times[0] - max_duration) | ||
|
||
return [ | ||
WordTiming(word, tokens, start, end, probability) | ||
for word, tokens, start, end, probability in zip( | ||
|
@@ -298,6 +276,7 @@ def add_word_timestamps( | |
num_frames: int, | ||
prepend_punctuations: str = "\"'“¿([{-", | ||
append_punctuations: str = "\"'.。,,!!??::”)]}、", | ||
last_speech_timestamp: float, | ||
**kwargs, | ||
): | ||
if len(segments) == 0: | ||
|
@@ -310,6 +289,25 @@ def add_word_timestamps( | |
|
||
text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment)) | ||
alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs) | ||
word_durations = np.array([t.end - t.start for t in alignment]) | ||
word_durations = word_durations[word_durations.nonzero()] | ||
median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0 | ||
max_duration = median_duration * 2 | ||
|
||
# hack: truncate long words at sentence boundaries. | ||
# a better segmentation algorithm based on VAD should be able to replace this. | ||
if len(word_durations) > 0: | ||
median_duration = np.median(word_durations) | ||
max_duration = median_duration * 2 | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
ryanheise
Author
Contributor
|
||
sentence_end_marks = ".。!!??" | ||
# ensure words at sentence boundaries are not longer than twice the median word duration. | ||
for i in range(1, len(alignment)): | ||
if alignment[i].end - alignment[i].start > max_duration: | ||
if alignment[i].word in sentence_end_marks: | ||
alignment[i].end = alignment[i].start + max_duration | ||
elif alignment[i - 1].word in sentence_end_marks: | ||
alignment[i].start = alignment[i].end - max_duration | ||
|
||
merge_punctuations(alignment, prepend_punctuations, append_punctuations) | ||
|
||
time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE | ||
|
@@ -335,18 +333,48 @@ def add_word_timestamps( | |
saved_tokens += len(timing.tokens) | ||
word_index += 1 | ||
|
||
# hack: truncate long words at segment boundaries. | ||
# a better segmentation algorithm based on VAD should be able to replace this. | ||
if len(words) > 0: | ||
segment["start"] = words[0]["start"] | ||
# hack: prefer the segment-level end timestamp if the last word is too long. | ||
# a better segmentation algorithm based on VAD should be able to replace this. | ||
# ensure the first and second word after a pause is not longer than | ||
# twice the median word duration. | ||
if words[0]["end"] - last_speech_timestamp > median_duration * 4 and ( | ||
words[0]["end"] - words[0]["start"] > max_duration | ||
or ( | ||
len(words) > 1 | ||
and words[1]["end"] - words[0]["start"] > max_duration * 2 | ||
) | ||
): | ||
if ( | ||
len(words) > 1 | ||
and words[1]["end"] - words[1]["start"] > max_duration | ||
): | ||
boundary = max(words[1]["end"] / 2, words[1]["end"] - max_duration) | ||
words[0]["end"] = words[1]["start"] = boundary | ||
words[0]["start"] = max(0, words[0]["end"] - max_duration) | ||
|
||
# prefer the segment-level start timestamp if the first word is too long. | ||
if ( | ||
segment["start"] < words[0]["end"] | ||
and segment["start"] - 0.5 > words[0]["start"] | ||
): | ||
words[0]["start"] = max( | ||
0, min(words[0]["end"] - median_duration, segment["start"]) | ||
) | ||
else: | ||
segment["start"] = words[0]["start"] | ||
|
||
# prefer the segment-level end timestamp if the last word is too long. | ||
if ( | ||
segment["end"] > words[-1]["start"] | ||
and segment["end"] + 0.5 < words[-1]["end"] | ||
): | ||
# adjust the word-level timestamps based on the segment-level timestamps | ||
words[-1]["end"] = segment["end"] | ||
words[-1]["end"] = max( | ||
words[-1]["start"] + median_duration, segment["end"] | ||
) | ||
else: | ||
# adjust the segment-level timestamps based on the word-level timestamps | ||
segment["end"] = words[-1]["end"] | ||
|
||
last_speech_timestamp = segment["end"] | ||
|
||
segment["words"] = words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Both
median_duration
andmax_duration
are calculated before (lines 294-295).