openai · jongwook · Aug 7, 2023 · Jul 29, 2023 · Jul 30, 2023 · Jul 30, 2023
diff --git a/whisper/timing.py b/whisper/timing.py
@@ -214,6 +214,13 @@ def find_alignment(
  text_indices, time_indices = dtw(-matrix)
 
  words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
+ if len(word_tokens) <= 1:
+ # return on eot only
+ # >>> np.pad([], (1, 0))
+ # array([0.])
+ # This results in crashes when we lookup jump_times with float, like
+ # IndexError: arrays used as indices must be of integer (or boolean) type
+ return []
  word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
 
  jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
@@ -297,8 +304,6 @@ def add_word_timestamps(
  # hack: truncate long words at sentence boundaries.
  # a better segmentation algorithm based on VAD should be able to replace this.
  if len(word_durations) > 0:
- median_duration = np.median(word_durations)
- max_duration = median_duration * 2
  sentence_end_marks = ".。!！?？"
  # ensure words at sentence boundaries are not longer than twice the median word duration.
  for i in range(1, len(alignment)):