EveryVoiceTTS · roedoejet · May 13, 2024 · May 6, 2024 · May 13, 2024
diff --git a/everyvoice/preprocessor/preprocessor.py b/everyvoice/preprocessor/preprocessor.py
@@ -75,6 +75,9 @@
         self.sampling_rate_change = (
             self.output_sampling_rate // self.input_sampling_rate
         )
+        self.output_hop_size = (
+            self.audio_config.fft_hop_size * self.sampling_rate_change
+        )
         # Define Spectral Transform
         # Gah, so many ways to do this: https://github.com/CookiePPP/VocoderComparisons/issues/3
         self.input_spectral_transform = get_spectral_transform(
@@ -91,7 +94,7 @@
             self.audio_config.spec_type,
             self.audio_config.n_fft * self.sampling_rate_change,
             self.audio_config.fft_window_size * self.sampling_rate_change,
-            self.audio_config.fft_hop_size * self.sampling_rate_change,
+            self.output_hop_size,
             sample_rate=self.input_sampling_rate,
             n_mels=self.audio_config.n_mels,
             f_min=self.audio_config.f_min,
@@ -126,6 +129,7 @@
         use_effects=True,
         resample_rate=None,
         sox_effects=None,
+        hop_size=None,
         update_counters=True,  # unset this when processing the same file a second time
     ) -> tuple[torch.Tensor, int] | tuple[None, None]:
         """Process audio
@@ -182,8 +186,17 @@
             self.counters.increment("duration", seconds)
 
         audio = audio.squeeze()  # get rid of channels dimension
-
-        return audio, sr
+        # limit the number of samples to a number that is evenly divisible by the hop size
+        # most reasonable hop sizes for training should be in the vicinity of ~10ms
+        # so we're talking about losing slightly less than that amount of audio from the
+        # signal at a maximum, and it makes downstream things like vocoder matching more straightforward.
+        if hop_size is None:
+            raise ValueError(
+                "We must know the hop size for processing audio because EveryVoice enforces that the number of samples is evenly divisible by the hop size"
+            )
+        max_frames = audio.size(0) // hop_size
+        max_samples = max_frames * hop_size
+        return audio[:max_samples], sr
 
     def extract_spectral_features(
         self, audio_tensor: torch.Tensor, transform, normalize=True
@@ -407,11 +420,12 @@
                 sys.exit(1)
 
     def create_path(self, item: dict, folder: str, fn: str) -> Path:
-        return (
+        path = (
             self.save_dir
             / folder
             / self.sep.join([item["basename"], item["speaker"], item["language"], fn])
         )
+        return path
 
     def process_one_audio(
         self, item: dict, data_dir, sox_effects: list[list]
@@ -450,6 +464,7 @@
                 audio_path,
                 resample_rate=self.input_sampling_rate,
                 sox_effects=sox_effects,
+                hop_size=self.audio_config.fft_hop_size,
             )
             if input_audio is None:
                 return None
@@ -472,6 +487,7 @@
                 resample_rate=self.output_sampling_rate,
                 sox_effects=sox_effects,
                 update_counters=False,
+                hop_size=self.output_hop_size,
             )
             if output_audio is not None:
                 output_audio = output_audio.unsqueeze(0)
@@ -627,12 +643,14 @@
                 binomial_interpolator(input_spec.size(1), len(phone_tokens))
             )
             assert input_spec.size(1) == phone_attn_prior.size(0)
+            assert len(phone_tokens) == phone_attn_prior.size(1)
             save_tensor(phone_attn_prior, phone_attn_prior_path)
         if process_characters:
             character_attn_prior = torch.from_numpy(
                 binomial_interpolator(input_spec.size(1), len(character_tokens))
             )
             assert input_spec.size(1) == character_attn_prior.size(0)
+            assert len(character_tokens) == character_attn_prior.size(1)
             save_tensor(character_attn_prior, character_attn_prior_path)
 
     def process_text(
@@ -740,14 +758,24 @@
         else:
             return (character_tokens, phone_tokens, pfs)
 
-    def process_spec(self, item):
+    def process_spec(
+        self, item
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """Processes spectral features based on the defined transform (linear, Mel, complex etc)
+        Processes an 'input' spectrogram based on the sampling rate of the audio input to the vocoder.
+        Processes an 'output' spectrogram based on the output sampling rate of the vocoder.
+
+        We also limit the length of the audio to a number that is evenly divisible by the hop size
+        """
+        input_spec = None
+        output_spec = None
         input_audio_path = self.create_path(
             item, "audio", f"audio-{self.input_sampling_rate}.wav"
         )
         if not input_audio_path.exists():
             self.counters.increment("skipped_processes")
             logger.info(f"Audio at {input_audio_path} is missing. Skipping...")
-            return
+            return input_spec, output_spec
         output_audio_path = self.create_path(
             item, "audio", f"audio-{self.output_sampling_rate}.wav"
         )
@@ -766,18 +794,29 @@
             if not output_spec_path.exists() or self.overwrite:
                 output_audio, _, _ = self.load_audio(output_audio_path)
                 output_audio = output_audio.squeeze()
+                # limit the number of frames to be equal to the number of
+                # available full frames from the audio (equal to fft_hop_size * sampling rate change, if we are upsampling)
+                # i.e. we round down
+                max_output_frames = output_audio.size(0) // self.output_hop_size
                 output_spec = self.extract_spectral_features(
                     output_audio, self.output_spectral_transform
-                )
+                )[:, :max_output_frames]
+                assert max_output_frames == output_spec.size(1)
                 save_tensor(output_spec, output_spec_path)
 
         if not input_spec_path.exists() or self.overwrite:
             input_audio, _, _ = self.load_audio(input_audio_path)
             input_audio = input_audio.squeeze()
+            # limit the number of frames to be equal to the number of
+            # available full frames from the audio
+            # i.e. we round down
+            max_input_frames = input_audio.size(0) // self.audio_config.fft_hop_size
             input_spec = self.extract_spectral_features(
                 input_audio, self.input_spectral_transform
-            )
+            )[:, :max_input_frames]
+            assert max_input_frames == input_spec.size(1)
             save_tensor(input_spec, input_spec_path)
+        return input_spec, output_spec
 
     def get_process_fn(self, process):
         if process == "text":

diff --git a/everyvoice/tests/test_preprocessing.py b/everyvoice/tests/test_preprocessing.py
@@ -61,6 +61,7 @@ def test_process_audio_for_alignment(self):
                 self.wavs_dir / (entry["basename"] + ".wav"),
                 use_effects=True,
                 sox_effects=config.preprocessing.source_data[0].sox_effects,
+                hop_size=config.preprocessing.audio.fft_hop_size,
             )
             self.assertEqual(sr, 22050)
             self.assertEqual(audio.dtype, float32)
@@ -86,6 +87,7 @@ def test_remove_silence(self):
             audio_path_with_silence,
             use_effects=True,
             sox_effects=sox_effects,
+            hop_size=config.preprocessing.audio.fft_hop_size,
         )
 
         self.assertEqual(
@@ -96,10 +98,9 @@ def test_remove_silence(self):
             3.5,
             "Should be exactly 3.5 seconds of audio at 44100 Hz sampling rate",
         )
-        self.assertAlmostEqual(
-            processed_audio.size()[0] / processed_sr,
+        self.assertEqual(
+            round(processed_audio.size()[0] / processed_sr, 2),
             2.5,
-            4,
             msg="Should be about half a second of silence removed from the beginning and end",
         )
         # should work with resampling too
@@ -108,11 +109,11 @@ def test_remove_silence(self):
             use_effects=True,
             resample_rate=22050,
             sox_effects=sox_effects,
+            hop_size=config.preprocessing.audio.fft_hop_size,
         )
-        self.assertAlmostEqual(
-            rs_processed_audio.size()[0] / rs_processed_sr,
+        self.assertEqual(
+            round(rs_processed_audio.size()[0] / rs_processed_sr, 2),
             2.5,
-            4,
             msg="Should be about half a second of silence removed from the beginning and end when resampled too",
         )
 
@@ -123,12 +124,33 @@ def test_process_empty_audio(self):
             self.assertEqual(sr, None)
 
     def test_process_audio(self):
+        import torchaudio
+
         for entry in self.filelist[1:]:
             audio, sr = self.preprocessor.process_audio(
-                self.wavs_dir / (entry["basename"] + ".wav")
+                self.wavs_dir / (entry["basename"] + ".wav"), hop_size=256
             )
             self.assertEqual(sr, 22050)
             self.assertEqual(audio.dtype, float32)
+        # test that truncating according to hop size actually happened
+        raw_audio, raw_sr = torchaudio.load(
+            str(self.wavs_dir / (entry["basename"] + ".wav"))
+        )
+        # remove channel info
+        raw_audio = raw_audio.squeeze()
+        self.assertNotEqual(raw_audio.size(0), audio.size(0))
+        self.assertLess(raw_audio.size(0) - audio.size(0), 256)
+        # changing the hop size changes how much is removed
+        diff_hop_audio, _ = self.preprocessor.process_audio(
+            self.wavs_dir / (entry["basename"] + ".wav"), hop_size=35
+        )
+        self.assertNotEqual(audio.size(0), diff_hop_audio.size(0))
+        # we should never truncate more than a portion of a single frame
+        self.assertLess(raw_audio.size(0) - diff_hop_audio.size(0), 35)
+        with self.assertRaises(ValueError):  # missing hop_size
+            self.preprocessor.process_audio(
+                self.wavs_dir / (entry["basename"] + ".wav")
+            )
 
     def test_spectral_feats(self):
         linear_vocoder_config = VocoderConfig(
@@ -148,7 +170,8 @@ def test_spectral_feats(self):
 
         for entry in self.filelist[1:]:
             audio, _ = self.preprocessor.process_audio(
-                self.wavs_dir / (entry["basename"] + ".wav")
+                self.wavs_dir / (entry["basename"] + ".wav"),
+                hop_size=linear_vocoder_config.preprocessing.audio.fft_hop_size,
             )
 
             # ming024_feats = np.load(
@@ -203,7 +226,8 @@ def test_pitch(self):
 
         for entry in self.filelist[1:]:
             audio, _ = self.preprocessor.process_audio(
-                self.wavs_dir / (entry["basename"] + ".wav")
+                self.wavs_dir / (entry["basename"] + ".wav"),
+                hop_size=pyworld_config.preprocessing.audio.fft_hop_size,
             )
             dur_path = (
                 self.lj_preprocessed
@@ -241,7 +265,7 @@ def test_pitch(self):
     def test_duration(self):
         for entry in self.filelist[1:]:
             audio, _ = self.preprocessor.process_audio(
-                self.wavs_dir / (entry["basename"] + ".wav")
+                self.wavs_dir / (entry["basename"] + ".wav"), hop_size=256
             )
             dur_path = (
                 self.lj_preprocessed
@@ -277,7 +301,8 @@ def test_energy(self):
         preprocessor = Preprocessor(frame_energy_config)
         for entry in self.filelist[1:]:
             audio, _ = self.preprocessor.process_audio(
-                self.wavs_dir / (entry["basename"] + ".wav")
+                self.wavs_dir / (entry["basename"] + ".wav"),
+                hop_size=frame_energy_config.preprocessing.audio.fft_hop_size,
             )
             dur_path = (
                 self.lj_preprocessed
@@ -364,9 +389,9 @@ def test_text_processing(self):
                     preprocessed_dir.mkdir(parents=True, exist_ok=True)
                     output_filelist = preprocessed_dir / "preprocessed_filelist.psv"
                     shutil.copyfile(filelist_test_info["path"], output_filelist)
-                    fp_config.preprocessing.source_data[0].filelist = (
-                        filelist_test_info["path"]
-                    )
+                    fp_config.preprocessing.source_data[
+                        0
+                    ].filelist = filelist_test_info["path"]
                     fp_config.preprocessing.save_dir = preprocessed_dir
                     preprocessor = Preprocessor(fp_config)
                     with capture_stdout() as output, mute_logger(
@@ -582,7 +607,6 @@ def test_hierarchy(self):
                     # to_process=("audio", "text", "pfs", "spec", "attn", "energy", "pitch"),
                     to_process=("audio", "text", "spec", "attn", "energy", "pitch"),
                 )
-
             for t in ("audio", "spec", "attn", "energy", "pitch"):
                 # There are two speakers
                 sources = [d.name for d in tmpdir.glob(f"**/{t}/*")]