Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: when preprocessing, everyvoice forces equal length time and f… #421

Merged
merged 2 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 47 additions & 8 deletions everyvoice/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@
self.sampling_rate_change = (
self.output_sampling_rate // self.input_sampling_rate
)
self.output_hop_size = (
self.audio_config.fft_hop_size * self.sampling_rate_change
)
# Define Spectral Transform
# Gah, so many ways to do this: https://github.com/CookiePPP/VocoderComparisons/issues/3
self.input_spectral_transform = get_spectral_transform(
Expand All @@ -91,7 +94,7 @@
self.audio_config.spec_type,
self.audio_config.n_fft * self.sampling_rate_change,
self.audio_config.fft_window_size * self.sampling_rate_change,
self.audio_config.fft_hop_size * self.sampling_rate_change,
self.output_hop_size,
sample_rate=self.input_sampling_rate,
n_mels=self.audio_config.n_mels,
f_min=self.audio_config.f_min,
Expand Down Expand Up @@ -126,6 +129,7 @@
use_effects=True,
resample_rate=None,
sox_effects=None,
hop_size=None,
update_counters=True, # unset this when processing the same file a second time
) -> tuple[torch.Tensor, int] | tuple[None, None]:
"""Process audio
Expand Down Expand Up @@ -182,8 +186,17 @@
self.counters.increment("duration", seconds)

audio = audio.squeeze() # get rid of channels dimension

return audio, sr
# limit the number of samples to a number that is evenly divisible by the hop size
# most reasonable hop sizes for training should be in the vicinity of ~10ms
# so we're talking about losing slightly less than that amount of audio from the
# signal at a maximum, and it makes downstream things like vocoder matching more straightforward.
if hop_size is None:
raise ValueError(
"We must know the hop size for processing audio because EveryVoice enforces that the number of samples is evenly divisible by the hop size"
)
max_frames = audio.size(0) // hop_size
max_samples = max_frames * hop_size
return audio[:max_samples], sr

def extract_spectral_features(
self, audio_tensor: torch.Tensor, transform, normalize=True
Expand Down Expand Up @@ -407,11 +420,12 @@
sys.exit(1)

def create_path(self, item: dict, folder: str, fn: str) -> Path:
return (
path = (
self.save_dir
/ folder
/ self.sep.join([item["basename"], item["speaker"], item["language"], fn])
)
return path

def process_one_audio(
self, item: dict, data_dir, sox_effects: list[list]
Expand Down Expand Up @@ -450,6 +464,7 @@
audio_path,
resample_rate=self.input_sampling_rate,
sox_effects=sox_effects,
hop_size=self.audio_config.fft_hop_size,
)
if input_audio is None:
return None
Expand All @@ -472,6 +487,7 @@
resample_rate=self.output_sampling_rate,
sox_effects=sox_effects,
update_counters=False,
hop_size=self.output_hop_size,
)
if output_audio is not None:
output_audio = output_audio.unsqueeze(0)
Expand Down Expand Up @@ -627,12 +643,14 @@
binomial_interpolator(input_spec.size(1), len(phone_tokens))
)
assert input_spec.size(1) == phone_attn_prior.size(0)
assert len(phone_tokens) == phone_attn_prior.size(1)

Check warning on line 646 in everyvoice/preprocessor/preprocessor.py

View check run for this annotation

Codecov / codecov/patch

everyvoice/preprocessor/preprocessor.py#L646

Added line #L646 was not covered by tests
save_tensor(phone_attn_prior, phone_attn_prior_path)
if process_characters:
character_attn_prior = torch.from_numpy(
binomial_interpolator(input_spec.size(1), len(character_tokens))
)
assert input_spec.size(1) == character_attn_prior.size(0)
assert len(character_tokens) == character_attn_prior.size(1)
save_tensor(character_attn_prior, character_attn_prior_path)

def process_text(
Expand Down Expand Up @@ -740,14 +758,24 @@
else:
return (character_tokens, phone_tokens, pfs)

def process_spec(self, item):
def process_spec(
self, item
) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
"""Processes spectral features based on the defined transform (linear, Mel, complex etc)
Processes an 'input' spectrogram based on the sampling rate of the audio input to the vocoder.
Processes an 'output' spectrogram based on the output sampling rate of the vocoder.

We also limit the length of the audio to a number that is evenly divisible by the hop size
"""
input_spec = None
output_spec = None
input_audio_path = self.create_path(
item, "audio", f"audio-{self.input_sampling_rate}.wav"
)
if not input_audio_path.exists():
self.counters.increment("skipped_processes")
logger.info(f"Audio at {input_audio_path} is missing. Skipping...")
return
return input_spec, output_spec

Check warning on line 778 in everyvoice/preprocessor/preprocessor.py

View check run for this annotation

Codecov / codecov/patch

everyvoice/preprocessor/preprocessor.py#L778

Added line #L778 was not covered by tests
output_audio_path = self.create_path(
item, "audio", f"audio-{self.output_sampling_rate}.wav"
)
Expand All @@ -766,18 +794,29 @@
if not output_spec_path.exists() or self.overwrite:
output_audio, _, _ = self.load_audio(output_audio_path)
output_audio = output_audio.squeeze()
# limit the number of frames to be equal to the number of
# available full frames from the audio (equal to fft_hop_size * sampling rate change, if we are upsampling)
# i.e. we round down
max_output_frames = output_audio.size(0) // self.output_hop_size

Check warning on line 800 in everyvoice/preprocessor/preprocessor.py

View check run for this annotation

Codecov / codecov/patch

everyvoice/preprocessor/preprocessor.py#L800

Added line #L800 was not covered by tests
output_spec = self.extract_spectral_features(
output_audio, self.output_spectral_transform
)
)[:, :max_output_frames]
assert max_output_frames == output_spec.size(1)

Check warning on line 804 in everyvoice/preprocessor/preprocessor.py

View check run for this annotation

Codecov / codecov/patch

everyvoice/preprocessor/preprocessor.py#L804

Added line #L804 was not covered by tests
save_tensor(output_spec, output_spec_path)

if not input_spec_path.exists() or self.overwrite:
input_audio, _, _ = self.load_audio(input_audio_path)
input_audio = input_audio.squeeze()
# limit the number of frames to be equal to the number of
# available full frames from the audio
# i.e. we round down
max_input_frames = input_audio.size(0) // self.audio_config.fft_hop_size
input_spec = self.extract_spectral_features(
input_audio, self.input_spectral_transform
)
)[:, :max_input_frames]
assert max_input_frames == input_spec.size(1)
save_tensor(input_spec, input_spec_path)
return input_spec, output_spec

def get_process_fn(self, process):
if process == "text":
Expand Down
54 changes: 39 additions & 15 deletions everyvoice/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_process_audio_for_alignment(self):
self.wavs_dir / (entry["basename"] + ".wav"),
use_effects=True,
sox_effects=config.preprocessing.source_data[0].sox_effects,
hop_size=config.preprocessing.audio.fft_hop_size,
)
self.assertEqual(sr, 22050)
self.assertEqual(audio.dtype, float32)
Expand All @@ -86,6 +87,7 @@ def test_remove_silence(self):
audio_path_with_silence,
use_effects=True,
sox_effects=sox_effects,
hop_size=config.preprocessing.audio.fft_hop_size,
)

self.assertEqual(
Expand All @@ -96,10 +98,9 @@ def test_remove_silence(self):
3.5,
"Should be exactly 3.5 seconds of audio at 44100 Hz sampling rate",
)
self.assertAlmostEqual(
processed_audio.size()[0] / processed_sr,
self.assertEqual(
round(processed_audio.size()[0] / processed_sr, 2),
2.5,
4,
msg="Should be about half a second of silence removed from the beginning and end",
)
# should work with resampling too
Expand All @@ -108,11 +109,11 @@ def test_remove_silence(self):
use_effects=True,
resample_rate=22050,
sox_effects=sox_effects,
hop_size=config.preprocessing.audio.fft_hop_size,
)
self.assertAlmostEqual(
rs_processed_audio.size()[0] / rs_processed_sr,
self.assertEqual(
round(rs_processed_audio.size()[0] / rs_processed_sr, 2),
2.5,
4,
msg="Should be about half a second of silence removed from the beginning and end when resampled too",
)

Expand All @@ -123,12 +124,33 @@ def test_process_empty_audio(self):
self.assertEqual(sr, None)

def test_process_audio(self):
import torchaudio

for entry in self.filelist[1:]:
audio, sr = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
self.wavs_dir / (entry["basename"] + ".wav"), hop_size=256
)
self.assertEqual(sr, 22050)
self.assertEqual(audio.dtype, float32)
# test that truncating according to hop size actually happened
raw_audio, raw_sr = torchaudio.load(
str(self.wavs_dir / (entry["basename"] + ".wav"))
)
# remove channel info
raw_audio = raw_audio.squeeze()
self.assertNotEqual(raw_audio.size(0), audio.size(0))
self.assertLess(raw_audio.size(0) - audio.size(0), 256)
# changing the hop size changes how much is removed
diff_hop_audio, _ = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav"), hop_size=35
)
self.assertNotEqual(audio.size(0), diff_hop_audio.size(0))
# we should never truncate more than a portion of a single frame
self.assertLess(raw_audio.size(0) - diff_hop_audio.size(0), 35)
with self.assertRaises(ValueError): # missing hop_size
self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
)

def test_spectral_feats(self):
linear_vocoder_config = VocoderConfig(
Expand All @@ -148,7 +170,8 @@ def test_spectral_feats(self):

for entry in self.filelist[1:]:
audio, _ = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
self.wavs_dir / (entry["basename"] + ".wav"),
hop_size=linear_vocoder_config.preprocessing.audio.fft_hop_size,
)

# ming024_feats = np.load(
Expand Down Expand Up @@ -203,7 +226,8 @@ def test_pitch(self):

for entry in self.filelist[1:]:
audio, _ = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
self.wavs_dir / (entry["basename"] + ".wav"),
hop_size=pyworld_config.preprocessing.audio.fft_hop_size,
)
dur_path = (
self.lj_preprocessed
Expand Down Expand Up @@ -241,7 +265,7 @@ def test_pitch(self):
def test_duration(self):
for entry in self.filelist[1:]:
audio, _ = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
self.wavs_dir / (entry["basename"] + ".wav"), hop_size=256
)
dur_path = (
self.lj_preprocessed
Expand Down Expand Up @@ -277,7 +301,8 @@ def test_energy(self):
preprocessor = Preprocessor(frame_energy_config)
for entry in self.filelist[1:]:
audio, _ = self.preprocessor.process_audio(
self.wavs_dir / (entry["basename"] + ".wav")
self.wavs_dir / (entry["basename"] + ".wav"),
hop_size=frame_energy_config.preprocessing.audio.fft_hop_size,
)
dur_path = (
self.lj_preprocessed
Expand Down Expand Up @@ -364,9 +389,9 @@ def test_text_processing(self):
preprocessed_dir.mkdir(parents=True, exist_ok=True)
output_filelist = preprocessed_dir / "preprocessed_filelist.psv"
shutil.copyfile(filelist_test_info["path"], output_filelist)
fp_config.preprocessing.source_data[0].filelist = (
filelist_test_info["path"]
)
fp_config.preprocessing.source_data[
0
].filelist = filelist_test_info["path"]
fp_config.preprocessing.save_dir = preprocessed_dir
preprocessor = Preprocessor(fp_config)
with capture_stdout() as output, mute_logger(
Expand Down Expand Up @@ -582,7 +607,6 @@ def test_hierarchy(self):
# to_process=("audio", "text", "pfs", "spec", "attn", "energy", "pitch"),
to_process=("audio", "text", "spec", "attn", "energy", "pitch"),
)

for t in ("audio", "spec", "attn", "energy", "pitch"):
# There are two speakers
sources = [d.name for d in tmpdir.glob(f"**/{t}/*")]
Expand Down