From 5a1929aa7f58eb4c5cad879301cf252c7c2dfa5f Mon Sep 17 00:00:00 2001 From: monica-sekoyan Date: Wed, 29 May 2024 14:46:52 +0000 Subject: [PATCH] Apply isort and black reformatting Signed-off-by: monica-sekoyan --- nemo/collections/asr/models/label_models.py | 27 ++++++++++++------- .../asr/parts/preprocessing/segment.py | 23 +++++++++++----- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index 8cad69e80b939..100aa2a5b52fe 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len): @typecheck() def forward(self, input_signal, input_signal_length): processed_signal, processed_signal_len = self.preprocessor( - input_signal=input_signal, length=input_signal_length, + input_signal=input_signal, + length=input_signal_length, ) if self.spec_augmentation is not None and self.training: @@ -591,7 +592,9 @@ def verify_speakers(self, path2audio_file1, path2audio_file2, threshold=0.7): return False @torch.no_grad() - def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshold=0.7, batch_size=32, sample_rate=16000, device='cuda'): + def verify_speakers_batch( + self, manifest_filepath1, manifest_filepath2, threshold=0.7, batch_size=32, sample_rate=16000, device='cuda' + ): """ Verify if audio files from the first and second manifests are from the same speaker or not. @@ -606,8 +609,12 @@ def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshol Returns: True if both audio pair is from same speaker, False otherwise """ - embs1, _, _, _ = self.batch_inference(manifest_filepath1, batch_size=batch_size, sample_rate=sample_rate, device=device) - embs2, _, _, _ = self.batch_inference(manifest_filepath2, batch_size=batch_size, sample_rate=sample_rate, device=device) + embs1, _, _, _ = self.batch_inference( + manifest_filepath1, batch_size=batch_size, sample_rate=sample_rate, device=device + ) + embs2, _, _, _ = self.batch_inference( + manifest_filepath2, batch_size=batch_size, sample_rate=sample_rate, device=device + ) if embs1.shape != embs2.shape: raise ValueError( @@ -624,10 +631,8 @@ def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshol Y = embs2.unsqueeze(dim=2) # Score similarity_scores = torch.matmul(X, Y).squeeze() / ( - ( - torch.matmul(X, X.permute(0, 2, 1)).squeeze() * torch.matmul(Y.permute(0, 2, 1), Y).squeeze() - ) ** 0.5 - ) + (torch.matmul(X, X.permute(0, 2, 1)).squeeze() * torch.matmul(Y.permute(0, 2, 1), Y).squeeze()) ** 0.5 + ) similarity_scores = (similarity_scores + 1) / 2 # Decision @@ -672,7 +677,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer) dataloader = torch.utils.data.DataLoader( - dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn, + dataset=dataset, + batch_size=batch_size, + collate_fn=dataset.fixed_seq_collate_fn, ) logits = [] @@ -694,7 +701,7 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d self.train(mode=mode) if mode is True: self.unfreeze() - + logits, embs, gt_labels = np.asarray(logits), np.asarray(embs), np.asarray(gt_labels) return embs, logits, gt_labels, trained_labels diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index 7d4410dc6a310..63b0eaba45c4d 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -51,7 +51,7 @@ from pydub import AudioSegment as Audio from pydub.exceptions import CouldntDecodeError - #FFMPEG for some formats needs explicitly defined coding-decoding strategy + # FFMPEG for some formats needs explicitly defined coding-decoding strategy ffmpeg_codecs = {'opus': 'opus'} except ModuleNotFoundError: @@ -374,7 +374,13 @@ def from_file_list( sample_rate = target_sr return cls( - samples, sample_rate, target_sr=target_sr, trim=trim, channel_selector=channel_selector, *args, **kwargs, + samples, + sample_rate, + target_sr=target_sr, + trim=trim, + channel_selector=channel_selector, + *args, + **kwargs, ) @classmethod @@ -472,9 +478,8 @@ def duration(self): @property def rms_db(self): - """Return per-channel RMS value. - """ - mean_square = np.mean(self._samples ** 2, axis=0) + """Return per-channel RMS value.""" + mean_square = np.mean(self._samples**2, axis=0) return 10 * np.log10(mean_square) @property @@ -485,7 +490,7 @@ def gain_db(self, gain): self._samples *= 10.0 ** (gain / 20.0) def normalize_db(self, target_db=-20, ref_channel=None): - """Normalize the signal to a target RMS value in decibels. + """Normalize the signal to a target RMS value in decibels. For multi-channel audio, the RMS value is determined by the reference channel (if not None), otherwise it will be the maximum RMS across all channels. """ @@ -513,7 +518,11 @@ def pad(self, pad_size, symmetric=False): f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." ) # apply padding - self._samples = np.pad(self._samples, pad_width, mode='constant',) + self._samples = np.pad( + self._samples, + pad_width, + mode='constant', + ) def subsegment(self, start_time=None, end_time=None): """Cut the AudioSegment between given boundaries.