Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
  • Loading branch information
monica-sekoyan committed May 29, 2024
1 parent 418e1f0 commit 5a1929a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 17 deletions.
27 changes: 17 additions & 10 deletions nemo/collections/asr/models/label_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len):
@typecheck()
def forward(self, input_signal, input_signal_length):
processed_signal, processed_signal_len = self.preprocessor(
input_signal=input_signal, length=input_signal_length,
input_signal=input_signal,
length=input_signal_length,
)

if self.spec_augmentation is not None and self.training:
Expand Down Expand Up @@ -591,7 +592,9 @@ def verify_speakers(self, path2audio_file1, path2audio_file2, threshold=0.7):
return False

@torch.no_grad()
def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshold=0.7, batch_size=32, sample_rate=16000, device='cuda'):
def verify_speakers_batch(
self, manifest_filepath1, manifest_filepath2, threshold=0.7, batch_size=32, sample_rate=16000, device='cuda'
):
"""
Verify if audio files from the first and second manifests are from the same speaker or not.
Expand All @@ -606,8 +609,12 @@ def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshol
Returns:
True if both audio pair is from same speaker, False otherwise
"""
embs1, _, _, _ = self.batch_inference(manifest_filepath1, batch_size=batch_size, sample_rate=sample_rate, device=device)
embs2, _, _, _ = self.batch_inference(manifest_filepath2, batch_size=batch_size, sample_rate=sample_rate, device=device)
embs1, _, _, _ = self.batch_inference(
manifest_filepath1, batch_size=batch_size, sample_rate=sample_rate, device=device
)
embs2, _, _, _ = self.batch_inference(
manifest_filepath2, batch_size=batch_size, sample_rate=sample_rate, device=device
)

if embs1.shape != embs2.shape:
raise ValueError(
Expand All @@ -624,10 +631,8 @@ def verify_speakers_batch(self, manifest_filepath1, manifest_filepath2, threshol
Y = embs2.unsqueeze(dim=2)
# Score
similarity_scores = torch.matmul(X, Y).squeeze() / (
(
torch.matmul(X, X.permute(0, 2, 1)).squeeze() * torch.matmul(Y.permute(0, 2, 1), Y).squeeze()
) ** 0.5
)
(torch.matmul(X, X.permute(0, 2, 1)).squeeze() * torch.matmul(Y.permute(0, 2, 1), Y).squeeze()) ** 0.5
)
similarity_scores = (similarity_scores + 1) / 2

# Decision
Expand Down Expand Up @@ -672,7 +677,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)

dataloader = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
dataset=dataset,
batch_size=batch_size,
collate_fn=dataset.fixed_seq_collate_fn,
)

logits = []
Expand All @@ -694,7 +701,7 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
self.train(mode=mode)
if mode is True:
self.unfreeze()

logits, embs, gt_labels = np.asarray(logits), np.asarray(embs), np.asarray(gt_labels)

return embs, logits, gt_labels, trained_labels
23 changes: 16 additions & 7 deletions nemo/collections/asr/parts/preprocessing/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from pydub import AudioSegment as Audio
from pydub.exceptions import CouldntDecodeError

#FFMPEG for some formats needs explicitly defined coding-decoding strategy
# FFMPEG for some formats needs explicitly defined coding-decoding strategy
ffmpeg_codecs = {'opus': 'opus'}

except ModuleNotFoundError:
Expand Down Expand Up @@ -374,7 +374,13 @@ def from_file_list(
sample_rate = target_sr

return cls(
samples, sample_rate, target_sr=target_sr, trim=trim, channel_selector=channel_selector, *args, **kwargs,
samples,
sample_rate,
target_sr=target_sr,
trim=trim,
channel_selector=channel_selector,
*args,
**kwargs,
)

@classmethod
Expand Down Expand Up @@ -472,9 +478,8 @@ def duration(self):

@property
def rms_db(self):
"""Return per-channel RMS value.
"""
mean_square = np.mean(self._samples ** 2, axis=0)
"""Return per-channel RMS value."""
mean_square = np.mean(self._samples**2, axis=0)
return 10 * np.log10(mean_square)

@property
Expand All @@ -485,7 +490,7 @@ def gain_db(self, gain):
self._samples *= 10.0 ** (gain / 20.0)

def normalize_db(self, target_db=-20, ref_channel=None):
"""Normalize the signal to a target RMS value in decibels.
"""Normalize the signal to a target RMS value in decibels.
For multi-channel audio, the RMS value is determined by the reference channel (if not None),
otherwise it will be the maximum RMS across all channels.
"""
Expand Down Expand Up @@ -513,7 +518,11 @@ def pad(self, pad_size, symmetric=False):
f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}."
)
# apply padding
self._samples = np.pad(self._samples, pad_width, mode='constant',)
self._samples = np.pad(
self._samples,
pad_width,
mode='constant',
)

def subsegment(self, start_time=None, end_time=None):
"""Cut the AudioSegment between given boundaries.
Expand Down

0 comments on commit 5a1929a

Please sign in to comment.