From 9f172fa850b1e7d22aa9e079c127d8a5afb71751 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Tue, 24 Sep 2024 14:29:12 -0400 Subject: [PATCH 1/2] add support for train_time_interval to consider hydra object (#10559) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- nemo/utils/exp_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 201ae0a37bd7..a5fd417741e1 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -116,7 +116,7 @@ class CallbackParams: auto_insert_metric_name: bool = True every_n_epochs: Optional[int] = 1 every_n_train_steps: Optional[int] = None - train_time_interval: Optional[str] = None + train_time_interval: Optional[Any] = None prefix: Optional[str] = None # If None, exp_manager will attempt to handle the filepath postfix: str = ".nemo" save_best_model: bool = False @@ -373,6 +373,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None. - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize + - train_time_interval (timedelta): pass an object of timedelta to save the model every timedelta. Defaults to None. + (use _target_ with hydra to achieve this) returns: log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of From e3b95cbb01a6cc0da41f6b646bf29c8afda43a04 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Fri, 20 Sep 2024 12:02:47 -0400 Subject: [PATCH 2/2] replace unbiased with correction (#10555) * replace unbiased with correction Signed-off-by: Nithin Rao Koluguri * Apply isort and black reformatting Signed-off-by: nithinraok --------- Signed-off-by: Nithin Rao Koluguri Signed-off-by: nithinraok Co-authored-by: Nithin Rao Koluguri Co-authored-by: nithinraok --- .../asr/parts/submodules/tdnn_attention.py | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/tdnn_attention.py b/nemo/collections/asr/parts/submodules/tdnn_attention.py index 14f27ef41af7..0b504efcef7a 100644 --- a/nemo/collections/asr/parts/submodules/tdnn_attention.py +++ b/nemo/collections/asr/parts/submodules/tdnn_attention.py @@ -32,7 +32,7 @@ class StatsPoolLayer(nn.Module): pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time average pooling, i.e., mean) eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode. - biased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default + unbiased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default for torch.Tensor.std() is True. Returns: @@ -42,7 +42,7 @@ class StatsPoolLayer(nn.Module): ValueError if an unsupported pooling mode is specified. """ - def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, biased: bool = True): + def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, unbiased: bool = True): super().__init__() supported_modes = {"xvector", "tap"} if pool_mode not in supported_modes: @@ -50,7 +50,7 @@ def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, self.pool_mode = pool_mode self.feat_in = feat_in self.eps = eps - self.biased = biased + self.unbiased = unbiased if self.pool_mode == 'xvector': # Mean + std self.feat_in *= 2 @@ -59,7 +59,8 @@ def forward(self, encoder_output, length=None): if length is None: mean = encoder_output.mean(dim=-1) # Time Axis if self.pool_mode == 'xvector': - std = encoder_output.std(dim=-1) + correction = 1 if self.unbiased else 0 + std = encoder_output.std(dim=-1, correction=correction).clamp(min=self.eps) pooled = torch.cat([mean, std], dim=-1) else: pooled = mean @@ -71,12 +72,13 @@ def forward(self, encoder_output, length=None): # Re-scale to get padded means means = means * (encoder_output.shape[-1] / length).unsqueeze(-1) if self.pool_mode == "xvector": + correction = 1 if self.unbiased else 0 stds = ( encoder_output.sub(means.unsqueeze(-1)) .masked_fill(mask, 0.0) .pow(2.0) .sum(-1) # [B, D, T] -> [B, D] - .div(length.view(-1, 1).sub(1 if self.biased else 0)) + .div(length.view(-1, 1).sub(correction)) .clamp(min=self.eps) .sqrt() ) @@ -104,7 +106,7 @@ def make_seq_mask_like( def lens_to_mask(lens: List[int], max_len: int, device: str = None): """ - outputs masking labels for list of lengths of audio features, with max length of any + outputs masking labels for list of lengths of audio features, with max length of any mask as max_len input: lens: list of lens @@ -124,8 +126,8 @@ def get_statistics_with_mask(x: torch.Tensor, m: torch.Tensor, dim: int = 2, eps """ compute mean and standard deviation of input(x) provided with its masking labels (m) input: - x: feature input - m: averaged mask labels + x: feature input + m: averaged mask labels output: mean: mean of input features std: stadard deviation of input features @@ -146,7 +148,7 @@ class TDNNModule(nn.Module): stride: stride for conv layer padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches) output: - tdnn layer output + tdnn layer output """ def __init__( @@ -183,7 +185,7 @@ class MaskedSEModule(nn.Module): """ Squeeze and Excite module implementation with conv1d layers input: - inp_filters: input filter channel size + inp_filters: input filter channel size se_filters: intermediate squeeze and excite channel output and input size out_filters: output filter channel size kernel_size: kernel_size for both conv1d layers @@ -196,10 +198,20 @@ class MaskedSEModule(nn.Module): def __init__(self, inp_filters: int, se_filters: int, out_filters: int, kernel_size: int = 1, dilation: int = 1): super().__init__() self.se_layer = nn.Sequential( - nn.Conv1d(inp_filters, se_filters, kernel_size=kernel_size, dilation=dilation,), + nn.Conv1d( + inp_filters, + se_filters, + kernel_size=kernel_size, + dilation=dilation, + ), nn.ReLU(), nn.BatchNorm1d(se_filters), - nn.Conv1d(se_filters, out_filters, kernel_size=kernel_size, dilation=dilation,), + nn.Conv1d( + se_filters, + out_filters, + kernel_size=kernel_size, + dilation=dilation, + ), nn.Sigmoid(), ) @@ -220,7 +232,7 @@ class TDNNSEModule(nn.Module): Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf) inputs: - inp_filters: input filter channel size + inp_filters: input filter channel size out_filters: output filter channel size group_scale: scale value to group wider conv channels (deafult:8) se_channels: squeeze and excite output channel size (deafult: 1024/8= 128) @@ -276,7 +288,7 @@ class AttentivePoolLayer(nn.Module): inp_filters: input feature channel length from encoder attention_channels: intermediate attention channel size kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1) - dilation: dilation size for TDNN and attention conv1d layers (default: 1) + dilation: dilation size for TDNN and attention conv1d layers (default: 1) """ def __init__( @@ -295,7 +307,10 @@ def __init__( TDNNModule(inp_filters * 3, attention_channels, kernel_size=kernel_size, dilation=dilation), nn.Tanh(), nn.Conv1d( - in_channels=attention_channels, out_channels=inp_filters, kernel_size=kernel_size, dilation=dilation, + in_channels=attention_channels, + out_channels=inp_filters, + kernel_size=kernel_size, + dilation=dilation, ), ) self.eps = eps