From 9f172fa850b1e7d22aa9e079c127d8a5afb71751 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Tue, 24 Sep 2024 14:29:12 -0400
Subject: [PATCH 1/2] add support for train_time_interval to consider hydra
 object (#10559)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 nemo/utils/exp_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 201ae0a37bd7..a5fd417741e1 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -116,7 +116,7 @@ class CallbackParams:
     auto_insert_metric_name: bool = True
     every_n_epochs: Optional[int] = 1
     every_n_train_steps: Optional[int] = None
-    train_time_interval: Optional[str] = None
+    train_time_interval: Optional[Any] = None
     prefix: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
     postfix: str = ".nemo"
     save_best_model: bool = False
@@ -373,6 +373,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
             - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
+            - train_time_interval (timedelta): pass an object of timedelta to save the model every timedelta. Defaults to None.
+                (use _target_ with hydra to achieve this)
 
     returns:
         log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of

From e3b95cbb01a6cc0da41f6b646bf29c8afda43a04 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Fri, 20 Sep 2024 12:02:47 -0400
Subject: [PATCH 2/2] replace unbiased with correction (#10555)

* replace unbiased with correction

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
---
 .../asr/parts/submodules/tdnn_attention.py    | 45 ++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/tdnn_attention.py b/nemo/collections/asr/parts/submodules/tdnn_attention.py
index 14f27ef41af7..0b504efcef7a 100644
--- a/nemo/collections/asr/parts/submodules/tdnn_attention.py
+++ b/nemo/collections/asr/parts/submodules/tdnn_attention.py
@@ -32,7 +32,7 @@ class StatsPoolLayer(nn.Module):
         pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time
             average pooling, i.e., mean)
         eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode.
-        biased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default
+        unbiased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default
             for torch.Tensor.std() is True.
 
     Returns:
@@ -42,7 +42,7 @@ class StatsPoolLayer(nn.Module):
         ValueError if an unsupported pooling mode is specified.
     """
 
-    def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, biased: bool = True):
+    def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, unbiased: bool = True):
         super().__init__()
         supported_modes = {"xvector", "tap"}
         if pool_mode not in supported_modes:
@@ -50,7 +50,7 @@ def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10,
         self.pool_mode = pool_mode
         self.feat_in = feat_in
         self.eps = eps
-        self.biased = biased
+        self.unbiased = unbiased
         if self.pool_mode == 'xvector':
             # Mean + std
             self.feat_in *= 2
@@ -59,7 +59,8 @@ def forward(self, encoder_output, length=None):
         if length is None:
             mean = encoder_output.mean(dim=-1)  # Time Axis
             if self.pool_mode == 'xvector':
-                std = encoder_output.std(dim=-1)
+                correction = 1 if self.unbiased else 0
+                std = encoder_output.std(dim=-1, correction=correction).clamp(min=self.eps)
                 pooled = torch.cat([mean, std], dim=-1)
             else:
                 pooled = mean
@@ -71,12 +72,13 @@ def forward(self, encoder_output, length=None):
             # Re-scale to get padded means
             means = means * (encoder_output.shape[-1] / length).unsqueeze(-1)
             if self.pool_mode == "xvector":
+                correction = 1 if self.unbiased else 0
                 stds = (
                     encoder_output.sub(means.unsqueeze(-1))
                     .masked_fill(mask, 0.0)
                     .pow(2.0)
                     .sum(-1)  # [B, D, T] -> [B, D]
-                    .div(length.view(-1, 1).sub(1 if self.biased else 0))
+                    .div(length.view(-1, 1).sub(correction))
                     .clamp(min=self.eps)
                     .sqrt()
                 )
@@ -104,7 +106,7 @@ def make_seq_mask_like(
 
 def lens_to_mask(lens: List[int], max_len: int, device: str = None):
     """
-    outputs masking labels for list of lengths of audio features, with max length of any 
+    outputs masking labels for list of lengths of audio features, with max length of any
     mask as max_len
     input:
         lens: list of lens
@@ -124,8 +126,8 @@ def get_statistics_with_mask(x: torch.Tensor, m: torch.Tensor, dim: int = 2, eps
     """
     compute mean and standard deviation of input(x) provided with its masking labels (m)
     input:
-        x: feature input 
-        m: averaged mask labels 
+        x: feature input
+        m: averaged mask labels
     output:
         mean: mean of input features
         std: stadard deviation of input features
@@ -146,7 +148,7 @@ class TDNNModule(nn.Module):
         stride: stride for conv layer
         padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches)
     output:
-        tdnn layer output 
+        tdnn layer output
     """
 
     def __init__(
@@ -183,7 +185,7 @@ class MaskedSEModule(nn.Module):
     """
     Squeeze and Excite module implementation with conv1d layers
     input:
-        inp_filters: input filter channel size 
+        inp_filters: input filter channel size
         se_filters: intermediate squeeze and excite channel output and input size
         out_filters: output filter channel size
         kernel_size: kernel_size for both conv1d layers
@@ -196,10 +198,20 @@ class MaskedSEModule(nn.Module):
     def __init__(self, inp_filters: int, se_filters: int, out_filters: int, kernel_size: int = 1, dilation: int = 1):
         super().__init__()
         self.se_layer = nn.Sequential(
-            nn.Conv1d(inp_filters, se_filters, kernel_size=kernel_size, dilation=dilation,),
+            nn.Conv1d(
+                inp_filters,
+                se_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
             nn.ReLU(),
             nn.BatchNorm1d(se_filters),
-            nn.Conv1d(se_filters, out_filters, kernel_size=kernel_size, dilation=dilation,),
+            nn.Conv1d(
+                se_filters,
+                out_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
             nn.Sigmoid(),
         )
 
@@ -220,7 +232,7 @@ class TDNNSEModule(nn.Module):
     Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference
     Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
     inputs:
-        inp_filters: input filter channel size 
+        inp_filters: input filter channel size
         out_filters: output filter channel size
         group_scale: scale value to group wider conv channels (deafult:8)
         se_channels: squeeze and excite output channel size (deafult: 1024/8= 128)
@@ -276,7 +288,7 @@ class AttentivePoolLayer(nn.Module):
         inp_filters: input feature channel length from encoder
         attention_channels: intermediate attention channel size
         kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1)
-        dilation: dilation size for TDNN and attention conv1d layers  (default: 1) 
+        dilation: dilation size for TDNN and attention conv1d layers  (default: 1)
     """
 
     def __init__(
@@ -295,7 +307,10 @@ def __init__(
             TDNNModule(inp_filters * 3, attention_channels, kernel_size=kernel_size, dilation=dilation),
             nn.Tanh(),
             nn.Conv1d(
-                in_channels=attention_channels, out_channels=inp_filters, kernel_size=kernel_size, dilation=dilation,
+                in_channels=attention_channels,
+                out_channels=inp_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
             ),
         )
         self.eps = eps