From 9165de5f57a2d14b51d430d8e2b32156ff4899c0 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sat, 16 Jul 2022 22:47:05 +0800
Subject: [PATCH 01/40] add ScaledLSTM

---
 .../pruned_transducer_stateless2/scaling.py   | 168 ++++++++++++++++--
 1 file changed, 157 insertions(+), 11 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index c190be626f..54f4a53c50 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -1,4 +1,4 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
+# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey, Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -21,7 +21,8 @@
 
 import torch
 import torch.nn as nn
-from torch import Tensor
+from torch import Tensor, _VF
+from torch.nn.utils.rnn import PackedSequence
 
 
 def _ntuple(n):
@@ -154,7 +155,7 @@ def __init__(
     def forward(self, x: Tensor) -> Tensor:
         assert x.shape[self.channel_dim] == self.num_channels
         scales = (
-            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True)
+            torch.mean(x**2, dim=self.channel_dim, keepdim=True)
             + self.eps.exp()
         ) ** -0.5
         return x * scales
@@ -207,12 +208,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
+        a = (3**0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
+        scale = fan_in**-0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -256,12 +257,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
+        a = (3**0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
+        scale = fan_in**-0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -325,12 +326,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
+        a = (3**0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
+        scale = fan_in**-0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -376,6 +377,137 @@ def forward(self, input: Tensor) -> Tensor:
         return self._conv_forward(input, self.get_weight())
 
 
+class ScaledLSTM(nn.LSTM):
+    # See docs for ScaledLinear.
+    # This class implements single-layer LSTM with scaling mechanism, using `torch._VF.lstm`
+    # Please refer to https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py
+    def __init__(
+        self,
+        *args,
+        initial_scale: float = 1.0,
+        initial_speed: float = 1.0,
+        **kwargs
+    ):
+        # Hardcode num_layers=1 and proj_size=0 here
+        super(ScaledLSTM, self).__init__(
+            *args, num_layers=1, proj_size=0, **kwargs
+        )
+        initial_scale = torch.tensor(initial_scale).log()
+        self._scales_names = []
+        self._scales = []
+        for name in self._flat_weights_names:
+            scale_name = name + "_scale"
+            self._scales_names.append(scale_name)
+            param = nn.Parameter(initial_scale.clone().detach())
+            setattr(self, scale_name, param)
+            self._scales.append(param)
+
+        self._reset_parameters(
+            initial_speed
+        )  # Overrides the reset_parameters in base class
+
+    def _reset_parameters(self, initial_speed: float):
+        std = 0.1 / initial_speed
+        a = (3**0.5) * std
+        fan_in = self.input_size
+        scale = fan_in**-0.5
+        v = scale / std
+        for idx, name in enumerate(self._flat_weights_names):
+            if "weight" in name:
+                nn.init.uniform_(self._flat_weights[idx], -a, a)
+                with torch.no_grad():
+                    self._scales[idx] += torch.tensor(v).log()
+            elif "bias" in name:
+                nn.init.constant_(self._flat_weights[idx], 0.0)
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for idx in range(len(self._flat_weights_names)):
+            flat_weights.append(
+                self._flat_weights[idx] * self._scales[idx].exp()
+            )
+        return flat_weights
+
+    def forward(self, input, hx=None):
+        # This function is copied from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py  # noqa
+        # The only change is for calling `_VF.lstm()`:
+        # self._flat_weights -> self.get_flat_weights()
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            max_batch_size = int(max_batch_size)
+        else:
+            batch_sizes = None
+            max_batch_size = (
+                input.size(0) if self.batch_first else input.size(1)
+            )
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            real_hidden_size = (
+                self.proj_size if self.proj_size > 0 else self.hidden_size
+            )
+            h_zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                real_hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            c_zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            hx = (h_zeros, c_zeros)
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.lstm(
+                input,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = _VF.lstm(
+                input,
+                batch_sizes,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output, batch_sizes, sorted_indices, unsorted_indices
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+
 class ActivationBalancer(torch.nn.Module):
     """
     Modifies the backpropped derivatives of a function to try to encourage, for
@@ -711,8 +843,8 @@ def _test_basic_norm():
     y = m(x)
 
     assert y.shape == x.shape
-    x_rms = (x ** 2).mean().sqrt()
-    y_rms = (y ** 2).mean().sqrt()
+    x_rms = (x**2).mean().sqrt()
+    y_rms = (y**2).mean().sqrt()
     print("x rms = ", x_rms)
     print("y rms = ", y_rms)
     assert y_rms < x_rms
@@ -726,8 +858,22 @@ def _test_double_swish_deriv():
     torch.autograd.gradcheck(m, x)
 
 
+def _test_scaled_lstm():
+    N, L = 2, 30
+    dim_in, dim_hidden = 10, 20
+    m = ScaledLSTM(input_size=dim_in, hidden_size=dim_hidden, bias=True)
+    x = torch.randn(L, N, dim_in)
+    h0 = torch.randn(1, N, dim_hidden)
+    c0 = torch.randn(1, N, dim_hidden)
+    y, (h, c) = m(x, (h0, c0))
+    assert y.shape == (L, N, dim_hidden)
+    assert h.shape == (1, N, dim_hidden)
+    assert c.shape == (1, N, dim_hidden)
+
+
 if __name__ == "__main__":
     _test_activation_balancer_sign()
     _test_activation_balancer_magnitude()
     _test_basic_norm()
     _test_double_swish_deriv()
+    _test_scaled_lstm()

From 7c9fcfa5c9318386a72fe12cdaf243b97fc719fc Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sat, 16 Jul 2022 22:50:42 +0800
Subject: [PATCH 02/40] add RNNEncoderLayer and RNNEncoder classes in lstm.py

---
 .../ASR/lstm_transducer_stateless/lstm.py     | 222 ++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 egs/librispeech/ASR/lstm_transducer_stateless/lstm.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
new file mode 100644
index 0000000000..a0f4637976
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -0,0 +1,222 @@
+# Copyright    2022  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+import warnings
+from typing import List, Optional, Tuple
+
+import torch
+from encoder_interface import EncoderInterface
+from scaling import (
+    ActivationBalancer,
+    BasicNorm,
+    DoubleSwish,
+    ScaledConv1d,
+    ScaledConv2d,
+    ScaledLinear,
+    ScaledLSTM,
+)
+from torch import Tensor, nn
+
+
+class RNNEncoderLayer(nn.Module):
+    """
+    RNNEncoderLayer is made up of lstm and feedforward networks.
+
+    Args:
+      d_model:
+        The number of expected features in the input (required).
+      dim_feedforward:
+        The dimension of feedforward network model (default=2048).
+      dropout:
+        The dropout value (default=0.1).
+      layer_dropout:
+        The dropout value for model-level warmup (default=0.075).
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        dim_feedforward: int,
+        dropout: float = 0.1,
+        layer_dropout: float = 0.075,
+    ) -> None:
+        super(RNNEncoderLayer, self).__init__()
+        self.layer_dropout = layer_dropout
+        self.d_model = d_model
+
+        self.lstm = ScaledLSTM(
+            input_size=d_model, hidden_size=d_model, dropout=0.0
+        )
+        self.feed_forward = nn.Sequential(
+            ScaledLinear(d_model, dim_feedforward),
+            ActivationBalancer(channel_dim=-1),
+            DoubleSwish(),
+            nn.Dropout(),
+            ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
+        )
+        self.norm_final = BasicNorm(d_model)
+
+        # try to ensure the output is close to zero-mean (or at least, zero-median).  # noqa
+        self.balancer = ActivationBalancer(
+            channel_dim=-1, min_positive=0.45, max_positive=0.55, max_abs=6.0
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
+        """
+        Pass the input through the encoder layer.
+
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          warmup:
+            It controls selective bypass of of layers; if < 1.0, we will
+            bypass layers more frequently.
+        """
+        src_orig = src
+
+        warmup_scale = min(0.1 + warmup, 1.0)
+        # alpha = 1.0 means fully use this encoder layer, 0.0 would mean
+        # completely bypass it.
+        if self.training:
+            alpha = (
+                warmup_scale
+                if torch.rand(()).item() <= (1.0 - self.layer_dropout)
+                else 0.1
+            )
+        else:
+            alpha = 1.0
+
+        # lstm module
+        src_lstm = self.lstm(src)[0]
+        src = src + self.dropout(src_lstm)
+
+        # feed forward module
+        src = src + self.dropout(self.feed_forward(src))
+
+        src = self.norm_final(self.balancer(src))
+
+        if alpha != 1.0:
+            src = alpha * src + (1 - alpha) * src_orig
+
+        return src
+
+    @torch.jit.export
+    def infer(
+        self, src: Tensor, states: Tuple[Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor]]:
+        """
+        Pass the input through the encoder layer.
+
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          states:
+            Its shape is (2, 1, N, E).
+            states[0] and states[1] are cached hidden state and cell state,
+            respectively.
+        """
+        assert not self.training
+        assert states.shape == (2, 1, src.size(1), src.size(2))
+
+        # lstm module
+        # The required shapes of h_0 and c_0 are both (1, N, E)
+        src_lstm, new_states = self.lstm(src, states.unbind(dim=0))
+        new_states = torch.stack(states, dim=0)
+        src = src + self.dropout(src_lstm)
+
+        # feed forward module
+        src = src + self.dropout(self.feed_forward(src))
+
+        src = self.norm_final(self.balancer(src))
+
+        return src, new_states
+
+
+class RNNEncoder(nn.Module):
+    """
+    RNNEncoder is a stack of N encoder layers.
+
+    Args:
+      encoder_layer:
+        An instance of the RNNEncoderLayer() class (required).
+      num_layers:
+        The number of sub-encoder-layers in the encoder (required).
+    """
+
+    def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None:
+        super(RNNEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for i in range(num_layers)]
+        )
+        self.num_layers = num_layers
+
+    def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
+        """
+        Pass the input through the encoder layer in turn.
+
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          warmup:
+            It controls selective bypass of of layers; if < 1.0, we will
+            bypass layers more frequently.
+        """
+        output = src
+
+        for layer_index, mod in enumerate(self.layers):
+            output = mod(output, warmup=warmup)
+
+        return output
+
+    @torch.jit.export
+    def infer(
+        self, src: Tensor, states: Tuple[Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor]]:
+        """
+        Pass the input through the encoder layer.
+
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          states:
+            Its shape is (2, num_layers, N, E).
+            states[0] and states[1] are cached hidden states and cell states for
+            all layers, respectively.
+        """
+        assert not self.training
+        assert states.shape == (2, self.num_layers, src.size(1), src.size(2))
+
+        new_states_list = []
+        output = src
+        for layer_index, mod in enumerate(self.layers):
+            # new_states: (2, 1, N, E)
+            output, new_states = mod.infer(
+                output, states[:, layer_index : layer_index + 1, :, :]
+            )
+            new_states_list.append(new_states)
+
+        return output, torch.cat(new_states_list, dim=1)

From 2d53f2ef8b87f278bdd8cfeaa670fcbcccd681b9 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 12:59:27 +0800
Subject: [PATCH 03/40] add RNN and Conv2dSubsampling classes in lstm.py

---
 .../ASR/lstm_transducer_stateless/lstm.py     | 263 +++++++++++++++++-
 1 file changed, 250 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index a0f4637976..e180d9ec64 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -15,9 +15,7 @@
 # limitations under the License.
 
 import copy
-import math
-import warnings
-from typing import List, Optional, Tuple
+from typing import Tuple
 
 import torch
 from encoder_interface import EncoderInterface
@@ -25,12 +23,157 @@
     ActivationBalancer,
     BasicNorm,
     DoubleSwish,
-    ScaledConv1d,
     ScaledConv2d,
     ScaledLinear,
     ScaledLSTM,
 )
-from torch import Tensor, nn
+from torch import nn
+
+
+class RNN(EncoderInterface):
+    """
+    Args:
+      num_features (int):
+        Number of input features.
+      subsampling_factor (int):
+        Subsampling factor of encoder (convolution layers before lstm layers).
+      d_model (int):
+        Hidden dimension for lstm layers, also output dimension (default=512).
+      dim_feedforward (int):
+        Feedforward dimension (default=2048).
+      num_encoder_layers (int):
+        Number of encoder layers (default=12).
+      dropout (float):
+        Dropout rate (default=0.1).
+      layer_dropout (float):
+        Dropout value for model-level warmup (default=0.075).
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        subsampling_factor: int,
+        d_model: int = 512,
+        dim_feedforward: int = 2048,
+        num_encoder_layers: int = 12,
+        dropout: float = 0.1,
+        layer_dropout: float = 0.075,
+    ) -> None:
+        super(RNN, self).__init__()
+
+        self.num_features = num_features
+        self.subsampling_factor = subsampling_factor
+        if subsampling_factor != 4:
+            raise NotImplementedError("Support only 'subsampling_factor=4'.")
+
+        # self.encoder_embed converts the input of shape (N, T, num_features)
+        # to the shape (N, T//subsampling_factor, d_model).
+        # That is, it does two things simultaneously:
+        #   (1) subsampling: T -> T//subsampling_factor
+        #   (2) embedding: num_features -> d_model
+        self.encoder_embed = Conv2dSubsampling(num_features, d_model)
+
+        self.encoder_layers = num_encoder_layers
+        self.d_model = d_model
+
+        encoder_layer = RNNEncoderLayer(
+            d_model, dim_feedforward, dropout, layer_dropout
+        )
+        self.encoder = RNNEncoder(encoder_layer, num_encoder_layers)
+
+    def forward(
+        self, x: torch.Tensor, x_lens: torch.Tensor, warmup: float = 1.0
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            The input tensor. Its shape is (N, T, C), where N is the batch size,
+            T is the sequence length, C is the feature dimension.
+          x_lens:
+            A tensor of shape (N,), containing the number of frames in `x`
+            before padding.
+          warmup:
+            A floating point value that gradually increases from 0 throughout
+            training; when it is >= 1.0 we are "fully warmed up".  It is used
+            to turn modules on sequentially.
+
+        Returns:
+          A tuple of 2 tensors:
+            - embeddings: its shape is (N, T', d_model), where T' is the output
+              sequence lengths.
+            - lengths: a tensor of shape (batch_size,) containing the number of
+              frames in `embeddings` before padding.
+        """
+        x = self.encoder_embed(x)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning
+        #
+        # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
+        assert x.size(0) == lengths.max().item()
+
+        x = self.encoder(x, warmup)
+
+        x = x.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
+        return x, lengths
+
+    @torch.jit.export
+    def get_init_state(self, device: torch.device) -> torch.Tensor:
+        """Get model initial state."""
+        init_states = torch.zeros(
+            (2, self.num_encoder_layers, self.d_model), device=device
+        )
+        return init_states
+
+    @torch.jit.export
+    def infer(
+        self, x: torch.Tensor, x_lens: torch.Tensor, states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            The input tensor. Its shape is (N, T, C), where N is the batch size,
+            T is the sequence length, C is the feature dimension.
+          x_lens:
+            A tensor of shape (N,), containing the number of frames in `x`
+            before padding.
+          states:
+            Its shape is (2, num_encoder_layers, N, E).
+            states[0] and states[1] are cached hidden states and cell states for
+            all layers, respectively.
+
+        Returns:
+          A tuple of 3 tensors:
+            - embeddings: its shape is (N, T', d_model), where T' is the output
+              sequence lengths.
+            - lengths: a tensor of shape (batch_size,) containing the number of
+              frames in `embeddings` before padding.
+            - updated states, with shape of (2, num_encoder_layers, N, E).
+        """
+        assert not self.training
+        assert states.shape == (
+            2,
+            self.num_encoder_layers,
+            x.size(0),
+            self.d_model,
+        ), states.shape
+
+        # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning
+        #
+        # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
+        # we will cut off 1 frame on each side of encoder_embed output
+        lengths -= 2
+
+        embed = self.encoder_embed(x)
+        embed = embed[:, 1:-1, :]
+        embed = embed.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        x, states = self.encoder.infer(embed, states)
+
+        x = x.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
+        return x, lengths, states
 
 
 class RNNEncoderLayer(nn.Module):
@@ -77,7 +220,7 @@ def __init__(
         )
         self.dropout = nn.Dropout(dropout)
 
-    def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
+    def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
         """
         Pass the input through the encoder layer.
 
@@ -120,8 +263,8 @@ def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
 
     @torch.jit.export
     def infer(
-        self, src: Tensor, states: Tuple[Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor]]:
+        self, src: torch.Tensor, states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Pass the input through the encoder layer.
 
@@ -139,9 +282,9 @@ def infer(
         assert states.shape == (2, 1, src.size(1), src.size(2))
 
         # lstm module
-        # The required shapes of h_0 and c_0 are both (1, N, E)
+        # The required shapes of h_0 and c_0 are both (1, N, E).
         src_lstm, new_states = self.lstm(src, states.unbind(dim=0))
-        new_states = torch.stack(states, dim=0)
+        new_states = torch.stack(new_states, dim=0)
         src = src + self.dropout(src_lstm)
 
         # feed forward module
@@ -170,7 +313,7 @@ def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None:
         )
         self.num_layers = num_layers
 
-    def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
+    def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
         """
         Pass the input through the encoder layer in turn.
 
@@ -192,8 +335,8 @@ def forward(self, src: Tensor, warmup: float = 1.0) -> Tensor:
 
     @torch.jit.export
     def infer(
-        self, src: Tensor, states: Tuple[Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor]]:
+        self, src: torch.Tensor, states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Pass the input through the encoder layer.
 
@@ -220,3 +363,97 @@ def infer(
             new_states_list.append(new_states)
 
         return output, torch.cat(new_states_list, dim=1)
+
+
+class Conv2dSubsampling(nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Convert an input of shape (N, T, idim) to an output
+    with shape (N, T', odim), where
+    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
+
+    It is based on
+    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        layer1_channels: int = 8,
+        layer2_channels: int = 32,
+        layer3_channels: int = 128,
+    ) -> None:
+        """
+        Args:
+          in_channels:
+            Number of channels in. The input shape is (N, T, in_channels).
+            Caution: It requires: T >=7, in_channels >=7
+          out_channels
+            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, out_channels)
+          layer1_channels:
+            Number of channels in layer1
+          layer1_channels:
+            Number of channels in layer2
+        """
+        assert in_channels >= 7
+        super().__init__()
+
+        self.conv = nn.Sequential(
+            ScaledConv2d(
+                in_channels=1,
+                out_channels=layer1_channels,
+                kernel_size=3,
+                padding=1,
+            ),
+            ActivationBalancer(channel_dim=1),
+            DoubleSwish(),
+            ScaledConv2d(
+                in_channels=layer1_channels,
+                out_channels=layer2_channels,
+                kernel_size=3,
+                stride=2,
+            ),
+            ActivationBalancer(channel_dim=1),
+            DoubleSwish(),
+            ScaledConv2d(
+                in_channels=layer2_channels,
+                out_channels=layer3_channels,
+                kernel_size=3,
+                stride=2,
+            ),
+            ActivationBalancer(channel_dim=1),
+            DoubleSwish(),
+        )
+        self.out = ScaledLinear(
+            layer3_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels
+        )
+        # set learn_eps=False because out_norm is preceded by `out`, and `out`
+        # itself has learned scale, so the extra degree of freedom is not
+        # needed.
+        self.out_norm = BasicNorm(out_channels, learn_eps=False)
+        # constrain median of output to be close to zero.
+        self.out_balancer = ActivationBalancer(
+            channel_dim=-1, min_positive=0.45, max_positive=0.55
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Subsample x.
+
+        Args:
+          x:
+            Its shape is (N, T, idim).
+
+        Returns:
+          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+        """
+        # On entry, x is (N, T, idim)
+        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
+        x = self.conv(x)
+        # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
+        x = self.out_norm(x)
+        x = self.out_balancer(x)
+        return x

From 074bd7da717b0aa3a2ffd217d315401e23d9a8d1 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:31:25 +0800
Subject: [PATCH 04/40] hardcode bidirectional=False

---
 .../ASR/lstm_transducer_stateless/lstm.py       | 17 +++++++++++++++--
 .../ASR/pruned_transducer_stateless2/scaling.py |  4 ++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index e180d9ec64..52424c0bb0 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -36,7 +36,7 @@ class RNN(EncoderInterface):
       num_features (int):
         Number of input features.
       subsampling_factor (int):
-        Subsampling factor of encoder (convolution layers before lstm layers).
+        Subsampling factor of encoder (convolution layers before lstm layers) (default=4).  # noqa
       d_model (int):
         Hidden dimension for lstm layers, also output dimension (default=512).
       dim_feedforward (int):
@@ -52,7 +52,7 @@ class RNN(EncoderInterface):
     def __init__(
         self,
         num_features: int,
-        subsampling_factor: int,
+        subsampling_factor: int = 4,
         d_model: int = 512,
         dim_feedforward: int = 2048,
         num_encoder_layers: int = 12,
@@ -457,3 +457,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.out_norm(x)
         x = self.out_balancer(x)
         return x
+
+
+if __name__ == "__main__":
+    feature_dim = 50
+    m = RNN(num_features=feature_dim, d_model=128)
+    batch_size = 5
+    seq_len = 20
+    # Just make sure the forward pass runs.
+    f = m(
+        torch.randn(batch_size, seq_len, feature_dim),
+        torch.full((batch_size,), seq_len, dtype=torch.int64),
+        warmup=0.5,
+    )
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 54f4a53c50..d0c16cd1e4 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -388,9 +388,9 @@ def __init__(
         initial_speed: float = 1.0,
         **kwargs
     ):
-        # Hardcode num_layers=1 and proj_size=0 here
+        # Hardcode num_layers=1, bidirectional=False, proj_size=0 here
         super(ScaledLSTM, self).__init__(
-            *args, num_layers=1, proj_size=0, **kwargs
+            *args, num_layers=1, bidirectional=False, proj_size=0, **kwargs
         )
         initial_scale = torch.tensor(initial_scale).log()
         self._scales_names = []

From d16b9ec15f60071f5cef7f081692cabfeca9c50c Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:32:54 +0800
Subject: [PATCH 05/40] link from pruned_transducer_stateless2

---
 egs/librispeech/ASR/lstm_transducer_stateless/asr_datamodule.py  | 1 +
 egs/librispeech/ASR/lstm_transducer_stateless/beam_search.py     | 1 +
 egs/librispeech/ASR/lstm_transducer_stateless/decoder.py         | 1 +
 .../ASR/lstm_transducer_stateless/encoder_interface.py           | 1 +
 egs/librispeech/ASR/lstm_transducer_stateless/joiner.py          | 1 +
 egs/librispeech/ASR/lstm_transducer_stateless/model.py           | 1 +
 egs/librispeech/ASR/lstm_transducer_stateless/optim.py           | 1 +
 7 files changed, 7 insertions(+)
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/asr_datamodule.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/beam_search.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/decoder.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/encoder_interface.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/joiner.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/model.py
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/optim.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/asr_datamodule.py b/egs/librispeech/ASR/lstm_transducer_stateless/asr_datamodule.py
new file mode 120000
index 0000000000..a074d60850
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/asr_datamodule.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/beam_search.py b/egs/librispeech/ASR/lstm_transducer_stateless/beam_search.py
new file mode 120000
index 0000000000..8554e44ccf
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/beam_search.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decoder.py b/egs/librispeech/ASR/lstm_transducer_stateless/decoder.py
new file mode 120000
index 0000000000..0793c5709c
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decoder.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/encoder_interface.py b/egs/librispeech/ASR/lstm_transducer_stateless/encoder_interface.py
new file mode 120000
index 0000000000..aa5d0217a8
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/encoder_interface.py
@@ -0,0 +1 @@
+../transducer_stateless/encoder_interface.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/joiner.py b/egs/librispeech/ASR/lstm_transducer_stateless/joiner.py
new file mode 120000
index 0000000000..815fd4bb6f
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/joiner.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/model.py b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
new file mode 120000
index 0000000000..ebb6d774d9
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/model.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/optim.py b/egs/librispeech/ASR/lstm_transducer_stateless/optim.py
new file mode 120000
index 0000000000..e2deb44925
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/optim.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/optim.py
\ No newline at end of file

From 89bfb6b9c7310d855c397aa30cbb8e97f5c82114 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:35:59 +0800
Subject: [PATCH 06/40] link scaling.py pruned_transducer_stateless2

---
 egs/librispeech/ASR/lstm_transducer_stateless/scaling.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/scaling.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/scaling.py b/egs/librispeech/ASR/lstm_transducer_stateless/scaling.py
new file mode 120000
index 0000000000..09d802cc44
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/scaling.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/scaling.py
\ No newline at end of file

From b1be6ea4754bdfe17fe1a95d41b2d85301bedb1a Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:37:27 +0800
Subject: [PATCH 07/40] copy from pruned_transducer_stateless2

---
 .../ASR/lstm_transducer_stateless/decode.py   |  811 ++++++++++++
 .../lstm_transducer_stateless/pretrained.py   |  390 ++++++
 .../lstm_transducer_stateless/test_model.py   |   76 ++
 .../ASR/lstm_transducer_stateless/train.py    | 1090 +++++++++++++++++
 4 files changed, 2367 insertions(+)
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/decode.py
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/train.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
new file mode 100755
index 0000000000..60a948a992
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -0,0 +1,811 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search (one best)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(6) fast beam search (nbest oracle WER)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(7) fast beam search (with LG)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(8) decode in streaming mode (take greedy search as an example)
+./pruned_transducer_stateless2/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --simulate-streaming 1 \
+    --causal-convolution 1 \
+    --decode-chunk-size 16 \
+    --left-context 64 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest,
+    fast_beam_search_nbest_LG,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 0.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="pruned_transducer_stateless2/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+          - fast_beam_search
+          - fast_beam_search_nbest
+          - fast_beam_search_nbest_oracle
+          - fast_beam_search_nbest_LG
+        If you use fast_beam_search_nbest_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding_method is fast_beam_search_nbest_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--simulate-streaming",
+        type=str2bool,
+        default=False,
+        help="""Whether to simulate streaming in decoding, this is a good way to
+        test a streaming model.
+        """,
+    )
+
+    parser.add_argument(
+        "--decode-chunk-size",
+        type=int,
+        default=16,
+        help="The chunk size for decoding (in frames after subsampling)",
+    )
+
+    parser.add_argument(
+        "--left-context",
+        type=int,
+        default=64,
+        help="left context can be seen during decoding (in frames after subsampling)",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    add_model_arguments(parser)
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = model.device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    feature_lens += params.left_context
+    feature = torch.nn.functional.pad(
+        feature,
+        pad=(0, 0, 0, params.left_context),
+        value=LOG_EPS,
+    )
+
+    if params.simulate_streaming:
+        encoder_out, encoder_out_lens, _ = model.encoder.streaming_forward(
+            x=feature,
+            x_lens=feature_lens,
+            chunk_size=params.decode_chunk_size,
+            left_context=params.left_context,
+            simulate_streaming=True,
+        )
+    else:
+        encoder_out, encoder_out_lens = model.encoder(
+            x=feature, x_lens=feature_lens
+        )
+
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "fast_beam_search_nbest_LG":
+        hyp_tokens = fast_beam_search_nbest_LG(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in hyp_tokens:
+            hyps.append([word_table[i] for i in hyp])
+    elif params.decoding_method == "fast_beam_search_nbest":
+        hyp_tokens = fast_beam_search_nbest(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif (
+        params.decoding_method == "greedy_search"
+        and params.max_sym_per_frame == 1
+    ):
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp).split())
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+            word_table=word_table,
+            decoding_graph=decoding_graph,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for hyp_words, ref_text in zip(hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(
+                f"batch {batch_str}, cuts processed until now is {num_cuts}"
+            )
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir
+        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "fast_beam_search",
+        "fast_beam_search_nbest",
+        "fast_beam_search_nbest_LG",
+        "fast_beam_search_nbest_oracle",
+        "modified_beam_search",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.simulate_streaming:
+        params.suffix += f"-streaming-chunk-size-{params.decode_chunk_size}"
+        params.suffix += f"-left-context-{params.left_context}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+            if "LG" in params.decoding_method:
+                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    if params.simulate_streaming:
+        assert (
+            params.causal_convolution
+        ), "Decoding in streaming requires causal convolution"
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    if params.iter > 0:
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(average_checkpoints(filenames, device=device))
+    elif params.avg == 1:
+        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    else:
+        start = params.epoch - params.avg + 1
+        filenames = []
+        for i in range(start, params.epoch + 1):
+            if start >= 0:
+                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(average_checkpoints(filenames, device=device))
+
+    model.to(device)
+    model.eval()
+    model.device = device
+
+    if "fast_beam_search" in params.decoding_method:
+        if params.decoding_method == "fast_beam_search_nbest_LG":
+            lexicon = Lexicon(params.lang_dir)
+            word_table = lexicon.word_table
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            word_table = None
+            decoding_graph = k2.trivial_graph(
+                params.vocab_size - 1, device=device
+            )
+    else:
+        decoding_graph = None
+        word_table = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dl = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+            word_table=word_table,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
new file mode 100755
index 0000000000..f52cb22ab5
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+# Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+(1) greedy search
+./pruned_transducer_stateless2/pretrained.py \
+    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method greedy_search \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+(2) beam search
+./pruned_transducer_stateless2/pretrained.py \
+    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+(3) modified beam search
+./pruned_transducer_stateless2/pretrained.py \
+    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method modified_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+(4) fast beam search
+./pruned_transducer_stateless2/pretrained.py \
+    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method fast_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+You can also use `./pruned_transducer_stateless2/exp/epoch-xx.pt`.
+
+Note: ./pruned_transducer_stateless2/exp/pretrained.pt is generated by
+./pruned_transducer_stateless2/export.py
+"""
+
+
+import argparse
+import logging
+import math
+from typing import List
+
+import k2
+import kaldifeat
+import sentencepiece as spm
+import torch
+import torchaudio
+from beam_search import (
+    beam_search,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.utils import str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="Path to the checkpoint. "
+        "The checkpoint is assumed to be saved by "
+        "icefall.checkpoint.save_checkpoint().",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        help="""Path to bpe.model.""",
+    )
+
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+          - fast_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame. Used only when
+        --method is greedy_search.
+        """,
+    )
+
+    parser.add_argument(
+        "--simulate-streaming",
+        type=str2bool,
+        default=False,
+        help="""Whether to simulate streaming in decoding, this is a good way to
+        test a streaming model.
+        """,
+    )
+
+    parser.add_argument(
+        "--decode-chunk-size",
+        type=int,
+        default=16,
+        help="The chunk size for decoding (in frames after subsampling)",
+    )
+    parser.add_argument(
+        "--left-context",
+        type=int,
+        default=64,
+        help="left context can be seen during decoding (in frames after subsampling)",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert sample_rate == expected_sample_rate, (
+            f"expected sample rate: {expected_sample_rate}. "
+            f"Given: {sample_rate}"
+        )
+        # We use only the first channel
+        ans.append(wave[0])
+    return ans
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    params = get_params()
+
+    params.update(vars(args))
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    if params.simulate_streaming:
+        assert (
+            params.causal_convolution
+        ), "Decoding in streaming requires causal convolution"
+
+    logging.info(f"{params}")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    logging.info("Creating model")
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    model.load_state_dict(checkpoint["model"], strict=False)
+    model.to(device)
+    model.eval()
+    model.device = device
+
+    logging.info("Constructing Fbank computer")
+    opts = kaldifeat.FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = params.sample_rate
+    opts.mel_opts.num_bins = params.feature_dim
+
+    fbank = kaldifeat.Fbank(opts)
+
+    logging.info(f"Reading sound files: {params.sound_files}")
+    waves = read_sound_files(
+        filenames=params.sound_files, expected_sample_rate=params.sample_rate
+    )
+    waves = [w.to(device) for w in waves]
+
+    logging.info("Decoding started")
+    features = fbank(waves)
+    feature_lengths = [f.size(0) for f in features]
+
+    features = pad_sequence(
+        features, batch_first=True, padding_value=math.log(1e-10)
+    )
+
+    feature_lengths = torch.tensor(feature_lengths, device=device)
+
+    if params.simulate_streaming:
+        encoder_out, encoder_out_lens, _ = model.encoder.streaming_forward(
+            x=features,
+            x_lens=feature_lengths,
+            chunk_size=params.decode_chunk_size,
+            left_context=params.left_context,
+            simulate_streaming=True,
+        )
+    else:
+        encoder_out, encoder_out_lens = model.encoder(
+            x=features, x_lens=feature_lengths
+        )
+
+    num_waves = encoder_out.size(0)
+    hyps = []
+    msg = f"Using {params.method}"
+    if params.method == "beam_search":
+        msg += f" with beam size {params.beam_size}"
+    logging.info(msg)
+
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    else:
+        for i in range(num_waves):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(f"Unsupported method: {params.method}")
+
+            hyps.append(sp.decode(hyp).split())
+
+    s = "\n"
+    for filename, hyp in zip(params.sound_files, hyps):
+        words = " ".join(hyp)
+        s += f"{filename}:\n{words}\n\n"
+    logging.info(s)
+
+    logging.info("Decoding Done")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
new file mode 100755
index 0000000000..1858d6bf02
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+To run this file, do:
+
+    cd icefall/egs/librispeech/ASR
+    python ./pruned_transducer_stateless/test_model.py
+"""
+
+import torch
+from train import get_params, get_transducer_model
+
+
+def test_model():
+    params = get_params()
+    params.vocab_size = 500
+    params.blank_id = 0
+    params.context_size = 2
+    params.unk_id = 2
+
+    params.dynamic_chunk_training = False
+    params.short_chunk_size = 25
+    params.num_left_chunks = 4
+    params.causal_convolution = False
+
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    print(f"Number of model parameters: {num_param}")
+    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+    torch.jit.script(model)
+
+
+def test_model_streaming():
+    params = get_params()
+    params.vocab_size = 500
+    params.blank_id = 0
+    params.context_size = 2
+    params.unk_id = 2
+
+    params.dynamic_chunk_training = True
+    params.short_chunk_size = 25
+    params.num_left_chunks = 4
+    params.causal_convolution = True
+
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    print(f"Number of model parameters: {num_param}")
+    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+    torch.jit.script(model)
+
+
+def main():
+    test_model()
+    test_model_streaming()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
new file mode 100755
index 0000000000..13175c4c23
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -0,0 +1,1090 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang
+#                                                  Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./pruned_transducer_stateless2/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 0 \
+  --exp-dir pruned_transducer_stateless2/exp \
+  --full-libri 1 \
+  --max-duration 300
+
+# For mix precision training:
+
+./pruned_transducer_stateless2/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 0 \
+  --use-fp16 1 \
+  --exp-dir pruned_transducer_stateless2/exp \
+  --full-libri 1 \
+  --max-duration 550
+
+# train a streaming model
+./pruned_transducer_stateless2/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 0 \
+  --exp-dir pruned_transducer_stateless/exp \
+  --full-libri 1 \
+  --dynamic-chunk-training 1 \
+  --causal-convolution 1 \
+  --short-chunk-size 25 \
+  --num-left-chunks 4 \
+  --max-duration 300
+"""
+
+
+import argparse
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from conformer import Conformer
+from decoder import Decoder
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from model import Transducer
+from optim import Eden, Eve
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import save_checkpoint_with_global_batch_idx
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[
+    torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler
+]
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--dynamic-chunk-training",
+        type=str2bool,
+        default=False,
+        help="""Whether to use dynamic_chunk_training, if you want a streaming
+        model, this requires to be True.
+        """,
+    )
+
+    parser.add_argument(
+        "--causal-convolution",
+        type=str2bool,
+        default=False,
+        help="""Whether to use causal convolution, this requires to be True when
+        using dynamic_chunk_training.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=25,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=0,
+        help="""Resume training from from this epoch.
+        If it is positive, it will load checkpoint from
+        transducer_stateless2/exp/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="pruned_transducer_stateless2/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--initial-lr",
+        type=float,
+        default=0.003,
+        help="The initial learning rate.  This value should not need to "
+        "be changed.",
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=6,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network)"
+        "part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=8000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=20,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - encoder_dim: Hidden dim for multi-head attention model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 3000,  # For the 100h subset, use 800
+            # parameters for conformer
+            "feature_dim": 80,
+            "subsampling_factor": 4,
+            "encoder_dim": 512,
+            "nhead": 8,
+            "dim_feedforward": 2048,
+            "num_encoder_layers": 12,
+            # parameters for decoder
+            "decoder_dim": 512,
+            # parameters for joiner
+            "joiner_dim": 512,
+            # parameters for Noam
+            "model_warm_step": 3000,  # arg given to model, not for lrate
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Conformer and Transformer
+    encoder = Conformer(
+        num_features=params.feature_dim,
+        subsampling_factor=params.subsampling_factor,
+        d_model=params.encoder_dim,
+        nhead=params.nhead,
+        dim_feedforward=params.dim_feedforward,
+        num_encoder_layers=params.num_encoder_layers,
+        dynamic_chunk_training=params.dynamic_chunk_training,
+        short_chunk_size=params.short_chunk_size,
+        num_left_chunks=params.num_left_chunks,
+        causal=params.causal_convolution,
+    )
+    return encoder
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=params.encoder_dim,
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_transducer_model(params: AttributeDict) -> nn.Module:
+    encoder = get_encoder_model(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = Transducer(
+        encoder=encoder,
+        decoder=decoder,
+        joiner=joiner,
+        encoder_dim=params.encoder_dim,
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is positive, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 0:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+        if "cur_batch_idx" in saved_params:
+            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+    warmup: float = 1.0,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    texts = batch["supervisions"]["text"]
+    y = sp.encode(texts, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            warmup=warmup,
+        )
+        # after the main warmup step, we keep pruned_loss_scale small
+        # for the same amount of time (model_warm_step), to avoid
+        # overwhelming the simple_loss and causing it to diverge,
+        # in case it had not fully learned the alignment yet.
+        pruned_loss_scale = (
+            0.0
+            if warmup < 1.0
+            else (0.1 if warmup > 1.0 and warmup < 2.0 else 1.0)
+        )
+        loss = (
+            params.simple_loss_scale * simple_loss
+            + pruned_loss_scale * pruned_loss
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                    warmup=(params.batch_idx_train / params.model_warm_step),
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 30:
+            return
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}"
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(
+                    tb_writer, "train/tot_", params.batch_idx_train
+                )
+
+        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+    if params.full_libri is False:
+        params.valid_interval = 1600
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    if params.dynamic_chunk_training:
+        assert (
+            params.causal_convolution
+        ), "dynamic_chunk_training requires causal convolution"
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    checkpoints = load_checkpoint_if_available(params=params, model=model)
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank])
+    model.device = device
+
+    optimizer = Eve(model.parameters(), lr=params.initial_lr)
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    train_cuts = librispeech.train_clean_100_cuts()
+    if params.full_libri:
+        train_cuts += librispeech.train_clean_360_cuts()
+        train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = librispeech.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_cuts = librispeech.dev_clean_cuts()
+    valid_cuts += librispeech.dev_other_cuts()
+    valid_dl = librispeech.valid_dataloaders(valid_cuts)
+
+    if not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+            warmup=0.0 if params.start_epoch == 0 else 1.0,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs):
+        scheduler.step_epoch(epoch)
+        fix_random_seed(params.seed + epoch)
+        train_dl.sampler.set_epoch(epoch)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = sp.encode(supervisions["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: nn.Module,
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    sp: spm.SentencePieceProcessor,
+    params: AttributeDict,
+    warmup: float,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                    warmup=warmup,
+                )
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()

From 4a0dea2aa23f1deea9f97a58c1fe8c756d4ffa8f Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:38:53 +0800
Subject: [PATCH 08/40] modify decode.py pretrained.py test_model.py train.py

---
 .../ASR/lstm_transducer_stateless/decode.py   | 94 ++++---------------
 .../lstm_transducer_stateless/pretrained.py   | 48 +---------
 .../lstm_transducer_stateless/test_model.py   | 26 -----
 .../ASR/lstm_transducer_stateless/train.py    | 69 ++------------
 4 files changed, 28 insertions(+), 209 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index 60a948a992..34e8e8fb99 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -18,36 +18,36 @@
 """
 Usage:
 (1) greedy search
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method greedy_search
 
 (2) beam search (not recommended)
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method beam_search \
     --beam-size 4
 
 (3) modified beam search
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method modified_beam_search \
     --beam-size 4
 
 (4) fast beam search (one best)
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method fast_beam_search \
     --beam 20.0 \
@@ -55,10 +55,10 @@
     --max-states 64
 
 (5) fast beam search (nbest)
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method fast_beam_search_nbest \
     --beam 20.0 \
@@ -68,10 +68,10 @@
     --nbest-scale 0.5
 
 (6) fast beam search (nbest oracle WER)
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method fast_beam_search_nbest_oracle \
     --beam 20.0 \
@@ -81,30 +81,15 @@
     --nbest-scale 0.5
 
 (7) fast beam search (with LG)
-./pruned_transducer_stateless2/decode.py \
+./lstm_transducer_stateless/decode.py \
     --epoch 28 \
     --avg 15 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
+    --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
     --decoding-method fast_beam_search_nbest_LG \
     --beam 20.0 \
     --max-contexts 8 \
     --max-states 64
-
-(8) decode in streaming mode (take greedy search as an example)
-./pruned_transducer_stateless2/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --simulate-streaming 1 \
-    --causal-convolution 1 \
-    --decode-chunk-size 16 \
-    --left-context 64 \
-    --exp-dir ./pruned_transducer_stateless2/exp \
-    --max-duration 600 \
-    --decoding-method greedy_search
-    --beam 20.0 \
-    --max-contexts 8 \
-    --max-states 64
 """
 
 
@@ -130,7 +115,7 @@
     greedy_search_batch,
     modified_beam_search,
 )
-from train import add_model_arguments, get_params, get_transducer_model
+from train import get_params, get_transducer_model
 
 from icefall.checkpoint import (
     average_checkpoints,
@@ -142,7 +127,6 @@
     AttributeDict,
     setup_logger,
     store_transcripts,
-    str2bool,
     write_error_stats,
 )
 
@@ -286,29 +270,6 @@ def get_parser():
         Used only when --decoding_method is greedy_search""",
     )
 
-    parser.add_argument(
-        "--simulate-streaming",
-        type=str2bool,
-        default=False,
-        help="""Whether to simulate streaming in decoding, this is a good way to
-        test a streaming model.
-        """,
-    )
-
-    parser.add_argument(
-        "--decode-chunk-size",
-        type=int,
-        default=16,
-        help="The chunk size for decoding (in frames after subsampling)",
-    )
-
-    parser.add_argument(
-        "--left-context",
-        type=int,
-        default=64,
-        help="left context can be seen during decoding (in frames after subsampling)",
-    )
-
     parser.add_argument(
         "--num-paths",
         type=int,
@@ -327,7 +288,6 @@ def get_parser():
         fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
     )
 
-    add_model_arguments(parser)
     return parser
 
 
@@ -387,18 +347,9 @@ def decode_one_batch(
         value=LOG_EPS,
     )
 
-    if params.simulate_streaming:
-        encoder_out, encoder_out_lens, _ = model.encoder.streaming_forward(
-            x=feature,
-            x_lens=feature_lens,
-            chunk_size=params.decode_chunk_size,
-            left_context=params.left_context,
-            simulate_streaming=True,
-        )
-    else:
-        encoder_out, encoder_out_lens = model.encoder(
-            x=feature, x_lens=feature_lens
-        )
+    encoder_out, encoder_out_lens = model.encoder(
+        x=feature, x_lens=feature_lens
+    )
 
     hyps = []
 
@@ -674,10 +625,6 @@ def main():
     else:
         params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
 
-    if params.simulate_streaming:
-        params.suffix += f"-streaming-chunk-size-{params.decode_chunk_size}"
-        params.suffix += f"-left-context-{params.left_context}"
-
     if "fast_beam_search" in params.decoding_method:
         params.suffix += f"-beam-{params.beam}"
         params.suffix += f"-max-contexts-{params.max_contexts}"
@@ -712,11 +659,6 @@ def main():
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
-    if params.simulate_streaming:
-        assert (
-            params.causal_convolution
-        ), "Decoding in streaming requires causal convolution"
-
     logging.info(params)
 
     logging.info("About to create model")
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
index f52cb22ab5..21bcf7cfd3 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
@@ -77,9 +77,7 @@
     modified_beam_search,
 )
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_transducer_model
-
-from icefall.utils import str2bool
+from train import get_params, get_transducer_model
 
 
 def get_parser():
@@ -180,30 +178,6 @@ def get_parser():
         """,
     )
 
-    parser.add_argument(
-        "--simulate-streaming",
-        type=str2bool,
-        default=False,
-        help="""Whether to simulate streaming in decoding, this is a good way to
-        test a streaming model.
-        """,
-    )
-
-    parser.add_argument(
-        "--decode-chunk-size",
-        type=int,
-        default=16,
-        help="The chunk size for decoding (in frames after subsampling)",
-    )
-    parser.add_argument(
-        "--left-context",
-        type=int,
-        default=64,
-        help="left context can be seen during decoding (in frames after subsampling)",
-    )
-
-    add_model_arguments(parser)
-
     return parser
 
 
@@ -248,11 +222,6 @@ def main():
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
-    if params.simulate_streaming:
-        assert (
-            params.causal_convolution
-        ), "Decoding in streaming requires causal convolution"
-
     logging.info(f"{params}")
 
     device = torch.device("cpu")
@@ -299,18 +268,9 @@ def main():
 
     feature_lengths = torch.tensor(feature_lengths, device=device)
 
-    if params.simulate_streaming:
-        encoder_out, encoder_out_lens, _ = model.encoder.streaming_forward(
-            x=features,
-            x_lens=feature_lengths,
-            chunk_size=params.decode_chunk_size,
-            left_context=params.left_context,
-            simulate_streaming=True,
-        )
-    else:
-        encoder_out, encoder_out_lens = model.encoder(
-            x=features, x_lens=feature_lengths
-        )
+    encoder_out, encoder_out_lens = model.encoder(
+        x=features, x_lens=feature_lengths
+    )
 
     num_waves = encoder_out.size(0)
     hyps = []
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
index 1858d6bf02..5c49025bdc 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
@@ -34,31 +34,6 @@ def test_model():
     params.context_size = 2
     params.unk_id = 2
 
-    params.dynamic_chunk_training = False
-    params.short_chunk_size = 25
-    params.num_left_chunks = 4
-    params.causal_convolution = False
-
-    model = get_transducer_model(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    print(f"Number of model parameters: {num_param}")
-    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-    torch.jit.script(model)
-
-
-def test_model_streaming():
-    params = get_params()
-    params.vocab_size = 500
-    params.blank_id = 0
-    params.context_size = 2
-    params.unk_id = 2
-
-    params.dynamic_chunk_training = True
-    params.short_chunk_size = 25
-    params.num_left_chunks = 4
-    params.causal_convolution = True
-
     model = get_transducer_model(params)
 
     num_param = sum([p.numel() for p in model.parameters()])
@@ -69,7 +44,6 @@ def test_model_streaming():
 
 def main():
     test_model()
-    test_model_streaming()
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 13175c4c23..8ce5bdc54a 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -21,37 +21,24 @@
 
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 
-./pruned_transducer_stateless2/train.py \
+./lstm_transducer_stateless/train.py \
   --world-size 4 \
   --num-epochs 30 \
   --start-epoch 0 \
-  --exp-dir pruned_transducer_stateless2/exp \
+  --exp-dir lstm_transducer_stateless/exp \
   --full-libri 1 \
   --max-duration 300
 
 # For mix precision training:
 
-./pruned_transducer_stateless2/train.py \
+./lstm_transducer_stateless/train.py \
   --world-size 4 \
   --num-epochs 30 \
   --start-epoch 0 \
   --use-fp16 1 \
-  --exp-dir pruned_transducer_stateless2/exp \
+  --exp-dir lstm_transducer_stateless/exp \
   --full-libri 1 \
   --max-duration 550
-
-# train a streaming model
-./pruned_transducer_stateless2/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 0 \
-  --exp-dir pruned_transducer_stateless/exp \
-  --full-libri 1 \
-  --dynamic-chunk-training 1 \
-  --causal-convolution 1 \
-  --short-chunk-size 25 \
-  --num-left-chunks 4 \
-  --max-duration 300
 """
 
 
@@ -69,7 +56,7 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from conformer import Conformer
+from lstm import RNN
 from decoder import Decoder
 from joiner import Joiner
 from lhotse.cut import Cut
@@ -95,42 +82,6 @@
 ]
 
 
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--dynamic-chunk-training",
-        type=str2bool,
-        default=False,
-        help="""Whether to use dynamic_chunk_training, if you want a streaming
-        model, this requires to be True.
-        """,
-    )
-
-    parser.add_argument(
-        "--causal-convolution",
-        type=str2bool,
-        default=False,
-        help="""Whether to use causal convolution, this requires to be True when
-        using dynamic_chunk_training.
-        """,
-    )
-
-    parser.add_argument(
-        "--short-chunk-size",
-        type=int,
-        default=25,
-        help="""Chunk length of dynamic training, the chunk size would be either
-        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
-        """,
-    )
-
-    parser.add_argument(
-        "--num-left-chunks",
-        type=int,
-        default=4,
-        help="How many left context can be seen in chunks when calculating attention.",
-    )
-
-
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -311,8 +262,6 @@ def get_parser():
         help="Whether to use half precision training.",
     )
 
-    add_model_arguments(parser)
-
     return parser
 
 
@@ -374,7 +323,6 @@ def get_params() -> AttributeDict:
             "feature_dim": 80,
             "subsampling_factor": 4,
             "encoder_dim": 512,
-            "nhead": 8,
             "dim_feedforward": 2048,
             "num_encoder_layers": 12,
             # parameters for decoder
@@ -392,17 +340,12 @@ def get_params() -> AttributeDict:
 
 def get_encoder_model(params: AttributeDict) -> nn.Module:
     # TODO: We can add an option to switch between Conformer and Transformer
-    encoder = Conformer(
+    encoder = RNN(
         num_features=params.feature_dim,
         subsampling_factor=params.subsampling_factor,
         d_model=params.encoder_dim,
-        nhead=params.nhead,
         dim_feedforward=params.dim_feedforward,
         num_encoder_layers=params.num_encoder_layers,
-        dynamic_chunk_training=params.dynamic_chunk_training,
-        short_chunk_size=params.short_chunk_size,
-        num_left_chunks=params.num_left_chunks,
-        causal=params.causal_convolution,
     )
     return encoder
 

From 822cc78a9c459cf35403ab750b88e78fd0e655c2 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 15:47:43 +0800
Subject: [PATCH 09/40] copy streaming decoding files from
 pruned_transducer_stateless2

---
 .../decode_stream.py                          | 126 ++++
 .../streaming_decode.py                       | 687 ++++++++++++++++++
 2 files changed, 813 insertions(+)
 create mode 100644 egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
new file mode 100644
index 0000000000..ba5e805553
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
@@ -0,0 +1,126 @@
+# Copyright    2022  Xiaomi Corp.        (authors: Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple
+
+import k2
+import torch
+
+from icefall.utils import AttributeDict
+
+
+class DecodeStream(object):
+    def __init__(
+        self,
+        params: AttributeDict,
+        initial_states: List[torch.Tensor],
+        decoding_graph: Optional[k2.Fsa] = None,
+        device: torch.device = torch.device("cpu"),
+    ) -> None:
+        """
+        Args:
+          initial_states:
+            Initial decode states of the model, e.g. the return value of
+            `get_init_state` in conformer.py
+          decoding_graph:
+            Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+            Used only when decoding_method is fast_beam_search.
+          device:
+            The device to run this stream.
+        """
+        if decoding_graph is not None:
+            assert device == decoding_graph.device
+
+        self.params = params
+        self.LOG_EPS = math.log(1e-10)
+
+        self.states = initial_states
+
+        # It contains a 2-D tensors representing the feature frames.
+        self.features: torch.Tensor = None
+
+        self.num_frames: int = 0
+        # how many frames have been processed. (before subsampling).
+        # we only modify this value in `func:get_feature_frames`.
+        self.num_processed_frames: int = 0
+
+        self._done: bool = False
+
+        # The transcript of current utterance.
+        self.ground_truth: str = ""
+
+        # The decoding result (partial or final) of current utterance.
+        self.hyp: List = []
+
+        # how many frames have been processed, after subsampling (i.e. a
+        # cumulative sum of the second return value of
+        # encoder.streaming_forward
+        self.done_frames: int = 0
+
+        self.pad_length = (
+            params.right_context + 2
+        ) * params.subsampling_factor + 3
+
+        if params.decoding_method == "greedy_search":
+            self.hyp = [params.blank_id] * params.context_size
+        elif params.decoding_method == "fast_beam_search":
+            # The rnnt_decoding_stream for fast_beam_search.
+            self.rnnt_decoding_stream: k2.RnntDecodingStream = (
+                k2.RnntDecodingStream(decoding_graph)
+            )
+        else:
+            assert (
+                False
+            ), f"Decoding method :{params.decoding_method} do not support."
+
+    @property
+    def done(self) -> bool:
+        """Return True if all the features are processed."""
+        return self._done
+
+    def set_features(
+        self,
+        features: torch.Tensor,
+    ) -> None:
+        """Set features tensor of current utterance."""
+        assert features.dim() == 2, features.dim()
+        self.features = torch.nn.functional.pad(
+            features,
+            (0, 0, 0, self.pad_length),
+            mode="constant",
+            value=self.LOG_EPS,
+        )
+        self.num_frames = self.features.size(0)
+
+    def get_feature_frames(self, chunk_size: int) -> Tuple[torch.Tensor, int]:
+        """Consume chunk_size frames of features"""
+        chunk_length = chunk_size + self.pad_length
+
+        ret_length = min(
+            self.num_frames - self.num_processed_frames, chunk_length
+        )
+
+        ret_features = self.features[
+            self.num_processed_frames : self.num_processed_frames  # noqa
+            + ret_length
+        ]
+
+        self.num_processed_frames += chunk_size
+        if self.num_processed_frames >= self.num_frames:
+            self._done = True
+
+        return ret_features, ret_length
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
new file mode 100755
index 0000000000..b3e1f04c34
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -0,0 +1,687 @@
+#!/usr/bin/env python3
+# Copyright 2022 Xiaomi Corporation (Authors: Wei Kang, Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage:
+./pruned_transducer_stateless2/streaming_decode.py \
+        --epoch 28 \
+        --avg 15 \
+        --left-context 32 \
+        --decode-chunk-size 8 \
+        --right-context 0 \
+        --exp-dir ./pruned_transducer_stateless2/exp \
+        --decoding_method greedy_search \
+        --num-decode-streams 1000
+"""
+
+import argparse
+import logging
+import math
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import numpy as np
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from decode_stream import DecodeStream
+from kaldifeat import Fbank, FbankOptions
+from lhotse import CutSet
+from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.decode import one_best_decoding
+from icefall.utils import (
+    AttributeDict,
+    get_texts,
+    setup_logger,
+    store_transcripts,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 0.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="pruned_transducer_stateless2/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Support only greedy_search and fast_beam_search now.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=32,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-size",
+        type=int,
+        default=16,
+        help="The chunk size for decoding (in frames after subsampling)",
+    )
+
+    parser.add_argument(
+        "--left-context",
+        type=int,
+        default=64,
+        help="left context can be seen during decoding (in frames after subsampling)",
+    )
+
+    parser.add_argument(
+        "--right-context",
+        type=int,
+        default=0,
+        help="right context can be seen during decoding (in frames after subsampling)",
+    )
+
+    parser.add_argument(
+        "--num-decode-streams",
+        type=int,
+        default=2000,
+        help="The number of streams that can be decoded parallel.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def greedy_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    streams: List[DecodeStream],
+) -> List[List[int]]:
+
+    assert len(streams) == encoder_out.size(0)
+    assert encoder_out.ndim == 3
+
+    blank_id = model.decoder.blank_id
+    context_size = model.decoder.context_size
+    device = model.device
+    T = encoder_out.size(1)
+
+    decoder_input = torch.tensor(
+        [stream.hyp[-context_size:] for stream in streams],
+        device=device,
+        dtype=torch.int64,
+    )
+    # decoder_out is of shape (N, decoder_out_dim)
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    # logging.info(f"decoder_out shape : {decoder_out.shape}")
+
+    for t in range(T):
+        # current_encoder_out's shape: (batch_size, 1, encoder_out_dim)
+        current_encoder_out = encoder_out[:, t : t + 1, :]  # noqa
+
+        logits = model.joiner(
+            current_encoder_out.unsqueeze(2),
+            decoder_out.unsqueeze(1),
+            project_input=False,
+        )
+        # logits'shape (batch_size,  vocab_size)
+        logits = logits.squeeze(1).squeeze(1)
+
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v != blank_id:
+                streams[i].hyp.append(v)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = torch.tensor(
+                [stream.hyp[-context_size:] for stream in streams],
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(
+                decoder_input,
+                need_pad=False,
+            )
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    hyp_tokens = []
+    for stream in streams:
+        hyp_tokens.append(stream.hyp)
+    return hyp_tokens
+
+
+def fast_beam_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    processed_lens: torch.Tensor,
+    decoding_streams: k2.RnntDecodingStreams,
+) -> List[List[int]]:
+
+    B, T, C = encoder_out.shape
+    for t in range(T):
+        # shape is a RaggedShape of shape (B, context)
+        # contexts is a Tensor of shape (shape.NumElements(), context_size)
+        shape, contexts = decoding_streams.get_contexts()
+        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
+        contexts = contexts.to(torch.int64)
+        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
+        decoder_out = model.decoder(contexts, need_pad=False)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # current_encoder_out is of shape
+        # (shape.NumElements(), 1, joiner_dim)
+        # fmt: off
+        current_encoder_out = torch.index_select(
+            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
+        )
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out.unsqueeze(2),
+            decoder_out.unsqueeze(1),
+            project_input=False,
+        )
+        logits = logits.squeeze(1).squeeze(1)
+        log_probs = logits.log_softmax(dim=-1)
+        decoding_streams.advance(log_probs)
+
+    decoding_streams.terminate_and_flush_to_streams()
+
+    lattice = decoding_streams.format_output(processed_lens.tolist())
+    best_path = one_best_decoding(lattice)
+    hyp_tokens = get_texts(best_path)
+    return hyp_tokens
+
+
+def decode_one_chunk(
+    params: AttributeDict,
+    model: nn.Module,
+    decode_streams: List[DecodeStream],
+) -> List[int]:
+    """Decode one chunk frames of features for each decode_streams and
+    return the indexes of finished streams in a List.
+
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      decode_streams:
+        A List of DecodeStream, each belonging to a utterance.
+    Returns:
+      Return a List containing which DecodeStreams are finished.
+    """
+    device = model.device
+
+    features = []
+    feature_lens = []
+    states = []
+
+    rnnt_stream_list = []
+    processed_lens = []
+
+    for stream in decode_streams:
+        feat, feat_len = stream.get_feature_frames(
+            params.decode_chunk_size * params.subsampling_factor
+        )
+        features.append(feat)
+        feature_lens.append(feat_len)
+        states.append(stream.states)
+        processed_lens.append(stream.done_frames)
+        if params.decoding_method == "fast_beam_search":
+            rnnt_stream_list.append(stream.rnnt_decoding_stream)
+
+    feature_lens = torch.tensor(feature_lens, device=device)
+    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
+
+    # if T is less than 7 there will be an error in time reduction layer,
+    # because we subsample features with ((x_len - 1) // 2 - 1) // 2
+    # we plus 2 here because we will cut off one frame on each size of
+    # encoder_embed output as they see invalid paddings. so we need extra 2
+    # frames.
+    tail_length = 7 + (2 + params.right_context) * params.subsampling_factor
+    if features.size(1) < tail_length:
+        feature_lens += tail_length - features.size(1)
+        features = torch.cat(
+            [
+                features,
+                torch.tensor(
+                    LOG_EPS, dtype=features.dtype, device=device
+                ).expand(
+                    features.size(0),
+                    tail_length - features.size(1),
+                    features.size(2),
+                ),
+            ],
+            dim=1,
+        )
+
+    states = [
+        torch.stack([x[0] for x in states], dim=2),
+        torch.stack([x[1] for x in states], dim=2),
+    ]
+    processed_lens = torch.tensor(processed_lens, device=device)
+
+    encoder_out, encoder_out_lens, states = model.encoder.streaming_forward(
+        x=features,
+        x_lens=feature_lens,
+        states=states,
+        left_context=params.left_context,
+        right_context=params.right_context,
+        processed_lens=processed_lens,
+    )
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    if params.decoding_method == "greedy_search":
+        hyp_tokens = greedy_search(model, encoder_out, decode_streams)
+    elif params.decoding_method == "fast_beam_search":
+        config = k2.RnntDecodingConfig(
+            vocab_size=params.vocab_size,
+            decoder_history_len=params.context_size,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        decoding_streams = k2.RnntDecodingStreams(rnnt_stream_list, config)
+        processed_lens = processed_lens + encoder_out_lens
+        hyp_tokens = fast_beam_search(
+            model, encoder_out, processed_lens, decoding_streams
+        )
+    else:
+        assert False
+
+    states = [torch.unbind(states[0], dim=2), torch.unbind(states[1], dim=2)]
+
+    finished_streams = []
+    for i in range(len(decode_streams)):
+        decode_streams[i].states = [states[0][i], states[1][i]]
+        decode_streams[i].done_frames += encoder_out_lens[i]
+        if params.decoding_method == "fast_beam_search":
+            decode_streams[i].hyp = hyp_tokens[i]
+        if decode_streams[i].done:
+            finished_streams.append(i)
+
+    return finished_streams
+
+
+def decode_dataset(
+    cuts: CutSet,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      cuts:
+        Lhotse Cutset containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    device = model.device
+
+    opts = FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = 16000
+    opts.mel_opts.num_bins = 80
+
+    log_interval = 50
+
+    decode_results = []
+    # Contain decode streams currently running.
+    decode_streams = []
+    initial_states = model.encoder.get_init_state(
+        params.left_context, device=device
+    )
+    for num, cut in enumerate(cuts):
+        # each utterance has a DecodeStream.
+        decode_stream = DecodeStream(
+            params=params,
+            initial_states=initial_states,
+            decoding_graph=decoding_graph,
+            device=device,
+        )
+
+        audio: np.ndarray = cut.load_audio()
+        # audio.shape: (1, num_samples)
+        assert len(audio.shape) == 2
+        assert audio.shape[0] == 1, "Should be single channel"
+        assert audio.dtype == np.float32, audio.dtype
+
+        # The trained model is using normalized samples
+        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
+
+        samples = torch.from_numpy(audio).squeeze(0)
+
+        fbank = Fbank(opts)
+        feature = fbank(samples.to(device))
+        decode_stream.set_features(feature)
+        decode_stream.ground_truth = cut.supervisions[0].text
+
+        decode_streams.append(decode_stream)
+
+        while len(decode_streams) >= params.num_decode_streams:
+            finished_streams = decode_one_chunk(
+                params=params, model=model, decode_streams=decode_streams
+            )
+            for i in sorted(finished_streams, reverse=True):
+                hyp = decode_streams[i].hyp
+                if params.decoding_method == "greedy_search":
+                    hyp = hyp[params.context_size :]  # noqa
+                decode_results.append(
+                    (
+                        decode_streams[i].ground_truth.split(),
+                        sp.decode(hyp).split(),
+                    )
+                )
+                del decode_streams[i]
+
+        if num % log_interval == 0:
+            logging.info(f"Cuts processed until now is {num}.")
+
+    # decode final chunks of last sequences
+    while len(decode_streams):
+        finished_streams = decode_one_chunk(
+            params=params, model=model, decode_streams=decode_streams
+        )
+        for i in sorted(finished_streams, reverse=True):
+            hyp = decode_streams[i].hyp
+            if params.decoding_method == "greedy_search":
+                hyp = hyp[params.context_size :]  # noqa
+            decode_results.append(
+                (
+                    decode_streams[i].ground_truth.split(),
+                    sp.decode(hyp).split(),
+                )
+            )
+            del decode_streams[i]
+
+    key = "greedy_search"
+    if params.decoding_method == "fast_beam_search":
+        key = (
+            f"beam_{params.beam}_"
+            f"max_contexts_{params.max_contexts}_"
+            f"max_states_{params.max_states}"
+        )
+    return {key: decode_results}
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        # sort results so we can easily compare the difference between two
+        # recognition results
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir
+        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    # for streaming
+    params.suffix += f"-streaming-chunk-size-{params.decode_chunk_size}"
+    params.suffix += f"-left-context-{params.left_context}"
+    params.suffix += f"-right-context-{params.right_context}"
+
+    # for fast_beam_search
+    if params.decoding_method == "fast_beam_search":
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+    # Decoding in streaming requires causal convolution
+    params.causal_convolution = True
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    if params.iter > 0:
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(average_checkpoints(filenames, device=device))
+    elif params.avg == 1:
+        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    else:
+        start = params.epoch - params.avg + 1
+        filenames = []
+        for i in range(start, params.epoch + 1):
+            if start >= 0:
+                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(average_checkpoints(filenames, device=device))
+
+    model.to(device)
+    model.eval()
+    model.device = device
+
+    decoding_graph = None
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_sets = ["test-clean", "test-other"]
+    test_cuts = [test_clean_cuts, test_other_cuts]
+
+    for test_set, test_cut in zip(test_sets, test_cuts):
+        results_dict = decode_dataset(
+            cuts=test_cut,
+            params=params,
+            model=model,
+            sp=sp,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()

From 5c669b77160e32cdd2cf1823a0a95dffbf3e6695 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 16:09:24 +0800
Subject: [PATCH 10/40] modify streaming decoding files

---
 .../decode_stream.py                          |  6 +--
 .../streaming_decode.py                       | 51 +++++--------------
 2 files changed, 16 insertions(+), 41 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
index ba5e805553..7df96a9afb 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
@@ -71,9 +71,9 @@ def __init__(
         # encoder.streaming_forward
         self.done_frames: int = 0
 
-        self.pad_length = (
-            params.right_context + 2
-        ) * params.subsampling_factor + 3
+        # add 2 here since we will drop the first and last frames after
+        # the convolutional subsampling module
+        self.pad_length = 2 * params.subsampling_factor + 3
 
         if params.decoding_method == "greedy_search":
             self.hyp = [params.blank_id] * params.context_size
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index b3e1f04c34..6ba72ca2aa 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -17,15 +17,13 @@
 
 """
 Usage:
-./pruned_transducer_stateless2/streaming_decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --left-context 32 \
-        --decode-chunk-size 8 \
-        --right-context 0 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --decoding_method greedy_search \
-        --num-decode-streams 1000
+./lstm_transducer_stateless/streaming_decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./lstm_transducer_stateless/exp \
+    --decoding_method greedy_search \
+    --decode-chunk-size 1 \
+    --num-decode-streams 1000
 """
 
 import argparse
@@ -44,7 +42,7 @@
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_transducer_model
+from train import get_params, get_transducer_model
 
 from icefall.checkpoint import (
     average_checkpoints,
@@ -155,24 +153,10 @@ def get_parser():
     parser.add_argument(
         "--decode-chunk-size",
         type=int,
-        default=16,
+        default=1,
         help="The chunk size for decoding (in frames after subsampling)",
     )
 
-    parser.add_argument(
-        "--left-context",
-        type=int,
-        default=64,
-        help="left context can be seen during decoding (in frames after subsampling)",
-    )
-
-    parser.add_argument(
-        "--right-context",
-        type=int,
-        default=0,
-        help="right context can be seen during decoding (in frames after subsampling)",
-    )
-
     parser.add_argument(
         "--num-decode-streams",
         type=int,
@@ -180,8 +164,6 @@ def get_parser():
         help="The number of streams that can be decoded parallel.",
     )
 
-    add_model_arguments(parser)
-
     return parser
 
 
@@ -334,7 +316,7 @@ def decode_one_chunk(
     # we plus 2 here because we will cut off one frame on each size of
     # encoder_embed output as they see invalid paddings. so we need extra 2
     # frames.
-    tail_length = 7 + (2 + params.right_context) * params.subsampling_factor
+    tail_length = 7 + 2 * params.subsampling_factor
     if features.size(1) < tail_length:
         feature_lens += tail_length - features.size(1)
         features = torch.cat(
@@ -357,13 +339,10 @@ def decode_one_chunk(
     ]
     processed_lens = torch.tensor(processed_lens, device=device)
 
-    encoder_out, encoder_out_lens, states = model.encoder.streaming_forward(
+    encoder_out, encoder_out_lens, states = model.encoder.infer(
         x=features,
         x_lens=feature_lens,
         states=states,
-        left_context=params.left_context,
-        right_context=params.right_context,
-        processed_lens=processed_lens,
     )
 
     encoder_out = model.joiner.encoder_proj(encoder_out)
@@ -442,8 +421,8 @@ def decode_dataset(
     decode_results = []
     # Contain decode streams currently running.
     decode_streams = []
-    initial_states = model.encoder.get_init_state(
-        params.left_context, device=device
+    initial_states = model.encoder.get_init_states(
+        device=device
     )
     for num, cut in enumerate(cuts):
         # each utterance has a DecodeStream.
@@ -584,8 +563,6 @@ def main():
 
     # for streaming
     params.suffix += f"-streaming-chunk-size-{params.decode_chunk_size}"
-    params.suffix += f"-left-context-{params.left_context}"
-    params.suffix += f"-right-context-{params.right_context}"
 
     # for fast_beam_search
     if params.decoding_method == "fast_beam_search":
@@ -609,8 +586,6 @@ def main():
     params.blank_id = sp.piece_to_id("<blk>")
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
-    # Decoding in streaming requires causal convolution
-    params.causal_convolution = True
 
     logging.info(params)
 

From 539a9d75d4b681f2ae51f2e999ef5b5835a3aa98 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 17:07:14 +0800
Subject: [PATCH 11/40] simplified code in ScaledLSTM

---
 .../ASR/lstm_transducer_stateless/lstm.py     | 10 +--
 .../pruned_transducer_stateless2/scaling.py   | 88 +++++--------------
 2 files changed, 29 insertions(+), 69 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 52424c0bb0..781176ea97 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import copy
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 from encoder_interface import EncoderInterface
@@ -73,7 +73,7 @@ def __init__(
         #   (2) embedding: num_features -> d_model
         self.encoder_embed = Conv2dSubsampling(num_features, d_model)
 
-        self.encoder_layers = num_encoder_layers
+        self.num_encoder_layers = num_encoder_layers
         self.d_model = d_model
 
         encoder_layer = RNNEncoderLayer(
@@ -119,8 +119,8 @@ def forward(
         return x, lengths
 
     @torch.jit.export
-    def get_init_state(self, device: torch.device) -> torch.Tensor:
-        """Get model initial state."""
+    def get_init_states(self, device: torch.device) -> torch.Tensor:
+        """Get model initial states."""
         init_states = torch.zeros(
             (2, self.num_encoder_layers, self.d_model), device=device
         )
@@ -283,7 +283,7 @@ def infer(
 
         # lstm module
         # The required shapes of h_0 and c_0 are both (1, N, E).
-        src_lstm, new_states = self.lstm(src, states.unbind(dim=0))
+        src_lstm, new_states = self.lstm(src, (states[0], states[1]))
         new_states = torch.stack(new_states, dim=0)
         src = src + self.dropout(src_lstm)
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index d0c16cd1e4..6446b6704f 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -22,7 +22,6 @@
 import torch
 import torch.nn as nn
 from torch import Tensor, _VF
-from torch.nn.utils.rnn import PackedSequence
 
 
 def _ntuple(n):
@@ -428,84 +427,45 @@ def get_flat_weights(self):
             )
         return flat_weights
 
-    def forward(self, input, hx=None):
-        # This function is copied from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py  # noqa
-        # The only change is for calling `_VF.lstm()`:
+    def forward(
+        self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
+    ):
+        # This function is modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py  # noqa
+        # The change for calling `_VF.lstm()` is:
         # self._flat_weights -> self.get_flat_weights()
-        orig_input = input
-        # xxx: isinstance check needs to be in conditional for TorchScript to compile
-        if isinstance(orig_input, PackedSequence):
-            input, batch_sizes, sorted_indices, unsorted_indices = input
-            max_batch_size = batch_sizes[0]
-            max_batch_size = int(max_batch_size)
-        else:
-            batch_sizes = None
-            max_batch_size = (
-                input.size(0) if self.batch_first else input.size(1)
-            )
-            sorted_indices = None
-            unsorted_indices = None
-
         if hx is None:
-            num_directions = 2 if self.bidirectional else 1
-            real_hidden_size = (
-                self.proj_size if self.proj_size > 0 else self.hidden_size
-            )
             h_zeros = torch.zeros(
-                self.num_layers * num_directions,
-                max_batch_size,
-                real_hidden_size,
+                1,
+                input.size(1),
+                self.hidden_size,
                 dtype=input.dtype,
                 device=input.device,
             )
             c_zeros = torch.zeros(
-                self.num_layers * num_directions,
-                max_batch_size,
+                1,
+                input.size(1),
                 self.hidden_size,
                 dtype=input.dtype,
                 device=input.device,
             )
             hx = (h_zeros, c_zeros)
-        else:
-            # Each batch of the hidden state should match the input sequence that
-            # the user believes he/she is passing in.
-            hx = self.permute_hidden(hx, sorted_indices)
 
-        self.check_forward_args(input, hx, batch_sizes)
-        if batch_sizes is None:
-            result = _VF.lstm(
-                input,
-                hx,
-                self.get_flat_weights(),
-                self.bias,
-                self.num_layers,
-                self.dropout,
-                self.training,
-                self.bidirectional,
-                self.batch_first,
-            )
-        else:
-            result = _VF.lstm(
-                input,
-                batch_sizes,
-                hx,
-                self.get_flat_weights(),
-                self.bias,
-                self.num_layers,
-                self.dropout,
-                self.training,
-                self.bidirectional,
-            )
+        self.check_forward_args(input, hx, None)
+        result = _VF.lstm(
+            input,
+            hx,
+            self.get_flat_weights(),
+            self.bias,
+            self.num_layers,
+            self.dropout,
+            self.training,
+            self.bidirectional,
+            self.batch_first,
+        )
+
         output = result[0]
         hidden = result[1:]
-        # xxx: isinstance check needs to be in conditional for TorchScript to compile
-        if isinstance(orig_input, PackedSequence):
-            output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
-            )
-            return output_packed, self.permute_hidden(hidden, unsorted_indices)
-        else:
-            return output, self.permute_hidden(hidden, unsorted_indices)
+        return output, hidden
 
 
 class ActivationBalancer(torch.nn.Module):

From 125eac8deee867a7beaebfb5d0409d6391c31841 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 20:35:29 +0800
Subject: [PATCH 12/40] flat weights after scaling

---
 .../pruned_transducer_stateless2/scaling.py   | 70 +++++++++++++++++--
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 6446b6704f..fe265eb81d 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -17,12 +17,14 @@
 
 import collections
 from itertools import repeat
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn as nn
 from torch import Tensor, _VF
 
+import torch.backends.cudnn.rnn as rnn
+
 
 def _ntuple(n):
     def parse(x):
@@ -419,12 +421,72 @@ def _reset_parameters(self, initial_speed: float):
             elif "bias" in name:
                 nn.init.constant_(self._flat_weights[idx], 0.0)
 
-    def get_flat_weights(self):
+    def _flatten_parameters(self, flat_weights) -> None:
+        """Resets parameter data pointer so that they can use faster code paths.
+
+        Right now, this works only if the module is on the GPU and cuDNN is enabled.
+        Otherwise, it's a no-op.
+
+        This function is modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py  # noqa
+        """
+        # Short-circuits if _flat_weights is only partially instantiated
+        if len(flat_weights) != len(self._flat_weights_names):
+            return
+
+        for w in flat_weights:
+            if not isinstance(w, Tensor):
+                return
+        # Short-circuits if any tensor in flat_weights is not acceptable to cuDNN
+        # or the tensors in flat_weights are of different dtypes
+
+        first_fw = flat_weights[0]
+        dtype = first_fw.dtype
+        for fw in flat_weights:
+            if (
+                not isinstance(fw.data, Tensor)
+                or not (fw.data.dtype == dtype)
+                or not fw.data.is_cuda
+                or not torch.backends.cudnn.is_acceptable(fw.data)
+            ):
+                return
+
+        # If any parameters alias, we fall back to the slower, copying code path. This is
+        # a sufficient check, because overlapping parameter buffers that don't completely
+        # alias would break the assumptions of the uniqueness check in
+        # Module.named_parameters().
+        unique_data_ptrs = set(p.data_ptr() for p in flat_weights)
+        if len(unique_data_ptrs) != len(flat_weights):
+            return
+
+        with torch.cuda.device_of(first_fw):
+
+            # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is
+            # an inplace operation on self._flat_weights
+            with torch.no_grad():
+                if torch._use_cudnn_rnn_flatten_weight():
+                    num_weights = 4 if self.bias else 2
+                    if self.proj_size > 0:
+                        num_weights += 1
+                    torch._cudnn_rnn_flatten_weight(
+                        flat_weights,
+                        num_weights,
+                        self.input_size,
+                        rnn.get_cudnn_mode(self.mode),
+                        self.hidden_size,
+                        self.proj_size,
+                        self.num_layers,
+                        self.batch_first,
+                        bool(self.bidirectional),
+                    )
+
+    def _get_flat_weights(self):
+        """Get scaled weights, and resets their data pointer."""
         flat_weights = []
         for idx in range(len(self._flat_weights_names)):
             flat_weights.append(
                 self._flat_weights[idx] * self._scales[idx].exp()
             )
+        self._flatten_parameters(flat_weights)
         return flat_weights
 
     def forward(
@@ -432,7 +494,7 @@ def forward(
     ):
         # This function is modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py  # noqa
         # The change for calling `_VF.lstm()` is:
-        # self._flat_weights -> self.get_flat_weights()
+        # self._flat_weights -> self._get_flat_weights()
         if hx is None:
             h_zeros = torch.zeros(
                 1,
@@ -454,7 +516,7 @@ def forward(
         result = _VF.lstm(
             input,
             hx,
-            self.get_flat_weights(),
+            self._get_flat_weights(),
             self.bias,
             self.num_layers,
             self.dropout,

From ce2d8171147a00196ac7a3955f9292a9f7e6aa2c Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 20:36:20 +0800
Subject: [PATCH 13/40] pruned2 -> pruned4

---
 .../ASR/lstm_transducer_stateless/decode.py   | 159 +++++++++-----
 .../streaming_decode.py                       | 139 ++++++++----
 .../ASR/lstm_transducer_stateless/train.py    | 198 ++++++++++--------
 3 files changed, 318 insertions(+), 178 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index 34e8e8fb99..f7e3677da5 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 #
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -19,7 +20,7 @@
 Usage:
 (1) greedy search
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -27,7 +28,7 @@
 
 (2) beam search (not recommended)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -36,7 +37,7 @@
 
 (3) modified beam search
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -45,7 +46,7 @@
 
 (4) fast beam search (one best)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -56,9 +57,9 @@
 
 (5) fast beam search (nbest)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
-    --exp-dir ./lstm_transducer_stateless/exp \
+    --exp-dir ./pruned_transducer_stateless3/exp \
     --max-duration 600 \
     --decoding-method fast_beam_search_nbest \
     --beam 20.0 \
@@ -69,7 +70,7 @@
 
 (6) fast beam search (nbest oracle WER)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 30 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -119,6 +120,7 @@
 
 from icefall.checkpoint import (
     average_checkpoints,
+    average_checkpoints_with_averaged_model,
     find_checkpoints,
     load_checkpoint,
 )
@@ -127,6 +129,7 @@
     AttributeDict,
     setup_logger,
     store_transcripts,
+    str2bool,
     write_error_stats,
 )
 
@@ -141,9 +144,9 @@ def get_parser():
     parser.add_argument(
         "--epoch",
         type=int,
-        default=28,
+        default=30,
         help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 0.
+        Note: Epoch counts from 1.
         You can specify --avg to use more checkpoints for model averaging.""",
     )
 
@@ -166,10 +169,21 @@ def get_parser():
         "'--epoch' and '--iter'",
     )
 
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless2/exp",
+        default="lstm_transducer_stateless/exp",
         help="The experiment dir",
     )
 
@@ -330,7 +344,7 @@ def decode_one_batch(
       Return the decoding result. See above description for the format of
       the returned dict.
     """
-    device = model.device
+    device = next(model.parameters()).device
     feature = batch["inputs"]
     assert feature.ndim == 3
 
@@ -433,7 +447,7 @@ def decode_one_batch(
 
         for i in range(batch_size):
             # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]]
             # fmt: on
             if params.decoding_method == "greedy_search":
                 hyp = greedy_search(
@@ -455,14 +469,6 @@ def decode_one_batch(
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
-    elif params.decoding_method == "fast_beam_search":
-        return {
-            (
-                f"beam_{params.beam}_"
-                f"max_contexts_{params.max_contexts}_"
-                f"max_states_{params.max_states}"
-            ): hyps
-        }
     elif "fast_beam_search" in params.decoding_method:
         key = f"beam_{params.beam}_"
         key += f"max_contexts_{params.max_contexts}_"
@@ -530,8 +536,8 @@ def decode_dataset(
             params=params,
             model=model,
             sp=sp,
-            word_table=word_table,
             decoding_graph=decoding_graph,
+            word_table=word_table,
             batch=batch,
         )
 
@@ -642,6 +648,9 @@ def main():
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
 
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
     setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
     logging.info("Decoding started")
 
@@ -659,43 +668,95 @@ def main():
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
+    if params.simulate_streaming:
+        assert (
+            params.causal_convolution
+        ), "Decoding in streaming requires causal convolution"
+
     logging.info(params)
 
     logging.info("About to create model")
     model = get_transducer_model(params)
 
-    if params.iter > 0:
-        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-            : params.avg
-        ]
-        if len(filenames) == 0:
-            raise ValueError(
-                f"No checkpoints found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg + 1]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
             )
-        elif len(filenames) < params.avg:
-            raise ValueError(
-                f"Not enough checkpoints ({len(filenames)}) found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
             )
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
 
     model.to(device)
     model.eval()
-    model.device = device
 
     if "fast_beam_search" in params.decoding_method:
         if params.decoding_method == "fast_beam_search_nbest_LG":
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 6ba72ca2aa..412718e33b 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -18,12 +18,12 @@
 """
 Usage:
 ./lstm_transducer_stateless/streaming_decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./lstm_transducer_stateless/exp \
-    --decoding_method greedy_search \
-    --decode-chunk-size 1 \
-    --num-decode-streams 1000
+        --epoch 28 \
+        --avg 15 \
+        --decode-chunk-size 1 \
+        --exp-dir ./lstm_transducer_stateless/exp \
+        --decoding_method greedy_search \
+        --num-decode-streams 200
 """
 
 import argparse
@@ -46,6 +46,7 @@
 
 from icefall.checkpoint import (
     average_checkpoints,
+    average_checkpoints_with_averaged_model,
     find_checkpoints,
     load_checkpoint,
 )
@@ -55,6 +56,7 @@
     get_texts,
     setup_logger,
     store_transcripts,
+    str2bool,
     write_error_stats,
 )
 
@@ -94,6 +96,17 @@ def get_parser():
         "'--epoch' and '--iter'",
     )
 
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
     parser.add_argument(
         "--exp-dir",
         type=str,
@@ -164,6 +177,8 @@ def get_parser():
         help="The number of streams that can be decoded parallel.",
     )
 
+    add_model_arguments(parser)
+
     return parser
 
 
@@ -421,8 +436,8 @@ def decode_dataset(
     decode_results = []
     # Contain decode streams currently running.
     decode_streams = []
-    initial_states = model.encoder.get_init_states(
-        device=device
+    initial_states = model.encoder.get_init_state(
+        params.left_context, device=device
     )
     for num, cut in enumerate(cuts):
         # each utterance has a DecodeStream.
@@ -507,8 +522,6 @@ def save_results(
         recog_path = (
             params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
         )
-        # sort results so we can easily compare the difference between two
-        # recognition results
         results = sorted(results)
         store_transcripts(filename=recog_path, texts=results)
         logging.info(f"The transcripts are stored in {recog_path}")
@@ -570,6 +583,9 @@ def main():
         params.suffix += f"-max-contexts-{params.max_contexts}"
         params.suffix += f"-max-states-{params.max_states}"
 
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
     setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
     logging.info("Decoding started")
 
@@ -587,39 +603,90 @@ def main():
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
+    # Decoding in streaming requires causal convolution
+    params.causal_convolution = True
+
     logging.info(params)
 
     logging.info("About to create model")
     model = get_transducer_model(params)
 
-    if params.iter > 0:
-        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-            : params.avg
-        ]
-        if len(filenames) == 0:
-            raise ValueError(
-                f"No checkpoints found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if start >= 0:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg + 1]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
             )
-        elif len(filenames) < params.avg:
-            raise ValueError(
-                f"Not enough checkpoints ({len(filenames)}) found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
             )
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
 
     model.to(device)
     model.eval()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 8ce5bdc54a..738a880eb0 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                  Wei Kang
-#                                                  Mingshuang Luo)
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -24,7 +25,7 @@
 ./lstm_transducer_stateless/train.py \
   --world-size 4 \
   --num-epochs 30 \
-  --start-epoch 0 \
+  --start-epoch 1 \
   --exp-dir lstm_transducer_stateless/exp \
   --full-libri 1 \
   --max-duration 300
@@ -34,15 +35,15 @@
 ./lstm_transducer_stateless/train.py \
   --world-size 4 \
   --num-epochs 30 \
-  --start-epoch 0 \
+  --start-epoch 1 \
   --use-fp16 1 \
   --exp-dir lstm_transducer_stateless/exp \
   --full-libri 1 \
   --max-duration 550
 """
 
-
 import argparse
+import copy
 import logging
 import warnings
 from pathlib import Path
@@ -72,7 +73,10 @@
 from icefall import diagnostics
 from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import save_checkpoint_with_global_batch_idx
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@@ -118,10 +122,10 @@ def get_parser():
     parser.add_argument(
         "--start-epoch",
         type=int,
-        default=0,
-        help="""Resume training from from this epoch.
-        If it is positive, it will load checkpoint from
-        transducer_stateless2/exp/epoch-{start_epoch-1}.pt
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
         """,
     )
 
@@ -137,7 +141,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless2/exp",
+        default="lstm_transducer_stateless/exp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved
@@ -155,16 +159,16 @@ def get_parser():
         "--initial-lr",
         type=float,
         default=0.003,
-        help="The initial learning rate.  This value should not need to "
-        "be changed.",
+        help="""The initial learning rate. This value should not need to be
+        changed.""",
     )
 
     parser.add_argument(
         "--lr-batches",
         type=float,
         default=5000,
-        help="""Number of steps that affects how rapidly the learning rate
-        decreases. We suggest not to change this.""",
+        help="""Number of steps that affects how rapidly the learning rate decreases.
+        We suggest not to change this.""",
     )
 
     parser.add_argument(
@@ -255,6 +259,19 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
     parser.add_argument(
         "--use-fp16",
         type=str2bool,
@@ -390,6 +407,7 @@ def get_transducer_model(params: AttributeDict) -> nn.Module:
 def load_checkpoint_if_available(
     params: AttributeDict,
     model: nn.Module,
+    model_avg: nn.Module = None,
     optimizer: Optional[torch.optim.Optimizer] = None,
     scheduler: Optional[LRSchedulerType] = None,
 ) -> Optional[Dict[str, Any]]:
@@ -397,7 +415,7 @@ def load_checkpoint_if_available(
 
     If params.start_batch is positive, it will load the checkpoint from
     `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is positive, it will load the checkpoint from
+    params.start_epoch is larger than 1, it will load the checkpoint from
     `params.start_epoch - 1`.
 
     Apart from loading state dict for `model` and `optimizer` it also updates
@@ -409,6 +427,8 @@ def load_checkpoint_if_available(
         The return value of :func:`get_params`.
       model:
         The training model.
+      model_avg:
+        The stored model averaged from the start of training.
       optimizer:
         The optimizer that we are using.
       scheduler:
@@ -418,7 +438,7 @@ def load_checkpoint_if_available(
     """
     if params.start_batch > 0:
         filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 0:
+    elif params.start_epoch > 1:
         filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
     else:
         return None
@@ -428,6 +448,7 @@ def load_checkpoint_if_available(
     saved_params = load_checkpoint(
         filename,
         model=model,
+        model_avg=model_avg,
         optimizer=optimizer,
         scheduler=scheduler,
     )
@@ -454,7 +475,8 @@ def load_checkpoint_if_available(
 
 def save_checkpoint(
     params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
     optimizer: Optional[torch.optim.Optimizer] = None,
     scheduler: Optional[LRSchedulerType] = None,
     sampler: Optional[CutSampler] = None,
@@ -468,6 +490,8 @@ def save_checkpoint(
         It is returned by :func:`get_params`.
       model:
         The training model.
+      model_avg:
+        The stored model averaged from the start of training.
       optimizer:
         The optimizer used in the training.
       sampler:
@@ -481,6 +505,7 @@ def save_checkpoint(
     save_checkpoint_impl(
         filename=filename,
         model=model,
+        model_avg=model_avg,
         params=params,
         optimizer=optimizer,
         scheduler=scheduler,
@@ -500,7 +525,7 @@ def save_checkpoint(
 
 def compute_loss(
     params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
     sp: spm.SentencePieceProcessor,
     batch: dict,
     is_training: bool,
@@ -524,7 +549,11 @@ def compute_loss(
      warmup: a floating point value which increases throughout training;
         values >= 1.0 are fully warmed up and have all modules present.
     """
-    device = model.device
+    device = (
+        model.device
+        if isinstance(model, DDP)
+        else next(model.parameters()).device
+    )
     feature = batch["inputs"]
     # at entry, feature is (N, T, C)
     assert feature.ndim == 3
@@ -580,7 +609,7 @@ def compute_loss(
 
 def compute_validation_loss(
     params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
     sp: spm.SentencePieceProcessor,
     valid_dl: torch.utils.data.DataLoader,
     world_size: int = 1,
@@ -614,13 +643,14 @@ def compute_validation_loss(
 
 def train_one_epoch(
     params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
     optimizer: torch.optim.Optimizer,
     scheduler: LRSchedulerType,
     sp: spm.SentencePieceProcessor,
     train_dl: torch.utils.data.DataLoader,
     valid_dl: torch.utils.data.DataLoader,
     scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
     tb_writer: Optional[SummaryWriter] = None,
     world_size: int = 1,
     rank: int = 0,
@@ -646,6 +676,8 @@ def train_one_epoch(
         Dataloader for the validation dataset.
       scaler:
         The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
       tb_writer:
         Writer to write log messages to tensorboard.
       world_size:
@@ -668,33 +700,40 @@ def train_one_epoch(
         params.batch_idx_train += 1
         batch_size = len(batch["supervisions"]["text"])
 
-        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, loss_info = compute_loss(
-                    params=params,
-                    model=model,
-                    sp=sp,
-                    batch=batch,
-                    is_training=True,
-                    warmup=(params.batch_idx_train / params.model_warm_step),
-                )
-            # summary stats
-            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-            # NOTE: We use reduction==sum and loss is computed over utterances
-            # in the batch and there is no normalization to it so far.
-            scaler.scale(loss).backward()
-            scheduler.step_batch(params.batch_idx_train)
-            scaler.step(optimizer)
-            scaler.update()
-            optimizer.zero_grad()
-        except:  # noqa
-            display_and_save_batch(batch, params=params, sp=sp)
-            raise
+        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            loss, loss_info = compute_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                batch=batch,
+                is_training=True,
+                warmup=(params.batch_idx_train / params.model_warm_step),
+            )
+        # summary stats
+        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+        # NOTE: We use reduction==sum and loss is computed over utterances
+        # in the batch and there is no normalization to it so far.
+        scaler.scale(loss).backward()
+        scheduler.step_batch(params.batch_idx_train)
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad()
 
         if params.print_diagnostics and batch_idx == 30:
             return
 
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
         if (
             params.batch_idx_train > 0
             and params.batch_idx_train % params.save_every_n == 0
@@ -704,6 +743,7 @@ def train_one_epoch(
                 out_dir=params.exp_dir,
                 global_batch_idx=params.batch_idx_train,
                 model=model,
+                model_avg=model_avg,
                 params=params,
                 optimizer=optimizer,
                 scheduler=scheduler,
@@ -803,11 +843,6 @@ def run(rank, world_size, args):
     params.blank_id = sp.piece_to_id("<blk>")
     params.vocab_size = sp.get_piece_size()
 
-    if params.dynamic_chunk_training:
-        assert (
-            params.causal_convolution
-        ), "dynamic_chunk_training requires causal convolution"
-
     logging.info(params)
 
     logging.info("About to create model")
@@ -816,13 +851,21 @@ def run(rank, world_size, args):
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
-    checkpoints = load_checkpoint_if_available(params=params, model=model)
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
 
     model.to(device)
     if world_size > 1:
         logging.info("Using DDP")
         model = DDP(model, device_ids=[rank])
-    model.device = device
 
     optimizer = Eve(model.parameters(), lr=params.initial_lr)
 
@@ -885,7 +928,7 @@ def remove_short_and_long_utt(c: Cut):
             optimizer=optimizer,
             sp=sp,
             params=params,
-            warmup=0.0 if params.start_epoch == 0 else 1.0,
+            warmup=0.0 if params.start_epoch == 1 else 1.0,
         )
 
     scaler = GradScaler(enabled=params.use_fp16)
@@ -893,10 +936,10 @@ def remove_short_and_long_utt(c: Cut):
         logging.info("Loading grad scaler state dict")
         scaler.load_state_dict(checkpoints["grad_scaler"])
 
-    for epoch in range(params.start_epoch, params.num_epochs):
-        scheduler.step_epoch(epoch)
-        fix_random_seed(params.seed + epoch)
-        train_dl.sampler.set_epoch(epoch)
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
 
         if tb_writer is not None:
             tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
@@ -906,6 +949,7 @@ def remove_short_and_long_utt(c: Cut):
         train_one_epoch(
             params=params,
             model=model,
+            model_avg=model_avg,
             optimizer=optimizer,
             scheduler=scheduler,
             sp=sp,
@@ -924,6 +968,7 @@ def remove_short_and_long_utt(c: Cut):
         save_checkpoint(
             params=params,
             model=model,
+            model_avg=model_avg,
             optimizer=optimizer,
             scheduler=scheduler,
             sampler=train_dl.sampler,
@@ -938,40 +983,8 @@ def remove_short_and_long_utt(c: Cut):
         cleanup_dist()
 
 
-def display_and_save_batch(
-    batch: dict,
-    params: AttributeDict,
-    sp: spm.SentencePieceProcessor,
-) -> None:
-    """Display the batch statistics and save the batch into disk.
-
-    Args:
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      params:
-        Parameters for training. See :func:`get_params`.
-      sp:
-        The BPE model.
-    """
-    from lhotse.utils import uuid4
-
-    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
-    logging.info(f"Saving batch to {filename}")
-    torch.save(batch, filename)
-
-    supervisions = batch["supervisions"]
-    features = batch["inputs"]
-
-    logging.info(f"features shape: {features.shape}")
-
-    y = sp.encode(supervisions["text"], out_type=int)
-    num_tokens = sum(len(i) for i in y)
-    logging.info(f"num tokens: {num_tokens}")
-
-
 def scan_pessimistic_batches_for_oom(
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
     train_dl: torch.utils.data.DataLoader,
     optimizer: torch.optim.Optimizer,
     sp: spm.SentencePieceProcessor,
@@ -981,7 +994,7 @@ def scan_pessimistic_batches_for_oom(
     from lhotse.dataset import find_pessimistic_batches
 
     logging.info(
-        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
     )
     batches, crit_values = find_pessimistic_batches(train_dl.sampler)
     for criterion, cuts in batches.items():
@@ -999,7 +1012,7 @@ def scan_pessimistic_batches_for_oom(
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
-        except Exception as e:
+        except RuntimeError as e:
             if "CUDA out of memory" in str(e):
                 logging.error(
                     "Your GPU ran out of memory with the current "
@@ -1008,7 +1021,6 @@ def scan_pessimistic_batches_for_oom(
                     f"Failing criterion: {criterion} "
                     f"(={crit_values[criterion]}) ..."
                 )
-            display_and_save_batch(batch, params=params, sp=sp)
             raise
 
 

From 872d2390d2469a37313109cf558b927796b4ea50 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 20:39:46 +0800
Subject: [PATCH 14/40] link __init__.py

---
 egs/librispeech/ASR/lstm_transducer_stateless/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/__init__.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/__init__.py b/egs/librispeech/ASR/lstm_transducer_stateless/__init__.py
new file mode 120000
index 0000000000..b24e5e3572
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/__init__.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/__init__.py
\ No newline at end of file

From 7c00f92abba10cc60f94fb2ddea07360905e4e8b Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 21:17:45 +0800
Subject: [PATCH 15/40] fix style

---
 .../ASR/pruned_transducer_stateless2/scaling.py            | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index fe265eb81d..018e2827bb 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -17,13 +17,12 @@
 
 import collections
 from itertools import repeat
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
-import torch
-import torch.nn as nn
 from torch import Tensor, _VF
-
+import torch
 import torch.backends.cudnn.rnn as rnn
+import torch.nn as nn
 
 
 def _ntuple(n):

From c71788ee572cf82eb3579cb264fd8f3b643132f7 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 21:20:39 +0800
Subject: [PATCH 16/40] remove add_model_arguments

---
 .../ASR/lstm_transducer_stateless/streaming_decode.py           | 2 --
 1 file changed, 2 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 412718e33b..bf48b231ba 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -177,8 +177,6 @@ def get_parser():
         help="The number of streams that can be decoded parallel.",
     )
 
-    add_model_arguments(parser)
-
     return parser
 
 

From 1b0d2f3592f2c621b86376c7cf30d6d0d4437cae Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 21:22:00 +0800
Subject: [PATCH 17/40] modify .flake8

---
 .flake8 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.flake8 b/.flake8
index c7c4f18553..7479828a56 100644
--- a/.flake8
+++ b/.flake8
@@ -10,6 +10,7 @@ per-file-ignores =
     egs/*/ASR/*/optim.py: E501,
     egs/*/ASR/*/scaling.py: E501,
     egs/librispeech/ASR/conv_emformer_transducer_stateless/*.py: E501, E203
+    egs/librispeech/ASR/lstm_transducer_stateless/*.py: E501, E203
 
     # invalid escape sequence (cause by tex formular), W605
     icefall/utils.py: E501, W605

From 3cedbe367804dac5d30c49b14af00b2d0d49f0f1 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 17 Jul 2022 21:40:29 +0800
Subject: [PATCH 18/40] fix style

---
 .../ASR/lstm_transducer_stateless/lstm.py     |  2 +-
 .../pruned_transducer_stateless2/scaling.py   | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 781176ea97..47b2c7b2b8 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import copy
-from typing import Optional, Tuple
+from typing import Tuple
 
 import torch
 from encoder_interface import EncoderInterface
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 018e2827bb..65c71ab2e5 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -19,10 +19,10 @@
 from itertools import repeat
 from typing import Optional, Tuple
 
-from torch import Tensor, _VF
 import torch
 import torch.backends.cudnn.rnn as rnn
 import torch.nn as nn
+from torch import _VF, Tensor
 
 
 def _ntuple(n):
@@ -155,7 +155,7 @@ def __init__(
     def forward(self, x: Tensor) -> Tensor:
         assert x.shape[self.channel_dim] == self.num_channels
         scales = (
-            torch.mean(x**2, dim=self.channel_dim, keepdim=True)
+            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True)
             + self.eps.exp()
         ) ** -0.5
         return x * scales
@@ -208,12 +208,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3**0.5) * std
+        a = (3 ** 0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in**-0.5  # 1/sqrt(fan_in)
+        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -257,12 +257,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3**0.5) * std
+        a = (3 ** 0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in**-0.5  # 1/sqrt(fan_in)
+        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -326,12 +326,12 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3**0.5) * std
+        a = (3 ** 0.5) * std
         nn.init.uniform_(self.weight, -a, a)
         if self.bias is not None:
             nn.init.constant_(self.bias, 0.0)
         fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in**-0.5  # 1/sqrt(fan_in)
+        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
         with torch.no_grad():
             self.weight_scale += torch.tensor(scale / std).log()
 
@@ -408,9 +408,9 @@ def __init__(
 
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
-        a = (3**0.5) * std
+        a = (3 ** 0.5) * std
         fan_in = self.input_size
-        scale = fan_in**-0.5
+        scale = fan_in ** -0.5
         v = scale / std
         for idx, name in enumerate(self._flat_weights_names):
             if "weight" in name:
@@ -864,8 +864,8 @@ def _test_basic_norm():
     y = m(x)
 
     assert y.shape == x.shape
-    x_rms = (x**2).mean().sqrt()
-    y_rms = (y**2).mean().sqrt()
+    x_rms = (x ** 2).mean().sqrt()
+    y_rms = (y ** 2).mean().sqrt()
     print("x rms = ", x_rms)
     print("y rms = ", y_rms)
     assert y_rms < x_rms

From 8bd700cff2159e067682a191114ca9eb42bdaba4 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Mon, 18 Jul 2022 10:33:35 +0800
Subject: [PATCH 19/40] fix scale value in scaling.py

---
 .../ASR/lstm_transducer_stateless/decode.py     | 17 ++++++-----------
 .../ASR/pruned_transducer_stateless2/scaling.py |  3 +--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index f7e3677da5..00e906691a 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -354,12 +354,12 @@ def decode_one_batch(
     supervisions = batch["supervisions"]
     feature_lens = supervisions["num_frames"].to(device)
 
-    feature_lens += params.left_context
-    feature = torch.nn.functional.pad(
-        feature,
-        pad=(0, 0, 0, params.left_context),
-        value=LOG_EPS,
-    )
+    # feature_lens += params.left_context
+    # feature = torch.nn.functional.pad(
+    #     feature,
+    #     pad=(0, 0, 0, params.left_context),
+    #     value=LOG_EPS,
+    # )
 
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
@@ -668,11 +668,6 @@ def main():
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
-    if params.simulate_streaming:
-        assert (
-            params.causal_convolution
-        ), "Decoding in streaming requires causal convolution"
-
     logging.info(params)
 
     logging.info("About to create model")
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 65c71ab2e5..5f0785d913 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -409,8 +409,7 @@ def __init__(
     def _reset_parameters(self, initial_speed: float):
         std = 0.1 / initial_speed
         a = (3 ** 0.5) * std
-        fan_in = self.input_size
-        scale = fan_in ** -0.5
+        scale = self.hidden_size ** -0.5
         v = scale / std
         for idx, name in enumerate(self._flat_weights_names):
             if "weight" in name:

From 9bb0c7988f4ea828e62b5f7a7638968dded0342b Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Mon, 18 Jul 2022 11:50:35 +0800
Subject: [PATCH 20/40] add random combiner for training deeper model

---
 .../ASR/lstm_transducer_stateless/decode.py   |  11 +-
 .../ASR/lstm_transducer_stateless/lstm.py     | 290 +++++++++++++++++-
 .../streaming_decode.py                       |   8 +-
 .../ASR/lstm_transducer_stateless/train.py    |  24 +-
 4 files changed, 315 insertions(+), 18 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index 00e906691a..a909b2a74a 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -116,7 +116,7 @@
     greedy_search_batch,
     modified_beam_search,
 )
-from train import get_params, get_transducer_model
+from train import add_model_arguments, get_params, get_transducer_model
 
 from icefall.checkpoint import (
     average_checkpoints,
@@ -302,6 +302,8 @@ def get_parser():
         fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
     )
 
+    add_model_arguments(parser)
+
     return parser
 
 
@@ -354,13 +356,6 @@ def decode_one_batch(
     supervisions = batch["supervisions"]
     feature_lens = supervisions["num_frames"].to(device)
 
-    # feature_lens += params.left_context
-    # feature = torch.nn.functional.pad(
-    #     feature,
-    #     pad=(0, 0, 0, params.left_context),
-    #     value=LOG_EPS,
-    # )
-
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 47b2c7b2b8..45409ccea1 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import copy
-from typing import Tuple
+from typing import List, Optional, Tuple
 
 import torch
 from encoder_interface import EncoderInterface
@@ -47,6 +47,9 @@ class RNN(EncoderInterface):
         Dropout rate (default=0.1).
       layer_dropout (float):
         Dropout value for model-level warmup (default=0.075).
+      aux_layer_period (int):
+        Peroid of auxiliary layers used for randomly combined during training.
+        If not larger than 0, will not use the random combiner.
     """
 
     def __init__(
@@ -58,6 +61,7 @@ def __init__(
         num_encoder_layers: int = 12,
         dropout: float = 0.1,
         layer_dropout: float = 0.075,
+        aux_layer_period: int = 3,
     ) -> None:
         super(RNN, self).__init__()
 
@@ -79,7 +83,19 @@ def __init__(
         encoder_layer = RNNEncoderLayer(
             d_model, dim_feedforward, dropout, layer_dropout
         )
-        self.encoder = RNNEncoder(encoder_layer, num_encoder_layers)
+        self.encoder = RNNEncoder(
+            encoder_layer,
+            num_encoder_layers,
+            aux_layers=list(
+                range(
+                    num_encoder_layers // 3,
+                    num_encoder_layers - 1,
+                    aux_layer_period,
+                )
+            )
+            if aux_layer_period > 0
+            else None,
+        )
 
     def forward(
         self, x: torch.Tensor, x_lens: torch.Tensor, warmup: float = 1.0
@@ -306,13 +322,31 @@ class RNNEncoder(nn.Module):
         The number of sub-encoder-layers in the encoder (required).
     """
 
-    def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None:
+    def __init__(
+        self,
+        encoder_layer: nn.Module,
+        num_layers: int,
+        aux_layers: Optional[List[int]] = None,
+    ) -> None:
         super(RNNEncoder, self).__init__()
         self.layers = nn.ModuleList(
             [copy.deepcopy(encoder_layer) for i in range(num_layers)]
         )
         self.num_layers = num_layers
 
+        self.use_random_combiner = False
+        if aux_layers is not None:
+            assert len(set(aux_layers)) == len(aux_layers)
+            assert num_layers - 1 not in aux_layers
+            self.use_random_combiner = True
+            self.aux_layers = aux_layers + [num_layers - 1]
+            self.combiner = RandomCombine(
+                num_inputs=len(self.aux_layers),
+                final_weight=0.5,
+                pure_prob=0.333,
+                stddev=2.0,
+            )
+
     def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
         """
         Pass the input through the encoder layer in turn.
@@ -328,8 +362,16 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
         """
         output = src
 
-        for layer_index, mod in enumerate(self.layers):
+        outputs = []
+
+        for i, mod in enumerate(self.layers):
             output = mod(output, warmup=warmup)
+            if self.use_random_combiner:
+                if i in self.aux_layers:
+                    outputs.append(output)
+
+        if self.use_random_combiner:
+            output = self.combiner(outputs)
 
         return output
 
@@ -459,6 +501,244 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
+class RandomCombine(nn.Module):
+    """
+    This module combines a list of Tensors, all with the same shape, to
+    produce a single output of that same shape which, in training time,
+    is a random combination of all the inputs; but which in test time
+    will be just the last input.
+
+    The idea is that the list of Tensors will be a list of outputs of multiple
+    conformer layers.  This has a similar effect as iterated loss. (See:
+    DEJA-VU: DOUBLE FEATURE PRESENTATION AND ITERATED LOSS IN DEEP TRANSFORMER
+    NETWORKS).
+    """
+
+    def __init__(
+        self,
+        num_inputs: int,
+        final_weight: float = 0.5,
+        pure_prob: float = 0.5,
+        stddev: float = 2.0,
+    ) -> None:
+        """
+        Args:
+          num_inputs:
+            The number of tensor inputs, which equals the number of layers'
+            outputs that are fed into this module.  E.g. in an 18-layer neural
+            net if we output layers 16, 12, 18, num_inputs would be 3.
+          final_weight:
+            The amount of weight or probability we assign to the
+            final layer when randomly choosing layers or when choosing
+            continuous layer weights.
+          pure_prob:
+            The probability, on each frame, with which we choose
+            only a single layer to output (rather than an interpolation)
+          stddev:
+            A standard deviation that we add to log-probs for computing
+            randomized weights.
+
+        The method of choosing which layers, or combinations of layers, to use,
+        is conceptually as follows::
+
+            With probability `pure_prob`::
+               With probability `final_weight`: choose final layer,
+               Else: choose random non-final layer.
+            Else::
+               Choose initial log-weights that correspond to assigning
+               weight `final_weight` to the final layer and equal
+               weights to other layers; then add Gaussian noise
+               with variance `stddev` to these log-weights, and normalize
+               to weights (note: the average weight assigned to the
+               final layer here will not be `final_weight` if stddev>0).
+        """
+        super().__init__()
+        assert 0 <= pure_prob <= 1, pure_prob
+        assert 0 < final_weight < 1, final_weight
+        assert num_inputs >= 1
+
+        self.num_inputs = num_inputs
+        self.final_weight = final_weight
+        self.pure_prob = pure_prob
+        self.stddev = stddev
+
+        self.final_log_weight = (
+            torch.tensor(
+                (final_weight / (1 - final_weight)) * (self.num_inputs - 1)
+            )
+            .log()
+            .item()
+        )
+
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
+        """Forward function.
+        Args:
+          inputs:
+            A list of Tensor, e.g. from various layers of a transformer.
+            All must be the same shape, of (*, num_channels)
+        Returns:
+          A Tensor of shape (*, num_channels).  In test mode
+          this is just the final input.
+        """
+        num_inputs = self.num_inputs
+        assert len(inputs) == num_inputs
+        if not self.training or torch.jit.is_scripting():
+            return inputs[-1]
+
+        # Shape of weights: (*, num_inputs)
+        num_channels = inputs[0].shape[-1]
+        num_frames = inputs[0].numel() // num_channels
+
+        ndim = inputs[0].ndim
+        # stacked_inputs: (num_frames, num_channels, num_inputs)
+        stacked_inputs = torch.stack(inputs, dim=ndim).reshape(
+            (num_frames, num_channels, num_inputs)
+        )
+
+        # weights: (num_frames, num_inputs)
+        weights = self._get_random_weights(
+            inputs[0].dtype, inputs[0].device, num_frames
+        )
+
+        weights = weights.reshape(num_frames, num_inputs, 1)
+        # ans: (num_frames, num_channels, 1)
+        ans = torch.matmul(stacked_inputs, weights)
+        # ans: (*, num_channels)
+
+        ans = ans.reshape(inputs[0].shape[:-1] + (num_channels,))
+
+        # The following if causes errors for torch script in torch 1.6.0
+        #  if __name__ == "__main__":
+        #      # for testing only...
+        #      print("Weights = ", weights.reshape(num_frames, num_inputs))
+        return ans
+
+    def _get_random_weights(
+        self, dtype: torch.dtype, device: torch.device, num_frames: int
+    ) -> torch.Tensor:
+        """Return a tensor of random weights, of shape
+        `(num_frames, self.num_inputs)`,
+        Args:
+          dtype:
+            The data-type desired for the answer, e.g. float, double.
+          device:
+            The device needed for the answer.
+          num_frames:
+            The number of sets of weights desired
+        Returns:
+          A tensor of shape (num_frames, self.num_inputs), such that
+          `ans.sum(dim=1)` is all ones.
+        """
+        pure_prob = self.pure_prob
+        if pure_prob == 0.0:
+            return self._get_random_mixed_weights(dtype, device, num_frames)
+        elif pure_prob == 1.0:
+            return self._get_random_pure_weights(dtype, device, num_frames)
+        else:
+            p = self._get_random_pure_weights(dtype, device, num_frames)
+            m = self._get_random_mixed_weights(dtype, device, num_frames)
+            return torch.where(
+                torch.rand(num_frames, 1, device=device) < self.pure_prob, p, m
+            )
+
+    def _get_random_pure_weights(
+        self, dtype: torch.dtype, device: torch.device, num_frames: int
+    ):
+        """Return a tensor of random one-hot weights, of shape
+        `(num_frames, self.num_inputs)`,
+        Args:
+          dtype:
+            The data-type desired for the answer, e.g. float, double.
+          device:
+            The device needed for the answer.
+          num_frames:
+            The number of sets of weights desired.
+        Returns:
+          A one-hot tensor of shape `(num_frames, self.num_inputs)`, with
+          exactly one weight equal to 1.0 on each frame.
+        """
+        final_prob = self.final_weight
+
+        # final contains self.num_inputs - 1 in all elements
+        final = torch.full((num_frames,), self.num_inputs - 1, device=device)
+        # nonfinal contains random integers in [0..num_inputs - 2], these are for non-final weights.  # noqa
+        nonfinal = torch.randint(
+            self.num_inputs - 1, (num_frames,), device=device
+        )
+
+        indexes = torch.where(
+            torch.rand(num_frames, device=device) < final_prob, final, nonfinal
+        )
+        ans = torch.nn.functional.one_hot(
+            indexes, num_classes=self.num_inputs
+        ).to(dtype=dtype)
+        return ans
+
+    def _get_random_mixed_weights(
+        self, dtype: torch.dtype, device: torch.device, num_frames: int
+    ):
+        """Return a tensor of random one-hot weights, of shape
+        `(num_frames, self.num_inputs)`,
+        Args:
+          dtype:
+            The data-type desired for the answer, e.g. float, double.
+          device:
+            The device needed for the answer.
+          num_frames:
+            The number of sets of weights desired.
+        Returns:
+          A tensor of shape (num_frames, self.num_inputs), which elements
+          in [0..1] that sum to one over the second axis, i.e.
+          `ans.sum(dim=1)` is all ones.
+        """
+        logprobs = (
+            torch.randn(num_frames, self.num_inputs, dtype=dtype, device=device)
+            * self.stddev
+        )
+        logprobs[:, -1] += self.final_log_weight
+        return logprobs.softmax(dim=1)
+
+
+def _test_random_combine(final_weight: float, pure_prob: float, stddev: float):
+    print(
+        f"_test_random_combine: final_weight={final_weight}, pure_prob={pure_prob}, stddev={stddev}"  # noqa
+    )
+    num_inputs = 3
+    num_channels = 50
+    m = RandomCombine(
+        num_inputs=num_inputs,
+        final_weight=final_weight,
+        pure_prob=pure_prob,
+        stddev=stddev,
+    )
+
+    x = [torch.ones(3, 4, num_channels) for _ in range(num_inputs)]
+
+    y = m(x)
+    assert y.shape == x[0].shape
+    assert torch.allclose(y, x[0])  # .. since actually all ones.
+
+
+def _test_random_combine_main():
+    _test_random_combine(0.999, 0, 0.0)
+    _test_random_combine(0.5, 0, 0.0)
+    _test_random_combine(0.999, 0, 0.0)
+    _test_random_combine(0.5, 0, 0.3)
+    _test_random_combine(0.5, 1, 0.3)
+    _test_random_combine(0.5, 0.5, 0.3)
+
+    feature_dim = 50
+    c = RNN(num_features=feature_dim, d_model=128)
+    batch_size = 5
+    seq_len = 20
+    # Just make sure the forward pass runs.
+    f = c(
+        torch.randn(batch_size, seq_len, feature_dim),
+        torch.full((batch_size,), seq_len, dtype=torch.int64),
+    )
+    f  # to remove flake8 warnings
+
+
 if __name__ == "__main__":
     feature_dim = 50
     m = RNN(num_features=feature_dim, d_model=128)
@@ -470,3 +750,5 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         torch.full((batch_size,), seq_len, dtype=torch.int64),
         warmup=0.5,
     )
+
+    _test_random_combine_main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index bf48b231ba..79b0a45a2a 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -42,7 +42,7 @@
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
 from torch.nn.utils.rnn import pad_sequence
-from train import get_params, get_transducer_model
+from train import add_model_arguments, get_params, get_transducer_model
 
 from icefall.checkpoint import (
     average_checkpoints,
@@ -177,6 +177,8 @@ def get_parser():
         help="The number of streams that can be decoded parallel.",
     )
 
+    add_model_arguments(parser)
+
     return parser
 
 
@@ -434,9 +436,7 @@ def decode_dataset(
     decode_results = []
     # Contain decode streams currently running.
     decode_streams = []
-    initial_states = model.encoder.get_init_state(
-        params.left_context, device=device
-    )
+    initial_states = model.encoder.get_init_states(device=device)
     for num, cut in enumerate(cuts):
         # each utterance has a DecodeStream.
         decode_stream = DecodeStream(
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 738a880eb0..0826c72e92 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -57,12 +57,12 @@
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from lstm import RNN
 from decoder import Decoder
 from joiner import Joiner
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
+from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
@@ -86,6 +86,24 @@
 ]
 
 
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=int,
+        default=20,
+        help="Number of RNN encoder layers..",
+    )
+
+    parser.add_argument(
+        "--aux-layer-period",
+        type=int,
+        default=3,
+        help="""Peroid of auxiliary layers used for randomly combined during training.
+        If not larger than 0, will not use the random combiner.
+        """,
+    )
+
+
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -279,6 +297,8 @@ def get_parser():
         help="Whether to use half precision training.",
     )
 
+    add_model_arguments(parser)
+
     return parser
 
 
@@ -341,7 +361,6 @@ def get_params() -> AttributeDict:
             "subsampling_factor": 4,
             "encoder_dim": 512,
             "dim_feedforward": 2048,
-            "num_encoder_layers": 12,
             # parameters for decoder
             "decoder_dim": 512,
             # parameters for joiner
@@ -363,6 +382,7 @@ def get_encoder_model(params: AttributeDict) -> nn.Module:
         d_model=params.encoder_dim,
         dim_feedforward=params.dim_feedforward,
         num_encoder_layers=params.num_encoder_layers,
+        aux_layer_period=params.aux_layer_period,
     )
     return encoder
 

From 6871c96ffad239cd69f2cd748311cf030eb546e4 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Mon, 25 Jul 2022 19:04:08 +0800
Subject: [PATCH 21/40] add using proj_size

---
 .../ASR/lstm_transducer_stateless/lstm.py     | 154 +++++++++++++-----
 .../ASR/lstm_transducer_stateless/train.py    |  28 +++-
 .../pruned_transducer_stateless2/scaling.py   |  14 +-
 3 files changed, 139 insertions(+), 57 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 45409ccea1..1d6fda0b49 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -38,9 +38,11 @@ class RNN(EncoderInterface):
       subsampling_factor (int):
         Subsampling factor of encoder (convolution layers before lstm layers) (default=4).  # noqa
       d_model (int):
-        Hidden dimension for lstm layers, also output dimension (default=512).
+        Output dimension (default=512).
       dim_feedforward (int):
         Feedforward dimension (default=2048).
+      rnn_hidden_size (int):
+        Hidden dimension for lstm layers (default=1024).
       num_encoder_layers (int):
         Number of encoder layers (default=12).
       dropout (float):
@@ -58,6 +60,7 @@ def __init__(
         subsampling_factor: int = 4,
         d_model: int = 512,
         dim_feedforward: int = 2048,
+        rnn_hidden_size: int = 1024,
         num_encoder_layers: int = 12,
         dropout: float = 0.1,
         layer_dropout: float = 0.075,
@@ -79,9 +82,14 @@ def __init__(
 
         self.num_encoder_layers = num_encoder_layers
         self.d_model = d_model
+        self.rnn_hidden_size = rnn_hidden_size
 
         encoder_layer = RNNEncoderLayer(
-            d_model, dim_feedforward, dropout, layer_dropout
+            d_model=d_model,
+            dim_feedforward=dim_feedforward,
+            rnn_hidden_size=rnn_hidden_size,
+            dropout=dropout,
+            layer_dropout=layer_dropout,
         )
         self.encoder = RNNEncoder(
             encoder_layer,
@@ -135,17 +143,26 @@ def forward(
         return x, lengths
 
     @torch.jit.export
-    def get_init_states(self, device: torch.device) -> torch.Tensor:
+    def get_init_states(
+        self, device: torch.device
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get model initial states."""
-        init_states = torch.zeros(
-            (2, self.num_encoder_layers, self.d_model), device=device
+        # for rnn hidden states
+        hidden_states = torch.zeros(
+            (self.num_encoder_layers, self.d_model), device=device
+        )
+        cell_states = torch.zeros(
+            (self.num_encoder_layers, self.rnn_hidden_size), device=device
         )
-        return init_states
+        return (hidden_states, cell_states)
 
     @torch.jit.export
     def infer(
-        self, x: torch.Tensor, x_lens: torch.Tensor, states: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        states: Tuple[torch.Tensor, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Args:
           x:
@@ -155,9 +172,11 @@ def infer(
             A tensor of shape (N,), containing the number of frames in `x`
             before padding.
           states:
-            Its shape is (2, num_encoder_layers, N, E).
-            states[0] and states[1] are cached hidden states and cell states for
-            all layers, respectively.
+            It is a list of 2 tensors.
+            states[0] is the hidden states of all layers,
+              with shape of (num_layers, N, d_model);
+            states[1] is the cell states of all layers,
+              with shape of (num_layers, N, rnn_hidden_size).
 
         Returns:
           A tuple of 3 tensors:
@@ -165,15 +184,22 @@ def infer(
               sequence lengths.
             - lengths: a tensor of shape (batch_size,) containing the number of
               frames in `embeddings` before padding.
-            - updated states, with shape of (2, num_encoder_layers, N, E).
+            - updated states, whose shape is same as the input states.
         """
         assert not self.training
-        assert states.shape == (
-            2,
+        assert len(states) == 2
+        # for hidden state
+        assert states[0].shape == (
             self.num_encoder_layers,
             x.size(0),
             self.d_model,
-        ), states.shape
+        )
+        # for cell state
+        assert states[1].shape == (
+            self.num_encoder_layers,
+            x.size(0),
+            self.rnn_hidden_size,
+        )
 
         # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning
         #
@@ -201,6 +227,8 @@ class RNNEncoderLayer(nn.Module):
         The number of expected features in the input (required).
       dim_feedforward:
         The dimension of feedforward network model (default=2048).
+      rnn_hidden_size:
+        The hidden dimension of rnn layer.
       dropout:
         The dropout value (default=0.1).
       layer_dropout:
@@ -211,15 +239,22 @@ def __init__(
         self,
         d_model: int,
         dim_feedforward: int,
+        rnn_hidden_size: int,
         dropout: float = 0.1,
         layer_dropout: float = 0.075,
     ) -> None:
         super(RNNEncoderLayer, self).__init__()
         self.layer_dropout = layer_dropout
         self.d_model = d_model
+        self.rnn_hidden_size = rnn_hidden_size
 
+        assert rnn_hidden_size >= d_model
         self.lstm = ScaledLSTM(
-            input_size=d_model, hidden_size=d_model, dropout=0.0
+            input_size=d_model,
+            hidden_size=rnn_hidden_size,
+            proj_size=d_model if rnn_hidden_size > d_model else 0,
+            num_layers=1,
+            dropout=0.0,
         )
         self.feed_forward = nn.Sequential(
             ScaledLinear(d_model, dim_feedforward),
@@ -279,28 +314,30 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
 
     @torch.jit.export
     def infer(
-        self, src: torch.Tensor, states: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self, src: torch.Tensor, states: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Pass the input through the encoder layer.
 
         Args:
           src:
             The sequence to the encoder layer (required).
-            Its shape is (S, N, E), where S is the sequence length,
-            N is the batch size, and E is the feature number.
+            Its shape is (S, N, d_model), where S is the sequence length,
+            N is the batch size.
           states:
-            Its shape is (2, 1, N, E).
-            states[0] and states[1] are cached hidden state and cell state,
-            respectively.
+            It is a tuple of 2 tensors.
+            states[0] is the hidden state, with shape of (1, N, d_model);
+            states[1] is the cell state, with shape of (1, N, rnn_hidden_size).
         """
         assert not self.training
-        assert states.shape == (2, 1, src.size(1), src.size(2))
+        assert len(states) == 2
+        # for hidden state
+        assert states[0].shape == (1, src.size(1), self.d_model)
+        # for cell state
+        assert states[1].shape == (1, src.size(1), self.rnn_hidden_size)
 
         # lstm module
-        # The required shapes of h_0 and c_0 are both (1, N, E).
-        src_lstm, new_states = self.lstm(src, (states[0], states[1]))
-        new_states = torch.stack(new_states, dim=0)
+        src_lstm, new_states = self.lstm(src, states)
         src = src + self.dropout(src_lstm)
 
         # feed forward module
@@ -333,6 +370,8 @@ def __init__(
             [copy.deepcopy(encoder_layer) for i in range(num_layers)]
         )
         self.num_layers = num_layers
+        self.d_model = encoder_layer.d_model
+        self.rnn_hidden_size = encoder_layer.rnn_hidden_size
 
         self.use_random_combiner = False
         if aux_layers is not None:
@@ -377,34 +416,55 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
 
     @torch.jit.export
     def infer(
-        self, src: torch.Tensor, states: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self, src: torch.Tensor, states: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Pass the input through the encoder layer.
 
         Args:
           src:
             The sequence to the encoder layer (required).
-            Its shape is (S, N, E), where S is the sequence length,
-            N is the batch size, and E is the feature number.
+            Its shape is (S, N, d_model), where S is the sequence length,
+            N is the batch size.
           states:
-            Its shape is (2, num_layers, N, E).
-            states[0] and states[1] are cached hidden states and cell states for
-            all layers, respectively.
+            It is a list of 2 tensors.
+            states[0] is the hidden states of all layers,
+              with shape of (num_layers, N, d_model);
+            states[1] is the cell states of all layers,
+              with shape of (num_layers, N, rnn_hidden_size).
         """
         assert not self.training
-        assert states.shape == (2, self.num_layers, src.size(1), src.size(2))
+        assert len(states) == 2
+        # for hidden state
+        assert states[0].shape == (self.num_layers, src.size(1), self.d_model)
+        # for cell state
+        assert states[1].shape == (
+            self.num_layers,
+            src.size(1),
+            self.rnn_hidden_size,
+        )
 
-        new_states_list = []
         output = src
+        new_hidden_states = []
+        new_cell_states = []
         for layer_index, mod in enumerate(self.layers):
-            # new_states: (2, 1, N, E)
-            output, new_states = mod.infer(
-                output, states[:, layer_index : layer_index + 1, :, :]
+            layer_states = (
+                states[0][
+                    layer_index : layer_index + 1, :, :
+                ],  # h: (1, N, d_model)
+                states[1][
+                    layer_index : layer_index + 1, :, :
+                ],  # c: (1, N, rnn_hidden_size)
             )
-            new_states_list.append(new_states)
+            output, (h, c) = mod.infer(output, layer_states)
+            new_hidden_states.append(h)
+            new_cell_states.append(c)
 
-        return output, torch.cat(new_states_list, dim=1)
+        new_states = (
+            torch.cat(new_hidden_states, dim=0),
+            torch.cat(new_cell_states, dim=0),
+        )
+        return output, new_states
 
 
 class Conv2dSubsampling(nn.Module):
@@ -740,8 +800,14 @@ def _test_random_combine_main():
 
 
 if __name__ == "__main__":
-    feature_dim = 50
-    m = RNN(num_features=feature_dim, d_model=128)
+    feature_dim = 80
+    m = RNN(
+        num_features=feature_dim,
+        d_model=512,
+        rnn_hidden_size=1024,
+        dim_feedforward=2048,
+        num_encoder_layers=12,
+    )
     batch_size = 5
     seq_len = 20
     # Just make sure the forward pass runs.
@@ -750,5 +816,7 @@ def _test_random_combine_main():
         torch.full((batch_size,), seq_len, dtype=torch.int64),
         warmup=0.5,
     )
+    num_param = sum([p.numel() for p in m.parameters()])
+    print(f"Number of model parameters: {num_param}")
 
     _test_random_combine_main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 0826c72e92..89bd406b19 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -90,16 +90,30 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-encoder-layers",
         type=int,
-        default=20,
+        default=12,
         help="Number of RNN encoder layers..",
     )
 
+    parser.add_argument(
+        "--encoder-dim",
+        type=int,
+        default=512,
+        help="Encoder output dimesion.",
+    )
+
+    parser.add_argument(
+        "--rnn-hidden-size",
+        type=int,
+        default=1024,
+        help="Hidden dim for LSTM layers.",
+    )
+
     parser.add_argument(
         "--aux-layer-period",
         type=int,
         default=3,
         help="""Peroid of auxiliary layers used for randomly combined during training.
-        If not larger than 0, will not use the random combiner.
+        If not larger than 0 (e.g., -1), will not use the random combiner.
         """,
     )
 
@@ -340,8 +354,6 @@ def get_params() -> AttributeDict:
 
         - subsampling_factor:  The subsampling factor for the model.
 
-        - encoder_dim: Hidden dim for multi-head attention model.
-
         - num_decoder_layers: Number of decoder layer of transformer decoder.
 
         - warm_step: The warm_step for Noam optimizer.
@@ -359,7 +371,6 @@ def get_params() -> AttributeDict:
             # parameters for conformer
             "feature_dim": 80,
             "subsampling_factor": 4,
-            "encoder_dim": 512,
             "dim_feedforward": 2048,
             # parameters for decoder
             "decoder_dim": 512,
@@ -380,6 +391,7 @@ def get_encoder_model(params: AttributeDict) -> nn.Module:
         num_features=params.feature_dim,
         subsampling_factor=params.subsampling_factor,
         d_model=params.encoder_dim,
+        rnn_hidden_size=params.rnn_hidden_size,
         dim_feedforward=params.dim_feedforward,
         num_encoder_layers=params.num_encoder_layers,
         aux_layer_period=params.aux_layer_period,
@@ -837,7 +849,7 @@ def run(rank, world_size, args):
     params = get_params()
     params.update(vars(args))
     if params.full_libri is False:
-        params.valid_interval = 1600
+        params.valid_interval = 800
 
     fix_random_seed(params.seed)
     if world_size > 1:
@@ -903,6 +915,10 @@ def run(rank, world_size, args):
         logging.info("Loading scheduler state dict")
         scheduler.load_state_dict(checkpoints["scheduler"])
 
+    # # overwrite it
+    # scheduler.base_lrs = [params.initial_lr for _ in scheduler.base_lrs]
+    # print(scheduler.base_lrs)
+
     if params.print_diagnostics:
         diagnostic = diagnostics.attach_diagnostics(model)
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 5f0785d913..560867c3b0 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -379,7 +379,7 @@ def forward(self, input: Tensor) -> Tensor:
 
 class ScaledLSTM(nn.LSTM):
     # See docs for ScaledLinear.
-    # This class implements single-layer LSTM with scaling mechanism, using `torch._VF.lstm`
+    # This class implements LSTM with scaling mechanism, using `torch._VF.lstm`
     # Please refer to https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py
     def __init__(
         self,
@@ -388,10 +388,8 @@ def __init__(
         initial_speed: float = 1.0,
         **kwargs
     ):
-        # Hardcode num_layers=1, bidirectional=False, proj_size=0 here
-        super(ScaledLSTM, self).__init__(
-            *args, num_layers=1, bidirectional=False, proj_size=0, **kwargs
-        )
+        # Hardcode bidirectional=False
+        super(ScaledLSTM, self).__init__(*args, bidirectional=False, **kwargs)
         initial_scale = torch.tensor(initial_scale).log()
         self._scales_names = []
         self._scales = []
@@ -495,14 +493,14 @@ def forward(
         # self._flat_weights -> self._get_flat_weights()
         if hx is None:
             h_zeros = torch.zeros(
-                1,
+                self.num_layers,
                 input.size(1),
-                self.hidden_size,
+                self.proj_size if self.proj_size > 0 else self.hidden_size,
                 dtype=input.dtype,
                 device=input.device,
             )
             c_zeros = torch.zeros(
-                1,
+                self.num_layers,
                 input.size(1),
                 self.hidden_size,
                 dtype=input.dtype,

From 03b056ca3790c90c6212d2d2f84b27447322ded2 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 5 Aug 2022 19:38:30 +0800
Subject: [PATCH 22/40] add scaling converter for ScaledLSTM

---
 .../scaling_converter.py                      |   1 +
 .../test_scaling_converter.py                 | 257 ++++++++++++++++++
 .../scaling_converter.py                      |  42 ++-
 3 files changed, 299 insertions(+), 1 deletion(-)
 create mode 120000 egs/librispeech/ASR/lstm_transducer_stateless/scaling_converter.py
 create mode 100644 egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/scaling_converter.py b/egs/librispeech/ASR/lstm_transducer_stateless/scaling_converter.py
new file mode 120000
index 0000000000..3b667058d3
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/scaling_converter.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless3/scaling_converter.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
new file mode 100644
index 0000000000..b4bc8d91fa
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+To run this file, do:
+
+    cd icefall/egs/librispeech/ASR
+    python ./pruned_transducer_stateless3/test_scaling_converter.py
+"""
+
+import copy
+
+import torch
+from scaling import (
+    ScaledConv1d,
+    ScaledConv2d,
+    ScaledEmbedding,
+    ScaledLinear,
+    ScaledLSTM,
+)
+from scaling_converter import (
+    convert_scaled_to_non_scaled,
+    scaled_conv1d_to_conv1d,
+    scaled_conv2d_to_conv2d,
+    scaled_embedding_to_embedding,
+    scaled_linear_to_linear,
+    scaled_lstm_to_lstm,
+)
+from train import get_params, get_transducer_model
+
+
+def get_model():
+    params = get_params()
+    params.vocab_size = 500
+    params.blank_id = 0
+    params.context_size = 2
+    params.unk_id = 2
+    params.encoder_dim = 512
+    params.rnn_hidden_size = 1024
+    params.num_encoder_layers = 12
+    params.aux_layer_period = -1
+
+    model = get_transducer_model(params)
+    return model
+
+
+def test_scaled_linear_to_linear():
+    N = 5
+    in_features = 10
+    out_features = 20
+    for bias in [True, False]:
+        scaled_linear = ScaledLinear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+        )
+        linear = scaled_linear_to_linear(scaled_linear)
+        x = torch.rand(N, in_features)
+
+        y1 = scaled_linear(x)
+        y2 = linear(x)
+        assert torch.allclose(y1, y2)
+
+        jit_scaled_linear = torch.jit.script(scaled_linear)
+        jit_linear = torch.jit.script(linear)
+
+        y3 = jit_scaled_linear(x)
+        y4 = jit_linear(x)
+
+        assert torch.allclose(y3, y4)
+        assert torch.allclose(y1, y4)
+
+
+def test_scaled_conv1d_to_conv1d():
+    in_channels = 3
+    for bias in [True, False]:
+        scaled_conv1d = ScaledConv1d(
+            in_channels,
+            6,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+
+        conv1d = scaled_conv1d_to_conv1d(scaled_conv1d)
+
+        x = torch.rand(20, in_channels, 10)
+        y1 = scaled_conv1d(x)
+        y2 = conv1d(x)
+        assert torch.allclose(y1, y2)
+
+        jit_scaled_conv1d = torch.jit.script(scaled_conv1d)
+        jit_conv1d = torch.jit.script(conv1d)
+
+        y3 = jit_scaled_conv1d(x)
+        y4 = jit_conv1d(x)
+
+        assert torch.allclose(y3, y4)
+        assert torch.allclose(y1, y4)
+
+
+def test_scaled_conv2d_to_conv2d():
+    in_channels = 1
+    for bias in [True, False]:
+        scaled_conv2d = ScaledConv2d(
+            in_channels=in_channels,
+            out_channels=3,
+            kernel_size=3,
+            padding=1,
+            bias=bias,
+        )
+
+        conv2d = scaled_conv2d_to_conv2d(scaled_conv2d)
+
+        x = torch.rand(20, in_channels, 10, 20)
+        y1 = scaled_conv2d(x)
+        y2 = conv2d(x)
+        assert torch.allclose(y1, y2)
+
+        jit_scaled_conv2d = torch.jit.script(scaled_conv2d)
+        jit_conv2d = torch.jit.script(conv2d)
+
+        y3 = jit_scaled_conv2d(x)
+        y4 = jit_conv2d(x)
+
+        assert torch.allclose(y3, y4)
+        assert torch.allclose(y1, y4)
+
+
+def test_scaled_embedding_to_embedding():
+    scaled_embedding = ScaledEmbedding(
+        num_embeddings=500,
+        embedding_dim=10,
+        padding_idx=0,
+    )
+    embedding = scaled_embedding_to_embedding(scaled_embedding)
+
+    for s in [10, 100, 300, 500, 800, 1000]:
+        x = torch.randint(low=0, high=500, size=(s,))
+        scaled_y = scaled_embedding(x)
+        y = embedding(x)
+        assert torch.equal(scaled_y, y)
+
+
+def test_scaled_lstm_to_lstm():
+    input_size = 512
+    batch_size = 20
+    for bias in [True, False]:
+        for hidden_size in [512, 1024]:
+            scaled_lstm = ScaledLSTM(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=1,
+                bias=bias,
+                proj_size=0 if hidden_size == input_size else input_size,
+            )
+
+            lstm = scaled_lstm_to_lstm(scaled_lstm)
+
+            x = torch.rand(200, batch_size, input_size)
+            h0 = torch.randn(1, batch_size, input_size)
+            c0 = torch.randn(1, batch_size, hidden_size)
+
+            y1, (h1, c1) = scaled_lstm(x, (h0, c0))
+            y2, (h2, c2) = lstm(x, (h0, c0))
+            assert torch.allclose(y1, y2)
+            assert torch.allclose(h1, h2)
+            assert torch.allclose(c1, c2)
+
+            jit_scaled_lstm = torch.jit.trace(lstm, (x, (h0, c0)))
+            y3, (h3, c3) = jit_scaled_lstm(x, (h0, c0))
+            assert torch.allclose(y1, y3)
+            assert torch.allclose(h1, h3)
+            assert torch.allclose(c1, c3)
+
+
+def test_convert_scaled_to_non_scaled():
+    for inplace in [False, True]:
+        model = get_model()
+        model.eval()
+
+        orig_model = copy.deepcopy(model)
+
+        converted_model = convert_scaled_to_non_scaled(model, inplace=inplace)
+
+        model = orig_model
+
+        # test encoder
+        N = 2
+        T = 100
+        vocab_size = model.decoder.vocab_size
+
+        x = torch.randn(N, T, 80, dtype=torch.float32)
+        x_lens = torch.full((N,), x.size(1))
+
+        e1, e1_lens = model.encoder(x, x_lens)
+        e2, e2_lens = converted_model.encoder(x, x_lens)
+
+        assert torch.all(torch.eq(e1_lens, e2_lens))
+        assert torch.allclose(e1, e2), (e1 - e2).abs().max()
+
+        # test decoder
+        U = 50
+        y = torch.randint(low=1, high=vocab_size - 1, size=(N, U))
+
+        d1 = model.decoder(y)
+        d2 = model.decoder(y)
+
+        assert torch.allclose(d1, d2)
+
+        # test simple projection
+        lm1 = model.simple_lm_proj(d1)
+        am1 = model.simple_am_proj(e1)
+
+        lm2 = converted_model.simple_lm_proj(d2)
+        am2 = converted_model.simple_am_proj(e2)
+
+        assert torch.allclose(lm1, lm2)
+        assert torch.allclose(am1, am2)
+
+        # test joiner
+        e = torch.rand(2, 3, 4, 512)
+        d = torch.rand(2, 3, 4, 512)
+
+        j1 = model.joiner(e, d)
+        j2 = converted_model.joiner(e, d)
+        assert torch.allclose(j1, j2)
+
+
+@torch.no_grad()
+def main():
+    test_scaled_linear_to_linear()
+    test_scaled_conv1d_to_conv1d()
+    test_scaled_conv2d_to_conv2d()
+    test_scaled_embedding_to_embedding()
+    test_scaled_lstm_to_lstm()
+    test_convert_scaled_to_non_scaled()
+
+
+if __name__ == "__main__":
+    torch.manual_seed(20220730)
+    main()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py b/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
index 79b178421a..e82c52e6f0 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
@@ -28,7 +28,13 @@
 
 import torch
 import torch.nn as nn
-from scaling import ScaledConv1d, ScaledConv2d, ScaledEmbedding, ScaledLinear
+from scaling import (
+    ScaledConv1d,
+    ScaledConv2d,
+    ScaledEmbedding,
+    ScaledLinear,
+    ScaledLSTM,
+)
 
 
 def scaled_linear_to_linear(scaled_linear: ScaledLinear) -> nn.Linear:
@@ -164,6 +170,38 @@ def scaled_embedding_to_embedding(
     return embedding
 
 
+def scaled_lstm_to_lstm(scaled_lstm: ScaledLSTM):
+    """Convert an instance of ScaledLSTM to nn.LSTM.
+
+    Args:
+      scaled_lstm:
+        The layer to be converted.
+    Returns:
+      Return an instance of nn.LSTM that has the same `forward()` behavior
+      of the given `scaled_lstm`.
+    """
+    assert isinstance(scaled_lstm, ScaledLSTM), type(scaled_lstm)
+    lstm = nn.LSTM(
+        input_size=scaled_lstm.input_size,
+        hidden_size=scaled_lstm.hidden_size,
+        num_layers=scaled_lstm.num_layers,
+        bias=scaled_lstm.bias,
+        batch_first=scaled_lstm.batch_first,
+        dropout=scaled_lstm.dropout,
+        bidirectional=scaled_lstm.bidirectional,
+        proj_size=scaled_lstm.proj_size,
+    )
+
+    assert lstm._flat_weights_names == scaled_lstm._flat_weights_names
+    for idx in range(len(scaled_lstm._flat_weights_names)):
+        scaled_weight = (
+            scaled_lstm._flat_weights[idx] * scaled_lstm._scales[idx].exp()
+        )
+        lstm._flat_weights[idx].data.copy_(scaled_weight)
+
+    return lstm
+
+
 def convert_scaled_to_non_scaled(model: nn.Module, inplace: bool = False):
     """Convert `ScaledLinear`, `ScaledConv1d`, and `ScaledConv2d`
     in the given modle to their unscaled version `nn.Linear`, `nn.Conv1d`,
@@ -196,6 +234,8 @@ def convert_scaled_to_non_scaled(model: nn.Module, inplace: bool = False):
             d[name] = scaled_conv2d_to_conv2d(m)
         elif isinstance(m, ScaledEmbedding):
             d[name] = scaled_embedding_to_embedding(m)
+        elif isinstance(m, ScaledLSTM):
+            d[name] = scaled_lstm_to_lstm(m)
 
     for k, v in d.items():
         if "." in k:

From 45c7894111c1881b976e5f70e32536b3e86181ab Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Tue, 9 Aug 2022 19:29:21 +0800
Subject: [PATCH 23/40] support jit trace

---
 .../ASR/lstm_transducer_stateless/export.py   | 581 ++++++++++++++++++
 .../jit_pretrained.py                         | 319 ++++++++++
 .../ASR/lstm_transducer_stateless/lstm.py     | 295 ++++-----
 .../test_scaling_converter.py                 |   2 +-
 .../pruned_transducer_stateless2/scaling.py   |   5 +-
 .../scaling_converter.py                      |   2 +-
 6 files changed, 1032 insertions(+), 172 deletions(-)
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/export.py
 create mode 100755 egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/export.py b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
new file mode 100755
index 0000000000..6a9b52caa5
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
@@ -0,0 +1,581 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script converts several saved checkpoints
+# to a single one using model averaging.
+"""
+
+Usage:
+
+(1) Export to torchscript model using torch.jit.trace()
+
+./lstm_transducer_stateless/export.py \
+  --exp-dir ./lstm_transducer_stateless/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 20 \
+  --avg 10 \
+  --jit-trace 1
+
+It will generates 3 files: `encoder_jit_trace.pt`,
+`decoder_jit_trace.pt`, and `joiner_jit_trace.pt`.
+
+
+(3) Export to ONNX format
+
+./lstm_transducer_stateless/export.py \
+  --exp-dir ./lstm_transducer_stateless/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 20 \
+  --avg 10 \
+  --onnx 1
+
+It will generate the following three files in the given `exp_dir`.
+Check `onnx_check.py` for how to use them.
+
+    - encoder.onnx
+    - decoder.onnx
+    - joiner.onnx
+
+
+(4) Export `model.state_dict()`
+
+./lstm_transducer_stateless/export.py \
+  --exp-dir ./lstm_transducer_stateless/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 20 \
+  --avg 10
+
+It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
+load it by `icefall.checkpoint.load_checkpoint()`.
+
+To use the generated file with `lstm_transducer_stateless/decode.py`,
+you can do:
+
+    cd /path/to/exp_dir
+    ln -s pretrained.pt epoch-9999.pt
+
+    cd /path/to/egs/librispeech/ASR
+    ./lstm_transducer_stateless/decode.py \
+        --exp-dir ./lstm_transducer_stateless/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+
+Check ./pretrained.py for its usage.
+
+Note: If you don't want to train a model from scratch, we have
+provided one for you. You can get it at
+
+https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+with the following commands:
+
+    sudo apt-get install git-lfs
+    git lfs install
+    git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+    # You will find the pre-trained model in icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import onnx
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from scaling_converter import convert_scaled_to_non_scaled
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for averaging.
+        Note: Epoch counts from 0.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="pruned_transducer_stateless3/exp",
+        help="""It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--jit-trace",
+        type=str2bool,
+        default=False,
+        help="""True to save a model after applying torch.jit.trace.
+        It will generate 3 files:
+         - encoder_jit_trace.pt
+         - decoder_jit_trace.pt
+         - joiner_jit_trace.pt
+
+        Check ./jit_pretrained.py for how to use them.
+        """,
+    )
+
+    parser.add_argument(
+        "--onnx",
+        type=str2bool,
+        default=False,
+        help="""If True, --jit is ignored and it exports the model
+        to onnx format. Three files will be generated:
+
+            - encoder.onnx
+            - decoder.onnx
+            - joiner.onnx
+
+        Check ./onnx_check.py and ./onnx_pretrained.py for how to use them.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def export_encoder_model_jit_trace(
+    encoder_model: nn.Module,
+    encoder_filename: str,
+) -> None:
+    """Export the given encoder model with torch.jit.trace()
+
+    Note: The warmup argument is fixed to 1.
+
+    Args:
+      encoder_model:
+        The input encoder model
+      encoder_filename:
+        The filename to save the exported model.
+    """
+    x = torch.zeros(1, 100, 80, dtype=torch.float32)
+    x_lens = torch.tensor([100], dtype=torch.int64)
+    states = encoder_model.get_init_states()
+    states = (states[0].unsqueeze(1), states[1].unsqueeze(1))
+
+    traced_model = torch.jit.trace(encoder_model, (x, x_lens, states))
+    traced_model.save(encoder_filename)
+    logging.info(f"Saved to {encoder_filename}")
+
+
+def export_decoder_model_jit_trace(
+    decoder_model: nn.Module,
+    decoder_filename: str,
+) -> None:
+    """Export the given decoder model with torch.jit.trace()
+
+    Note: The argument need_pad is fixed to False.
+
+    Args:
+      decoder_model:
+        The input decoder model
+      decoder_filename:
+        The filename to save the exported model.
+    """
+    y = torch.zeros(10, decoder_model.context_size, dtype=torch.int64)
+    need_pad = torch.tensor([False])
+
+    traced_model = torch.jit.trace(decoder_model, (y, need_pad))
+    traced_model.save(decoder_filename)
+    logging.info(f"Saved to {decoder_filename}")
+
+
+def export_joiner_model_jit_trace(
+    joiner_model: nn.Module,
+    joiner_filename: str,
+) -> None:
+    """Export the given joiner model with torch.jit.trace()
+
+    Note: The argument project_input is fixed to True. A user should not
+    project the encoder_out/decoder_out by himself/herself. The exported joiner
+    will do that for the user.
+
+    Args:
+      joiner_model:
+        The input joiner model
+      joiner_filename:
+        The filename to save the exported model.
+
+    """
+    encoder_out_dim = joiner_model.encoder_proj.weight.shape[1]
+    decoder_out_dim = joiner_model.decoder_proj.weight.shape[1]
+    encoder_out = torch.rand(1, encoder_out_dim, dtype=torch.float32)
+    decoder_out = torch.rand(1, decoder_out_dim, dtype=torch.float32)
+
+    traced_model = torch.jit.trace(joiner_model, (encoder_out, decoder_out))
+    traced_model.save(joiner_filename)
+    logging.info(f"Saved to {joiner_filename}")
+
+
+def export_encoder_model_onnx(
+    encoder_model: nn.Module,
+    encoder_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the given encoder model to ONNX format.
+    The exported model has two inputs:
+
+        - x, a tensor of shape (N, T, C); dtype is torch.float32
+        - x_lens, a tensor of shape (N,); dtype is torch.int64
+
+    and it has two outputs:
+
+        - encoder_out, a tensor of shape (N, T, C)
+        - encoder_out_lens, a tensor of shape (N,)
+
+    Note: The warmup argument is fixed to 1.
+
+    Args:
+      encoder_model:
+        The input encoder model
+      encoder_filename:
+        The filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    x = torch.zeros(1, 100, 80, dtype=torch.float32)
+    x_lens = torch.tensor([100], dtype=torch.int64)
+    states = encoder_model.get_init_states()
+    hidden_states = states[0].unsqueeze(1)
+    cell_states = states[1].unsqueeze(1)
+    #  encoder_model = torch.jit.script(encoder_model)
+    # It throws the following error for the above statement
+    #
+    # RuntimeError: Exporting the operator __is_ to ONNX opset version
+    # 11 is not supported. Please feel free to request support or
+    # submit a pull request on PyTorch GitHub.
+    #
+    # I cannot find which statement causes the above error.
+    # torch.onnx.export() will use torch.jit.trace() internally, which
+    # works well for the current reworked model
+    warmup = 1.0
+    torch.onnx.export(
+        encoder_model,
+        (x, x_lens, (hidden_states, cell_states), warmup),
+        encoder_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["x", "x_lens", "hidden_states", "cell_states", "warmup"],
+        output_names=[
+            "encoder_out",
+            "encoder_out_lens",
+            "new_hidden_states",
+            "new_cell_states",
+        ],
+        dynamic_axes={
+            "x": {0: "N", 1: "T"},
+            "x_lens": {0: "N"},
+            "hidden_states": {1: "N"},
+            "cell_states": {1: "N"},
+            "encoder_out": {0: "N", 1: "T"},
+            "encoder_out_lens": {0: "N"},
+            "new_hidden_states": {1: "N"},
+            "new_cell_states": {1: "N"},
+        },
+    )
+    logging.info(f"Saved to {encoder_filename}")
+
+
+def export_decoder_model_onnx(
+    decoder_model: nn.Module,
+    decoder_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the decoder model to ONNX format.
+
+    The exported model has one input:
+
+        - y: a torch.int64 tensor of shape (N, decoder_model.context_size)
+
+    and has one output:
+
+        - decoder_out: a torch.float32 tensor of shape (N, 1, C)
+
+    Note: The argument need_pad is fixed to False.
+
+    Args:
+      decoder_model:
+        The decoder model to be exported.
+      decoder_filename:
+        Filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    y = torch.zeros(10, decoder_model.context_size, dtype=torch.int64)
+    need_pad = False  # Always False, so we can use torch.jit.trace() here
+    # Note(fangjun): torch.jit.trace() is more efficient than torch.jit.script()
+    # in this case
+    torch.onnx.export(
+        decoder_model,
+        (y, need_pad),
+        decoder_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["y", "need_pad"],
+        output_names=["decoder_out"],
+        dynamic_axes={
+            "y": {0: "N"},
+            "decoder_out": {0: "N"},
+        },
+    )
+    logging.info(f"Saved to {decoder_filename}")
+
+
+def export_joiner_model_onnx(
+    joiner_model: nn.Module,
+    joiner_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the joiner model to ONNX format.
+    The exported model has two inputs:
+
+        - encoder_out: a tensor of shape (N, encoder_out_dim)
+        - decoder_out: a tensor of shape (N, decoder_out_dim)
+
+    and has one output:
+
+        - joiner_out: a tensor of shape (N, vocab_size)
+
+    Note: The argument project_input is fixed to True. A user should not
+    project the encoder_out/decoder_out by himself/herself. The exported joiner
+    will do that for the user.
+    """
+    encoder_out_dim = joiner_model.encoder_proj.weight.shape[1]
+    decoder_out_dim = joiner_model.decoder_proj.weight.shape[1]
+    encoder_out = torch.rand(1, encoder_out_dim, dtype=torch.float32)
+    decoder_out = torch.rand(1, decoder_out_dim, dtype=torch.float32)
+
+    project_input = True
+    # Note: It uses torch.jit.trace() internally
+    torch.onnx.export(
+        joiner_model,
+        (encoder_out, decoder_out, project_input),
+        joiner_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["encoder_out", "decoder_out", "project_input"],
+        output_names=["logit"],
+        dynamic_axes={
+            "encoder_out": {0: "N"},
+            "decoder_out": {0: "N"},
+            "logit": {0: "N"},
+        },
+    )
+    logging.info(f"Saved to {joiner_filename}")
+
+
+def export_all_in_one_onnx(
+    encoder_filename: str,
+    decoder_filename: str,
+    joiner_filename: str,
+    all_in_one_filename: str,
+):
+    encoder_onnx = onnx.load(encoder_filename)
+    decoder_onnx = onnx.load(decoder_filename)
+    joiner_onnx = onnx.load(joiner_filename)
+
+    encoder_onnx = onnx.compose.add_prefix(encoder_onnx, prefix="encoder/")
+    decoder_onnx = onnx.compose.add_prefix(decoder_onnx, prefix="decoder/")
+    joiner_onnx = onnx.compose.add_prefix(joiner_onnx, prefix="joiner/")
+
+    combined_model = onnx.compose.merge_models(
+        encoder_onnx, decoder_onnx, io_map={}
+    )
+    combined_model = onnx.compose.merge_models(
+        combined_model, joiner_onnx, io_map={}
+    )
+    onnx.save(combined_model, all_in_one_filename)
+    logging.info(f"Saved to {all_in_one_filename}")
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    model.to(device)
+
+    if params.iter > 0:
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(
+            average_checkpoints(filenames, device=device), strict=False
+        )
+    elif params.avg == 1:
+        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    else:
+        start = params.epoch - params.avg + 1
+        filenames = []
+        for i in range(start, params.epoch + 1):
+            if start >= 0:
+                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(
+            average_checkpoints(filenames, device=device), strict=False
+        )
+
+    model.to("cpu")
+    model.eval()
+    convert_scaled_to_non_scaled(model, inplace=True)
+
+    if params.onnx is True:
+        opset_version = 11
+        logging.info("Exporting to onnx format")
+        encoder_filename = params.exp_dir / "encoder.onnx"
+        export_encoder_model_onnx(
+            model.encoder,
+            encoder_filename,
+            opset_version=opset_version,
+        )
+
+        decoder_filename = params.exp_dir / "decoder.onnx"
+        export_decoder_model_onnx(
+            model.decoder,
+            decoder_filename,
+            opset_version=opset_version,
+        )
+
+        joiner_filename = params.exp_dir / "joiner.onnx"
+        export_joiner_model_onnx(
+            model.joiner,
+            joiner_filename,
+            opset_version=opset_version,
+        )
+
+        all_in_one_filename = params.exp_dir / "all_in_one.onnx"
+        export_all_in_one_onnx(
+            encoder_filename,
+            decoder_filename,
+            joiner_filename,
+            all_in_one_filename,
+        )
+    elif params.jit_trace is True:
+        logging.info("Using torch.jit.trace()")
+        encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
+        export_encoder_model_jit_trace(model.encoder, encoder_filename)
+
+        decoder_filename = params.exp_dir / "decoder_jit_trace.pt"
+        export_decoder_model_jit_trace(model.decoder, decoder_filename)
+
+        joiner_filename = params.exp_dir / "joiner_jit_trace.pt"
+        export_joiner_model_jit_trace(model.joiner, joiner_filename)
+    else:
+        logging.info("Not using torchscript")
+        # Save it using a format so that it can be loaded
+        # by :func:`load_checkpoint`
+        filename = params.exp_dir / "pretrained.pt"
+        torch.save({"model": model.state_dict()}, str(filename))
+        logging.info(f"Saved to {filename}")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py b/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py
new file mode 100755
index 0000000000..eeb7dadade
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads torchscript models, either exported by `torch.jit.trace()`
+or by `torch.jit.script()`, and uses them to decode waves.
+You can use the following command to get the exported models:
+
+./lstm_transducer_stateless/export.py \
+  --exp-dir ./lstm_transducer_stateless/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 20 \
+  --avg 10 \
+  --jit-trace 1
+
+Usage of this script:
+
+./lstm_transducer_stateless/jit_pretrained.py \
+  --encoder-model-filename ./lstm_transducer_stateless/exp/encoder_jit_trace.pt \
+  --decoder-model-filename ./lstm_transducer_stateless/exp/decoder_jit_trace.pt \
+  --joiner-model-filename ./lstm_transducer_stateless/exp/joiner_jit_trace.pt \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+"""
+
+import argparse
+import logging
+import math
+from typing import List
+
+import kaldifeat
+import sentencepiece as spm
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder torchscript model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder torchscript model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner torchscript model. ",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        help="""Path to bpe.model.""",
+    )
+
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="Context size of the decoder model",
+    )
+
+    return parser
+
+
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert sample_rate == expected_sample_rate, (
+            f"expected sample rate: {expected_sample_rate}. "
+            f"Given: {sample_rate}"
+        )
+        # We use only the first channel
+        ans.append(wave[0])
+    return ans
+
+
+def greedy_search(
+    decoder: torch.jit.ScriptModule,
+    joiner: torch.jit.ScriptModule,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    context_size: int,
+) -> List[List[int]]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      decoder:
+        The decoder model.
+      joiner:
+        The joiner model.
+      encoder_out:
+        A 3-D tensor of shape (N, T, C)
+      encoder_out_lens:
+        A 1-D tensor of shape (N,).
+      context_size:
+        The context size of the decoder model.
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = encoder_out.device
+    blank_id = 0  # hard-code to 0
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[blank_id] * context_size for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = decoder(
+        decoder_input,
+        need_pad=torch.tensor([False]),
+    ).squeeze(1)
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = packed_encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out
+        # current_encoder_out's shape: (batch_size, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = joiner(
+            current_encoder_out,
+            decoder_out,
+        )
+        # logits'shape (batch_size, vocab_size)
+
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v != blank_id:
+                hyps[i].append(v)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = decoder(
+                decoder_input,
+                need_pad=torch.tensor([False]),
+            )
+            decoder_out = decoder_out.squeeze(1)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
+    return ans
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    encoder = torch.jit.load(args.encoder_model_filename)
+    decoder = torch.jit.load(args.decoder_model_filename)
+    joiner = torch.jit.load(args.joiner_model_filename)
+
+    encoder.eval()
+    decoder.eval()
+    joiner.eval()
+
+    encoder.to(device)
+    decoder.to(device)
+    joiner.to(device)
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    logging.info("Constructing Fbank computer")
+    opts = kaldifeat.FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = args.sample_rate
+    opts.mel_opts.num_bins = 80
+
+    fbank = kaldifeat.Fbank(opts)
+
+    logging.info(f"Reading sound files: {args.sound_files}")
+    waves = read_sound_files(
+        filenames=args.sound_files,
+        expected_sample_rate=args.sample_rate,
+    )
+    waves = [w.to(device) for w in waves]
+
+    logging.info("Decoding started")
+    features = fbank(waves)
+    feature_lengths = [f.size(0) for f in features]
+
+    features = pad_sequence(
+        features,
+        batch_first=True,
+        padding_value=math.log(1e-10),
+    )
+
+    feature_lengths = torch.tensor(feature_lengths, device=device)
+
+    encoder_out, encoder_out_lens, _ = encoder(
+        x=features,
+        x_lens=feature_lengths,
+    )
+
+    hyps = greedy_search(
+        decoder=decoder,
+        joiner=joiner,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        context_size=args.context_size,
+    )
+    s = "\n"
+    for filename, hyp in zip(args.sound_files, hyps):
+        words = sp.decode(hyp)
+        s += f"{filename}:\n{words}\n\n"
+    logging.info(s)
+
+    logging.info("Decoding Done")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 1d6fda0b49..7eef94feae 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -51,7 +51,8 @@ class RNN(EncoderInterface):
         Dropout value for model-level warmup (default=0.075).
       aux_layer_period (int):
         Peroid of auxiliary layers used for randomly combined during training.
-        If not larger than 0, will not use the random combiner.
+        If set to 0, will not use the random combiner (Default).
+        You can set a positive integer to use the random combiner, e.g., 3.
     """
 
     def __init__(
@@ -64,7 +65,7 @@ def __init__(
         num_encoder_layers: int = 12,
         dropout: float = 0.1,
         layer_dropout: float = 0.075,
-        aux_layer_period: int = 3,
+        aux_layer_period: int = 0,
     ) -> None:
         super(RNN, self).__init__()
 
@@ -106,8 +107,12 @@ def __init__(
         )
 
     def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor, warmup: float = 1.0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        warmup: float = 1.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Args:
           x:
@@ -116,17 +121,24 @@ def forward(
           x_lens:
             A tensor of shape (N,), containing the number of frames in `x`
             before padding.
+          states:
+            A tuple of 2 tensors (optional). It is for streaming inference.
+            states[0] is the hidden states of all layers,
+              with shape of (num_layers, N, d_model);
+            states[1] is the cell states of all layers,
+              with shape of (num_layers, N, rnn_hidden_size).
           warmup:
             A floating point value that gradually increases from 0 throughout
             training; when it is >= 1.0 we are "fully warmed up".  It is used
             to turn modules on sequentially.
 
         Returns:
-          A tuple of 2 tensors:
+          A tuple of 3 tensors:
             - embeddings: its shape is (N, T', d_model), where T' is the output
               sequence lengths.
             - lengths: a tensor of shape (batch_size,) containing the number of
               frames in `embeddings` before padding.
+            - updated states, whose shape is same as the input states.
         """
         x = self.encoder_embed(x)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
@@ -135,16 +147,40 @@ def forward(
         #
         # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
         lengths = (((x_lens - 1) >> 1) - 1) >> 1
-        assert x.size(0) == lengths.max().item()
+        if not torch.jit.is_tracing():
+            assert x.size(0) == lengths.max().item()
 
-        x = self.encoder(x, warmup)
+        if states is None:
+            x = self.encoder(x, warmup=warmup)[0]
+            # torch.jit.trace requires returned types be the same as annotated
+            new_states = (torch.empty(0), torch.empty(0))
+        else:
+            # we cut off 1 frame on each side of encoder_embed output
+            lengths -= 2
+            x = x[1:-1, :, :]
+
+            assert not self.training
+            assert len(states) == 2
+            if not torch.jit.is_tracing():
+                # for hidden state
+                assert states[0].shape == (
+                    self.num_encoder_layers,
+                    x.size(1),
+                    self.d_model,
+                )
+                # for cell state
+                assert states[1].shape == (
+                    self.num_encoder_layers,
+                    x.size(1),
+                    self.rnn_hidden_size,
+                )
+            x, new_states = self.encoder(x, states)
 
         x = x.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
-        return x, lengths
+        return x, lengths, new_states
 
-    @torch.jit.export
     def get_init_states(
-        self, device: torch.device
+        self, device: torch.device = torch.device("cpu")
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get model initial states."""
         # for rnn hidden states
@@ -156,67 +192,6 @@ def get_init_states(
         )
         return (hidden_states, cell_states)
 
-    @torch.jit.export
-    def infer(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        states: Tuple[torch.Tensor, torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Args:
-          x:
-            The input tensor. Its shape is (N, T, C), where N is the batch size,
-            T is the sequence length, C is the feature dimension.
-          x_lens:
-            A tensor of shape (N,), containing the number of frames in `x`
-            before padding.
-          states:
-            It is a list of 2 tensors.
-            states[0] is the hidden states of all layers,
-              with shape of (num_layers, N, d_model);
-            states[1] is the cell states of all layers,
-              with shape of (num_layers, N, rnn_hidden_size).
-
-        Returns:
-          A tuple of 3 tensors:
-            - embeddings: its shape is (N, T', d_model), where T' is the output
-              sequence lengths.
-            - lengths: a tensor of shape (batch_size,) containing the number of
-              frames in `embeddings` before padding.
-            - updated states, whose shape is same as the input states.
-        """
-        assert not self.training
-        assert len(states) == 2
-        # for hidden state
-        assert states[0].shape == (
-            self.num_encoder_layers,
-            x.size(0),
-            self.d_model,
-        )
-        # for cell state
-        assert states[1].shape == (
-            self.num_encoder_layers,
-            x.size(0),
-            self.rnn_hidden_size,
-        )
-
-        # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning
-        #
-        # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
-        lengths = (((x_lens - 1) >> 1) - 1) >> 1
-        # we will cut off 1 frame on each side of encoder_embed output
-        lengths -= 2
-
-        embed = self.encoder_embed(x)
-        embed = embed[:, 1:-1, :]
-        embed = embed.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        x, states = self.encoder.infer(embed, states)
-
-        x = x.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
-        return x, lengths, states
-
 
 class RNNEncoderLayer(nn.Module):
     """
@@ -271,7 +246,12 @@ def __init__(
         )
         self.dropout = nn.Dropout(dropout)
 
-    def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
+    def forward(
+        self,
+        src: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        warmup: float = 1.0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Pass the input through the encoder layer.
 
@@ -280,6 +260,12 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
             The sequence to the encoder layer (required).
             Its shape is (S, N, E), where S is the sequence length,
             N is the batch size, and E is the feature number.
+          states:
+            A tuple of 2 tensors (optional). It is for streaming inference.
+            states[0] is the hidden states of all layers,
+              with shape of (1, N, d_model);
+            states[1] is the cell states of all layers,
+              with shape of (1, N, rnn_hidden_size).
           warmup:
             It controls selective bypass of of layers; if < 1.0, we will
             bypass layers more frequently.
@@ -299,7 +285,19 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
             alpha = 1.0
 
         # lstm module
-        src_lstm = self.lstm(src)[0]
+        if states is None:
+            src_lstm = self.lstm(src)[0]
+            # torch.jit.trace requires returned types be the same as annotated
+            new_states = (torch.empty(0), torch.empty(0))
+        else:
+            assert not self.training
+            assert len(states) == 2
+            if not torch.jit.is_tracing():
+                # for hidden state
+                assert states[0].shape == (1, src.size(1), self.d_model)
+                # for cell state
+                assert states[1].shape == (1, src.size(1), self.rnn_hidden_size)
+            src_lstm, new_states = self.lstm(src, states)
         src = src + self.dropout(src_lstm)
 
         # feed forward module
@@ -310,41 +308,6 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
         if alpha != 1.0:
             src = alpha * src + (1 - alpha) * src_orig
 
-        return src
-
-    @torch.jit.export
-    def infer(
-        self, src: torch.Tensor, states: Tuple[torch.Tensor, torch.Tensor]
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Pass the input through the encoder layer.
-
-        Args:
-          src:
-            The sequence to the encoder layer (required).
-            Its shape is (S, N, d_model), where S is the sequence length,
-            N is the batch size.
-          states:
-            It is a tuple of 2 tensors.
-            states[0] is the hidden state, with shape of (1, N, d_model);
-            states[1] is the cell state, with shape of (1, N, rnn_hidden_size).
-        """
-        assert not self.training
-        assert len(states) == 2
-        # for hidden state
-        assert states[0].shape == (1, src.size(1), self.d_model)
-        # for cell state
-        assert states[1].shape == (1, src.size(1), self.rnn_hidden_size)
-
-        # lstm module
-        src_lstm, new_states = self.lstm(src, states)
-        src = src + self.dropout(src_lstm)
-
-        # feed forward module
-        src = src + self.dropout(self.feed_forward(src))
-
-        src = self.norm_final(self.balancer(src))
-
         return src, new_states
 
 
@@ -373,11 +336,11 @@ def __init__(
         self.d_model = encoder_layer.d_model
         self.rnn_hidden_size = encoder_layer.rnn_hidden_size
 
-        self.use_random_combiner = False
+        self.aux_layers: List[int] = []
+        self.combiner: Optional[nn.Module] = None
         if aux_layers is not None:
             assert len(set(aux_layers)) == len(aux_layers)
             assert num_layers - 1 not in aux_layers
-            self.use_random_combiner = True
             self.aux_layers = aux_layers + [num_layers - 1]
             self.combiner = RandomCombine(
                 num_inputs=len(self.aux_layers),
@@ -386,7 +349,12 @@ def __init__(
                 stddev=2.0,
             )
 
-    def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
+    def forward(
+        self,
+        src: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        warmup: float = 1.0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Pass the input through the encoder layer in turn.
 
@@ -395,75 +363,66 @@ def forward(self, src: torch.Tensor, warmup: float = 1.0) -> torch.Tensor:
             The sequence to the encoder layer (required).
             Its shape is (S, N, E), where S is the sequence length,
             N is the batch size, and E is the feature number.
+          states:
+            A tuple of 2 tensors (optional). It is for streaming inference.
+            states[0] is the hidden states of all layers,
+              with shape of (num_layers, N, d_model);
+            states[1] is the cell states of all layers,
+              with shape of (num_layers, N, rnn_hidden_size).
           warmup:
             It controls selective bypass of of layers; if < 1.0, we will
             bypass layers more frequently.
         """
+        if states is not None:
+            assert not self.training
+            assert len(states) == 2
+            if not torch.jit.is_tracing():
+                # for hidden state
+                assert states[0].shape == (
+                    self.num_layers,
+                    src.size(1),
+                    self.d_model,
+                )
+                # for cell state
+                assert states[1].shape == (
+                    self.num_layers,
+                    src.size(1),
+                    self.rnn_hidden_size,
+                )
+
         output = src
 
         outputs = []
 
-        for i, mod in enumerate(self.layers):
-            output = mod(output, warmup=warmup)
-            if self.use_random_combiner:
-                if i in self.aux_layers:
-                    outputs.append(output)
-
-        if self.use_random_combiner:
-            output = self.combiner(outputs)
+        new_hidden_states = []
+        new_cell_states = []
 
-        return output
+        for i, mod in enumerate(self.layers):
+            if states is None:
+                output = mod(output, warmup=warmup)[0]
+            else:
+                layer_state = (
+                    states[0][i : i + 1, :, :],  # h: (1, N, d_model)
+                    states[1][i : i + 1, :, :],  # c: (1, N, rnn_hidden_size)
+                )
+                output, (h, c) = mod(output, layer_state)
+                new_hidden_states.append(h)
+                new_cell_states.append(c)
 
-    @torch.jit.export
-    def infer(
-        self, src: torch.Tensor, states: Tuple[torch.Tensor, torch.Tensor]
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Pass the input through the encoder layer.
+            if self.combiner is not None and i in self.aux_layers:
+                outputs.append(output)
 
-        Args:
-          src:
-            The sequence to the encoder layer (required).
-            Its shape is (S, N, d_model), where S is the sequence length,
-            N is the batch size.
-          states:
-            It is a list of 2 tensors.
-            states[0] is the hidden states of all layers,
-              with shape of (num_layers, N, d_model);
-            states[1] is the cell states of all layers,
-              with shape of (num_layers, N, rnn_hidden_size).
-        """
-        assert not self.training
-        assert len(states) == 2
-        # for hidden state
-        assert states[0].shape == (self.num_layers, src.size(1), self.d_model)
-        # for cell state
-        assert states[1].shape == (
-            self.num_layers,
-            src.size(1),
-            self.rnn_hidden_size,
-        )
+        if self.combiner is not None:
+            output = self.combiner(outputs)
 
-        output = src
-        new_hidden_states = []
-        new_cell_states = []
-        for layer_index, mod in enumerate(self.layers):
-            layer_states = (
-                states[0][
-                    layer_index : layer_index + 1, :, :
-                ],  # h: (1, N, d_model)
-                states[1][
-                    layer_index : layer_index + 1, :, :
-                ],  # c: (1, N, rnn_hidden_size)
+        if states is None:
+            new_states = (torch.empty(0), torch.empty(0))
+        else:
+            new_states = (
+                torch.cat(new_hidden_states, dim=0),
+                torch.cat(new_cell_states, dim=0),
             )
-            output, (h, c) = mod.infer(output, layer_states)
-            new_hidden_states.append(h)
-            new_cell_states.append(c)
 
-        new_states = (
-            torch.cat(new_hidden_states, dim=0),
-            torch.cat(new_cell_states, dim=0),
-        )
         return output, new_states
 
 
@@ -804,9 +763,9 @@ def _test_random_combine_main():
     m = RNN(
         num_features=feature_dim,
         d_model=512,
-        rnn_hidden_size=1024,
+        rnn_hidden_size=1536,
         dim_feedforward=2048,
-        num_encoder_layers=12,
+        num_encoder_layers=10,
     )
     batch_size = 5
     seq_len = 20
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
index b4bc8d91fa..0a7e63b60f 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
@@ -19,7 +19,7 @@
 To run this file, do:
 
     cd icefall/egs/librispeech/ASR
-    python ./pruned_transducer_stateless3/test_scaling_converter.py
+    python ./lstm_transducer_stateless/test_scaling_converter.py
 """
 
 import copy
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 354815f926..b6351b1d3a 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -389,8 +389,9 @@ def __init__(
         initial_speed: float = 1.0,
         **kwargs
     ):
-        # Hardcode bidirectional=False
-        super(ScaledLSTM, self).__init__(*args, bidirectional=False, **kwargs)
+        if "bidirectional" in kwargs:
+            assert kwargs["bidirectional"] is False
+        super(ScaledLSTM, self).__init__(*args, **kwargs)
         initial_scale = torch.tensor(initial_scale).log()
         self._scales_names = []
         self._scales = []
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py b/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
index e82c52e6f0..cc7889d412 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
@@ -170,7 +170,7 @@ def scaled_embedding_to_embedding(
     return embedding
 
 
-def scaled_lstm_to_lstm(scaled_lstm: ScaledLSTM):
+def scaled_lstm_to_lstm(scaled_lstm: ScaledLSTM) -> nn.LSTM:
     """Convert an instance of ScaledLSTM to nn.LSTM.
 
     Args:

From 522a45ce75462bbf38298b743a85d08c7df95310 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Wed, 10 Aug 2022 15:43:50 +0800
Subject: [PATCH 24/40] add using averaged model in export.py

---
 .../ASR/lstm_transducer_stateless/decode.py   |   2 +-
 .../ASR/lstm_transducer_stateless/export.py   | 369 +++++-------------
 .../jit_pretrained.py                         |   3 +
 .../ASR/lstm_transducer_stateless/lstm.py     |  14 +-
 .../ASR/pruned_transducer_stateless2/model.py |   2 +-
 5 files changed, 101 insertions(+), 289 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index a909b2a74a..afc614c971 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -356,7 +356,7 @@ def decode_one_batch(
     supervisions = batch["supervisions"]
     feature_lens = supervisions["num_frames"].to(device)
 
-    encoder_out, encoder_out_lens = model.encoder(
+    encoder_out, encoder_out_lens, _ = model.encoder(
         x=feature, x_lens=feature_lens
     )
 
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/export.py b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
index 6a9b52caa5..49ba93d55a 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/export.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
@@ -34,25 +34,7 @@
 It will generates 3 files: `encoder_jit_trace.pt`,
 `decoder_jit_trace.pt`, and `joiner_jit_trace.pt`.
 
-
-(3) Export to ONNX format
-
-./lstm_transducer_stateless/export.py \
-  --exp-dir ./lstm_transducer_stateless/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 20 \
-  --avg 10 \
-  --onnx 1
-
-It will generate the following three files in the given `exp_dir`.
-Check `onnx_check.py` for how to use them.
-
-    - encoder.onnx
-    - decoder.onnx
-    - joiner.onnx
-
-
-(4) Export `model.state_dict()`
+(2) Export `model.state_dict()`
 
 ./lstm_transducer_stateless/export.py \
   --exp-dir ./lstm_transducer_stateless/exp \
@@ -97,7 +79,6 @@
 import logging
 from pathlib import Path
 
-import onnx
 import sentencepiece as spm
 import torch
 import torch.nn as nn
@@ -106,6 +87,7 @@
 
 from icefall.checkpoint import (
     average_checkpoints,
+    average_checkpoints_with_averaged_model,
     find_checkpoints,
     load_checkpoint,
 )
@@ -145,6 +127,17 @@ def get_parser():
         "'--epoch' and '--iter'",
     )
 
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
     parser.add_argument(
         "--exp-dir",
         type=str,
@@ -175,21 +168,6 @@ def get_parser():
         """,
     )
 
-    parser.add_argument(
-        "--onnx",
-        type=str2bool,
-        default=False,
-        help="""If True, --jit is ignored and it exports the model
-        to onnx format. Three files will be generated:
-
-            - encoder.onnx
-            - decoder.onnx
-            - joiner.onnx
-
-        Check ./onnx_check.py and ./onnx_pretrained.py for how to use them.
-        """,
-    )
-
     parser.add_argument(
         "--context-size",
         type=int,
@@ -220,7 +198,6 @@ def export_encoder_model_jit_trace(
     x = torch.zeros(1, 100, 80, dtype=torch.float32)
     x_lens = torch.tensor([100], dtype=torch.int64)
     states = encoder_model.get_init_states()
-    states = (states[0].unsqueeze(1), states[1].unsqueeze(1))
 
     traced_model = torch.jit.trace(encoder_model, (x, x_lens, states))
     traced_model.save(encoder_filename)
@@ -276,187 +253,6 @@ def export_joiner_model_jit_trace(
     logging.info(f"Saved to {joiner_filename}")
 
 
-def export_encoder_model_onnx(
-    encoder_model: nn.Module,
-    encoder_filename: str,
-    opset_version: int = 11,
-) -> None:
-    """Export the given encoder model to ONNX format.
-    The exported model has two inputs:
-
-        - x, a tensor of shape (N, T, C); dtype is torch.float32
-        - x_lens, a tensor of shape (N,); dtype is torch.int64
-
-    and it has two outputs:
-
-        - encoder_out, a tensor of shape (N, T, C)
-        - encoder_out_lens, a tensor of shape (N,)
-
-    Note: The warmup argument is fixed to 1.
-
-    Args:
-      encoder_model:
-        The input encoder model
-      encoder_filename:
-        The filename to save the exported ONNX model.
-      opset_version:
-        The opset version to use.
-    """
-    x = torch.zeros(1, 100, 80, dtype=torch.float32)
-    x_lens = torch.tensor([100], dtype=torch.int64)
-    states = encoder_model.get_init_states()
-    hidden_states = states[0].unsqueeze(1)
-    cell_states = states[1].unsqueeze(1)
-    #  encoder_model = torch.jit.script(encoder_model)
-    # It throws the following error for the above statement
-    #
-    # RuntimeError: Exporting the operator __is_ to ONNX opset version
-    # 11 is not supported. Please feel free to request support or
-    # submit a pull request on PyTorch GitHub.
-    #
-    # I cannot find which statement causes the above error.
-    # torch.onnx.export() will use torch.jit.trace() internally, which
-    # works well for the current reworked model
-    warmup = 1.0
-    torch.onnx.export(
-        encoder_model,
-        (x, x_lens, (hidden_states, cell_states), warmup),
-        encoder_filename,
-        verbose=False,
-        opset_version=opset_version,
-        input_names=["x", "x_lens", "hidden_states", "cell_states", "warmup"],
-        output_names=[
-            "encoder_out",
-            "encoder_out_lens",
-            "new_hidden_states",
-            "new_cell_states",
-        ],
-        dynamic_axes={
-            "x": {0: "N", 1: "T"},
-            "x_lens": {0: "N"},
-            "hidden_states": {1: "N"},
-            "cell_states": {1: "N"},
-            "encoder_out": {0: "N", 1: "T"},
-            "encoder_out_lens": {0: "N"},
-            "new_hidden_states": {1: "N"},
-            "new_cell_states": {1: "N"},
-        },
-    )
-    logging.info(f"Saved to {encoder_filename}")
-
-
-def export_decoder_model_onnx(
-    decoder_model: nn.Module,
-    decoder_filename: str,
-    opset_version: int = 11,
-) -> None:
-    """Export the decoder model to ONNX format.
-
-    The exported model has one input:
-
-        - y: a torch.int64 tensor of shape (N, decoder_model.context_size)
-
-    and has one output:
-
-        - decoder_out: a torch.float32 tensor of shape (N, 1, C)
-
-    Note: The argument need_pad is fixed to False.
-
-    Args:
-      decoder_model:
-        The decoder model to be exported.
-      decoder_filename:
-        Filename to save the exported ONNX model.
-      opset_version:
-        The opset version to use.
-    """
-    y = torch.zeros(10, decoder_model.context_size, dtype=torch.int64)
-    need_pad = False  # Always False, so we can use torch.jit.trace() here
-    # Note(fangjun): torch.jit.trace() is more efficient than torch.jit.script()
-    # in this case
-    torch.onnx.export(
-        decoder_model,
-        (y, need_pad),
-        decoder_filename,
-        verbose=False,
-        opset_version=opset_version,
-        input_names=["y", "need_pad"],
-        output_names=["decoder_out"],
-        dynamic_axes={
-            "y": {0: "N"},
-            "decoder_out": {0: "N"},
-        },
-    )
-    logging.info(f"Saved to {decoder_filename}")
-
-
-def export_joiner_model_onnx(
-    joiner_model: nn.Module,
-    joiner_filename: str,
-    opset_version: int = 11,
-) -> None:
-    """Export the joiner model to ONNX format.
-    The exported model has two inputs:
-
-        - encoder_out: a tensor of shape (N, encoder_out_dim)
-        - decoder_out: a tensor of shape (N, decoder_out_dim)
-
-    and has one output:
-
-        - joiner_out: a tensor of shape (N, vocab_size)
-
-    Note: The argument project_input is fixed to True. A user should not
-    project the encoder_out/decoder_out by himself/herself. The exported joiner
-    will do that for the user.
-    """
-    encoder_out_dim = joiner_model.encoder_proj.weight.shape[1]
-    decoder_out_dim = joiner_model.decoder_proj.weight.shape[1]
-    encoder_out = torch.rand(1, encoder_out_dim, dtype=torch.float32)
-    decoder_out = torch.rand(1, decoder_out_dim, dtype=torch.float32)
-
-    project_input = True
-    # Note: It uses torch.jit.trace() internally
-    torch.onnx.export(
-        joiner_model,
-        (encoder_out, decoder_out, project_input),
-        joiner_filename,
-        verbose=False,
-        opset_version=opset_version,
-        input_names=["encoder_out", "decoder_out", "project_input"],
-        output_names=["logit"],
-        dynamic_axes={
-            "encoder_out": {0: "N"},
-            "decoder_out": {0: "N"},
-            "logit": {0: "N"},
-        },
-    )
-    logging.info(f"Saved to {joiner_filename}")
-
-
-def export_all_in_one_onnx(
-    encoder_filename: str,
-    decoder_filename: str,
-    joiner_filename: str,
-    all_in_one_filename: str,
-):
-    encoder_onnx = onnx.load(encoder_filename)
-    decoder_onnx = onnx.load(decoder_filename)
-    joiner_onnx = onnx.load(joiner_filename)
-
-    encoder_onnx = onnx.compose.add_prefix(encoder_onnx, prefix="encoder/")
-    decoder_onnx = onnx.compose.add_prefix(decoder_onnx, prefix="decoder/")
-    joiner_onnx = onnx.compose.add_prefix(joiner_onnx, prefix="joiner/")
-
-    combined_model = onnx.compose.merge_models(
-        encoder_onnx, decoder_onnx, io_map={}
-    )
-    combined_model = onnx.compose.merge_models(
-        combined_model, joiner_onnx, io_map={}
-    )
-    onnx.save(combined_model, all_in_one_filename)
-    logging.info(f"Saved to {all_in_one_filename}")
-
-
 @torch.no_grad()
 def main():
     args = get_parser().parse_args()
@@ -483,77 +279,88 @@ def main():
     logging.info("About to create model")
     model = get_transducer_model(params)
 
-    model.to(device)
-
-    if params.iter > 0:
-        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-            : params.avg
-        ]
-        if len(filenames) == 0:
-            raise ValueError(
-                f"No checkpoints found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(
+                params.exp_dir, iteration=-params.iter
+            )[: params.avg + 1]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
             )
-        elif len(filenames) < params.avg:
-            raise ValueError(
-                f"Not enough checkpoints ({len(filenames)}) found for"
-                f" --iter {params.iter}, --avg {params.avg}"
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
             )
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints(filenames, device=device), strict=False
-        )
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints(filenames, device=device), strict=False
-        )
 
     model.to("cpu")
     model.eval()
     convert_scaled_to_non_scaled(model, inplace=True)
 
-    if params.onnx is True:
-        opset_version = 11
-        logging.info("Exporting to onnx format")
-        encoder_filename = params.exp_dir / "encoder.onnx"
-        export_encoder_model_onnx(
-            model.encoder,
-            encoder_filename,
-            opset_version=opset_version,
-        )
-
-        decoder_filename = params.exp_dir / "decoder.onnx"
-        export_decoder_model_onnx(
-            model.decoder,
-            decoder_filename,
-            opset_version=opset_version,
-        )
-
-        joiner_filename = params.exp_dir / "joiner.onnx"
-        export_joiner_model_onnx(
-            model.joiner,
-            joiner_filename,
-            opset_version=opset_version,
-        )
-
-        all_in_one_filename = params.exp_dir / "all_in_one.onnx"
-        export_all_in_one_onnx(
-            encoder_filename,
-            decoder_filename,
-            joiner_filename,
-            all_in_one_filename,
-        )
-    elif params.jit_trace is True:
+    if params.jit_trace is True:
         logging.info("Using torch.jit.trace()")
         encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
         export_encoder_model_jit_trace(model.encoder, encoder_filename)
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py b/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py
index eeb7dadade..594c33e4f5 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/jit_pretrained.py
@@ -289,9 +289,12 @@ def main():
 
     feature_lengths = torch.tensor(feature_lengths, device=device)
 
+    states = encoder.get_init_states(batch_size=features.size(0), device=device)
+
     encoder_out, encoder_out_lens, _ = encoder(
         x=features,
         x_lens=feature_lengths,
+        states=states,
     )
 
     hyps = greedy_search(
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 7eef94feae..d1202dae51 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -179,16 +179,18 @@ def forward(
         x = x.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
         return x, lengths, new_states
 
+    @torch.jit.export
     def get_init_states(
-        self, device: torch.device = torch.device("cpu")
+        self, batch_size: int = 1, device: torch.device = torch.device("cpu")
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get model initial states."""
         # for rnn hidden states
         hidden_states = torch.zeros(
-            (self.num_encoder_layers, self.d_model), device=device
+            (self.num_encoder_layers, batch_size, self.d_model), device=device
         )
         cell_states = torch.zeros(
-            (self.num_encoder_layers, self.rnn_hidden_size), device=device
+            (self.num_encoder_layers, batch_size, self.rnn_hidden_size),
+            device=device,
         )
         return (hidden_states, cell_states)
 
@@ -235,7 +237,7 @@ def __init__(
             ScaledLinear(d_model, dim_feedforward),
             ActivationBalancer(channel_dim=-1),
             DoubleSwish(),
-            nn.Dropout(),
+            nn.Dropout(dropout),
             ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
         )
         self.norm_final = BasicNorm(d_model)
@@ -763,9 +765,9 @@ def _test_random_combine_main():
     m = RNN(
         num_features=feature_dim,
         d_model=512,
-        rnn_hidden_size=1536,
+        rnn_hidden_size=1024,
         dim_feedforward=2048,
-        num_encoder_layers=10,
+        num_encoder_layers=12,
     )
     batch_size = 5
     seq_len = 20
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
index 2434fd41d6..37592ce083 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@@ -116,7 +116,7 @@ def forward(
 
         assert x.size(0) == x_lens.size(0) == y.dim0
 
-        encoder_out, x_lens = self.encoder(x, x_lens, warmup=warmup)
+        encoder_out, x_lens, _ = self.encoder(x, x_lens, warmup=warmup)
         assert torch.all(x_lens > 0)
 
         # Now for the decoder, i.e., the prediction network

From 8f3645e5cb3cb33823c170ac8e50714e0f8c4842 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Wed, 10 Aug 2022 16:20:25 +0800
Subject: [PATCH 25/40] modify test_model.py, test if the model can be
 successfully exported by jit.trace

---
 .../lstm_transducer_stateless/test_model.py   | 36 ++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
index 5c49025bdc..78bf9b2d85 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
@@ -20,10 +20,18 @@
 To run this file, do:
 
     cd icefall/egs/librispeech/ASR
-    python ./pruned_transducer_stateless/test_model.py
+    python ./lstm_transducer_stateless/test_model.py
 """
 
-import torch
+import os
+from pathlib import Path
+
+from export import (
+    export_decoder_model_jit_trace,
+    export_encoder_model_jit_trace,
+    export_joiner_model_jit_trace,
+)
+from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model
 
 
@@ -33,13 +41,33 @@ def test_model():
     params.blank_id = 0
     params.context_size = 2
     params.unk_id = 2
+    params.encoder_dim = 512
+    params.rnn_hidden_size = 1024
+    params.num_encoder_layers = 12
+    params.aux_layer_period = 0
+    params.exp_dir = Path("exp_test_model")
 
     model = get_transducer_model(params)
+    model.eval()
 
     num_param = sum([p.numel() for p in model.parameters()])
     print(f"Number of model parameters: {num_param}")
-    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-    torch.jit.script(model)
+
+    convert_scaled_to_non_scaled(model, inplace=True)
+
+    if not os.path.exists(params.exp_dir):
+        os.path.mkdir(params.exp_dir)
+
+    encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
+    export_encoder_model_jit_trace(model.encoder, encoder_filename)
+
+    decoder_filename = params.exp_dir / "decoder_jit_trace.pt"
+    export_decoder_model_jit_trace(model.decoder, decoder_filename)
+
+    joiner_filename = params.exp_dir / "joiner_jit_trace.pt"
+    export_joiner_model_jit_trace(model.joiner, joiner_filename)
+
+    print("The model has been successfully exported using jit.trace.")
 
 
 def main():

From 1138b27f167067b53f75be9fa1c038fe35c9c01b Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Wed, 10 Aug 2022 17:16:09 +0800
Subject: [PATCH 26/40] modify pretrained.py

---
 .../ASR/lstm_transducer_stateless/export.py   |  2 +-
 .../lstm_transducer_stateless/pretrained.py   | 28 ++++++++++---------
 .../ASR/lstm_transducer_stateless/train.py    |  9 +++---
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/export.py b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
index 49ba93d55a..9fa841bcc8 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/export.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
@@ -358,9 +358,9 @@ def main():
 
     model.to("cpu")
     model.eval()
-    convert_scaled_to_non_scaled(model, inplace=True)
 
     if params.jit_trace is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
         logging.info("Using torch.jit.trace()")
         encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
         export_encoder_model_jit_trace(model.encoder, encoder_filename)
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
index 21bcf7cfd3..2a6e2adc67 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
@@ -18,16 +18,16 @@
 Usage:
 
 (1) greedy search
-./pruned_transducer_stateless2/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+./lstm_transducer_stateless/pretrained.py \
+    --checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
     --bpe-model ./data/lang_bpe_500/bpe.model \
     --method greedy_search \
     /path/to/foo.wav \
     /path/to/bar.wav
 
 (2) beam search
-./pruned_transducer_stateless2/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+./lstm_transducer_stateless/pretrained.py \
+    --checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
     --bpe-model ./data/lang_bpe_500/bpe.model \
     --method beam_search \
     --beam-size 4 \
@@ -35,8 +35,8 @@
     /path/to/bar.wav
 
 (3) modified beam search
-./pruned_transducer_stateless2/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+./lstm_transducer_stateless/pretrained.py \
+    --checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
     --bpe-model ./data/lang_bpe_500/bpe.model \
     --method modified_beam_search \
     --beam-size 4 \
@@ -44,18 +44,18 @@
     /path/to/bar.wav
 
 (4) fast beam search
-./pruned_transducer_stateless2/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
+./lstm_transducer_stateless/pretrained.py \
+    --checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
     --bpe-model ./data/lang_bpe_500/bpe.model \
     --method fast_beam_search \
     --beam-size 4 \
     /path/to/foo.wav \
     /path/to/bar.wav
 
-You can also use `./pruned_transducer_stateless2/exp/epoch-xx.pt`.
+You can also use `./lstm_transducer_stateless/exp/epoch-xx.pt`.
 
-Note: ./pruned_transducer_stateless2/exp/pretrained.pt is generated by
-./pruned_transducer_stateless2/export.py
+Note: ./lstm_transducer_stateless/exp/pretrained.pt is generated by
+./lstm_transducer_stateless/export.py
 """
 
 
@@ -77,7 +77,7 @@
     modified_beam_search,
 )
 from torch.nn.utils.rnn import pad_sequence
-from train import get_params, get_transducer_model
+from train import add_model_arguments, get_params, get_transducer_model
 
 
 def get_parser():
@@ -178,6 +178,8 @@ def get_parser():
         """,
     )
 
+    add_model_arguments(parser)
+
     return parser
 
 
@@ -268,7 +270,7 @@ def main():
 
     feature_lengths = torch.tensor(feature_lengths, device=device)
 
-    encoder_out, encoder_out_lens = model.encoder(
+    encoder_out, encoder_out_lens, _ = model.encoder(
         x=features, x_lens=feature_lengths
     )
 
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 89bd406b19..8d07aae5e9 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -111,9 +111,10 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--aux-layer-period",
         type=int,
-        default=3,
+        default=0,
         help="""Peroid of auxiliary layers used for randomly combined during training.
-        If not larger than 0 (e.g., -1), will not use the random combiner.
+        If set to 0, will not use the random combiner (Default).
+        You can set a positive integer to use the random combiner, e.g., 3.
         """,
     )
 
@@ -206,7 +207,7 @@ def get_parser():
     parser.add_argument(
         "--lr-epochs",
         type=float,
-        default=6,
+        default=10,
         help="""Number of epochs that affects how rapidly the learning rate decreases.
         """,
     )
@@ -270,7 +271,7 @@ def get_parser():
     parser.add_argument(
         "--save-every-n",
         type=int,
-        default=8000,
+        default=4000,
         help="""Save checkpoint after processing this number of batches"
         periodically. We save checkpoint to exp-dir/ whenever
         params.batch_idx_train % save_every_n == 0. The checkpoint filename

From dc73ff0cbe8bb41c29981aa3d801d6d8bdb932b3 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Wed, 10 Aug 2022 21:36:32 +0800
Subject: [PATCH 27/40] support streaming decoding

---
 .../ASR/lstm_transducer_stateless/lstm.py     |  63 ++
 .../ASR/lstm_transducer_stateless/stream.py   | 147 +++++
 .../streaming_decode.py                       | 589 ++++++++++++------
 .../lstm_transducer_stateless/test_model.py   |  14 +
 .../pruned_transducer_stateless3/export.py    |   4 +-
 5 files changed, 640 insertions(+), 177 deletions(-)
 create mode 100644 egs/librispeech/ASR/lstm_transducer_stateless/stream.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index d1202dae51..c24bac6df5 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import copy
+import math
 from typing import List, Optional, Tuple
 
 import torch
@@ -29,6 +30,68 @@
 )
 from torch import nn
 
+LOG_EPSILON = math.log(1e-10)
+
+
+def unstack_states(
+    states: Tuple[torch.Tensor, torch.Tensor]
+) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Unstack the lstm states corresponding to a batch of utterances into a list
+    of states, where the i-th entry is the state from the i-th utterance.
+
+    Args:
+      states:
+        A tuple of 2 elements.
+        ``states[0]`` is the lstm hidden states, of a batch of utterance.
+        ``states[1]`` is the lstm cell states, of a batch of utterances.
+
+    Returns:
+      A list of states.
+        ``states[i]`` is a tuple of 2 elememts of i-th utterance.
+        ``states[i][0]`` is the lstm hidden states of i-th utterance.
+        ``states[i][1]`` is the lstm cell states of i-th utterance.
+    """
+    hidden_states, cell_states = states
+
+    list_hidden_states = hidden_states.unbind(dim=1)
+    list_cell_states = cell_states.unbind(dim=1)
+
+    ans = [
+        (h.unsqueeze(1), c.unsqueeze(1))
+        for (h, c) in zip(list_hidden_states, list_cell_states)
+    ]
+    return ans
+
+
+def stack_states(
+    states_list: List[Tuple[torch.Tensor, torch.Tensor]]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Stack list of lstm states corresponding to separate utterances into a single
+    lstm state so that it can be used as an input for lsit when those utterances
+    are formed into a batch.
+
+    Args:
+      state_list:
+        Each element in state_list corresponds to the lstm state for a single
+        utterance.
+        ``states[i]`` is a tuple of 2 elememts of i-th utterance.
+        ``states[i][0]`` is the lstm hidden states of i-th utterance.
+        ``states[i][1]`` is the lstm cell states of i-th utterance.
+
+
+    Returns:
+      A new state corresponding to a batch of utterances.
+      It is a tuple of 2 elements.
+        ``states[0]`` is the lstm hidden states, of a batch of utterance.
+        ``states[1]`` is the lstm cell states, of a batch of utterances.
+    """
+    hidden_states = torch.cat([s[0] for s in states_list], dim=1)
+    cell_states = torch.cat([s[1] for s in states_list], dim=1)
+    ans = (hidden_states, cell_states)
+    return ans
+
 
 class RNN(EncoderInterface):
     """
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
new file mode 100644
index 0000000000..55b331abc6
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -0,0 +1,147 @@
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple
+
+import k2
+import torch
+from beam_search import Hypothesis, HypothesisList
+
+from icefall.utils import AttributeDict
+
+
+class Stream(object):
+    def __init__(
+        self,
+        params: AttributeDict,
+        decoding_graph: Optional[k2.Fsa] = None,
+        device: torch.device = torch.device("cpu"),
+        LOG_EPS: float = math.log(1e-10),
+    ) -> None:
+        """
+        Args:
+          params:
+            It's the return value of :func:`get_params`.
+          decoding_graph:
+            The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+            only when --decoding_method is fast_beam_search.
+          device:
+            The device to run this stream.
+        """
+        self.LOG_EPS = LOG_EPS
+
+        # Containing attention caches and convolution caches
+        self.states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+
+        # It uses different attributes for different decoding methods.
+        self.context_size = params.context_size
+        self.decoding_method = params.decoding_method
+        if params.decoding_method == "greedy_search":
+            self.hyp = [params.blank_id] * params.context_size
+        elif params.decoding_method == "modified_beam_search":
+            self.hyps = HypothesisList()
+            self.hyps.add(
+                Hypothesis(
+                    ys=[params.blank_id] * params.context_size,
+                    log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                )
+            )
+        elif params.decoding_method == "fast_beam_search":
+            # feature_len is needed to get partial results.
+            # The rnnt_decoding_stream for fast_beam_search.
+            self.rnnt_decoding_stream: k2.RnntDecodingStream = (
+                k2.RnntDecodingStream(decoding_graph)
+            )
+            self.hyp: Optional[List[int]] = None
+        else:
+            raise ValueError(
+                f"Unsupported decoding method: {params.decoding_method}"
+            )
+
+        self.ground_truth: str = ""
+
+        self.feature: Optional[torch.Tensor] = None
+        # Make sure all feature frames can be used.
+        # We aim to obtain 1 frame after subsampling.
+        self.chunk_length = params.subsampling_factor
+        # Add 2 here since we will drop the first and last after subsampling.
+        self.pad_length = 2 * params.subsampling_factor + 3
+        self.num_frames = 0
+        self.num_processed_frames = 0
+
+        # After all feature frames are processed, we set this flag to True
+        self._done = False
+
+    def set_feature(self, feature: torch.Tensor) -> None:
+        assert feature.dim() == 2, feature.dim()
+        self.num_frames = feature.size(0)
+        # tail padding
+        self.feature = torch.nn.functional.pad(
+            feature,
+            (0, 0, 0, self.pad_length),
+            mode="constant",
+            value=self.LOG_EPS,
+        )
+
+    def set_ground_truth(self, ground_truth: str) -> None:
+        self.ground_truth = ground_truth
+
+    def set_states(
+        self, states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]]
+    ) -> None:
+        """Set states."""
+        self.states = states
+
+    def get_feature_chunk(self) -> torch.Tensor:
+        """Get a chunk of feature frames.
+
+        Returns:
+          A tensor of shape (ret_length, feature_dim).
+        """
+        update_length = min(
+            self.num_frames - self.num_processed_frames, self.chunk_length
+        )
+        ret_length = update_length + self.pad_length
+
+        ret_feature = self.feature[
+            self.num_processed_frames : self.num_processed_frames + ret_length
+        ]
+        # Cut off used frames.
+        # self.feature = self.feature[update_length:]
+
+        self.num_processed_frames += update_length
+        if self.num_processed_frames >= self.num_frames:
+            self._done = True
+
+        return ret_feature
+
+    @property
+    def done(self) -> bool:
+        """Return True if all feature frames are processed."""
+        return self._done
+
+    def decoding_result(self) -> List[int]:
+        """Obtain current decoding result."""
+        if self.decoding_method == "greedy_search":
+            return self.hyp[self.context_size :]
+        elif self.decoding_method == "modified_beam_search":
+            best_hyp = self.hyps.get_most_probable(length_norm=True)
+            return best_hyp.ys[self.context_size :]
+        else:
+            assert self.decoding_method == "fast_beam_search"
+            return self.hyp
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 79b0a45a2a..0e82e05a1e 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
-# Copyright 2022 Xiaomi Corporation (Authors: Wei Kang, Fangjun Kuang)
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -14,21 +16,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Usage:
+(1) greedy search
+./lstm_transducer_stateless/streaming_decode.py \
+      --epoch 30 \
+      --avg 10 \
+      --exp-dir lstm_transducer_stateless/exp \
+      --num-decode-streams 2000 \
+      --num-encoder-layers 12 \
+      --rnn-hidden-size 1024 \
+      --decoding-method greedy_search \
+      --use-averaged-model True
+
+(2) modified beam search
 ./lstm_transducer_stateless/streaming_decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --decode-chunk-size 1 \
-        --exp-dir ./lstm_transducer_stateless/exp \
-        --decoding_method greedy_search \
-        --num-decode-streams 200
+      --epoch 30 \
+      --avg 10 \
+      --exp-dir lstm_transducer_stateless/exp \
+      --num-decode-streams 2000 \
+      --num-encoder-layers 12 \
+      --rnn-hidden-size 1024 \
+      --decoding-method modified_beam_search \
+      --use-averaged-model True \
+      --beam-size 4
+
+(3) fast beam search
+./lstm_transducer_stateless/streaming_decode.py \
+      --epoch 30 \
+      --avg 10 \
+      --exp-dir lstm_transducer_stateless/exp \
+      --num-decode-streams 2000 \
+      --num-encoder-layers 12 \
+      --rnn-hidden-size 1024 \
+      --decoding-method fast_beam_search \
+      --use-averaged-model True \
+      --beam 4 \
+      --max-contexts 4 \
+      --max-states 8
 """
-
 import argparse
 import logging
-import math
+import warnings
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
@@ -38,9 +67,11 @@
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from decode_stream import DecodeStream
+from beam_search import Hypothesis, HypothesisList, get_hyps_shape
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
+from lstm import LOG_EPSILON, stack_states, unstack_states
+from stream import Stream
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
 
@@ -60,8 +91,6 @@
     write_error_stats,
 )
 
-LOG_EPS = math.log(1e-10)
-
 
 def get_parser():
     parser = argparse.ArgumentParser(
@@ -72,9 +101,8 @@ def get_parser():
         "--epoch",
         type=int,
         default=28,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 0.
-        You can specify --avg to use more checkpoints for model averaging.""",
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 0.",
     )
 
     parser.add_argument(
@@ -93,13 +121,13 @@ def get_parser():
         default=15,
         help="Number of checkpoints to average. Automatically select "
         "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
+        "'--epoch'. ",
     )
 
     parser.add_argument(
         "--use-averaged-model",
         type=str2bool,
-        default=True,
+        default=False,
         help="Whether to load averaged model. Currently it only supports "
         "using --epoch. If True, it would decode with the averaged model "
         "over the epoch range from `epoch-avg` (excluded) to `epoch`."
@@ -110,7 +138,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless2/exp",
+        default="transducer_emformer/exp",
         help="The experiment dir",
     )
 
@@ -125,10 +153,22 @@ def get_parser():
         "--decoding-method",
         type=str,
         default="greedy_search",
-        help="""Support only greedy_search and fast_beam_search now.
+        help="""Possible values are:
+          - greedy_search
+          - modified_beam_search
+          - fast_beam_search
         """,
     )
 
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An interger indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
     parser.add_argument(
         "--beam",
         type=float,
@@ -150,7 +190,7 @@ def get_parser():
     parser.add_argument(
         "--max-states",
         type=int,
-        default=32,
+        default=8,
         help="""Used only when --decoding-method is
         fast_beam_search""",
     )
@@ -162,19 +202,26 @@ def get_parser():
         help="The context size in the decoder. 1 means bigram; "
         "2 means tri-gram",
     )
-
     parser.add_argument(
-        "--decode-chunk-size",
+        "--max-sym-per-frame",
         type=int,
         default=1,
-        help="The chunk size for decoding (in frames after subsampling)",
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--sampling-rate",
+        type=float,
+        default=16000,
+        help="Sample rate of the audio",
     )
 
     parser.add_argument(
         "--num-decode-streams",
         type=int,
         default=2000,
-        help="The number of streams that can be decoded parallel.",
+        help="The number of streams that can be decoded parallel",
     )
 
     add_model_arguments(parser)
@@ -185,26 +232,36 @@ def get_parser():
 def greedy_search(
     model: nn.Module,
     encoder_out: torch.Tensor,
-    streams: List[DecodeStream],
-) -> List[List[int]]:
+    streams: List[Stream],
+) -> None:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
 
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      streams:
+        A list of Stream objects.
+    """
     assert len(streams) == encoder_out.size(0)
     assert encoder_out.ndim == 3
 
     blank_id = model.decoder.blank_id
     context_size = model.decoder.context_size
-    device = model.device
+    device = next(model.parameters()).device
     T = encoder_out.size(1)
 
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
     decoder_input = torch.tensor(
         [stream.hyp[-context_size:] for stream in streams],
         device=device,
         dtype=torch.int64,
     )
-    # decoder_out is of shape (N, decoder_out_dim)
+    # decoder_out is of shape (batch_size, 1, decoder_out_dim)
     decoder_out = model.decoder(decoder_input, need_pad=False)
     decoder_out = model.joiner.decoder_proj(decoder_out)
-    # logging.info(f"decoder_out shape : {decoder_out.shape}")
 
     for t in range(T):
         # current_encoder_out's shape: (batch_size, 1, encoder_out_dim)
@@ -238,20 +295,171 @@ def greedy_search(
             )
             decoder_out = model.joiner.decoder_proj(decoder_out)
 
-    hyp_tokens = []
-    for stream in streams:
-        hyp_tokens.append(stream.hyp)
-    return hyp_tokens
 
+def modified_beam_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    streams: List[Stream],
+    beam: int = 4,
+):
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The RNN-T model.
+      encoder_out:
+        A 3-D tensor of shape (N, T, encoder_out_dim) containing the output of
+        the encoder model.
+      streams:
+        A list of stream objects.
+      beam:
+        Number of active paths during the beam search.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert len(streams) == encoder_out.size(0)
+
+    blank_id = model.decoder.blank_id
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+    batch_size = len(streams)
+    T = encoder_out.size(1)
+
+    B = [stream.hyps for stream in streams]
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    for t in range(T):
+        current_encoder_out = encoder_out[:, t].unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.stack(
+            [hyp.log_prob.reshape(1) for hyps in A for hyp in hyps], dim=0
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, decoder_output_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out, decoder_out, project_input=False
+        )
+        # logits is of shape (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)
+
+        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
 
-def fast_beam_search(
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(
+            shape=log_probs_shape, value=log_probs
+        )
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                if new_token != blank_id:
+                    new_ys.append(new_token)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
+                B[i].add(new_hyp)
+
+    for i in range(batch_size):
+        streams[i].hyps = B[i]
+
+
+def fast_beam_search_one_best(
     model: nn.Module,
+    streams: List[Stream],
     encoder_out: torch.Tensor,
     processed_lens: torch.Tensor,
-    decoding_streams: k2.RnntDecodingStreams,
-) -> List[List[int]]:
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+) -> None:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using modified beam search, and then
+    the shortest path within the lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      streams:
+        A list of stream objects.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      processed_lens:
+        A tensor of shape (N,) containing the number of processed frames
+        in `encoder_out` before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+    """
+    assert encoder_out.ndim == 3
+
+    context_size = model.decoder.context_size
+    vocab_size = model.decoder.vocab_size
 
     B, T, C = encoder_out.shape
+    assert B == len(streams)
+
+    config = k2.RnntDecodingConfig(
+        vocab_size=vocab_size,
+        decoder_history_len=context_size,
+        beam=beam,
+        max_contexts=max_contexts,
+        max_states=max_states,
+    )
+    individual_streams = []
+    for i in range(B):
+        individual_streams.append(streams[i].rnnt_decoding_stream)
+    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
     for t in range(T):
         # shape is a RaggedShape of shape (B, context)
         # contexts is a Tensor of shape (shape.NumElements(), context_size)
@@ -280,127 +488,152 @@ def fast_beam_search(
     decoding_streams.terminate_and_flush_to_streams()
 
     lattice = decoding_streams.format_output(processed_lens.tolist())
+
     best_path = one_best_decoding(lattice)
-    hyp_tokens = get_texts(best_path)
-    return hyp_tokens
+    hyps = get_texts(best_path)
+
+    for i in range(B):
+        streams[i].hyp = hyps[i]
 
 
 def decode_one_chunk(
-    params: AttributeDict,
     model: nn.Module,
-    decode_streams: List[DecodeStream],
+    streams: List[Stream],
+    params: AttributeDict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> List[int]:
-    """Decode one chunk frames of features for each decode_streams and
-    return the indexes of finished streams in a List.
-
+    """
     Args:
-      params:
-        It's the return value of :func:`get_params`.
       model:
-        The neural model.
-      decode_streams:
-        A List of DecodeStream, each belonging to a utterance.
+        The Transducer model.
+      streams:
+        A list of Stream objects.
+      params:
+        It is returned by :func:`get_params`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
+
     Returns:
-      Return a List containing which DecodeStreams are finished.
+       A list of indexes indicating the finished streams.
     """
-    device = model.device
+    device = next(model.parameters()).device
 
-    features = []
-    feature_lens = []
-    states = []
+    feature_list = []
+    feature_len_list = []
+    state_list = []
+    num_processed_frames_list = []
 
-    rnnt_stream_list = []
-    processed_lens = []
+    for stream in streams:
+        # We should first get `stream.num_processed_frames`
+        # before calling `stream.get_feature_chunk()`
+        # since `stream.num_processed_frames` would be updated
+        num_processed_frames_list.append(stream.num_processed_frames)
+        feature = stream.get_feature_chunk()
+        feature_len = feature.size(0)
+        feature_list.append(feature)
+        feature_len_list.append(feature_len)
+        state_list.append(stream.states)
+
+    features = pad_sequence(
+        feature_list, batch_first=True, padding_value=LOG_EPSILON
+    ).to(device)
+    feature_lens = torch.tensor(feature_len_list, device=device)
+    num_processed_frames = torch.tensor(
+        num_processed_frames_list, device=device
+    )
 
-    for stream in decode_streams:
-        feat, feat_len = stream.get_feature_frames(
-            params.decode_chunk_size * params.subsampling_factor
-        )
-        features.append(feat)
-        feature_lens.append(feat_len)
-        states.append(stream.states)
-        processed_lens.append(stream.done_frames)
-        if params.decoding_method == "fast_beam_search":
-            rnnt_stream_list.append(stream.rnnt_decoding_stream)
-
-    feature_lens = torch.tensor(feature_lens, device=device)
-    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
-
-    # if T is less than 7 there will be an error in time reduction layer,
-    # because we subsample features with ((x_len - 1) // 2 - 1) // 2
-    # we plus 2 here because we will cut off one frame on each size of
-    # encoder_embed output as they see invalid paddings. so we need extra 2
-    # frames.
-    tail_length = 7 + 2 * params.subsampling_factor
+    # Make sure it has at least 1 frame after subsampling, first-and-last-frame cutting, and right context cutting  # noqa
+    tail_length = 3 * params.subsampling_factor + 3
     if features.size(1) < tail_length:
-        feature_lens += tail_length - features.size(1)
-        features = torch.cat(
-            [
-                features,
-                torch.tensor(
-                    LOG_EPS, dtype=features.dtype, device=device
-                ).expand(
-                    features.size(0),
-                    tail_length - features.size(1),
-                    features.size(2),
-                ),
-            ],
-            dim=1,
+        pad_length = tail_length - features.size(1)
+        feature_lens += pad_length
+        features = torch.nn.functional.pad(
+            features,
+            (0, 0, 0, pad_length),
+            mode="constant",
+            value=LOG_EPSILON,
         )
 
-    states = [
-        torch.stack([x[0] for x in states], dim=2),
-        torch.stack([x[1] for x in states], dim=2),
-    ]
-    processed_lens = torch.tensor(processed_lens, device=device)
+    # Stack states of all streams
+    states = stack_states(state_list)
 
-    encoder_out, encoder_out_lens, states = model.encoder.infer(
+    encoder_out, encoder_out_lens, states = model.encoder(
         x=features,
         x_lens=feature_lens,
         states=states,
     )
 
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
     if params.decoding_method == "greedy_search":
-        hyp_tokens = greedy_search(model, encoder_out, decode_streams)
+        greedy_search(
+            model=model,
+            streams=streams,
+            encoder_out=encoder_out,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        modified_beam_search(
+            model=model,
+            streams=streams,
+            encoder_out=encoder_out,
+            beam=params.beam_size,
+        )
     elif params.decoding_method == "fast_beam_search":
-        config = k2.RnntDecodingConfig(
-            vocab_size=params.vocab_size,
-            decoder_history_len=params.context_size,
+        # feature_len is needed to get partial results.
+        # The rnnt_decoding_stream for fast_beam_search.
+        with warnings.simplefilter("ignore"):
+            processed_lens = (
+                num_processed_frames // params.subsampling_factor
+                + encoder_out_lens
+            )
+        fast_beam_search_one_best(
+            model=model,
+            streams=streams,
+            encoder_out=encoder_out,
+            processed_lens=processed_lens,
             beam=params.beam,
             max_contexts=params.max_contexts,
             max_states=params.max_states,
         )
-        decoding_streams = k2.RnntDecodingStreams(rnnt_stream_list, config)
-        processed_lens = processed_lens + encoder_out_lens
-        hyp_tokens = fast_beam_search(
-            model, encoder_out, processed_lens, decoding_streams
-        )
     else:
-        assert False
-
-    states = [torch.unbind(states[0], dim=2), torch.unbind(states[1], dim=2)]
+        raise ValueError(
+            f"Unsupported decoding method: {params.decoding_method}"
+        )
 
-    finished_streams = []
-    for i in range(len(decode_streams)):
-        decode_streams[i].states = [states[0][i], states[1][i]]
-        decode_streams[i].done_frames += encoder_out_lens[i]
-        if params.decoding_method == "fast_beam_search":
-            decode_streams[i].hyp = hyp_tokens[i]
-        if decode_streams[i].done:
-            finished_streams.append(i)
+    # Update cached states of each stream
+    state_list = unstack_states(states)
+    for i, s in enumerate(state_list):
+        streams[i].states = s
 
+    finished_streams = [i for i, stream in enumerate(streams) if stream.done]
     return finished_streams
 
 
+def create_streaming_feature_extractor() -> Fbank:
+    """Create a CPU streaming feature extractor.
+
+    At present, we assume it returns a fbank feature extractor with
+    fixed options. In the future, we will support passing in the options
+    from outside.
+
+    Returns:
+      Return a CPU streaming feature extractor.
+    """
+    opts = FbankOptions()
+    opts.device = "cpu"
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = 16000
+    opts.mel_opts.num_bins = 80
+    return Fbank(opts)
+
+
 def decode_dataset(
     cuts: CutSet,
-    params: AttributeDict,
     model: nn.Module,
+    params: AttributeDict,
     sp: spm.SentencePieceProcessor,
     decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+):
     """Decode dataset.
 
     Args:
@@ -409,12 +642,13 @@ def decode_dataset(
       params:
         It is returned by :func:`get_params`.
       model:
-        The neural model.
+        The Transducer model.
       sp:
         The BPE model.
       decoding_graph:
         The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
         only when --decoding_method is fast_beam_search.
+
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -422,91 +656,88 @@ def decode_dataset(
       The first is the reference transcript, and the second is the
       predicted result.
     """
-    device = model.device
+    device = next(model.parameters()).device
 
-    opts = FbankOptions()
-    opts.device = device
-    opts.frame_opts.dither = 0
-    opts.frame_opts.snip_edges = False
-    opts.frame_opts.samp_freq = 16000
-    opts.mel_opts.num_bins = 80
+    log_interval = 300
 
-    log_interval = 50
+    fbank = create_streaming_feature_extractor()
 
     decode_results = []
-    # Contain decode streams currently running.
-    decode_streams = []
-    initial_states = model.encoder.get_init_states(device=device)
+    streams = []
     for num, cut in enumerate(cuts):
-        # each utterance has a DecodeStream.
-        decode_stream = DecodeStream(
+        # Each utterance has a Stream.
+        stream = Stream(
             params=params,
-            initial_states=initial_states,
             decoding_graph=decoding_graph,
             device=device,
+            LOG_EPS=LOG_EPSILON,
         )
 
+        stream.set_states(model.encoder.get_init_states(device=device))
+
         audio: np.ndarray = cut.load_audio()
         # audio.shape: (1, num_samples)
         assert len(audio.shape) == 2
         assert audio.shape[0] == 1, "Should be single channel"
         assert audio.dtype == np.float32, audio.dtype
-
         # The trained model is using normalized samples
         assert audio.max() <= 1, "Should be normalized to [-1, 1])"
 
         samples = torch.from_numpy(audio).squeeze(0)
+        feature = fbank(samples)
+        stream.set_feature(feature)
+        stream.set_ground_truth(cut.supervisions[0].text)
 
-        fbank = Fbank(opts)
-        feature = fbank(samples.to(device))
-        decode_stream.set_features(feature)
-        decode_stream.ground_truth = cut.supervisions[0].text
-
-        decode_streams.append(decode_stream)
+        streams.append(stream)
 
-        while len(decode_streams) >= params.num_decode_streams:
+        while len(streams) >= params.num_decode_streams:
             finished_streams = decode_one_chunk(
-                params=params, model=model, decode_streams=decode_streams
+                model=model,
+                streams=streams,
+                params=params,
+                decoding_graph=decoding_graph,
             )
+
             for i in sorted(finished_streams, reverse=True):
-                hyp = decode_streams[i].hyp
-                if params.decoding_method == "greedy_search":
-                    hyp = hyp[params.context_size :]  # noqa
                 decode_results.append(
                     (
-                        decode_streams[i].ground_truth.split(),
-                        sp.decode(hyp).split(),
+                        streams[i].ground_truth.split(),
+                        sp.decode(streams[i].decoding_result()).split(),
                     )
                 )
-                del decode_streams[i]
+                del streams[i]
 
         if num % log_interval == 0:
             logging.info(f"Cuts processed until now is {num}.")
 
-    # decode final chunks of last sequences
-    while len(decode_streams):
+    while len(streams) > 0:
         finished_streams = decode_one_chunk(
-            params=params, model=model, decode_streams=decode_streams
+            model=model,
+            streams=streams,
+            params=params,
+            decoding_graph=decoding_graph,
         )
+
         for i in sorted(finished_streams, reverse=True):
-            hyp = decode_streams[i].hyp
-            if params.decoding_method == "greedy_search":
-                hyp = hyp[params.context_size :]  # noqa
             decode_results.append(
                 (
-                    decode_streams[i].ground_truth.split(),
-                    sp.decode(hyp).split(),
+                    streams[i].ground_truth.split(),
+                    sp.decode(streams[i].decoding_result()).split(),
                 )
             )
-            del decode_streams[i]
+            del streams[i]
 
-    key = "greedy_search"
-    if params.decoding_method == "fast_beam_search":
+    if params.decoding_method == "greedy_search":
+        key = "greedy_search"
+    elif params.decoding_method == "fast_beam_search":
         key = (
             f"beam_{params.beam}_"
             f"max_contexts_{params.max_contexts}_"
             f"max_states_{params.max_states}"
         )
+    else:
+        key = f"beam_size_{params.beam_size}"
+
     return {key: decode_results}
 
 
@@ -520,8 +751,7 @@ def save_results(
         recog_path = (
             params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
         )
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=sorted(results))
         logging.info(f"The transcripts are stored in {recog_path}")
 
         # The following prints out WERs, per-word error statistics and aligned
@@ -565,6 +795,11 @@ def main():
     params = get_params()
     params.update(vars(args))
 
+    assert params.decoding_method in (
+        "greedy_search",
+        "fast_beam_search",
+        "modified_beam_search",
+    )
     params.res_dir = params.exp_dir / "streaming" / params.decoding_method
 
     if params.iter > 0:
@@ -572,19 +807,22 @@ def main():
     else:
         params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
 
-    # for streaming
-    params.suffix += f"-streaming-chunk-size-{params.decode_chunk_size}"
-
-    # for fast_beam_search
-    if params.decoding_method == "fast_beam_search":
+    if "fast_beam_search" in params.decoding_method:
         params.suffix += f"-beam-{params.beam}"
         params.suffix += f"-max-contexts-{params.max_contexts}"
         params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
 
     if params.use_averaged_model:
         params.suffix += "-use-averaged-model"
 
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    setup_logger(f"{params.res_dir}/log-streaming-decode")
     logging.info("Decoding started")
 
     device = torch.device("cpu")
@@ -596,13 +834,12 @@ def main():
     sp = spm.SentencePieceProcessor()
     sp.load(params.bpe_model)
 
-    # <blk> and <unk> is defined in local/train_bpe_model.py
+    # <blk> and <unk> are defined in local/train_bpe_model.py
     params.blank_id = sp.piece_to_id("<blk>")
     params.unk_id = sp.piece_to_id("<unk>")
     params.vocab_size = sp.get_piece_size()
 
-    # Decoding in streaming requires causal convolution
-    params.causal_convolution = True
+    params.device = device
 
     logging.info(params)
 
@@ -633,7 +870,7 @@ def main():
             start = params.epoch - params.avg + 1
             filenames = []
             for i in range(start, params.epoch + 1):
-                if start >= 0:
+                if i >= 1:
                     filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
             logging.info(f"averaging {filenames}")
             model.to(device)
@@ -686,13 +923,12 @@ def main():
                 )
             )
 
-    model.to(device)
     model.eval()
-    model.device = device
 
-    decoding_graph = None
     if params.decoding_method == "fast_beam_search":
         decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
 
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
@@ -708,8 +944,8 @@ def main():
     for test_set, test_cut in zip(test_sets, test_cuts):
         results_dict = decode_dataset(
             cuts=test_cut,
-            params=params,
             model=model,
+            params=params,
             sp=sp,
             decoding_graph=decoding_graph,
         )
@@ -724,4 +960,5 @@ def main():
 
 
 if __name__ == "__main__":
+    torch.manual_seed(20220810)
     main()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
index 78bf9b2d85..03dfe1997f 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
@@ -26,11 +26,13 @@
 import os
 from pathlib import Path
 
+import torch
 from export import (
     export_decoder_model_jit_trace,
     export_encoder_model_jit_trace,
     export_joiner_model_jit_trace,
 )
+from lstm import stack_states, unstack_states
 from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model
 
@@ -70,8 +72,20 @@ def test_model():
     print("The model has been successfully exported using jit.trace.")
 
 
+def test_states_stack_and_unstack():
+    layer, batch, hidden, cell = 12, 100, 512, 1024
+    states = (
+        torch.randn(layer, batch, hidden),
+        torch.randn(layer, batch, cell),
+    )
+    states2 = stack_states(unstack_states(states))
+    assert torch.allclose(states[0], states2[0])
+    assert torch.allclose(states[1], states2[1])
+
+
 def main():
     test_model()
+    test_states_stack_and_unstack()
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/export.py b/egs/librispeech/ASR/pruned_transducer_stateless3/export.py
index 2bb518bcd6..fb3db282a4 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/export.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/export.py
@@ -603,9 +603,9 @@ def main():
 
     model.to("cpu")
     model.eval()
-    convert_scaled_to_non_scaled(model, inplace=True)
 
     if params.onnx is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
         opset_version = 11
         logging.info("Exporting to onnx format")
         encoder_filename = params.exp_dir / "encoder.onnx"
@@ -637,6 +637,7 @@ def main():
             all_in_one_filename,
         )
     elif params.jit is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
         logging.info("Using torch.jit.script()")
         # We won't use the forward() method of the model in C++, so just ignore
         # it here.
@@ -660,6 +661,7 @@ def main():
         export_joiner_model_jit_trace(model.joiner, joiner_filename)
 
     elif params.jit_trace is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
         logging.info("Using torch.jit.trace()")
         encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
         export_encoder_model_jit_trace(model.encoder, encoder_filename)

From f63f855ab533ea642ef05d05be9964e27c341f11 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 11 Aug 2022 10:29:02 +0800
Subject: [PATCH 28/40] fix model.py

---
 .../ASR/lstm_transducer_stateless/model.py    | 195 +++++++++++++++++-
 .../ASR/pruned_transducer_stateless2/model.py |   2 +-
 2 files changed, 195 insertions(+), 2 deletions(-)
 mode change 120000 => 100644 egs/librispeech/ASR/lstm_transducer_stateless/model.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/model.py b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
deleted file mode 120000
index ebb6d774d9..0000000000
--- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless2/model.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/model.py b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
new file mode 100644
index 0000000000..37592ce083
--- /dev/null
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@@ -0,0 +1,194 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+from scaling import ScaledLinear
+
+from icefall.utils import add_sos
+
+
+class Transducer(nn.Module):
+    """It implements https://arxiv.org/pdf/1211.3711.pdf
+    "Sequence Transduction with Recurrent Neural Networks"
+    """
+
+    def __init__(
+        self,
+        encoder: EncoderInterface,
+        decoder: nn.Module,
+        joiner: nn.Module,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and
+            (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output
+            contains unnormalized probs, i.e., not processed by log-softmax.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+        assert hasattr(decoder, "blank_id")
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.joiner = joiner
+
+        self.simple_am_proj = ScaledLinear(
+            encoder_dim, vocab_size, initial_speed=0.5
+        )
+        self.simple_lm_proj = ScaledLinear(decoder_dim, vocab_size)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        warmup: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+          warmup:
+            A value warmup >= 0 that determines which modules are active, values
+            warmup > 1 "are fully warmed up" and all modules will be active.
+        Returns:
+          Return the transducer loss.
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0) == y.dim0
+
+        encoder_out, x_lens, _ = self.encoder(x, x_lens, warmup=warmup)
+        assert torch.all(x_lens > 0)
+
+        # Now for the decoder, i.e., the prediction network
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros(
+            (x.size(0), 4), dtype=torch.int64, device=x.device
+        )
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = x_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction="sum",
+                return_grad=True,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction="sum",
+            )
+
+        return (simple_loss, pruned_loss)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
index 37592ce083..2434fd41d6 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@@ -116,7 +116,7 @@ def forward(
 
         assert x.size(0) == x_lens.size(0) == y.dim0
 
-        encoder_out, x_lens, _ = self.encoder(x, x_lens, warmup=warmup)
+        encoder_out, x_lens = self.encoder(x, x_lens, warmup=warmup)
         assert torch.all(x_lens > 0)
 
         # Now for the decoder, i.e., the prediction network

From dc212ba4e1570e2d97d50cafde78a038c601cadd Mon Sep 17 00:00:00 2001
From: pkufool <wkang.pku@gmail.com>
Date: Sun, 7 Aug 2022 17:19:19 +0800
Subject: [PATCH 29/40] Add cut_id to recognition results

---
 .../ASR/pruned_transducer_stateless/decode.py |  7 ++++--
 icefall/utils.py                              | 25 ++++++++++---------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
index b7558089c4..282917625e 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@@ -551,6 +551,7 @@ def decode_dataset(
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
         texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
 
         hyps_dict = decode_one_batch(
             params=params,
@@ -564,9 +565,9 @@ def decode_dataset(
         for name, hyps in hyps_dict.items():
             this_batch = []
             assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                 ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))
 
             results[name].extend(this_batch)
 
@@ -631,6 +632,8 @@ def main():
     LibriSpeechAsrDataModule.add_arguments(parser)
     args = parser.parse_args()
     args.exp_dir = Path(args.exp_dir)
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
 
     params = get_params()
     params.update(vars(args))
diff --git a/icefall/utils.py b/icefall/utils.py
index f40f769f86..2e04767333 100644
--- a/icefall/utils.py
+++ b/icefall/utils.py
@@ -321,7 +321,7 @@ def load_alignments(filename: str) -> Tuple[int, Dict[str, List[int]]]:
 
 
 def store_transcripts(
-    filename: Pathlike, texts: Iterable[Tuple[str, str]]
+    filename: Pathlike, texts: Iterable[Tuple[str, str, str]]
 ) -> None:
     """Save predicted results and reference transcripts to a file.
 
@@ -329,15 +329,15 @@ def store_transcripts(
       filename:
         File to save the results to.
       texts:
-        An iterable of tuples. The first element is the reference transcript
-        while the second element is the predicted result.
+        An iterable of tuples. The first element is the cur_id, the second is
+        the reference transcript and the third element is the predicted result.
     Returns:
       Return None.
     """
     with open(filename, "w") as f:
-        for ref, hyp in texts:
-            print(f"ref={ref}", file=f)
-            print(f"hyp={hyp}", file=f)
+        for cut_id, ref, hyp in texts:
+            print(f"{cut_id}:\tref={ref}", file=f)
+            print(f"{cut_id}:\thyp={hyp}", file=f)
 
 
 def write_error_stats(
@@ -372,8 +372,8 @@ def write_error_stats(
           The reference word `SIR` is missing in the predicted
           results (a deletion error).
       results:
-        An iterable of tuples. The first element is the reference transcript
-        while the second element is the predicted result.
+        An iterable of tuples. The first element is the cur_id, the second is
+        the reference transcript and the third element is the predicted result.
       enable_log:
         If True, also print detailed WER to the console.
         Otherwise, it is written only to the given file.
@@ -389,7 +389,7 @@ def write_error_stats(
     words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
     num_corr = 0
     ERR = "*"
-    for ref, hyp in results:
+    for cut_id, ref, hyp in results:
         ali = kaldialign.align(ref, hyp, ERR)
         for ref_word, hyp_word in ali:
             if ref_word == ERR:
@@ -405,7 +405,7 @@ def write_error_stats(
             else:
                 words[ref_word][0] += 1
                 num_corr += 1
-    ref_len = sum([len(r) for r, _ in results])
+    ref_len = sum([len(r) for _, r, _ in results])
     sub_errs = sum(subs.values())
     ins_errs = sum(ins.values())
     del_errs = sum(dels.values())
@@ -434,7 +434,7 @@ def write_error_stats(
 
     print("", file=f)
     print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
-    for ref, hyp in results:
+    for cut_id, ref, hyp in results:
         ali = kaldialign.align(ref, hyp, ERR)
         combine_successive_errors = True
         if combine_successive_errors:
@@ -461,7 +461,8 @@ def write_error_stats(
             ]
 
         print(
-            " ".join(
+            f"{cut_id}:\t"
+            + " ".join(
                 (
                     ref_word
                     if ref_word == hyp_word

From 8cceedf62910792b05ad09310b68ff43a45b3836 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 11 Aug 2022 15:08:22 +0800
Subject: [PATCH 30/40] Add cut_id to recognition results

---
 egs/librispeech/ASR/lstm_transducer_stateless/decode.py   | 8 ++++++--
 egs/librispeech/ASR/lstm_transducer_stateless/stream.py   | 6 ++++++
 .../ASR/lstm_transducer_stateless/streaming_decode.py     | 3 +++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index afc614c971..08f4e42505 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -526,6 +526,7 @@ def decode_dataset(
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
         texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
 
         hyps_dict = decode_one_batch(
             params=params,
@@ -539,9 +540,9 @@ def decode_dataset(
         for name, hyps in hyps_dict.items():
             this_batch = []
             assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                 ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
+                this_batch.append((cut_id, ref_words, hyp_words))
 
             results[name].extend(this_batch)
 
@@ -566,6 +567,7 @@ def save_results(
         recog_path = (
             params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
         )
+        results = sorted(results)
         store_transcripts(filename=recog_path, texts=results)
         logging.info(f"The transcripts are stored in {recog_path}")
 
@@ -770,6 +772,8 @@ def main():
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
     librispeech = LibriSpeechAsrDataModule(args)
 
     test_clean_cuts = librispeech.test_clean_cuts()
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
index 55b331abc6..3a4d3b34b0 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -29,6 +29,7 @@ class Stream(object):
     def __init__(
         self,
         params: AttributeDict,
+        cut_id: str,
         decoding_graph: Optional[k2.Fsa] = None,
         device: torch.device = torch.device("cpu"),
         LOG_EPS: float = math.log(1e-10),
@@ -44,6 +45,7 @@ def __init__(
             The device to run this stream.
         """
         self.LOG_EPS = LOG_EPS
+        self.cut_id = cut_id
 
         # Containing attention caches and convolution caches
         self.states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
@@ -130,6 +132,10 @@ def get_feature_chunk(self) -> torch.Tensor:
 
         return ret_feature
 
+    @property
+    def id(self) -> str:
+        return self.cut_id
+
     @property
     def done(self) -> bool:
         """Return True if all feature frames are processed."""
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 0e82e05a1e..b47cfd5a85 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -668,6 +668,7 @@ def decode_dataset(
         # Each utterance has a Stream.
         stream = Stream(
             params=params,
+            cut_id=cut.id,
             decoding_graph=decoding_graph,
             device=device,
             LOG_EPS=LOG_EPSILON,
@@ -701,6 +702,7 @@ def decode_dataset(
             for i in sorted(finished_streams, reverse=True):
                 decode_results.append(
                     (
+                        streams[i].id,
                         streams[i].ground_truth.split(),
                         sp.decode(streams[i].decoding_result()).split(),
                     )
@@ -721,6 +723,7 @@ def decode_dataset(
         for i in sorted(finished_streams, reverse=True):
             decode_results.append(
                 (
+                    streams[i].id,
                     streams[i].ground_truth.split(),
                     sp.decode(streams[i].decoding_result()).split(),
                 )

From 7ee3701a4afca797e2e8528fd6d20f2ac531f84e Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Sun, 14 Aug 2022 17:10:39 +0800
Subject: [PATCH 31/40] do not pad in Conv subsampling module; add tail padding
 during decoding.

---
 .../ASR/lstm_transducer_stateless/decode.py   |  9 +++++++
 .../ASR/lstm_transducer_stateless/lstm.py     | 26 ++++++++-----------
 .../ASR/lstm_transducer_stateless/stream.py   |  7 +++--
 .../streaming_decode.py                       |  4 +--
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index 08f4e42505..b30e76d4c8 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -356,6 +356,15 @@ def decode_one_batch(
     supervisions = batch["supervisions"]
     feature_lens = supervisions["num_frames"].to(device)
 
+    # tail padding
+    feature = torch.nn.functional.pad(
+        feature,
+        (0, 0, 0, 20),
+        mode="constant",
+        value=LOG_EPS,
+    )
+    feature_lens += 20
+
     encoder_out, encoder_out_lens, _ = model.encoder(
         x=feature, x_lens=feature_lens
     )
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index c24bac6df5..53bd33ba07 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -206,10 +206,10 @@ def forward(
         x = self.encoder_embed(x)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
-        # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning
+        # lengths = ((x_lens - 3) // 2 - 1) // 2 # issue an warning
         #
         # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
-        lengths = (((x_lens - 1) >> 1) - 1) >> 1
+        lengths = (((x_lens - 3) >> 1) - 1) >> 1
         if not torch.jit.is_tracing():
             assert x.size(0) == lengths.max().item()
 
@@ -218,10 +218,6 @@ def forward(
             # torch.jit.trace requires returned types be the same as annotated
             new_states = (torch.empty(0), torch.empty(0))
         else:
-            # we cut off 1 frame on each side of encoder_embed output
-            lengths -= 2
-            x = x[1:-1, :, :]
-
             assert not self.training
             assert len(states) == 2
             if not torch.jit.is_tracing():
@@ -496,7 +492,7 @@ class Conv2dSubsampling(nn.Module):
 
     Convert an input of shape (N, T, idim) to an output
     with shape (N, T', odim), where
-    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
+    T' = ((T-3)//2-1)//2, which approximates T' == T//4
 
     It is based on
     https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
@@ -514,15 +510,15 @@ def __init__(
         Args:
           in_channels:
             Number of channels in. The input shape is (N, T, in_channels).
-            Caution: It requires: T >=7, in_channels >=7
+            Caution: It requires: T >= 9, in_channels >= 9.
           out_channels
-            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, out_channels)
+            Output dim. The output shape is (N, ((T-3)//2-1)//2, out_channels)
           layer1_channels:
             Number of channels in layer1
           layer1_channels:
             Number of channels in layer2
         """
-        assert in_channels >= 7
+        assert in_channels >= 9
         super().__init__()
 
         self.conv = nn.Sequential(
@@ -530,7 +526,7 @@ def __init__(
                 in_channels=1,
                 out_channels=layer1_channels,
                 kernel_size=3,
-                padding=1,
+                padding=0,
             ),
             ActivationBalancer(channel_dim=1),
             DoubleSwish(),
@@ -552,7 +548,7 @@ def __init__(
             DoubleSwish(),
         )
         self.out = ScaledLinear(
-            layer3_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels
+            layer3_channels * (((in_channels - 3) // 2 - 1) // 2), out_channels
         )
         # set learn_eps=False because out_norm is preceded by `out`, and `out`
         # itself has learned scale, so the extra degree of freedom is not
@@ -571,15 +567,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             Its shape is (N, T, idim).
 
         Returns:
-          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+          Return a tensor of shape (N, ((T-3)//2-1)//2, odim)
         """
         # On entry, x is (N, T, idim)
         x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
         x = self.conv(x)
-        # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
+        # Now x is of shape (N, odim, ((T-3)//2-1)//2, ((idim-3)//2-1)//2)
         b, c, t, f = x.size()
         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
+        # Now x is of shape (N, ((T-3)//2-1))//2, odim)
         x = self.out_norm(x)
         x = self.out_balancer(x)
         return x
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
index 3a4d3b34b0..8c6c0609bf 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -81,8 +81,7 @@ def __init__(
         # Make sure all feature frames can be used.
         # We aim to obtain 1 frame after subsampling.
         self.chunk_length = params.subsampling_factor
-        # Add 2 here since we will drop the first and last after subsampling.
-        self.pad_length = 2 * params.subsampling_factor + 3
+        self.pad_length = 5
         self.num_frames = 0
         self.num_processed_frames = 0
 
@@ -91,11 +90,11 @@ def __init__(
 
     def set_feature(self, feature: torch.Tensor) -> None:
         assert feature.dim() == 2, feature.dim()
-        self.num_frames = feature.size(0)
         # tail padding
+        self.num_frames = feature.size(0) + 20
         self.feature = torch.nn.functional.pad(
             feature,
-            (0, 0, 0, self.pad_length),
+            (0, 0, 0, self.pad_length + 20),
             mode="constant",
             value=self.LOG_EPS,
         )
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index b47cfd5a85..0b6a51ee10 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -543,8 +543,8 @@ def decode_one_chunk(
         num_processed_frames_list, device=device
     )
 
-    # Make sure it has at least 1 frame after subsampling, first-and-last-frame cutting, and right context cutting  # noqa
-    tail_length = 3 * params.subsampling_factor + 3
+    # Make sure it has at least 1 frame after subsampling
+    tail_length = params.subsampling_factor + 5
     if features.size(1) < tail_length:
         pad_length = tail_length - features.size(1)
         feature_lens += pad_length

From be1861087d84c58d0793dc3e00b650477a7f723d Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 18 Aug 2022 16:37:49 +0800
Subject: [PATCH 32/40] update RESULTS.md

---
 egs/librispeech/ASR/RESULTS.md                | 132 ++++++++++++++++++
 .../ASR/lstm_transducer_stateless/decode.py   |  16 +--
 .../ASR/lstm_transducer_stateless/stream.py   |   4 +-
 .../streaming_decode.py                       |   9 +-
 .../ASR/lstm_transducer_stateless/train.py    |   2 +-
 5 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
index 496445fbff..e0459cb4d4 100644
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@@ -1,5 +1,137 @@
 ## Results
 
+#### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T 2)
+
+[lstm_transducer_stateless](./lstm_transducer_stateless)
+
+It implements LSTM model with mechanisms in reworked model for streaming ASR.
+
+See <https://github.com/k2-fsa/icefall/pull/479> for more details.
+
+#### training on full librispeech
+
+This model contains 12 encoder layers (LSTM module + Feedforward module). The number of model parameters is 84689496.
+
+The WERs are:
+
+|                                     | test-clean | test-other | comment              | decoding mode        |
+|-------------------------------------|------------|------------|----------------------|----------------------|
+| greedy search (max sym per frame 1) | 3.81       | 9.73       | --epoch 35 --avg 15  | simulated streaming  |
+| greedy search (max sym per frame 1) | 3.78       | 9.79       | --epoch 35 --avg 15  | streaming            |
+| fast beam search                    | 3.74       | 9.59       | --epoch 35 --avg 15  | simulated streaming  |
+| fast beam search                    | 3.73       | 9.61       | --epoch 35 --avg 15  | streaming            |
+| modified beam search                | 3.64       | 9.55       | --epoch 35 --avg 15  | simulated streaming  |
+| modified beam search                | 3.65       | 9.51       | --epoch 35 --avg 15  | streaming            |
+
+The training command is:
+
+```bash
+./lstm_transducer_stateless/train.py \
+  --world-size 4 \
+  --num-epochs 35 \
+  --start-epoch 1 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --full-libri 1 \
+  --max-duration 500 \
+  --master-port 12321 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+```
+
+The tensorboard log can be found at
+<https://tensorboard.dev/experiment/FWrM20mjTeWo6dTpFYOsYQ/>
+
+The simulated streaming decoding command using greedy search is:
+```bash
+./lstm_transducer_stateless/decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method greedy_search \
+  --use-averaged-model True
+```
+
+The simulated streaming decoding command using fast beam search is:
+```bash
+./lstm_transducer_stateless/decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method fast_beam_search \
+  --use-averaged-model True \
+  --beam 4 \
+  --max-contexts 4 \
+  --max-states 8
+```
+
+The simulated streaming decoding command using modified beam search is:
+```bash
+./lstm_transducer_stateless/decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method modified_beam_search \
+  --use-averaged-model True \
+  --beam-size 4
+```
+
+The streaming decoding command using greedy search is:
+```bash
+./lstm_transducer_stateless/streaming_decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method greedy_search \
+  --use-averaged-model True
+```
+
+The streaming decoding command using fast beam search is:
+```bash
+./lstm_transducer_stateless/streaming_decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method fast_beam_search \
+  --use-averaged-model True \
+  --beam 4 \
+  --max-contexts 4 \
+  --max-states 8
+```
+
+The streaming decoding command using modified beam search is:
+```bash
+./lstm_transducer_stateless/streaming_decode.py \
+  --epoch 35 \
+  --avg 15 \
+  --exp-dir lstm_transducer_stateless/exp \
+  --max-duration 600 \
+  --num-encoder-layers 12 \
+  --rnn-hidden-size 1024
+  --decoding-method modified_beam_search \
+  --use-averaged-model True \
+  --beam-size 4
+```
+
+Pretrained models, training logs, decoding logs, and decoding results
+are available at
+<https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18>
+
+
 #### LibriSpeech BPE training results (Pruned Stateless Conv-Emformer RNN-T 2)
 
 [conv_emformer_transducer_stateless2](./conv_emformer_transducer_stateless2)
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index b30e76d4c8..08b614f043 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -20,7 +20,7 @@
 Usage:
 (1) greedy search
 ./lstm_transducer_stateless/decode.py \
-    --epoch 30 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -28,7 +28,7 @@
 
 (2) beam search (not recommended)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 30 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -37,7 +37,7 @@
 
 (3) modified beam search
 ./lstm_transducer_stateless/decode.py \
-    --epoch 30 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -46,7 +46,7 @@
 
 (4) fast beam search (one best)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 30 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -70,7 +70,7 @@
 
 (6) fast beam search (nbest oracle WER)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 30 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -83,7 +83,7 @@
 
 (7) fast beam search (with LG)
 ./lstm_transducer_stateless/decode.py \
-    --epoch 28 \
+    --epoch 35 \
     --avg 15 \
     --exp-dir ./lstm_transducer_stateless/exp \
     --max-duration 600 \
@@ -359,11 +359,11 @@ def decode_one_batch(
     # tail padding
     feature = torch.nn.functional.pad(
         feature,
-        (0, 0, 0, 20),
+        (0, 0, 0, 35),
         mode="constant",
         value=LOG_EPS,
     )
-    feature_lens += 20
+    feature_lens += 35
 
     encoder_out, encoder_out_lens, _ = model.encoder(
         x=feature, x_lens=feature_lens
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
index 8c6c0609bf..552d4490a7 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -91,10 +91,10 @@ def __init__(
     def set_feature(self, feature: torch.Tensor) -> None:
         assert feature.dim() == 2, feature.dim()
         # tail padding
-        self.num_frames = feature.size(0) + 20
+        self.num_frames = feature.size(0) + 35
         self.feature = torch.nn.functional.pad(
             feature,
-            (0, 0, 0, self.pad_length + 20),
+            (0, 0, 0, self.pad_length + 35),
             mode="constant",
             value=self.LOG_EPS,
         )
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 0b6a51ee10..17819d8ffb 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -20,7 +20,7 @@
 Usage:
 (1) greedy search
 ./lstm_transducer_stateless/streaming_decode.py \
-      --epoch 30 \
+      --epoch 35 \
       --avg 10 \
       --exp-dir lstm_transducer_stateless/exp \
       --num-decode-streams 2000 \
@@ -31,7 +31,7 @@
 
 (2) modified beam search
 ./lstm_transducer_stateless/streaming_decode.py \
-      --epoch 30 \
+      --epoch 35 \
       --avg 10 \
       --exp-dir lstm_transducer_stateless/exp \
       --num-decode-streams 2000 \
@@ -43,7 +43,7 @@
 
 (3) fast beam search
 ./lstm_transducer_stateless/streaming_decode.py \
-      --epoch 30 \
+      --epoch 35 \
       --avg 10 \
       --exp-dir lstm_transducer_stateless/exp \
       --num-decode-streams 2000 \
@@ -580,7 +580,8 @@ def decode_one_chunk(
     elif params.decoding_method == "fast_beam_search":
         # feature_len is needed to get partial results.
         # The rnnt_decoding_stream for fast_beam_search.
-        with warnings.simplefilter("ignore"):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
             processed_lens = (
                 num_processed_frames // params.subsampling_factor
                 + encoder_out_lens
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index 8d07aae5e9..dabd54197e 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -148,7 +148,7 @@ def get_parser():
     parser.add_argument(
         "--num-epochs",
         type=int,
-        default=30,
+        default=35,
         help="Number of epochs to train.",
     )
 

From ab6f5e319cdd4c6eddc3c625afcdf7aa331b366c Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 18 Aug 2022 16:49:07 +0800
Subject: [PATCH 33/40] minor fix

---
 egs/librispeech/ASR/RESULTS.md                                | 2 +-
 .../ASR/lstm_transducer_stateless/test_scaling_converter.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
index e0459cb4d4..dd41ee2912 100644
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@@ -1,6 +1,6 @@
 ## Results
 
-#### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T 2)
+#### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T)
 
 [lstm_transducer_stateless](./lstm_transducer_stateless)
 
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
index 0a7e63b60f..7567dd58c2 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_scaling_converter.py
@@ -208,8 +208,8 @@ def test_convert_scaled_to_non_scaled():
         x = torch.randn(N, T, 80, dtype=torch.float32)
         x_lens = torch.full((N,), x.size(1))
 
-        e1, e1_lens = model.encoder(x, x_lens)
-        e2, e2_lens = converted_model.encoder(x, x_lens)
+        e1, e1_lens, _ = model.encoder(x, x_lens)
+        e2, e2_lens, _ = converted_model.encoder(x, x_lens)
 
         assert torch.all(torch.eq(e1_lens, e2_lens))
         assert torch.allclose(e1, e2), (e1 - e2).abs().max()

From db3e570d6aed5ba570601daa62a4aed525710524 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 18 Aug 2022 17:23:01 +0800
Subject: [PATCH 34/40] fix doc

---
 .../decode_stream.py                          | 126 ------------------
 .../ASR/lstm_transducer_stateless/export.py   |  10 +-
 2 files changed, 5 insertions(+), 131 deletions(-)
 delete mode 100644 egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
deleted file mode 100644
index 7df96a9afb..0000000000
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode_stream.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple
-
-import k2
-import torch
-
-from icefall.utils import AttributeDict
-
-
-class DecodeStream(object):
-    def __init__(
-        self,
-        params: AttributeDict,
-        initial_states: List[torch.Tensor],
-        decoding_graph: Optional[k2.Fsa] = None,
-        device: torch.device = torch.device("cpu"),
-    ) -> None:
-        """
-        Args:
-          initial_states:
-            Initial decode states of the model, e.g. the return value of
-            `get_init_state` in conformer.py
-          decoding_graph:
-            Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-            Used only when decoding_method is fast_beam_search.
-          device:
-            The device to run this stream.
-        """
-        if decoding_graph is not None:
-            assert device == decoding_graph.device
-
-        self.params = params
-        self.LOG_EPS = math.log(1e-10)
-
-        self.states = initial_states
-
-        # It contains a 2-D tensors representing the feature frames.
-        self.features: torch.Tensor = None
-
-        self.num_frames: int = 0
-        # how many frames have been processed. (before subsampling).
-        # we only modify this value in `func:get_feature_frames`.
-        self.num_processed_frames: int = 0
-
-        self._done: bool = False
-
-        # The transcript of current utterance.
-        self.ground_truth: str = ""
-
-        # The decoding result (partial or final) of current utterance.
-        self.hyp: List = []
-
-        # how many frames have been processed, after subsampling (i.e. a
-        # cumulative sum of the second return value of
-        # encoder.streaming_forward
-        self.done_frames: int = 0
-
-        # add 2 here since we will drop the first and last frames after
-        # the convolutional subsampling module
-        self.pad_length = 2 * params.subsampling_factor + 3
-
-        if params.decoding_method == "greedy_search":
-            self.hyp = [params.blank_id] * params.context_size
-        elif params.decoding_method == "fast_beam_search":
-            # The rnnt_decoding_stream for fast_beam_search.
-            self.rnnt_decoding_stream: k2.RnntDecodingStream = (
-                k2.RnntDecodingStream(decoding_graph)
-            )
-        else:
-            assert (
-                False
-            ), f"Decoding method :{params.decoding_method} do not support."
-
-    @property
-    def done(self) -> bool:
-        """Return True if all the features are processed."""
-        return self._done
-
-    def set_features(
-        self,
-        features: torch.Tensor,
-    ) -> None:
-        """Set features tensor of current utterance."""
-        assert features.dim() == 2, features.dim()
-        self.features = torch.nn.functional.pad(
-            features,
-            (0, 0, 0, self.pad_length),
-            mode="constant",
-            value=self.LOG_EPS,
-        )
-        self.num_frames = self.features.size(0)
-
-    def get_feature_frames(self, chunk_size: int) -> Tuple[torch.Tensor, int]:
-        """Consume chunk_size frames of features"""
-        chunk_length = chunk_size + self.pad_length
-
-        ret_length = min(
-            self.num_frames - self.num_processed_frames, chunk_length
-        )
-
-        ret_features = self.features[
-            self.num_processed_frames : self.num_processed_frames  # noqa
-            + ret_length
-        ]
-
-        self.num_processed_frames += chunk_size
-        if self.num_processed_frames >= self.num_frames:
-            self._done = True
-
-        return ret_features, ret_length
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/export.py b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
index 9fa841bcc8..b4f0bba32d 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/export.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
@@ -27,7 +27,7 @@
 ./lstm_transducer_stateless/export.py \
   --exp-dir ./lstm_transducer_stateless/exp \
   --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 20 \
+  --epoch 35 \
   --avg 10 \
   --jit-trace 1
 
@@ -39,7 +39,7 @@
 ./lstm_transducer_stateless/export.py \
   --exp-dir ./lstm_transducer_stateless/exp \
   --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 20 \
+  --epoch 35 \
   --avg 10
 
 It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
@@ -65,14 +65,14 @@
 Note: If you don't want to train a model from scratch, we have
 provided one for you. You can get it at
 
-https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18
 
 with the following commands:
 
     sudo apt-get install git-lfs
     git lfs install
-    git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
-    # You will find the pre-trained model in icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp
+    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18
+    # You will find the pre-trained model in icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18/exp
 """
 
 import argparse

From 2ee5122f119515cc8f57529925745c5f47e3766f Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Thu, 18 Aug 2022 17:43:11 +0800
Subject: [PATCH 35/40] update README.md

---
 egs/librispeech/ASR/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md
index cbdee53e6d..f590bc8374 100644
--- a/egs/librispeech/ASR/README.md
+++ b/egs/librispeech/ASR/README.md
@@ -25,6 +25,7 @@ The following table lists the differences among them.
 | `pruned_stateless_emformer_rnnt2`     | Emformer(from torchaudio) | Embedding + Conv1d | Using Emformer from torchaudio for streaming ASR|
 | `conv_emformer_transducer_stateless`  | ConvEmformer | Embedding + Conv1d | Using ConvEmformer for streaming ASR + mechanisms in reworked model |
 | `conv_emformer_transducer_stateless2` | ConvEmformer | Embedding + Conv1d | Using ConvEmformer with simplified memory for streaming ASR + mechanisms in reworked model |
+| `lstm_transducer_stateless`           | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model |
 
 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).

From 6191c3e4a6242fd0ab2bdaaebab6f715dcc84c12 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 19 Aug 2022 10:45:14 +0800
Subject: [PATCH 36/40] minor change, filter infinite loss

---
 .../ASR/lstm_transducer_stateless/decode.py   | 11 ++-
 .../ASR/lstm_transducer_stateless/export.py   |  4 +-
 .../ASR/lstm_transducer_stateless/lstm.py     | 10 +-
 .../ASR/lstm_transducer_stateless/model.py    | 14 ++-
 .../ASR/lstm_transducer_stateless/stream.py   |  7 +-
 .../streaming_decode.py                       | 12 +--
 .../ASR/lstm_transducer_stateless/train.py    | 92 ++++++++++++++-----
 7 files changed, 105 insertions(+), 45 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
index 08b614f043..bfc158e0a6 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@@ -339,7 +339,7 @@ def decode_one_batch(
       word_table:
         The word symbol table.
       decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
         only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
         fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
     Returns:
@@ -356,14 +356,15 @@ def decode_one_batch(
     supervisions = batch["supervisions"]
     feature_lens = supervisions["num_frames"].to(device)
 
-    # tail padding
+    # tail padding here to alleviate the tail deletion problem
+    num_tail_padded_frames = 35
     feature = torch.nn.functional.pad(
         feature,
-        (0, 0, 0, 35),
+        (0, 0, 0, num_tail_padded_frames),
         mode="constant",
         value=LOG_EPS,
     )
-    feature_lens += 35
+    feature_lens += num_tail_padded_frames
 
     encoder_out, encoder_out_lens, _ = model.encoder(
         x=feature, x_lens=feature_lens
@@ -510,7 +511,7 @@ def decode_dataset(
       word_table:
         The word symbol table.
       decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
         only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
         fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
     Returns:
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/export.py b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
index b4f0bba32d..13dac60092 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/export.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/export.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -31,7 +31,7 @@
   --avg 10 \
   --jit-trace 1
 
-It will generates 3 files: `encoder_jit_trace.pt`,
+It will generate 3 files: `encoder_jit_trace.pt`,
 `decoder_jit_trace.pt`, and `joiner_jit_trace.pt`.
 
 (2) Export `model.state_dict()`
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
index 53bd33ba07..6ce966b137 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@@ -69,7 +69,7 @@ def stack_states(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Stack list of lstm states corresponding to separate utterances into a single
-    lstm state so that it can be used as an input for lsit when those utterances
+    lstm state so that it can be used as an input for lstm when those utterances
     are formed into a batch.
 
     Args:
@@ -113,7 +113,7 @@ class RNN(EncoderInterface):
       layer_dropout (float):
         Dropout value for model-level warmup (default=0.075).
       aux_layer_period (int):
-        Peroid of auxiliary layers used for randomly combined during training.
+        Period of auxiliary layers used for random combiner during training.
         If set to 0, will not use the random combiner (Default).
         You can set a positive integer to use the random combiner, e.g., 3.
     """
@@ -201,7 +201,7 @@ def forward(
               sequence lengths.
             - lengths: a tensor of shape (batch_size,) containing the number of
               frames in `embeddings` before padding.
-            - updated states, whose shape is same as the input states.
+            - updated states, whose shape is the same as the input states.
         """
         x = self.encoder_embed(x)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
@@ -215,7 +215,7 @@ def forward(
 
         if states is None:
             x = self.encoder(x, warmup=warmup)[0]
-            # torch.jit.trace requires returned types be the same as annotated
+            # torch.jit.trace requires returned types to be the same as annotated  # noqa
             new_states = (torch.empty(0), torch.empty(0))
         else:
             assert not self.training
@@ -284,7 +284,7 @@ def __init__(
         self.d_model = d_model
         self.rnn_hidden_size = rnn_hidden_size
 
-        assert rnn_hidden_size >= d_model
+        assert rnn_hidden_size >= d_model, (rnn_hidden_size, d_model)
         self.lstm = ScaledLSTM(
             input_size=d_model,
             hidden_size=rnn_hidden_size,
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/model.py b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
index 37592ce083..efbc88a55d 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 
+from typing import Tuple
+
 import k2
 import torch
 import torch.nn as nn
@@ -78,7 +80,8 @@ def forward(
         am_scale: float = 0.0,
         lm_scale: float = 0.0,
         warmup: float = 1.0,
-    ) -> torch.Tensor:
+        reduction: str = "sum",
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
           x:
@@ -101,6 +104,10 @@ def forward(
           warmup:
             A value warmup >= 0 that determines which modules are active, values
             warmup > 1 "are fully warmed up" and all modules will be active.
+          reduction:
+            "sum" to sum the losses over all utterances in the batch.
+            "none" to return the loss in a 1-D tensor for each utterance
+            in the batch.
         Returns:
           Return the transducer loss.
 
@@ -110,6 +117,7 @@ def forward(
               lm_scale * lm_probs + am_scale * am_probs +
               (1-lm_scale-am_scale) * combined_probs
         """
+        assert reduction in ("sum", "none"), reduction
         assert x.ndim == 3, x.shape
         assert x_lens.ndim == 1, x_lens.shape
         assert y.num_axes == 2, y.num_axes
@@ -155,7 +163,7 @@ def forward(
                 lm_only_scale=lm_scale,
                 am_only_scale=am_scale,
                 boundary=boundary,
-                reduction="sum",
+                reduction=reduction,
                 return_grad=True,
             )
 
@@ -188,7 +196,7 @@ def forward(
                 ranges=ranges,
                 termination_symbol=blank_id,
                 boundary=boundary,
-                reduction="sum",
+                reduction=reduction,
             )
 
         return (simple_loss, pruned_loss)
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
index 552d4490a7..d50b618468 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -90,11 +90,12 @@ def __init__(
 
     def set_feature(self, feature: torch.Tensor) -> None:
         assert feature.dim() == 2, feature.dim()
-        # tail padding
-        self.num_frames = feature.size(0) + 35
+        # tail padding here to alleviate the tail deletion problem
+        num_tail_padded_frames = 35
+        self.num_frames = feature.size(0) + num_tail_padded_frames
         self.feature = torch.nn.functional.pad(
             feature,
-            (0, 0, 0, self.pad_length + 35),
+            (0, 0, 0, self.pad_length + num_tail_padded_frames),
             mode="constant",
             value=self.LOG_EPS,
         )
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index 17819d8ffb..f4d4a3051d 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -172,7 +172,7 @@ def get_parser():
     parser.add_argument(
         "--beam",
         type=float,
-        default=4,
+        default=20.0,
         help="""A floating point value to calculate the cutoff score during beam
         search (i.e., `cutoff = max-score - beam`), which is the same as the
         `beam` in Kaldi.
@@ -182,7 +182,7 @@ def get_parser():
     parser.add_argument(
         "--max-contexts",
         type=int,
-        default=4,
+        default=8,
         help="""Used only when --decoding-method is
         fast_beam_search""",
     )
@@ -190,7 +190,7 @@ def get_parser():
     parser.add_argument(
         "--max-states",
         type=int,
-        default=8,
+        default=64,
         help="""Used only when --decoding-method is
         fast_beam_search""",
     )
@@ -221,7 +221,7 @@ def get_parser():
         "--num-decode-streams",
         type=int,
         default=2000,
-        help="The number of streams that can be decoded parallel",
+        help="The number of streams that can be decoded in parallel",
     )
 
     add_model_arguments(parser)
@@ -511,7 +511,7 @@ def decode_one_chunk(
       params:
         It is returned by :func:`get_params`.
       decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
         only when --decoding_method is fast_beam_search.
 
     Returns:
@@ -647,7 +647,7 @@ def decode_dataset(
       sp:
         The BPE model.
       decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
         only when --decoding_method is fast_beam_search.
 
     Returns:
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
index dabd54197e..3bc8d78431 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@@ -79,7 +79,13 @@
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    display_and_save_batch,
+    setup_logger,
+    str2bool,
+)
 
 LRSchedulerType = Union[
     torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler
@@ -387,7 +393,6 @@ def get_params() -> AttributeDict:
 
 
 def get_encoder_model(params: AttributeDict) -> nn.Module:
-    # TODO: We can add an option to switch between Conformer and Transformer
     encoder = RNN(
         num_features=params.feature_dim,
         subsampling_factor=params.subsampling_factor,
@@ -608,7 +613,35 @@ def compute_loss(
             am_scale=params.am_scale,
             lm_scale=params.lm_scale,
             warmup=warmup,
+            reduction="none",
         )
+        simple_loss[0] = float("inf")
+        pruned_loss[1] = float("nan")
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        is_finite = simple_loss_is_finite & pruned_loss_is_finite
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_loss: {simple_loss}\n"
+                f"pruned_loss: {pruned_loss}"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss = simple_loss.sum()
+        pruned_loss = pruned_loss.sum()
         # after the main warmup step, we keep pruned_loss_scale small
         # for the same amount of time (model_warm_step), to avoid
         # overwhelming the simple_loss and causing it to diverge,
@@ -628,10 +661,23 @@ def compute_loss(
     info = MetricsTracker()
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
         info["frames"] = (
             (feature_lens // params.subsampling_factor).sum().item()
         )
 
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
     # Note: We use reduction=sum while computing the loss.
     info["loss"] = loss.detach().cpu().item()
     info["simple_loss"] = simple_loss.detach().cpu().item()
@@ -733,25 +779,29 @@ def train_one_epoch(
         params.batch_idx_train += 1
         batch_size = len(batch["supervisions"]["text"])
 
-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
-            loss, loss_info = compute_loss(
-                params=params,
-                model=model,
-                sp=sp,
-                batch=batch,
-                is_training=True,
-                warmup=(params.batch_idx_train / params.model_warm_step),
-            )
-        # summary stats
-        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-        # NOTE: We use reduction==sum and loss is computed over utterances
-        # in the batch and there is no normalization to it so far.
-        scaler.scale(loss).backward()
-        scheduler.step_batch(params.batch_idx_train)
-        scaler.step(optimizer)
-        scaler.update()
-        optimizer.zero_grad()
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                    warmup=(params.batch_idx_train / params.model_warm_step),
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
 
         if params.print_diagnostics and batch_idx == 30:
             return

From 3b6310c5dac4ae9048a038d93b363a90d3e1e7ac Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 19 Aug 2022 10:50:56 +0800
Subject: [PATCH 37/40] remove the condition of raise error

---
 .../ASR/pruned_transducer_stateless/train.py   | 18 ++++++++----------
 .../ASR/pruned_transducer_stateless2/train.py  | 18 ++++++++----------
 .../ASR/pruned_transducer_stateless3/train.py  | 18 ++++++++----------
 .../ASR/pruned_transducer_stateless4/train.py  | 18 ++++++++----------
 .../ASR/pruned_transducer_stateless5/train.py  | 15 +++++++--------
 .../ASR/pruned_transducer_stateless6/train.py  | 15 +++++++--------
 6 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
index c2e0f1f985..193c5050c0 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@@ -560,17 +560,15 @@ def compute_loss(
             simple_loss = simple_loss[simple_loss_is_finite]
             pruned_loss = pruned_loss[pruned_loss_is_finite]
 
-            # If the batch contains more than 10 utterances AND
-            # if either all simple_loss or pruned_loss is inf or nan,
+            # If either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
index c801bd2bd6..5c2f675345 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@@ -621,17 +621,15 @@ def compute_loss(
             simple_loss = simple_loss[simple_loss_is_finite]
             pruned_loss = pruned_loss[pruned_loss_is_finite]
 
-            # If the batch contains more than 10 utterances AND
-            # if either all simple_loss or pruned_loss is inf or nan,
+            # If either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
index be12e69ceb..c2ef28c7b7 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@@ -658,17 +658,15 @@ def compute_loss(
             simple_loss = simple_loss[simple_loss_is_finite]
             pruned_loss = pruned_loss[pruned_loss_is_finite]
 
-            # If the batch contains more than 10 utterances AND
-            # if either all simple_loss or pruned_loss is inf or nan,
+            # If either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
index 2ba28acd4f..13a5b1a515 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@@ -651,17 +651,15 @@ def compute_loss(
             simple_loss = simple_loss[simple_loss_is_finite]
             pruned_loss = pruned_loss[pruned_loss_is_finite]
 
-            # If the batch contains more than 10 utterances AND
-            # if either all simple_loss or pruned_loss is inf or nan,
+            # If either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
index cee7d2bffe..1fa6682935 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
@@ -679,14 +679,13 @@ def compute_loss(
             # If the batch contains more than 10 utterances AND
             # if either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
index 294fd4c527..25d1c4ca66 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
@@ -655,14 +655,13 @@ def compute_loss(
             # If the batch contains more than 10 utterances AND
             # if either all simple_loss or pruned_loss is inf or nan,
             # we stop the training process by raising an exception
-            if feature.size(0) >= 10:
-                if torch.all(~simple_loss_is_finite) or torch.all(
-                    ~pruned_loss_is_finite
-                ):
-                    raise ValueError(
-                        "There are too many utterances in this batch "
-                        "leading to inf or nan losses."
-                    )
+            if torch.all(~simple_loss_is_finite) or torch.all(
+                ~pruned_loss_is_finite
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
 
         simple_loss = simple_loss.sum()
         pruned_loss = pruned_loss.sum()

From 5b6212524916abbddc3ef094e8bc43b529fbfa11 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 19 Aug 2022 11:03:19 +0800
Subject: [PATCH 38/40] modify type hint for the return value in model.py

---
 egs/librispeech/ASR/pruned_transducer_stateless/model.py  | 4 +++-
 egs/librispeech/ASR/pruned_transducer_stateless2/model.py | 4 +++-
 egs/librispeech/ASR/pruned_transducer_stateless3/model.py | 4 ++--
 egs/librispeech/ASR/pruned_transducer_stateless6/model.py | 7 ++++---
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/model.py b/egs/librispeech/ASR/pruned_transducer_stateless/model.py
index e2c9eb789b..73b651b3fa 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/model.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 
+from typing import Tuple
+
 import k2
 import torch
 import torch.nn as nn
@@ -67,7 +69,7 @@ def forward(
         am_scale: float = 0.0,
         lm_scale: float = 0.0,
         reduction: str = "sum",
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
           x:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
index 452102d21f..ba7616c610 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 
+from typing import Tuple
+
 import k2
 import torch
 import torch.nn as nn
@@ -79,7 +81,7 @@ def forward(
         lm_scale: float = 0.0,
         warmup: float = 1.0,
         reduction: str = "sum",
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
           x:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
index ece340534f..0d5f7cc6d9 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Optional, Tuple
 
 import k2
 import torch
@@ -106,7 +106,7 @@ def forward(
         lm_scale: float = 0.0,
         warmup: float = 1.0,
         reduction: str = "sum",
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
           x:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
index 9de0769d9a..06c4b5204c 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
@@ -15,16 +15,17 @@
 # limitations under the License.
 
 
+from typing import Tuple
+
 import k2
 import torch
 import torch.nn as nn
 from encoder_interface import EncoderInterface
+from multi_quantization.prediction import JointCodebookLoss
 from scaling import ScaledLinear
 
 from icefall.utils import add_sos
 
-from multi_quantization.prediction import JointCodebookLoss
-
 
 class Transducer(nn.Module):
     """It implements https://arxiv.org/pdf/1211.3711.pdf
@@ -91,7 +92,7 @@ def forward(
         warmup: float = 1.0,
         reduction: str = "sum",
         codebook_indexes: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Args:
           x:

From e00aa2955f1ff9467c8663e99e26d60ca544d7bb Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 19 Aug 2022 11:32:10 +0800
Subject: [PATCH 39/40] minor change

---
 .../ASR/lstm_transducer_stateless/stream.py         | 13 ++++---------
 .../lstm_transducer_stateless/streaming_decode.py   |  4 ++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
index d50b618468..97d890c820 100644
--- a/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/stream.py
@@ -38,11 +38,15 @@ def __init__(
         Args:
           params:
             It's the return value of :func:`get_params`.
+          cut_id:
+            The cut id of the current stream.
           decoding_graph:
             The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
             only when --decoding_method is fast_beam_search.
           device:
             The device to run this stream.
+          LOG_EPS:
+            A float value used for padding.
         """
         self.LOG_EPS = LOG_EPS
         self.cut_id = cut_id
@@ -100,15 +104,6 @@ def set_feature(self, feature: torch.Tensor) -> None:
             value=self.LOG_EPS,
         )
 
-    def set_ground_truth(self, ground_truth: str) -> None:
-        self.ground_truth = ground_truth
-
-    def set_states(
-        self, states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]]
-    ) -> None:
-        """Set states."""
-        self.states = states
-
     def get_feature_chunk(self) -> torch.Tensor:
         """Get a chunk of feature frames.
 
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
index f4d4a3051d..d6376bdc06 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/streaming_decode.py
@@ -675,7 +675,7 @@ def decode_dataset(
             LOG_EPS=LOG_EPSILON,
         )
 
-        stream.set_states(model.encoder.get_init_states(device=device))
+        stream.states = model.encoder.get_init_states(device=device)
 
         audio: np.ndarray = cut.load_audio()
         # audio.shape: (1, num_samples)
@@ -688,7 +688,7 @@ def decode_dataset(
         samples = torch.from_numpy(audio).squeeze(0)
         feature = fbank(samples)
         stream.set_feature(feature)
-        stream.set_ground_truth(cut.supervisions[0].text)
+        stream.ground_truth = cut.supervisions[0].text
 
         streams.append(stream)
 

From 9b96a14566528d03791f2ac29a734c4de6aaa225 Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@outlook.com>
Date: Fri, 19 Aug 2022 14:15:20 +0800
Subject: [PATCH 40/40] modify RESULTS.md

---
 egs/librispeech/ASR/RESULTS.md | 114 ++++++++++-----------------------
 1 file changed, 34 insertions(+), 80 deletions(-)

diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
index dd41ee2912..1c6a350f05 100644
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@@ -23,6 +23,8 @@ The WERs are:
 | modified beam search                | 3.64       | 9.55       | --epoch 35 --avg 15  | simulated streaming  |
 | modified beam search                | 3.65       | 9.51       | --epoch 35 --avg 15  | streaming            |
 
+Note: `simulated streaming` indicates feeding full utterance during decoding, while `streaming` indicates feeding certain number of frames at each time.
+
 The training command is:
 
 ```bash
@@ -41,90 +43,42 @@ The training command is:
 The tensorboard log can be found at
 <https://tensorboard.dev/experiment/FWrM20mjTeWo6dTpFYOsYQ/>
 
-The simulated streaming decoding command using greedy search is:
-```bash
-./lstm_transducer_stateless/decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method greedy_search \
-  --use-averaged-model True
-```
-
-The simulated streaming decoding command using fast beam search is:
-```bash
-./lstm_transducer_stateless/decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method fast_beam_search \
-  --use-averaged-model True \
-  --beam 4 \
-  --max-contexts 4 \
-  --max-states 8
-```
-
-The simulated streaming decoding command using modified beam search is:
+The simulated streaming decoding command using greedy search, fast beam search, and modified beam search is:
 ```bash
-./lstm_transducer_stateless/decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method modified_beam_search \
-  --use-averaged-model True \
-  --beam-size 4
-```
-
-The streaming decoding command using greedy search is:
-```bash
-./lstm_transducer_stateless/streaming_decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method greedy_search \
-  --use-averaged-model True
-```
-
-The streaming decoding command using fast beam search is:
-```bash
-./lstm_transducer_stateless/streaming_decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method fast_beam_search \
-  --use-averaged-model True \
-  --beam 4 \
-  --max-contexts 4 \
-  --max-states 8
+for decoding_method in greedy_search fast_beam_search modified_beam_search; do
+  ./lstm_transducer_stateless/decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir lstm_transducer_stateless/exp \
+    --max-duration 600 \
+    --num-encoder-layers 12 \
+    --rnn-hidden-size 1024 \
+    --decoding-method $decoding_method \
+    --use-averaged-model True \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8 \
+    --beam-size 4
+done
 ```
 
-The streaming decoding command using modified beam search is:
+The streaming decoding command using greedy search, fast beam search, and modified beam search is:
 ```bash
-./lstm_transducer_stateless/streaming_decode.py \
-  --epoch 35 \
-  --avg 15 \
-  --exp-dir lstm_transducer_stateless/exp \
-  --max-duration 600 \
-  --num-encoder-layers 12 \
-  --rnn-hidden-size 1024
-  --decoding-method modified_beam_search \
-  --use-averaged-model True \
-  --beam-size 4
+for decoding_method in greedy_search fast_beam_search modified_beam_search; do
+  ./lstm_transducer_stateless/streaming_decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir lstm_transducer_stateless/exp \
+    --max-duration 600 \
+    --num-encoder-layers 12 \
+    --rnn-hidden-size 1024 \
+    --decoding-method $decoding_method \
+    --use-averaged-model True \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8 \
+    --beam-size 4
+done
 ```
 
 Pretrained models, training logs, decoding logs, and decoding results