From 2ba3457bf6d26346d97e415f90a50fad56fa8fbf Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Thu, 10 Feb 2022 16:48:46 +0800
Subject: [PATCH 01/34] add skim model

---
 espnet2/enh/layers/dprnn.py |   1 +
 espnet2/enh/layers/skim.py  | 132 ++++++++++++++++++++++++++++++++++++
 espnet2/enh/layers/tcn.py   |  44 +++++++++---
 3 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 espnet2/enh/layers/skim.py

diff --git a/espnet2/enh/layers/dprnn.py b/espnet2/enh/layers/dprnn.py
index 827c754ac86..aae6040f74f 100644
--- a/espnet2/enh/layers/dprnn.py
+++ b/espnet2/enh/layers/dprnn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/yluo42/TAC/blob/master/utility/models.py
+# Licensed under CC BY-NC-SA 3.0 US.
 #
 
 
diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
new file mode 100644
index 00000000000..0e231a80c7e
--- /dev/null
+++ b/espnet2/enh/layers/skim.py
@@ -0,0 +1,132 @@
+# An implementation of SkiM model described in 
+# "SkiM: Skipping Memory LSTM for Low-Latency Real-Time Continuous Speech Separation"
+# (https://arxiv.org/abs/2201.10800)
+# 
+
+
+from turtle import forward, shape
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.dprnn import SingleRNN, split_feature, merge_feature
+from espnet2.enh.layers.tcn import chose_norm
+
+
+class MemLSTM(nn.Module):
+    """ the Mem-LSTM of SkiM
+
+    args:
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
+        mem_type: 'hc', 'h', 'c' or 'id'. 
+            It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will be identically returned.  
+        norm_type: gLN, cLN. cLN is for causal implementation.
+    """
+
+    def __init__(self,hidden_size, dropout=0.0, bidirectional=False, mem_type='hc', norm_type='gLN'):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.bidirectional = bidirectional
+        self.input_size = (int(bidirectional) + 1) * hidden_size
+        self.mem_type = mem_type
+
+        assert mem_type in ["hc", "h", 'c', 'id'], f"only support 'hc', 'h', 'c' and 'id', current type: {mem_type}"
+
+        if mem_type in ["hc", 'h']:
+            self.h_net = SingleRNN('LSTM', input_size=self.input_size, hidden_size=self.hidden_size, dropout=dropout, bidirectional=bidirectional)
+            self.h_norm = chose_norm(norm_type=norm_type, channel_size=self.input_size, shape='BTD')
+        if mem_type in ["hc", 'c']:
+            self.c_net = SingleRNN('LSTM', input_size=self.input_size, hidden_size=self.hidden_size, dropout=dropout, bidirectional=bidirectional)
+            self.c_norm = chose_norm(norm_type=norm_type, channel_size=self.input_size, shape='BTD')
+    
+    def extra_repr(self) -> str:
+        return f"Mem_type: {self.mem_type}, bidirectional: {self.bidirectional}"
+
+    def forward(self, hc, S):
+        # hc = (h, c), tuple of hidden and cell states from SegLSTM
+        # shape of h and c: (d, B*S, H)
+        # S: number of segments in SegLSTM
+        
+        if self.mem_type == 'id':
+            ret_val = hc
+        else:
+            h, c = hc
+            d, BS, H = h.shape
+            B = BS // S
+            h = h.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH  
+            c = c.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH  
+            if self.mem_type == 'hc':
+                h = h + self.h_norm(self.h_net(h))
+                c = c + self.c_norm(self.c_net(c))
+            elif self.mem_type == 'h':
+                h = h + self.h_norm(self.h_net(h))
+                c = torch.zeros_like(c)
+            elif self.mem_type == 'c':
+                h = torch.zeros_like(h)
+                c = c + self.c_norm(self.c_net(c))
+            
+            h = h.view(B * S, d, H).transpose(1, 0).contiguous()
+            c = c.view(B * S, d, H).transpose(1, 0).contiguous()
+            ret_val = (h, c)
+
+        if not self.bidirectional:
+            # for causal setup
+            causal_ret_val = []
+            for x in ret_val:
+                x_ = torch.zeros_like(x)
+                x_[:, 1:, :] = x[:, :-1,:]
+                causal_ret_val.append(x_)
+            ret_val = tuple(causal_ret_val)
+        
+        return ret_val
+   
+            
+
+
+class SegLSTM(nn.Module):
+
+    """ the Seg-LSTM of SkiM
+
+    args:
+        input_size: int, dimension of the input feature. The input should have shape
+                    (batch, seq_len, input_size).  
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
+    """
+
+    def __init__(self, input_size, hidden_size, dropout=0.0, bidirectional=False):
+        super().__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_direction = int(bidirectional) + 1
+
+        self.lstm = nn.LSTM(input_size, hidden_size, 1,batch_first=True,bidirectional=bidirectional,)
+        self.dropout = nn.Dropout(p=dropout)
+        self.proj = nn.Linear(hidden_size * self.num_direction, input_size)  
+    
+    def forward(self, input, hc):
+        # input shape: B, T, H
+
+        B, T, H = input.shape
+
+        if hc == None:
+            # In fist input SkiM block, h and c are not available
+            d = self.num_direction
+            h = torch.zeros(d, B, self.hidden_size).to(input.device)
+            c = torch.zeros(d, B, self.hidden_size).to(input.device)
+        else:
+            h, c = hc
+
+        output, (h, c) = self.lstm(input, (h, c)) 
+        output = self.dropout(output)
+        output = self.proj(
+            output.contiguous().view(-1, output.shape[2])
+        ).view(output.shape)       
+
+        return output, (h, c)
+
+
diff --git a/espnet2/enh/layers/tcn.py b/espnet2/enh/layers/tcn.py
index 8fe8cd17036..b41b108731c 100644
--- a/espnet2/enh/layers/tcn.py
+++ b/espnet2/enh/layers/tcn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/kaituoxu/Conv-TasNet/blob/master/src/conv_tasnet.py
+# Licensed under MIT.
 #
 
 
@@ -46,7 +47,7 @@ def __init__(
         for r in range(R):
             blocks = []
             for x in range(X):
-                dilation = 2**x
+                dilation = 2 ** x
                 padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
                 blocks += [
                     TemporalBlock(
@@ -86,9 +87,9 @@ def forward(self, mixture_w):
         elif self.mask_nonlinear == "relu":
             est_mask = F.relu(score)
         elif self.mask_nonlinear == "sigmoid":
-            est_mask = torch.sigmoid(score)
+            est_mask = F.sigmoid(score)
         elif self.mask_nonlinear == "tanh":
-            est_mask = torch.tanh(score)
+            est_mask = F.tanh(score)
         else:
             raise ValueError("Unsupported mask non-linear function")
         return est_mask
@@ -214,19 +215,21 @@ def check_nonlinear(nolinear_type):
         raise ValueError("Unsupported nonlinear type")
 
 
-def chose_norm(norm_type, channel_size):
+def chose_norm(norm_type, channel_size, shape='BDT'):
     """The input of normalization will be (M, C, K), where M is batch size.
 
     C is channel size and K is sequence length.
     """
     if norm_type == "gLN":
-        return GlobalLayerNorm(channel_size)
+        return GlobalLayerNorm(channel_size, shape=shape)
     elif norm_type == "cLN":
-        return ChannelwiseLayerNorm(channel_size)
+        return ChannelwiseLayerNorm(channel_size, shape=shape)
     elif norm_type == "BN":
         # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
         # along M and K, so this BN usage is right.
         return nn.BatchNorm1d(channel_size)
+    elif norm_type == "GN":
+        return nn.GroupNorm(1, channel_size, eps=1e-8)
     else:
         raise ValueError("Unsupported normalization type")
 
@@ -234,11 +237,13 @@ def chose_norm(norm_type, channel_size):
 class ChannelwiseLayerNorm(nn.Module):
     """Channel-wise Layer Normalization (cLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape='BDT'):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ['BDT', 'BTD']
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -253,20 +258,37 @@ def forward(self, y):
         Returns:
             cLN_y: [M, N, K]
         """
+        dim = 3
+        if y.dim() == 4:
+            dim = 4
+            M, N, K, L = y.shape
+            y = y.view(M, N, K * L)
+
+        if self.shape == 'BTD':
+            y = y.transpose(1, 2).contiguous()
+
         mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
         var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
         cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+
+        if self.shape == 'BTD':
+            cLN_y = cLN_y.transpose(1, 2).contiguous()
+
+        if dim  == 4:
+            cLN_y = cLN_y.view(M, N, K, L)
         return cLN_y
 
 
 class GlobalLayerNorm(nn.Module):
     """Global Layer Normalization (gLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape='BDT'):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ['BDT', 'BTD']
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -281,9 +303,15 @@ def forward(self, y):
         Returns:
             gLN_y: [M, N, K]
         """
+        if self.shape == 'BTD':
+            y = y.transpose(1, 2).contiguous()      
+
         mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
         var = (
             (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
         )
         gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        
+        if self.shape == 'BTD':
+            gLN_y = gLN_y.transpose(1, 2).contiguous()   
         return gLN_y

From 7d7686fec615bef79724eef1f11fd2c6dce56087 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Thu, 10 Feb 2022 19:10:07 +0800
Subject: [PATCH 02/34] update skim.py

---
 espnet2/enh/layers/skim.py | 210 ++++++++++++++++++++++++++++++-------
 1 file changed, 174 insertions(+), 36 deletions(-)

diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
index 0e231a80c7e..6f247524283 100644
--- a/espnet2/enh/layers/skim.py
+++ b/espnet2/enh/layers/skim.py
@@ -1,10 +1,11 @@
-# An implementation of SkiM model described in 
+# An implementation of SkiM model described in
 # "SkiM: Skipping Memory LSTM for Low-Latency Real-Time Continuous Speech Separation"
 # (https://arxiv.org/abs/2201.10800)
-# 
+#
 
 
 from turtle import forward, shape
+from black import main
 import torch
 import torch.nn as nn
 
@@ -13,34 +14,62 @@
 
 
 class MemLSTM(nn.Module):
-    """ the Mem-LSTM of SkiM
+    """the Mem-LSTM of SkiM
 
     args:
         hidden_size: int, dimension of the hidden state.
         dropout: float, dropout ratio. Default is 0.
         bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
-        mem_type: 'hc', 'h', 'c' or 'id'. 
+        mem_type: 'hc', 'h', 'c' or 'id'.
             It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
-            In 'id' mode, both the hidden and cell states will be identically returned.  
+            In 'id' mode, both the hidden and cell states will be identically returned.
         norm_type: gLN, cLN. cLN is for causal implementation.
     """
 
-    def __init__(self,hidden_size, dropout=0.0, bidirectional=False, mem_type='hc', norm_type='gLN'):
+    def __init__(
+        self,
+        hidden_size,
+        dropout=0.0,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.bidirectional = bidirectional
         self.input_size = (int(bidirectional) + 1) * hidden_size
         self.mem_type = mem_type
 
-        assert mem_type in ["hc", "h", 'c', 'id'], f"only support 'hc', 'h', 'c' and 'id', current type: {mem_type}"
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            "id",
+        ], f"only support 'hc', 'h', 'c' and 'id', current type: {mem_type}"
+
+        if mem_type in ["hc", "h"]:
+            self.h_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.h_norm = chose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
+        if mem_type in ["hc", "c"]:
+            self.c_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.c_norm = chose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
 
-        if mem_type in ["hc", 'h']:
-            self.h_net = SingleRNN('LSTM', input_size=self.input_size, hidden_size=self.hidden_size, dropout=dropout, bidirectional=bidirectional)
-            self.h_norm = chose_norm(norm_type=norm_type, channel_size=self.input_size, shape='BTD')
-        if mem_type in ["hc", 'c']:
-            self.c_net = SingleRNN('LSTM', input_size=self.input_size, hidden_size=self.hidden_size, dropout=dropout, bidirectional=bidirectional)
-            self.c_norm = chose_norm(norm_type=norm_type, channel_size=self.input_size, shape='BTD')
-    
     def extra_repr(self) -> str:
         return f"Mem_type: {self.mem_type}, bidirectional: {self.bidirectional}"
 
@@ -48,25 +77,25 @@ def forward(self, hc, S):
         # hc = (h, c), tuple of hidden and cell states from SegLSTM
         # shape of h and c: (d, B*S, H)
         # S: number of segments in SegLSTM
-        
-        if self.mem_type == 'id':
+
+        if self.mem_type == "id":
             ret_val = hc
         else:
             h, c = hc
             d, BS, H = h.shape
             B = BS // S
-            h = h.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH  
-            c = c.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH  
-            if self.mem_type == 'hc':
+            h = h.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            c = c.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            if self.mem_type == "hc":
                 h = h + self.h_norm(self.h_net(h))
                 c = c + self.c_norm(self.c_net(c))
-            elif self.mem_type == 'h':
+            elif self.mem_type == "h":
                 h = h + self.h_norm(self.h_net(h))
                 c = torch.zeros_like(c)
-            elif self.mem_type == 'c':
+            elif self.mem_type == "c":
                 h = torch.zeros_like(h)
                 c = c + self.c_norm(self.c_net(c))
-            
+
             h = h.view(B * S, d, H).transpose(1, 0).contiguous()
             c = c.view(B * S, d, H).transpose(1, 0).contiguous()
             ret_val = (h, c)
@@ -76,38 +105,48 @@ def forward(self, hc, S):
             causal_ret_val = []
             for x in ret_val:
                 x_ = torch.zeros_like(x)
-                x_[:, 1:, :] = x[:, :-1,:]
+                x_[:, 1:, :] = x[:, :-1, :]
                 causal_ret_val.append(x_)
             ret_val = tuple(causal_ret_val)
-        
+
         return ret_val
-   
-            
 
 
 class SegLSTM(nn.Module):
 
-    """ the Seg-LSTM of SkiM
+    """the Seg-LSTM of SkiM
 
     args:
         input_size: int, dimension of the input feature. The input should have shape
-                    (batch, seq_len, input_size).  
+                    (batch, seq_len, input_size).
         hidden_size: int, dimension of the hidden state.
         dropout: float, dropout ratio. Default is 0.
         bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
+        norm_type: gLN, cLN. cLN is for causal implementation.
     """
 
-    def __init__(self, input_size, hidden_size, dropout=0.0, bidirectional=False):
+    def __init__(
+        self, input_size, hidden_size, dropout=0.0, bidirectional=False, norm_type="cLN"
+    ):
         super().__init__()
 
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_direction = int(bidirectional) + 1
 
-        self.lstm = nn.LSTM(input_size, hidden_size, 1,batch_first=True,bidirectional=bidirectional,)
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            1,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
         self.dropout = nn.Dropout(p=dropout)
-        self.proj = nn.Linear(hidden_size * self.num_direction, input_size)  
-    
+        self.proj = nn.Linear(hidden_size * self.num_direction, input_size)
+        self.norm = chose_norm(
+            norm_type=norm_type, channel_size=hidden_size, shape="BTD"
+        )
+
     def forward(self, input, hc):
         # input shape: B, T, H
 
@@ -121,12 +160,111 @@ def forward(self, input, hc):
         else:
             h, c = hc
 
-        output, (h, c) = self.lstm(input, (h, c)) 
+        output, (h, c) = self.lstm(input, (h, c))
         output = self.dropout(output)
-        output = self.proj(
-            output.contiguous().view(-1, output.shape[2])
-        ).view(output.shape)       
+        output = self.proj(output.contiguous().view(-1, output.shape[2])).view(
+            output.shape
+        )
+        output = input + self.norm(output)
 
         return output, (h, c)
 
 
+class SkiM(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0.0,
+        num_blocks=2,
+        segment_size=20,
+        bidirectional=True,
+        mem_type="hc",
+        norm_type="gLN",
+        seg_overlap=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.segment_size = segment_size
+        self.dropout = dropout
+        self.num_blocks = num_blocks
+        self.mem_type = mem_type
+        self.norm_type = norm_type
+        self.seg_overlap = seg_overlap
+
+        self.seg_lstms = nn.ModuleList([])
+        for i in range(num_blocks):
+            self.seg_lstms.append(
+                SegLSTM(
+                    input_size=input_size,
+                    hidden_size=hidden_size,
+                    dropout=dropout,
+                    bidirectional=bidirectional,
+                    norm_type=norm_type,
+                )
+            )
+        if self.mem_type is not None:
+            self.mem_lstms = nn.ModuleList([])
+            for i in range(num_blocks - 1):
+                self.mem_lstms.append(
+                    MemLSTM(
+                        hidden_size,
+                        dropout=dropout,
+                        bidirectional=bidirectional,
+                        mem_type=mem_type,
+                        norm_type=norm_type,
+                    )
+                )
+        self.output_fc = nn.Sequential(nn.PReLU(), nn.Conv1d(input_size, output_size, 1))
+
+    def forward(self, input):
+        # input shape: B, T (S*K), D
+        B, T, D = input.shape
+
+        if self.seg_overlap:
+            input, rest = split_feature(input.transpose(1, 2),
+                                        segment_size=self.segment_size)  # B, D, K, S
+            input = input.permute(0, 3, 2, 1).contiguous() # B, S, K, D
+        else:
+            input, rest = self._padfeature(input=input)
+            input = input.view(B, -1, self.segment_size, D)  # B, S, K, D
+        B, S, K, D = input.shape
+
+        assert K == self.segment_size
+
+        output = input.view(B * S, K, D).contiguous()  # BS, K, D
+        hc = None
+        for i in range(self.num_blocks):
+            output, hc = self.seg_lstms[i](output, hc)  # BS, K, D
+            if self.mem_type and i < self.num_blocks - 1:
+                hc = self.mem_lstms[i](hc, S)
+        
+        if self.seg_overlap:
+            output = output.view(B, S, K, D).permute(0, 3, 2, 1) # B, D, K, S
+            output = merge_feature(output, rest) # B, D, T
+            output = self.output(output).transpose(1, 2)
+
+        else:
+            output = output.view(B, S * K, D)[:, :T, :]  # B, T, D
+            output = self.output(output.transpose(1, 2)).transpose(1, 2)
+        
+        return output
+
+    def _padfeature(self, input):
+        B, T, D = input.shape
+        rest = self.segment_size - T % self.segment_size
+
+        if rest > 0:
+            pad = torch.zeros(B, rest, D, device=input.device)
+            input = torch.cat([input, pad], dim=1)
+        return input, rest
+
+
+if __name__ == "__main__":
+
+    model = SkiM(256, 123, 345, dropout=0.1, num_blocks=3, segment_size=20,bidirectional=True, mem_type='hc', norm_type='gLN', seg_overlap=False)
+    input = torch.randn(2, 1002, 256)
+    print(model(input).shape)

From d103494a29b76c73520f40a92b43e17a9fea16ed Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 13:50:57 +0800
Subject: [PATCH 03/34] add skim separator

---
 .../conf/tuning/train_enh_skim_tasnet.yaml    |  72 +++++++++++
 espnet2/enh/layers/skim.py                    |  69 ++++++++---
 espnet2/enh/separator/skim_separator.py       | 117 ++++++++++++++++++
 espnet2/tasks/enh.py                          |   2 +
 4 files changed, 246 insertions(+), 14 deletions(-)
 create mode 100644 egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
 create mode 100644 espnet2/enh/separator/skim_separator.py

diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
new file mode 100644
index 00000000000..0937dd6024c
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 1 # batch_size 16 can be trained on 4 RTX 2080ti
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    casual: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: False
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
index 6f247524283..2e3d506cfbd 100644
--- a/espnet2/enh/layers/skim.py
+++ b/espnet2/enh/layers/skim.py
@@ -144,7 +144,7 @@ def __init__(
         self.dropout = nn.Dropout(p=dropout)
         self.proj = nn.Linear(hidden_size * self.num_direction, input_size)
         self.norm = chose_norm(
-            norm_type=norm_type, channel_size=hidden_size, shape="BTD"
+            norm_type=norm_type, channel_size=input_size, shape="BTD"
         )
 
     def forward(self, input, hc):
@@ -163,7 +163,7 @@ def forward(self, input, hc):
         output, (h, c) = self.lstm(input, (h, c))
         output = self.dropout(output)
         output = self.proj(output.contiguous().view(-1, output.shape[2])).view(
-            output.shape
+            input.shape
         )
         output = input + self.norm(output)
 
@@ -171,6 +171,26 @@ def forward(self, input, hc):
 
 
 class SkiM(nn.Module):
+    """Skipping Memory Net
+
+    args:
+        input_size: int, dimension of the input feature.
+            Input shape shoud be (batch, length, input_size)
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_blocks: number of basic SkiM blocks
+        segment_size: segmentation size for splitting long features
+        bidirectional: bool, whether the RNN layers are bidirectional.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+            It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+        seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
+            Default is False.
+    """
+
     def __init__(
         self,
         input_size,
@@ -195,6 +215,13 @@ def __init__(
         self.norm_type = norm_type
         self.seg_overlap = seg_overlap
 
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            None,
+        ], f"only support 'hc', 'h', 'c', 'id', and None, current type: {mem_type}"
+
         self.seg_lstms = nn.ModuleList([])
         for i in range(num_blocks):
             self.seg_lstms.append(
@@ -218,16 +245,19 @@ def __init__(
                         norm_type=norm_type,
                     )
                 )
-        self.output_fc = nn.Sequential(nn.PReLU(), nn.Conv1d(input_size, output_size, 1))
+        self.output_fc = nn.Sequential(
+            nn.PReLU(), nn.Conv1d(input_size, output_size, 1)
+        )
 
     def forward(self, input):
         # input shape: B, T (S*K), D
         B, T, D = input.shape
 
         if self.seg_overlap:
-            input, rest = split_feature(input.transpose(1, 2),
-                                        segment_size=self.segment_size)  # B, D, K, S
-            input = input.permute(0, 3, 2, 1).contiguous() # B, S, K, D
+            input, rest = split_feature(
+                input.transpose(1, 2), segment_size=self.segment_size
+            )  # B, D, K, S
+            input = input.permute(0, 3, 2, 1).contiguous()  # B, S, K, D
         else:
             input, rest = self._padfeature(input=input)
             input = input.view(B, -1, self.segment_size, D)  # B, S, K, D
@@ -241,16 +271,16 @@ def forward(self, input):
             output, hc = self.seg_lstms[i](output, hc)  # BS, K, D
             if self.mem_type and i < self.num_blocks - 1:
                 hc = self.mem_lstms[i](hc, S)
-        
+
         if self.seg_overlap:
-            output = output.view(B, S, K, D).permute(0, 3, 2, 1) # B, D, K, S
-            output = merge_feature(output, rest) # B, D, T
-            output = self.output(output).transpose(1, 2)
+            output = output.view(B, S, K, D).permute(0, 3, 2, 1)  # B, D, K, S
+            output = merge_feature(output, rest)  # B, D, T
+            output = self.output_fc(output).transpose(1, 2)
 
         else:
             output = output.view(B, S * K, D)[:, :T, :]  # B, T, D
-            output = self.output(output.transpose(1, 2)).transpose(1, 2)
-        
+            output = self.output_fc(output.transpose(1, 2)).transpose(1, 2)
+
         return output
 
     def _padfeature(self, input):
@@ -265,6 +295,17 @@ def _padfeature(self, input):
 
 if __name__ == "__main__":
 
-    model = SkiM(256, 123, 345, dropout=0.1, num_blocks=3, segment_size=20,bidirectional=True, mem_type='hc', norm_type='gLN', seg_overlap=False)
-    input = torch.randn(2, 1002, 256)
+    model = SkiM(
+        333,
+        111,
+        222,
+        dropout=0.1,
+        num_blocks=3,
+        segment_size=20,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+        seg_overlap=True,
+    )
+    input = torch.randn(2, 1002, 333)
     print(model(input).shape)
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
new file mode 100644
index 00000000000..bf291b2daec
--- /dev/null
+++ b/espnet2/enh/separator/skim_separator.py
@@ -0,0 +1,117 @@
+from collections import OrderedDict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.skim import SkiM
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class SkiMSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        casual: bool = True,
+        num_spk: int = 2,
+        nonlinear: str = "relu",
+        layer: int = 3,
+        unit: int = 512,
+        segment_size: int = 20,
+        dropout: float = 0.0,
+        mem_type: str = 'hc',
+        seg_overlap: bool = False,
+    ):
+        """Skipping Memory (SkiM) Separator
+
+        Args:
+            input_dim: input feature dimension
+            casual: bool, whether the system is casual.
+            num_spk: number of target speakers.
+            nonlinear: the nonlinear function for mask estimation,
+                    select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of SkiM blocks. Default is 3.
+            unit: int, dimension of the hidden state.
+            segment_size: segmentation size for splitting long features
+            dropout: float, dropout ratio. Default is 0.
+            mem_type: 'hc', 'h', 'c', 'id' or None.
+                    It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
+                    In 'id' mode, both the hidden and cell states will be identically returned.
+                    When mem_type is None, the MemLSTM will be removed.
+            seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
+                    Default is False.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.segment_size = segment_size
+
+        self.skim = SkiM(
+            input_size=input_dim,
+            hidden_size=unit,
+            output_size=input_dim * num_spk,
+            dropout=dropout,
+            num_blocks=layer,
+            bidirectional=(not casual),
+            norm_type='cLN' if casual else 'gLN',
+            segment_size=segment_size,
+            seg_overlap=seg_overlap,
+            mem_type=mem_type,
+        )
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+    def forward(
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+
+        B, T, N = feature.shape
+
+        processed = self.skim(feature)  # B,T, N
+
+        processed = processed.view(B, T, N, self.num_spk)
+        masks = self.nonlinear(processed).unbind(dim=3)
+
+        masked = [input * m for m in masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py
index 2a722cba554..7b1aab863d7 100644
--- a/espnet2/tasks/enh.py
+++ b/espnet2/tasks/enh.py
@@ -35,6 +35,7 @@
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
+from espnet2.enh.separator.skim_separator import SkiMSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 from espnet2.tasks.abs_task import AbsTask
@@ -58,6 +59,7 @@
     name="separator",
     classes=dict(
         rnn=RNNSeparator,
+        skim=SkiMSeparator,
         tcn=TCNSeparator,
         dprnn=DPRNNSeparator,
         transformer=TransformerSeparator,

From cf535c2661c20f91810a7c86f528386a3a0bdaf4 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 14:24:01 +0800
Subject: [PATCH 04/34] add causal config for skim

---
 .../conf/tuning/train_enh_skim_tasnet.yaml    |  4 +-
 .../tuning/train_enh_skim_tasnet_causal.yaml  | 72 +++++++++++++++++++
 espnet2/enh/separator/skim_separator.py       | 42 +++++------
 3 files changed, 96 insertions(+), 22 deletions(-)
 create mode 100644 egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml

diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
index 0937dd6024c..5fdd978f95c 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
@@ -2,9 +2,9 @@ optim: adam
 init: xavier_uniform
 max_epoch: 150
 batch_type: folded
-batch_size: 1 # batch_size 16 can be trained on 4 RTX 2080ti
+batch_size: 8
 iterator_type: chunk
-chunk_length: 32000
+chunk_length: 16000
 num_workers: 4
 optim_conf:
     lr: 1.0e-03
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
new file mode 100644
index 00000000000..5681dab1996
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    casual: True
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: False
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
index bf291b2daec..0f76a305d8b 100644
--- a/espnet2/enh/separator/skim_separator.py
+++ b/espnet2/enh/separator/skim_separator.py
@@ -11,6 +11,26 @@
 
 
 class SkiMSeparator(AbsSeparator):
+    """Skipping Memory (SkiM) Separator
+
+    Args:
+        input_dim: input feature dimension
+        casual: bool, whether the system is casual.
+        num_spk: number of target speakers.
+        nonlinear: the nonlinear function for mask estimation,
+                select from 'relu', 'tanh', 'sigmoid'
+        layer: int, number of SkiM blocks. Default is 3.
+        unit: int, dimension of the hidden state.
+        segment_size: segmentation size for splitting long features
+        dropout: float, dropout ratio. Default is 0.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+                It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
+                In 'id' mode, both the hidden and cell states will be identically returned.
+                When mem_type is None, the MemLSTM will be removed.
+        seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
+                Default is False.
+    """
+
     def __init__(
         self,
         input_dim: int,
@@ -21,28 +41,10 @@ def __init__(
         unit: int = 512,
         segment_size: int = 20,
         dropout: float = 0.0,
-        mem_type: str = 'hc',
+        mem_type: str = "hc",
         seg_overlap: bool = False,
     ):
-        """Skipping Memory (SkiM) Separator
 
-        Args:
-            input_dim: input feature dimension
-            casual: bool, whether the system is casual.
-            num_spk: number of target speakers.
-            nonlinear: the nonlinear function for mask estimation,
-                    select from 'relu', 'tanh', 'sigmoid'
-            layer: int, number of SkiM blocks. Default is 3.
-            unit: int, dimension of the hidden state.
-            segment_size: segmentation size for splitting long features
-            dropout: float, dropout ratio. Default is 0.
-            mem_type: 'hc', 'h', 'c', 'id' or None.
-                    It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
-                    In 'id' mode, both the hidden and cell states will be identically returned.
-                    When mem_type is None, the MemLSTM will be removed.
-            seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
-                    Default is False.
-        """
         super().__init__()
 
         self._num_spk = num_spk
@@ -56,7 +58,7 @@ def __init__(
             dropout=dropout,
             num_blocks=layer,
             bidirectional=(not casual),
-            norm_type='cLN' if casual else 'gLN',
+            norm_type="cLN" if casual else "gLN",
             segment_size=segment_size,
             seg_overlap=seg_overlap,
             mem_type=mem_type,

From 016b12e3474397a43756a8f7a6b50256d820034e Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 14:40:48 +0800
Subject: [PATCH 05/34] add unit test

---
 espnet2/enh/separator/skim_separator.py       |  11 +-
 .../enh/separator/test_skim_separator.py      | 142 ++++++++++++++++++
 2 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 test/espnet2/enh/separator/test_skim_separator.py

diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
index 0f76a305d8b..aa279a919bd 100644
--- a/espnet2/enh/separator/skim_separator.py
+++ b/espnet2/enh/separator/skim_separator.py
@@ -15,7 +15,7 @@ class SkiMSeparator(AbsSeparator):
 
     Args:
         input_dim: input feature dimension
-        casual: bool, whether the system is casual.
+        causal: bool, whether the system is causal.
         num_spk: number of target speakers.
         nonlinear: the nonlinear function for mask estimation,
                 select from 'relu', 'tanh', 'sigmoid'
@@ -34,7 +34,7 @@ class SkiMSeparator(AbsSeparator):
     def __init__(
         self,
         input_dim: int,
-        casual: bool = True,
+        causal: bool = True,
         num_spk: int = 2,
         nonlinear: str = "relu",
         layer: int = 3,
@@ -51,14 +51,17 @@ def __init__(
 
         self.segment_size = segment_size
 
+        if mem_type not in ("hc", "h", "c", "id", None):
+            raise ValueError("Not supporting mem_type={}".format(mem_type))
+
         self.skim = SkiM(
             input_size=input_dim,
             hidden_size=unit,
             output_size=input_dim * num_spk,
             dropout=dropout,
             num_blocks=layer,
-            bidirectional=(not casual),
-            norm_type="cLN" if casual else "gLN",
+            bidirectional=(not causal),
+            norm_type="cLN" if causal else "gLN",
             segment_size=segment_size,
             seg_overlap=seg_overlap,
             mem_type=mem_type,
diff --git a/test/espnet2/enh/separator/test_skim_separator.py b/test/espnet2/enh/separator/test_skim_separator.py
new file mode 100644
index 00000000000..c1417e3b28a
--- /dev/null
+++ b/test/espnet2/enh/separator/test_skim_separator.py
@@ -0,0 +1,142 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.skim_separator import SkiMSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_complex(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_real(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_skim_separator_invalid_type():
+    with pytest.raises(ValueError):
+        SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="fff",
+            mem_type='aaa',
+            segment_size=2,
+        )
+
+
+def test_skim_separator_output():
+
+    x = torch.rand(2, 10, 10)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="relu",
+            segment_size=2,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x.shape == specs[0].shape
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape

From 8310b735c503132e3dbe02d30883158a696a0935 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 14:41:03 +0800
Subject: [PATCH 06/34] update config

---
 .../enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
index 5681dab1996..cb59580d9d8 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
@@ -40,7 +40,7 @@ decoder_conf:
     stride: 1
 separator: skim
 separator_conf:
-    casual: True
+    causal: True
     num_spk: 2
     layer: 6
     nonlinear: relu

From f1ac9259da4159be127e7ea351c209fa6dcc5965 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 14:50:23 +0800
Subject: [PATCH 07/34] fix for testing

---
 espnet2/enh/layers/skim.py                    | 35 +++++++++++--------
 espnet2/enh/layers/tcn.py                     | 28 +++++++--------
 espnet2/enh/separator/skim_separator.py       | 14 ++++----
 .../enh/separator/test_skim_separator.py      |  6 ++--
 4 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
index 2e3d506cfbd..46f14c13227 100644
--- a/espnet2/enh/layers/skim.py
+++ b/espnet2/enh/layers/skim.py
@@ -3,13 +3,12 @@
 # (https://arxiv.org/abs/2201.10800)
 #
 
-
-from turtle import forward, shape
-from black import main
 import torch
 import torch.nn as nn
 
-from espnet2.enh.layers.dprnn import SingleRNN, split_feature, merge_feature
+from espnet2.enh.layers.dprnn import merge_feature
+from espnet2.enh.layers.dprnn import SingleRNN
+from espnet2.enh.layers.dprnn import split_feature
 from espnet2.enh.layers.tcn import chose_norm
 
 
@@ -19,10 +18,13 @@ class MemLSTM(nn.Module):
     args:
         hidden_size: int, dimension of the hidden state.
         dropout: float, dropout ratio. Default is 0.
-        bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
         mem_type: 'hc', 'h', 'c' or 'id'.
-            It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
-            In 'id' mode, both the hidden and cell states will be identically returned.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
         norm_type: gLN, cLN. cLN is for causal implementation.
     """
 
@@ -117,11 +119,12 @@ class SegLSTM(nn.Module):
     """the Seg-LSTM of SkiM
 
     args:
-        input_size: int, dimension of the input feature. The input should have shape
-                    (batch, seq_len, input_size).
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
         hidden_size: int, dimension of the hidden state.
         dropout: float, dropout ratio. Default is 0.
-        bidirectional: bool, whether the LSTM layers are bidirectional. Default is False.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
         norm_type: gLN, cLN. cLN is for causal implementation.
     """
 
@@ -152,7 +155,7 @@ def forward(self, input, hc):
 
         B, T, H = input.shape
 
-        if hc == None:
+        if hc is None:
             # In fist input SkiM block, h and c are not available
             d = self.num_direction
             h = torch.zeros(d, B, self.hidden_size).to(input.device)
@@ -183,12 +186,14 @@ class SkiM(nn.Module):
         segment_size: segmentation size for splitting long features
         bidirectional: bool, whether the RNN layers are bidirectional.
         mem_type: 'hc', 'h', 'c', 'id' or None.
-            It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
-            In 'id' mode, both the hidden and cell states will be identically returned.
+            It controls whether the hidden (or cell) state of SegLSTM
+            will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
             When mem_type is None, the MemLSTM will be removed.
         norm_type: gLN, cLN. cLN is for causal implementation.
-        seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
-            Default is False.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments.Default is False.
     """
 
     def __init__(
diff --git a/espnet2/enh/layers/tcn.py b/espnet2/enh/layers/tcn.py
index b41b108731c..acc2ba6e309 100644
--- a/espnet2/enh/layers/tcn.py
+++ b/espnet2/enh/layers/tcn.py
@@ -47,7 +47,7 @@ def __init__(
         for r in range(R):
             blocks = []
             for x in range(X):
-                dilation = 2 ** x
+                dilation = 2**x
                 padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
                 blocks += [
                     TemporalBlock(
@@ -215,7 +215,7 @@ def check_nonlinear(nolinear_type):
         raise ValueError("Unsupported nonlinear type")
 
 
-def chose_norm(norm_type, channel_size, shape='BDT'):
+def chose_norm(norm_type, channel_size, shape="BDT"):
     """The input of normalization will be (M, C, K), where M is batch size.
 
     C is channel size and K is sequence length.
@@ -237,12 +237,12 @@ def chose_norm(norm_type, channel_size, shape='BDT'):
 class ChannelwiseLayerNorm(nn.Module):
     """Channel-wise Layer Normalization (cLN)."""
 
-    def __init__(self, channel_size, shape='BDT'):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
-        assert shape in ['BDT', 'BTD']
+        assert shape in ["BDT", "BTD"]
         self.shape = shape
 
     def reset_parameters(self):
@@ -264,17 +264,17 @@ def forward(self, y):
             M, N, K, L = y.shape
             y = y.view(M, N, K * L)
 
-        if self.shape == 'BTD':
+        if self.shape == "BTD":
             y = y.transpose(1, 2).contiguous()
 
         mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
         var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
         cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
 
-        if self.shape == 'BTD':
+        if self.shape == "BTD":
             cLN_y = cLN_y.transpose(1, 2).contiguous()
 
-        if dim  == 4:
+        if dim == 4:
             cLN_y = cLN_y.view(M, N, K, L)
         return cLN_y
 
@@ -282,12 +282,12 @@ def forward(self, y):
 class GlobalLayerNorm(nn.Module):
     """Global Layer Normalization (gLN)."""
 
-    def __init__(self, channel_size, shape='BDT'):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
-        assert shape in ['BDT', 'BTD']
+        assert shape in ["BDT", "BTD"]
         self.shape = shape
 
     def reset_parameters(self):
@@ -303,15 +303,15 @@ def forward(self, y):
         Returns:
             gLN_y: [M, N, K]
         """
-        if self.shape == 'BTD':
-            y = y.transpose(1, 2).contiguous()      
+        if self.shape == "BTD":
+            y = y.transpose(1, 2).contiguous()
 
         mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
         var = (
             (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
         )
         gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
-        
-        if self.shape == 'BTD':
-            gLN_y = gLN_y.transpose(1, 2).contiguous()   
+
+        if self.shape == "BTD":
+            gLN_y = gLN_y.transpose(1, 2).contiguous()
         return gLN_y
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
index aa279a919bd..9df8e517a96 100644
--- a/espnet2/enh/separator/skim_separator.py
+++ b/espnet2/enh/separator/skim_separator.py
@@ -18,17 +18,19 @@ class SkiMSeparator(AbsSeparator):
         causal: bool, whether the system is causal.
         num_spk: number of target speakers.
         nonlinear: the nonlinear function for mask estimation,
-                select from 'relu', 'tanh', 'sigmoid'
+            select from 'relu', 'tanh', 'sigmoid'
         layer: int, number of SkiM blocks. Default is 3.
         unit: int, dimension of the hidden state.
         segment_size: segmentation size for splitting long features
         dropout: float, dropout ratio. Default is 0.
         mem_type: 'hc', 'h', 'c', 'id' or None.
-                It controls whether the hidden (or cell) state of SegLSTM will be processed by MemLSTM.
-                In 'id' mode, both the hidden and cell states will be identically returned.
-                When mem_type is None, the MemLSTM will be removed.
-        seg_overlap: Bool, whether the segmentation will reserve 50% overlap for adjacent segments.
-                Default is False.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states
+            will be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments. Default is False.
     """
 
     def __init__(
diff --git a/test/espnet2/enh/separator/test_skim_separator.py b/test/espnet2/enh/separator/test_skim_separator.py
index c1417e3b28a..7cb8f4bc253 100644
--- a/test/espnet2/enh/separator/test_skim_separator.py
+++ b/test/espnet2/enh/separator/test_skim_separator.py
@@ -39,7 +39,7 @@ def test_skim_separator_forward_backward_complex(
         segment_size=segment_size,
         dropout=dropout,
         mem_type=mem_type,
-        seg_overlap=seg_overlap
+        seg_overlap=seg_overlap,
     )
     model.train()
 
@@ -88,7 +88,7 @@ def test_skim_separator_forward_backward_real(
         segment_size=segment_size,
         dropout=dropout,
         mem_type=mem_type,
-        seg_overlap=seg_overlap
+        seg_overlap=seg_overlap,
     )
     model.train()
 
@@ -112,7 +112,7 @@ def test_skim_separator_invalid_type():
             dropout=0.1,
             num_spk=2,
             nonlinear="fff",
-            mem_type='aaa',
+            mem_type="aaa",
             segment_size=2,
         )
 

From ac3c10cfe4faf82c0bb30f8b32d9e8692363e0a9 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Fri, 11 Feb 2022 16:22:52 +0800
Subject: [PATCH 08/34] fixing an assertion missing

---
 egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml | 2 +-
 espnet2/enh/layers/skim.py                                 | 1 +
 test/espnet2/enh/separator/test_skim_separator.py          | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
index 5fdd978f95c..2eced345061 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
@@ -40,7 +40,7 @@ decoder_conf:
     stride: 1
 separator: skim
 separator_conf:
-    casual: False
+    causal: False
     num_spk: 2
     layer: 6
     nonlinear: relu
diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
index 46f14c13227..3560d13b56e 100644
--- a/espnet2/enh/layers/skim.py
+++ b/espnet2/enh/layers/skim.py
@@ -224,6 +224,7 @@ def __init__(
             "hc",
             "h",
             "c",
+            "id",
             None,
         ], f"only support 'hc', 'h', 'c', 'id', and None, current type: {mem_type}"
 
diff --git a/test/espnet2/enh/separator/test_skim_separator.py b/test/espnet2/enh/separator/test_skim_separator.py
index 7cb8f4bc253..e1594cd5620 100644
--- a/test/espnet2/enh/separator/test_skim_separator.py
+++ b/test/espnet2/enh/separator/test_skim_separator.py
@@ -63,7 +63,7 @@ def test_skim_separator_forward_backward_complex(
 @pytest.mark.parametrize("dropout", [0.0, 0.2])
 @pytest.mark.parametrize("num_spk", [1, 2])
 @pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
-@pytest.mark.parametrize("mem_type", ["hc", "c", "h", None])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", "id", None])
 @pytest.mark.parametrize("segment_size", [2, 4])
 @pytest.mark.parametrize("seg_overlap", [False, True])
 def test_skim_separator_forward_backward_real(

From 2554f265c818d608c378a8febac42fc1ffa10ca3 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Mon, 14 Feb 2022 18:04:07 -0500
Subject: [PATCH 09/34] initial MT files from ST for tracking

---
 egs2/TEMPLATE/mt1/cmd.sh          |  110 ++
 egs2/TEMPLATE/mt1/conf/fbank.conf |    2 +
 egs2/TEMPLATE/mt1/conf/pbs.conf   |   11 +
 egs2/TEMPLATE/mt1/conf/pitch.conf |    1 +
 egs2/TEMPLATE/mt1/conf/queue.conf |   12 +
 egs2/TEMPLATE/mt1/conf/slurm.conf |   14 +
 egs2/TEMPLATE/mt1/db.sh           |    1 +
 egs2/TEMPLATE/mt1/local/path.sh   |    0
 egs2/TEMPLATE/mt1/mt.sh           | 1703 +++++++++++++++++++++++++++++
 egs2/TEMPLATE/mt1/path.sh         |   22 +
 egs2/TEMPLATE/mt1/pyscripts       |    1 +
 egs2/TEMPLATE/mt1/scripts         |    1 +
 egs2/TEMPLATE/mt1/setup.sh        |   58 +
 egs2/TEMPLATE/mt1/steps           |    1 +
 egs2/TEMPLATE/mt1/utils           |    1 +
 egs2/iwslt14/mt1/cmd.sh           |  110 ++
 egs2/iwslt14/mt1/conf/fbank.conf  |    2 +
 egs2/iwslt14/mt1/conf/pbs.conf    |   11 +
 egs2/iwslt14/mt1/conf/pitch.conf  |    1 +
 egs2/iwslt14/mt1/conf/queue.conf  |   12 +
 egs2/iwslt14/mt1/conf/slurm.conf  |   14 +
 egs2/iwslt14/mt1/db.sh            |    1 +
 egs2/iwslt14/mt1/local/data.sh    |   34 +
 egs2/iwslt14/mt1/local/path.sh    |    0
 egs2/iwslt14/mt1/mt.sh            |    1 +
 egs2/iwslt14/mt1/path.sh          |    1 +
 egs2/iwslt14/mt1/pyscripts        |    1 +
 egs2/iwslt14/mt1/run.sh           |   55 +
 egs2/iwslt14/mt1/scripts          |    1 +
 egs2/iwslt14/mt1/steps            |    1 +
 egs2/iwslt14/mt1/utils            |    1 +
 espnet2/mt/__init__.py            |    0
 espnet2/mt/espnet_model.py        |  452 ++++++++
 33 files changed, 2636 insertions(+)
 create mode 100644 egs2/TEMPLATE/mt1/cmd.sh
 create mode 100644 egs2/TEMPLATE/mt1/conf/fbank.conf
 create mode 100644 egs2/TEMPLATE/mt1/conf/pbs.conf
 create mode 100644 egs2/TEMPLATE/mt1/conf/pitch.conf
 create mode 100644 egs2/TEMPLATE/mt1/conf/queue.conf
 create mode 100644 egs2/TEMPLATE/mt1/conf/slurm.conf
 create mode 120000 egs2/TEMPLATE/mt1/db.sh
 create mode 100644 egs2/TEMPLATE/mt1/local/path.sh
 create mode 100755 egs2/TEMPLATE/mt1/mt.sh
 create mode 100755 egs2/TEMPLATE/mt1/path.sh
 create mode 120000 egs2/TEMPLATE/mt1/pyscripts
 create mode 120000 egs2/TEMPLATE/mt1/scripts
 create mode 100755 egs2/TEMPLATE/mt1/setup.sh
 create mode 120000 egs2/TEMPLATE/mt1/steps
 create mode 120000 egs2/TEMPLATE/mt1/utils
 create mode 100644 egs2/iwslt14/mt1/cmd.sh
 create mode 100644 egs2/iwslt14/mt1/conf/fbank.conf
 create mode 100644 egs2/iwslt14/mt1/conf/pbs.conf
 create mode 100644 egs2/iwslt14/mt1/conf/pitch.conf
 create mode 100644 egs2/iwslt14/mt1/conf/queue.conf
 create mode 100644 egs2/iwslt14/mt1/conf/slurm.conf
 create mode 120000 egs2/iwslt14/mt1/db.sh
 create mode 100644 egs2/iwslt14/mt1/local/data.sh
 create mode 100644 egs2/iwslt14/mt1/local/path.sh
 create mode 120000 egs2/iwslt14/mt1/mt.sh
 create mode 120000 egs2/iwslt14/mt1/path.sh
 create mode 120000 egs2/iwslt14/mt1/pyscripts
 create mode 100755 egs2/iwslt14/mt1/run.sh
 create mode 120000 egs2/iwslt14/mt1/scripts
 create mode 120000 egs2/iwslt14/mt1/steps
 create mode 120000 egs2/iwslt14/mt1/utils
 create mode 100644 espnet2/mt/__init__.py
 create mode 100644 espnet2/mt/espnet_model.py

diff --git a/egs2/TEMPLATE/mt1/cmd.sh b/egs2/TEMPLATE/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/mt1/conf/fbank.conf b/egs2/TEMPLATE/mt1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/mt1/conf/pbs.conf b/egs2/TEMPLATE/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/mt1/conf/pitch.conf b/egs2/TEMPLATE/mt1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/mt1/conf/queue.conf b/egs2/TEMPLATE/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/mt1/conf/slurm.conf b/egs2/TEMPLATE/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/mt1/db.sh b/egs2/TEMPLATE/mt1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/local/path.sh b/egs2/TEMPLATE/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
new file mode 100755
index 00000000000..964ae2e7ece
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -0,0 +1,1703 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+st_tag=        # Suffix to the result dir for st model training.
+st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, st_tag is ignored.
+st_stats_dir=  # Specify the directory path for ST statistics.
+st_config=     # Config for st model training.
+st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in st config.
+pretrained_asr=               # Pretrained model to load
+ignore_init_mismatch=false      # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
+                  # Note that it will overwrite args in inference config.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_st_model=train.loss.best.pth
+                                      # inference_st_model=3epoch.pth
+                                      # inference_st_model=valid.acc.best.pth
+                                      # inference_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+st_speech_fold_length=800 # fold_length for speech data during ST training.
+st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --st_tag           # Suffix to the result dir for st model training (default="${st_tag}").
+    --st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, st_tag is ignored (default="${st_exp}").
+    --st_stats_dir     # Specify the directory path for ST statistics (default="${st_stats_dir}").
+    --st_config        # Config for st model training (default="${st_config}").
+    --st_args          # Arguments for st model training (default="${st_args}").
+                       # e.g., --st_args "--max_epoch 10"
+                       # Note that it will overwrite args in st config.
+    --pretrained_asr=          # Pretrained model to load (default="${pretrained_asr}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --inference_args      # Arguments for decoding (default="${inference_args}").
+                          # e.g., --inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_st_model # ST model path for decoding (default="${inference_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
+    --st_text_fold_length   # fold_length for text data during ST training (default="${st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for bpe training if not specified.
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    # TODO (prepare data as text.${src_lang}_${tgt_lang})
+    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+else
+    [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+fi
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${st_tag}" ]; then
+    if [ -n "${st_config}" ]; then
+        st_tag="$(basename "${st_config}" .yaml)_${feats_type}"
+    else
+        st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_args}" ]; then
+        st_tag+="$(echo "${st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${st_exp}" ]; then
+    st_exp="${expdir}/st_${st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_st_model_$(echo "${inference_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done 
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+                # shellcheck disable=SC2086
+                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank_pitch ]; then
+            log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # 1. Copy datadir
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # 2. Feature extract
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+
+                # 3. Derive the the frame length and feature dimension
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                # 4. Write feats_dim
+                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
+                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+
+                # 5. Write feats_type
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            log "${feats_type} is not supported yet."
+            exit 1
+
+        elif  [ "${feats_type}" = extracted ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # Generate dummy wav.scp to avoid error by copy_data_dir.sh
+                <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # Derive the the frame length and feature dimension
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
+                    awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                <"${data_feats}/org/${dset}/wav.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                    >"${data_feats}/${dset}/wav.scp"
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_st_train_dir}/${_scp} wc -l)" "$(<${_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${st_stats_dir}/run.sh"; chmod +x "${st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.st_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${st_speech_fold_length}"
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_st_train_dir}/${_scp}" \
+                      "${_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${st_stats_dir}/train/speech_shape" \
+                      "${st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${st_exp}/run.sh"; chmod +x "${st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${st_exp})"
+        else
+            jobname="${st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.st_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_asr} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --output_dir "${st_exp}" \
+                ${_opts} ${st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    st_exp="${expdir}/${download_model}"
+    mkdir -p "${st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${st_exp}/config.txt"
+
+    # Get the path of each file
+    _st_model_file=$(<"${st_exp}/config.txt" sed -e "s/.*'st_model_file': '\([^']*\)'.*$/\1/")
+    _st_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'st_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_st_model_file}" "${st_exp}"
+    ln -sf "${_st_train_config}" "${st_exp}"
+    inference_st_model=$(basename "${_st_model_file}")
+
+    if [ "$(<${st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${st_exp}/${inference_tag}/run.sh"; chmod +x "${st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            st_inference_tool="espnet2.bin.st_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${st_exp}"/config.yaml \
+                    --st_model_file "${st_exp}"/"${inference_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Scoring"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+            
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l en -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l en -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+                
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+                    
+                    # 
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l en -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_st_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
+        cat "${cat_exp}"/RESULTS.md
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${st_exp}/${st_exp##*/}_${inference_st_model%.*}.zip"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack st \
+            --st_train_config "${st_exp}"/config.yaml \
+            --st_model_file "${st_exp}"/"${inference_st_model}" \
+            ${_opts} \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Upload model to Zenodo: ${packed_model}"
+
+        # To upload your model, you need to do:
+        #   1. Sign up to Zenodo: https://zenodo.org/
+        #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+        #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="
+git checkout $(git show -s --format=%H)"
+
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/st1/ -> foo/st1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/st1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # Generate description file
+        cat << EOF > "${st_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">See https://github.com/espnet/espnet_model_zoo</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${st_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>ST config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+        # NOTE(kamo): The model file is uploaded here, but not published yet.
+        #   Please confirm your record at Zenodo and publish it by yourself.
+
+        # shellcheck disable=SC2086
+        espnet_model_zoo_upload \
+            --file "${packed_model}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --description_file "${st_exp}"/description \
+            --creator_name "${_creator_name}" \
+            --license "CC-BY-4.0" \
+            --use_sandbox false \
+            --publish false
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-translation
+        # shellcheck disable=SC2034     
+        espnet_task=ST
+        # shellcheck disable=SC2034
+        task_exp=${st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/mt1/path.sh b/egs2/TEMPLATE/mt1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/mt1/pyscripts b/egs2/TEMPLATE/mt1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/scripts b/egs2/TEMPLATE/mt1/scripts
new file mode 120000
index 00000000000..1000492f630
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/scripts
@@ -0,0 +1 @@
+../asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/setup.sh b/egs2/TEMPLATE/mt1/setup.sh
new file mode 100755
index 00000000000..bcb0fa0916b
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/mt1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in mt.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/mt1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/mt1/steps b/egs2/TEMPLATE/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/utils b/egs2/TEMPLATE/mt1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/cmd.sh b/egs2/iwslt14/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt14/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt14/mt1/conf/fbank.conf b/egs2/iwslt14/mt1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt14/mt1/conf/pbs.conf b/egs2/iwslt14/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt14/mt1/conf/pitch.conf b/egs2/iwslt14/mt1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt14/mt1/conf/queue.conf b/egs2/iwslt14/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt14/mt1/conf/slurm.conf b/egs2/iwslt14/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt14/mt1/db.sh b/egs2/iwslt14/mt1/db.sh
new file mode 120000
index 00000000000..a11c0666fa1
--- /dev/null
+++ b/egs2/iwslt14/mt1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
new file mode 100644
index 00000000000..2f40495279a
--- /dev/null
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+splits_dir=data/iwslt22_splits
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ -z "${IWSLT14}" ]; then
+    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
diff --git a/egs2/iwslt14/mt1/local/path.sh b/egs2/iwslt14/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt14/mt1/mt.sh b/egs2/iwslt14/mt1/mt.sh
new file mode 120000
index 00000000000..9f4c1d5c0bb
--- /dev/null
+++ b/egs2/iwslt14/mt1/mt.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/mt.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/path.sh b/egs2/iwslt14/mt1/path.sh
new file mode 120000
index 00000000000..a2d87d29a46
--- /dev/null
+++ b/egs2/iwslt14/mt1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/pyscripts b/egs2/iwslt14/mt1/pyscripts
new file mode 120000
index 00000000000..bca5bde44f3
--- /dev/null
+++ b/egs2/iwslt14/mt1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
new file mode 100755
index 00000000000..c977de7cddb
--- /dev/null
+++ b/egs2/iwslt14/mt1/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=ta
+tgt_lang=en
+
+train_set=train
+train_dev=dev
+test_set=test1
+
+st_config=conf/train_st_conformer.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=1000
+tgt_nbpe=1000
+
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+# Note, it is best to keep tgt_case as tc to match IWSLT22 eval
+src_case=tc.rm
+tgt_case=tc
+
+./st.sh \
+    --ignore_init_mismatch true \
+    --stage 1 \
+    --stop_stage 13 \
+    --use_lm false \
+    --token_joint false \
+    --audio_format "flac.ark" \
+    --nj 40 \
+    --inference_nj 40 \
+    --audio_format "flac.ark" \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/iwslt14/mt1/scripts b/egs2/iwslt14/mt1/scripts
new file mode 120000
index 00000000000..1c11b3c3c7b
--- /dev/null
+++ b/egs2/iwslt14/mt1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/steps b/egs2/iwslt14/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt14/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/utils b/egs2/iwslt14/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt14/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet2/mt/__init__.py b/espnet2/mt/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
new file mode 100644
index 00000000000..d0eaaef367a
--- /dev/null
+++ b/espnet2/mt/espnet_model.py
@@ -0,0 +1,452 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_asr_common import ErrorCalculator as ASRErrorCalculator
+from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetSTModel(AbsESPnetModel):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        extra_asr_decoder: Optional[AbsDecoder],
+        extra_mt_decoder: Optional[AbsDecoder],
+        ctc: CTC,
+        src_vocab_size: int = 0,
+        src_token_list: Union[Tuple[str, ...], List[str]] = [],
+        asr_weight: float = 0.0,
+        mt_weight: float = 0.0,
+        mtlalpha: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        report_bleu: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+        assert 0.0 <= asr_weight < 1.0, "asr_weight should be [0.0, 1.0)"
+        assert 0.0 <= mt_weight < 1.0, "mt_weight should be [0.0, 1.0)"
+        assert 0.0 <= mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.src_vocab_size = src_vocab_size
+        self.ignore_id = ignore_id
+        self.asr_weight = asr_weight
+        self.mt_weight = mt_weight
+        self.mtlalpha = mtlalpha
+        self.token_list = token_list.copy()
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+        self.decoder = (
+            decoder  # TODO(jiatong): directly implement multi-decoder structure at here
+        )
+
+        self.criterion_st = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.criterion_asr = LabelSmoothingLoss(
+            size=src_vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # submodule for ASR task
+        if self.asr_weight > 0:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            if self.mtlalpha > 0.0:
+                self.ctc = ctc
+            if self.mtlalpha < 1.0:
+                self.extra_asr_decoder = extra_asr_decoder
+            elif extra_asr_decoder is not None:
+                logging.warning(
+                    "Not using extra_asr_decoder because "
+                    "mtlalpha is set as {} (== 1.0)".format(mtlalpha),
+                )
+
+        # submodule for MT task
+        if self.mt_weight > 0:
+            self.extra_mt_decoder = extra_mt_decoder
+        elif extra_mt_decoder is not None:
+            logging.warning(
+                "Not using extra_mt_decoder because "
+                "mt_weight is set as {} (== 0)".format(mt_weight),
+            )
+
+        # MT error calculator
+        if report_bleu:
+            self.mt_error_calculator = MTErrorCalculator(
+                token_list, sym_space, sym_blank, report_bleu
+            )
+        else:
+            self.mt_error_calculator = None
+
+        # ASR error calculator
+        if report_cer or report_wer:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            self.asr_error_calculator = ASRErrorCalculator(
+                src_token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.asr_error_calculator = None
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+        # TODO(jiatong): add multilingual related functions
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch,)
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            src_text: (Batch, length)
+            src_text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if src_text is not None:
+            assert src_text_lengths.dim() == 1, src_text_lengths.shape
+            assert text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0], (
+                text.shape,
+                src_text.shape,
+                src_text_lengths.shape,
+            )
+
+        batch_size = speech.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+        # 2a. Attention-decoder branch (ST)
+        loss_st_att, acc_st_att, bleu_st_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths, st=True
+        )
+
+        # 2b. CTC branch
+        if self.asr_weight > 0:
+            assert src_text is not None, "missing source text for asr sub-task of ST"
+
+        if self.asr_weight > 0 and self.mtlalpha > 0:
+            loss_asr_ctc, cer_asr_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_ctc, cer_asr_ctc = 0, None
+
+        # 2c. Attention-decoder branch (extra ASR)
+        if self.asr_weight > 0 and self.mtlalpha < 1.0:
+            (
+                loss_asr_att,
+                acc_asr_att,
+                cer_asr_att,
+                wer_asr_att,
+            ) = self._calc_asr_att_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_att, acc_asr_att, cer_asr_att, wer_asr_att = 0, None, None, None
+
+        # 2d. Attention-decoder branch (extra MT)
+        if self.mt_weight > 0:
+            loss_mt_att, acc_mt_att = self._calc_mt_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths, st=False
+            )
+        else:
+            loss_mt_att, acc_mt_att = 0, None
+
+        # 3. Loss computation
+        asr_ctc_weight = self.mtlalpha
+        loss_st = loss_st_att
+        if asr_ctc_weight == 1.0:
+            loss_asr = loss_asr_ctc
+        elif asr_ctc_weight == 0.0:
+            loss_asr = loss_asr_att
+        else:
+            loss_asr = (
+                asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att
+            )
+        loss_mt = self.mt_weight * loss_mt_att
+        loss = (
+            (1 - self.asr_weight - self.mt_weight) * loss_st
+            + self.asr_weight * loss_asr
+            + self.mt_weight * loss_mt
+        )
+
+        stats = dict(
+            loss=loss.detach(),
+            loss_asr=loss_asr.detach() if type(loss_asr) is not float else loss_asr,
+            loss_mt=loss_mt.detach() if type(loss_mt) is not float else loss_mt,
+            loss_st=loss_st.detach(),
+            acc_asr=acc_asr_att,
+            acc_mt=acc_mt_att,
+            acc=acc_st_att,
+            cer_ctc=cer_asr_ctc,
+            cer=cer_asr_att,
+            wer=wer_asr_att,
+            bleu=bleu_st_att,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by st_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+        return feats, feats_lengths
+
+    def _calc_mt_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        st: bool = True,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        if st:
+            decoder_out, _ = self.decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+        else:
+            decoder_out, _ = self.extra_mt_decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_st(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.mt_error_calculator is None:
+            bleu_att = None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, bleu_att
+
+    def _calc_asr_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_asr(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.asr_error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc

From 97841e06e27f9bb6779eddc4051533ce687b5a22 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sat, 19 Feb 2022 20:12:57 -0500
Subject: [PATCH 10/34] egs2

---
 egs2/TEMPLATE/asr1/db.sh                      |   2 +
 egs2/TEMPLATE/mt1/mt.sh                       | 570 +++++-------------
 egs2/TEMPLATE/st1/st.sh                       |   8 +-
 egs2/iwslt14/mt1/local/data.sh                | 159 ++++-
 egs2/iwslt14/mt1/run.sh                       |  17 +-
 .../asr/encoder/lego_transformer_encoder.py   | 189 ++++++
 espnet2/mt/espnet_model.py                    |   7 -
 7 files changed, 525 insertions(+), 427 deletions(-)
 mode change 100644 => 100755 egs2/iwslt14/mt1/local/data.sh
 create mode 100644 espnet2/asr/encoder/lego_transformer_encoder.py

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 942716654c2..62fd7e048f2 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -125,6 +125,7 @@ TOTONAC=downloads
 PRIMEWORDS_CHINESE=downloads
 SEAME=
 BENGALI=downloads
+IWSLT14=downloads
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
@@ -154,6 +155,7 @@ if [[ "$(hostname)" == tir* ]]; then
     BABEL_403=/projects/tir5/data/speech_corpora/babel/IARPA-babel403b-v1.0b-build/BABEL_OP3_403/
     BABEL_404=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_OP3_404/
     GRABO=/projects/tir5/data/speech_corpora/Grabo
+    IWSLT14=/projects/tir5/data/iwslt14
     IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
     PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
     FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index 964ae2e7ece..f7ad47931b3 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -42,15 +42,8 @@ python=python3       # Specify python to execute espnet commands.
 # Data preparation related
 local_data_opts= # The options given to local/data.sh.
 
-# Speed perturbation related
-speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
-
 # Feature extraction related
 feats_type=raw       # Feature type (raw or fbank_pitch).
-audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
-fs=16k               # Sampling rate.
-min_wav_duration=0.1 # Minimum duration in second.
-max_wav_duration=20  # Maximum duration in second.
 
 # Tokenization related
 oov="<unk>"         # Out of vocabulary symbol.
@@ -78,7 +71,7 @@ ngram_exp=
 ngram_num=3
 
 # Language model related
-use_lm=true       # Use language model for ST decoding.
+use_lm=true       # Use language model for MT decoding.
 lm_tag=           # Suffix to the result dir for language model training.
 lm_exp=           # Specify the directory path for LM experiment.
                   # If this option is specified, lm_tag is ignored.
@@ -91,18 +84,16 @@ num_splits_lm=1   # Number of splitting for lm corpus.
 # shellcheck disable=SC2034
 word_vocab_size=10000 # Size of word vocabulary.
 
-# ST model related
-st_tag=        # Suffix to the result dir for st model training.
-st_exp=        # Specify the directory path for ST experiment.
-               # If this option is specified, st_tag is ignored.
-st_stats_dir=  # Specify the directory path for ST statistics.
-st_config=     # Config for st model training.
-st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
-               # Note that it will overwrite args in st config.
-pretrained_asr=               # Pretrained model to load
+# MT model related
+mt_tag=        # Suffix to the result dir for mt model training.
+mt_exp=        # Specify the directory path for MT experiment.
+               # If this option is specified, mt_tag is ignored.
+mt_stats_dir=  # Specify the directory path for MT statistics.
+mt_config=     # Config for mt model training.
+mt_args=       # Arguments for mt model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in mt config.
 ignore_init_mismatch=false      # Ignore initial mismatch
-feats_normalize=global_mvn # Normalizaton layer type.
-num_splits_st=1            # Number of splitting for lm corpus.
+num_splits_mt=1            # Number of splitting for lm corpus.
 src_lang=es                # source language abbrev. id (e.g., es)
 tgt_lang=en                # target language abbrev. id (e.g., en)
 
@@ -118,12 +109,12 @@ inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
                   # Note that it will overwrite args in inference config.
 inference_lm=valid.loss.ave.pth       # Language model path for decoding.
 inference_ngram=${ngram_num}gram.bin
-inference_st_model=valid.acc.ave.pth # ST model path for decoding.
+inference_mt_model=valid.acc.ave.pth # MT model path for decoding.
                                       # e.g.
-                                      # inference_st_model=train.loss.best.pth
-                                      # inference_st_model=3epoch.pth
-                                      # inference_st_model=valid.acc.best.pth
-                                      # inference_st_model=valid.loss.ave.pth
+                                      # inference_mt_model=train.loss.best.pth
+                                      # inference_mt_model=3epoch.pth
+                                      # inference_mt_model=valid.acc.best.pth
+                                      # inference_mt_model=valid.loss.ave.pth
 download_model= # Download a model from Model Zoo and use it for decoding.
 
 # [Task dependent] Set the datadir name created by local/data.sh
@@ -141,8 +132,7 @@ g2p=none         # g2p method (needed if token_type=phn).
 lang=noinfo      # The language type of corpus.
 score_opts=                # The options given to sclite scoring
 local_score_opts=          # The options given to local/score.sh.
-st_speech_fold_length=800 # fold_length for speech data during ST training.
-st_text_fold_length=150   # fold_length for text data during ST training.
+mt_text_fold_length=150   # fold_length for text data during MT training.
 lm_fold_length=150         # fold_length for LM training.
 
 help_message=$(cat << EOF
@@ -168,15 +158,8 @@ Options:
     # Data preparation related
     --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
 
-    # Speed perturbation related
-    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
-
     # Feature extraction related
     --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
-    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
-    --fs               # Sampling rate (default="${fs}").
-    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
-    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
 
     # Tokenization related
     --oov                     # Out of vocabulary symbol (default="${oov}").
@@ -210,19 +193,17 @@ Options:
     --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
     --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
 
-    # ST model related
-    --st_tag           # Suffix to the result dir for st model training (default="${st_tag}").
-    --st_exp           # Specify the directory path for ST experiment.
-                       # If this option is specified, st_tag is ignored (default="${st_exp}").
-    --st_stats_dir     # Specify the directory path for ST statistics (default="${st_stats_dir}").
-    --st_config        # Config for st model training (default="${st_config}").
-    --st_args          # Arguments for st model training (default="${st_args}").
-                       # e.g., --st_args "--max_epoch 10"
-                       # Note that it will overwrite args in st config.
-    --pretrained_asr=          # Pretrained model to load (default="${pretrained_asr}").
+    # MT model related
+    --mt_tag           # Suffix to the result dir for mt model training (default="${mt_tag}").
+    --mt_exp           # Specify the directory path for MT experiment.
+                       # If this option is specified, mt_tag is ignored (default="${mt_exp}").
+    --mt_stats_dir     # Specify the directory path for MT statistics (default="${mt_stats_dir}").
+    --mt_config        # Config for mt model training (default="${mt_config}").
+    --mt_args          # Arguments for mt model training (default="${mt_args}").
+                       # e.g., --mt_args "--max_epoch 10"
+                       # Note that it will overwrite args in mt config.
     --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
-    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
-    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --num_splits_mt    # Number of splitting for lm corpus.  (default="${num_splits_mt}").
     --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
     --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
 
@@ -233,7 +214,7 @@ Options:
                           # e.g., --inference_args "--lm_weight 0.1"
                           # Note that it will overwrite args in inference config.
     --inference_lm        # Language model path for decoding (default="${inference_lm}").
-    --inference_st_model # ST model path for decoding (default="${inference_st_model}").
+    --inference_mt_model # MT model path for decoding (default="${inference_mt_model}").
     --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
@@ -252,8 +233,7 @@ Options:
     --lang          # The language type of corpus (default=${lang}).
     --score_opts             # The options given to sclite scoring (default="{score_opts}").
     --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
-    --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
-    --st_text_fold_length   # fold_length for text data during ST training (default="${st_text_fold_length}").
+    --mt_text_fold_length   # fold_length for text data during MT training (default="${mt_text_fold_length}").
     --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
 EOF
 )
@@ -281,12 +261,6 @@ fi
 # Check feature type
 if [ "${feats_type}" = raw ]; then
     data_feats=${dumpdir}/raw
-elif [ "${feats_type}" = fbank_pitch ]; then
-    data_feats=${dumpdir}/fbank_pitch
-elif [ "${feats_type}" = fbank ]; then
-    data_feats=${dumpdir}/fbank
-elif [ "${feats_type}" == extracted ]; then
-    data_feats=${dumpdir}/extracted
 else
     log "${help_message}"
     log "Error: not supported: --feats_type ${feats_type}"
@@ -295,18 +269,12 @@ fi
 
 # Extra files for translation process
 utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
-# Use the same text as ST for bpe training if not specified.
-if "${token_joint}"; then
-    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
-    # TODO (prepare data as text.${src_lang}_${tgt_lang})
-    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
-else
-    [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
-    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
-fi
-# Use the same text as ST for lm training if not specified.
+# Use the same text as MT for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for lm training if not specified.
 [ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
-# Use the same text as ST for lm training if not specified.
+# Use the same text as MT for lm training if not specified.
 [ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
 # Use the text of the 1st evaldir if lm_test is not specified
 [ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
@@ -385,26 +353,23 @@ fi
 
 
 # Set tag for naming of model directory
-if [ -z "${st_tag}" ]; then
-    if [ -n "${st_config}" ]; then
-        st_tag="$(basename "${st_config}" .yaml)_${feats_type}"
+if [ -z "${mt_tag}" ]; then
+    if [ -n "${mt_config}" ]; then
+        mt_tag="$(basename "${mt_config}" .yaml)_${feats_type}"
     else
-        st_tag="train_${feats_type}"
+        mt_tag="train_${feats_type}"
     fi
     if [ "${lang}" != noinfo ]; then
-        st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+        mt_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
     else
-        st_tag+="_${tgt_token_type}_${tgt_case}"
+        mt_tag+="_${tgt_token_type}_${tgt_case}"
     fi
     if [ "${tgt_token_type}" = bpe ]; then
-        st_tag+="${tgt_nbpe}"
+        mt_tag+="${tgt_nbpe}"
     fi
     # Add overwritten arg's info
-    if [ -n "${st_args}" ]; then
-        st_tag+="$(echo "${st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
-    fi
-    if [ -n "${speed_perturb_factors}" ]; then
-        st_tag+="_sp"
+    if [ -n "${mt_args}" ]; then
+        mt_tag+="$(echo "${mt_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
     fi
 fi
 if [ -z "${lm_tag}" ]; then
@@ -428,17 +393,14 @@ if [ -z "${lm_tag}" ]; then
 fi
 
 # The directory used for collect-stats mode
-if [ -z "${st_stats_dir}" ]; then
+if [ -z "${mt_stats_dir}" ]; then
     if [ "${lang}" != noinfo ]; then
-        st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${lang}_${tgt_token_type}"
     else
-        st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${tgt_token_type}"
     fi
     if [ "${tgt_token_type}" = bpe ]; then
-        st_stats_dir+="${tgt_nbpe}"
-    fi
-    if [ -n "${speed_perturb_factors}" ]; then
-        st_stats_dir+="_sp"
+        mt_stats_dir+="${tgt_nbpe}"
     fi
 fi
 if [ -z "${lm_stats_dir}" ]; then
@@ -452,8 +414,8 @@ if [ -z "${lm_stats_dir}" ]; then
     fi
 fi
 # The directory used for training commands
-if [ -z "${st_exp}" ]; then
-    st_exp="${expdir}/st_${st_tag}"
+if [ -z "${mt_exp}" ]; then
+    mt_exp="${expdir}/mt_${mt_tag}"
 fi
 if [ -z "${lm_exp}" ]; then
     lm_exp="${expdir}/lm_${lm_tag}"
@@ -479,7 +441,7 @@ if [ -z "${inference_tag}" ]; then
     if "${use_ngram}"; then
         inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
     fi
-    inference_tag+="_st_model_$(echo "${inference_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    inference_tag+="_mt_model_$(echo "${inference_mt_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
 
     if "${use_k2}"; then
       inference_tag+="_use_k2"
@@ -493,138 +455,12 @@ if ! "${skip_data_prep}"; then
         log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
         # [Task dependent] Need to create data.sh for new corpus
         local/data.sh ${local_data_opts}
+        
     fi
 
     if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-        if [ -n "${speed_perturb_factors}" ]; then
-            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
-            for factor in ${speed_perturb_factors}; do
-                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
-                    scripts/utils/perturb_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
-                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
-                    _dirs+="data/${train_set}_sp${factor} "
-                else
-                    # If speed factor is 1, same as the original
-                    _dirs+="data/${train_set} "
-                fi
-            done
-            utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
-            for extra_file in ${utt_extra_files}; do
-                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
-                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
-            done 
-        else
-           log "Skip stage 2: Speed perturbation"
-        fi
-    fi
-
-    if [ -n "${speed_perturb_factors}" ]; then
-        train_set="${train_set}_sp"
-    fi
-
-    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         if [ "${feats_type}" = raw ]; then
-            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
-
-            # ====== Recreating "wav.scp" ======
-            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
-            # shouldn't be used in training process.
-            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
-            # and it can also change the audio-format and sampling rate.
-            # If nothing is need, then format_wav_scp.sh does nothing:
-            # i.e. the input file format and rate is same as the output.
-
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
-                fi
-                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
-
-                # expand the utt_extra_files for multi-references
-                expand_utt_extra_files=""
-                for extra_file in ${utt_extra_files}; do
-                    # with regex to suuport multi-references
-                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
-                        cp ${single_file} "${data_feats}${_suf}/${dset}"
-                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
-                    done 
-                done
-                echo "${expand_utt_extra_files}"
-                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
-                for extra_file in ${expand_utt_extra_files}; do
-                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
-                done
-
-                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
-                _opts=
-                if [ -e data/"${dset}"/segments ]; then
-                    # "segments" is used for splitting wav files which are written in "wav".scp
-                    # into utterances. The file format of segments:
-                    #   <segment_id> <record_id> <start_time> <end_time>
-                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
-                    # Where the time is written in seconds.
-                    _opts+="--segments data/${dset}/segments "
-                fi
-                # shellcheck disable=SC2086
-                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
-                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
-                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
-
-                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-            done
-
-        elif [ "${feats_type}" = fbank_pitch ]; then
-            log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
-                fi
-                # 1. Copy datadir
-                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
-
-                # expand the utt_extra_files for multi-references
-                expand_utt_extra_files=""
-                for extra_file in ${utt_extra_files}; do
-                    # with regex to suuport multi-references
-                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
-                        cp ${single_file} "${data_feats}${_suf}/${dset}"
-                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
-                    done 
-                done
-                for extra_file in ${expand_utt_extra_files}; do
-                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
-                done
-
-                # 2. Feature extract
-                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
-                steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
-                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
-
-                # 3. Derive the the frame length and feature dimension
-                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
-
-                # 4. Write feats_dim
-                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
-                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
-
-                # 5. Write feats_type
-                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-            done
-
-        elif [ "${feats_type}" = fbank ]; then
-            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-            log "${feats_type} is not supported yet."
-            exit 1
-
-        elif  [ "${feats_type}" = extracted ]; then
-            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+            log "Stage 3: data/ -> ${data_feats}"
 
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
@@ -632,35 +468,16 @@ if ! "${skip_data_prep}"; then
                 else
                     _suf=""
                 fi
-                # Generate dummy wav.scp to avoid error by copy_data_dir.sh
-                <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
-                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+                mkdir -p "${data_feats}${_suf}/${dset}"
 
-                # expand the utt_extra_files for multi-references
-                expand_utt_extra_files=""
                 for extra_file in ${utt_extra_files}; do
                     # with regex to suuport multi-references
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
-                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
                     done 
                 done
-                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
-                for extra_file in ${expand_utt_extra_files}; do
-                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
-                done
-
-                # Derive the the frame length and feature dimension
-                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
-                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
-
-                pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
-                    awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
-
                 echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
             done
-
         else
             log "Error: not supported: --feats_type ${feats_type}"
             exit 2
@@ -668,77 +485,39 @@ if ! "${skip_data_prep}"; then
     fi
 
 
-    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
 
         # NOTE(kamo): Not applying to test_sets to keep original data
         for dset in "${train_set}" "${valid_set}"; do
             # Copy data dir
-            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            mkdir -p "${data_feats}/${dset}"
             cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
 
             for utt_extra_file in ${utt_extra_files}; do
                 cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
             done
-            # Remove short utterances
-            _feats_type="$(<${data_feats}/${dset}/feats_type)"
-            if [ "${_feats_type}" = raw ]; then
-                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
-                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
-                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
-
-                # utt2num_samples is created by format_wav_scp.sh
-                <"${data_feats}/org/${dset}/utt2num_samples" \
-                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
-                        >"${data_feats}/${dset}/utt2num_samples"
-                <"${data_feats}/org/${dset}/wav.scp" \
-                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
-                    >"${data_feats}/${dset}/wav.scp"
-            else
-                # Get frame shift in ms from conf/fbank.conf
-                _frame_shift=
-                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
-                    # Assume using conf/fbank.conf for feature extraction
-                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
-                fi
-                if [ -z "${_frame_shift}" ]; then
-                    # If not existing, use the default number in Kaldi (=10ms).
-                    # If you are using different number, you have to change the following value manually.
-                    _frame_shift=10
-                fi
-
-                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
-                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
-
-                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
-                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
-                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
-                        >"${data_feats}/${dset}/feats_shape"
-                <"${data_feats}/org/${dset}/feats.scp" \
-                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
-                    >"${data_feats}/${dset}/feats.scp"
-            fi
-
-            # Remove empty text
-            <"${data_feats}/org/${dset}/text" \
-                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
-
-            # fix_data_dir.sh leaves only utts which exist in all files
-            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
-            for utt_extra_file in ${utt_extra_files}; do
-                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
-                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
-                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
-            done 
+            # TODO: Maybe Remove empty text
+            # TODO: Add other data cleaning -- currently being done as part of data.sh
         done
 
         # shellcheck disable=SC2002
         cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
     fi
 
-    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >>  ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
         # First generate tgt lang
         if [ "${tgt_token_type}" = bpe ]; then
             log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
@@ -886,8 +665,8 @@ fi
 
 if ! "${skip_train}"; then
     if "${use_lm}"; then
-        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+        if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+            log "Stage 5: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -963,8 +742,8 @@ if ! "${skip_train}"; then
         fi
 
 
-        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -1040,8 +819,8 @@ if ! "${skip_train}"; then
         fi
 
 
-        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-            log "Stage 8: Calc perplexity: ${lm_test_text}"
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: Calc perplexity: ${lm_test_text}"
             _opts=
             # TODO(kamo): Parallelize?
             log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
@@ -1059,61 +838,44 @@ if ! "${skip_train}"; then
         fi
 
     else
-        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+        log "Stage 5-7: Skip lm-related stages: use_lm=${use_lm}"
     fi
 
 
     if "${use_ngram}"; then
         mkdir -p ${ngram_exp}
     fi
-    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
         if "${use_ngram}"; then
-            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            log "Stage 8: Ngram Training: train_set=${data_feats}/lm_train.txt"
             cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
             build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
         else
-            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+            log "Stage 8: Skip ngram stages: use_ngram=${use_ngram}"
         fi
     fi
 
 
-    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
-        _st_train_dir="${data_feats}/${train_set}"
-        _st_valid_dir="${data_feats}/${valid_set}"
-        log "Stage 10: ST collect stats: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 9: MT collect stats: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
 
         _opts=
-        if [ -n "${st_config}" ]; then
+        if [ -n "${mt_config}" ]; then
             # To generate the config file: e.g.
-            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
-            _opts+="--config ${st_config} "
-        fi
-
-        _feats_type="$(<${_st_train_dir}/feats_type)"
-        if [ "${_feats_type}" = raw ]; then
-            _scp=wav.scp
-            if [[ "${audio_format}" == *ark* ]]; then
-                _type=kaldi_ark
-            else
-                # "sound" supports "wav", "flac", etc.
-                _type=sound
-            fi
-            _opts+="--frontend_conf fs=${fs} "
-        else
-            _scp=feats.scp
-            _type=kaldi_ark
-            _input_size="$(<${_st_train_dir}/feats_dim)"
-            _opts+="--input_size=${_input_size} "
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
         fi
 
         # 1. Split the key file
-        _logdir="${st_stats_dir}/logdir"
+        _logdir="${mt_stats_dir}/logdir"
         mkdir -p "${_logdir}"
 
         # Get the minimum number among ${nj} and the number lines of input files
-        _nj=$(min "${nj}" "$(<${_st_train_dir}/${_scp} wc -l)" "$(<${_st_valid_dir}/${_scp} wc -l)")
+        _nj=$(min "${nj}" "$(<${_mt_train_dir}/${_scp} wc -l)" "$(<${_mt_valid_dir}/${_scp} wc -l)")
 
-        key_file="${_st_train_dir}/${_scp}"
+        key_file="${_mt_train_dir}/${_scp}"
         split_scps=""
         for n in $(seq "${_nj}"); do
             split_scps+=" ${_logdir}/train.${n}.scp"
@@ -1121,7 +883,7 @@ if ! "${skip_train}"; then
         # shellcheck disable=SC2086
         utils/split_scp.pl "${key_file}" ${split_scps}
 
-        key_file="${_st_valid_dir}/${_scp}"
+        key_file="${_mt_valid_dir}/${_scp}"
         split_scps=""
         for n in $(seq "${_nj}"); do
             split_scps+=" ${_logdir}/valid.${n}.scp"
@@ -1130,8 +892,8 @@ if ! "${skip_train}"; then
         utils/split_scp.pl "${key_file}" ${split_scps}
 
         # 2. Generate run.sh
-        log "Generate '${st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
-        mkdir -p "${st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${st_stats_dir}/run.sh"; chmod +x "${st_stats_dir}/run.sh"
+        log "Generate '${mt_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
 
         # 3. Submit jobs
         log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
@@ -1142,7 +904,7 @@ if ! "${skip_train}"; then
         # TODO(jiatong): fix different bpe model
         # shellcheck disable=SC2086
         ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-            ${python} -m espnet2.bin.st_train \
+            ${python} -m espnet2.bin.mt_train \
                 --collect_stats true \
                 --use_preprocessor true \
                 --bpemodel "${tgt_bpemodel}" \
@@ -1154,16 +916,16 @@ if ! "${skip_train}"; then
                 --non_linguistic_symbols "${nlsyms_txt}" \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
-                --train_data_path_and_name_and_type "${_st_train_dir}/${_scp},speech,${_type}" \
-                --train_data_path_and_name_and_type "${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
-                --train_data_path_and_name_and_type "${_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
                 --train_shape_file "${_logdir}/train.JOB.scp" \
                 --valid_shape_file "${_logdir}/valid.JOB.scp" \
                 --output_dir "${_logdir}/stats.JOB" \
-                ${_opts} ${st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+                ${_opts} ${mt_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
 
         # 4. Aggregate shape files
         _opts=
@@ -1171,40 +933,40 @@ if ! "${skip_train}"; then
             _opts+="--input_dir ${_logdir}/stats.${i} "
         done
         # shellcheck disable=SC2086
-        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${st_stats_dir}"
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${mt_stats_dir}"
 
         # Append the num-tokens at the last dimensions. This is used for batch-bins count
-        <"${st_stats_dir}/train/text_shape" \
+        <"${mt_stats_dir}/train/text_shape" \
             awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
-            >"${st_stats_dir}/train/text_shape.${tgt_token_type}"
+            >"${mt_stats_dir}/train/text_shape.${tgt_token_type}"
 
-        <"${st_stats_dir}/train/src_text_shape" \
+        <"${mt_stats_dir}/train/src_text_shape" \
             awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
-            >"${st_stats_dir}/train/src_text_shape.${src_token_type}"
+            >"${mt_stats_dir}/train/src_text_shape.${src_token_type}"
 
-        <"${st_stats_dir}/valid/text_shape" \
+        <"${mt_stats_dir}/valid/text_shape" \
             awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
-            >"${st_stats_dir}/valid/text_shape.${tgt_token_type}"
+            >"${mt_stats_dir}/valid/text_shape.${tgt_token_type}"
 
-        <"${st_stats_dir}/valid/src_text_shape" \
+        <"${mt_stats_dir}/valid/src_text_shape" \
             awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
-            >"${st_stats_dir}/valid/src_text_shape.${src_token_type}"
+            >"${mt_stats_dir}/valid/src_text_shape.${src_token_type}"
     fi
 
 
-    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
-        _st_train_dir="${data_feats}/${train_set}"
-        _st_valid_dir="${data_feats}/${valid_set}"
-        log "Stage 11: ST Training: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: MT Training: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
 
         _opts=
-        if [ -n "${st_config}" ]; then
+        if [ -n "${mt_config}" ]; then
             # To generate the config file: e.g.
-            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
-            _opts+="--config ${st_config} "
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
         fi
 
-        _feats_type="$(<${_st_train_dir}/feats_type)"
+        _feats_type="$(<${_mt_train_dir}/feats_type)"
         if [ "${_feats_type}" = raw ]; then
             _scp=wav.scp
             # "sound" supports "wav", "flac", etc.
@@ -1213,19 +975,19 @@ if ! "${skip_train}"; then
             else
                 _type=sound
             fi
-            _fold_length="$((st_speech_fold_length * 100))"
+            _fold_length="$((mt_speech_fold_length * 100))"
             _opts+="--frontend_conf fs=${fs} "
         else
             _scp=feats.scp
             _type=kaldi_ark
-            _fold_length="${st_speech_fold_length}"
-            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _fold_length="${mt_speech_fold_length}"
+            _input_size="$(<${_mt_train_dir}/feats_dim)"
             _opts+="--input_size=${_input_size} "
 
         fi
         if [ "${feats_normalize}" = global_mvn ]; then
             # Default normalization is utterance_mvn and changes to global_mvn
-            _opts+="--normalize=global_mvn --normalize_conf stats_file=${st_stats_dir}/train/feats_stats.npz "
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${mt_stats_dir}/train/feats_stats.npz "
         fi
 
         if [ "${num_splits_st}" -gt 1 ]; then
@@ -1233,17 +995,17 @@ if ! "${skip_train}"; then
             # The corpus is split into subsets and each subset is used for training one by one in order,
             # so the memory footprint can be limited to the memory required for each dataset.
 
-            _split_dir="${st_stats_dir}/splits${num_splits_st}"
+            _split_dir="${mt_stats_dir}/splits${num_splits_st}"
             if [ ! -f "${_split_dir}/.done" ]; then
                 rm -f "${_split_dir}/.done"
                 ${python} -m espnet2.bin.split_scps \
                   --scps \
-                      "${_st_train_dir}/${_scp}" \
-                      "${_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
-                      "${_st_train_dir}/text.${src_case}.${src_lang}" \
-                      "${st_stats_dir}/train/speech_shape" \
-                      "${st_stats_dir}/train/text_shape.${tgt_token_type}" \
-                      "${st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                      "${_mt_train_dir}/${_scp}" \
+                      "${_mt_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_mt_train_dir}/text.${src_case}.${src_lang}" \
+                      "${mt_stats_dir}/train/speech_shape" \
+                      "${mt_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${mt_stats_dir}/train/src_text_shape.${src_token_type}" \
                   --num_splits "${num_splits_st}" \
                   --output_dir "${_split_dir}"
                 touch "${_split_dir}/.done"
@@ -1259,36 +1021,36 @@ if ! "${skip_train}"; then
             _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
             _opts+="--multiple_iterator true "
         else
-            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/${_scp},speech,${_type} "
-            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
-            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
-            _opts+="--train_shape_file ${st_stats_dir}/train/speech_shape "
-            _opts+="--train_shape_file ${st_stats_dir}/train/text_shape.${tgt_token_type} "
-            _opts+="--train_shape_file ${st_stats_dir}/train/src_text_shape.${src_token_type} "
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/src_text_shape.${src_token_type} "
         fi
 
-        log "Generate '${st_exp}/run.sh'. You can resume the process from stage 11 using this script"
-        mkdir -p "${st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${st_exp}/run.sh"; chmod +x "${st_exp}/run.sh"
+        log "Generate '${mt_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${mt_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
 
         # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
-        log "ST training started... log: '${st_exp}/train.log'"
+        log "ST training started... log: '${mt_exp}/train.log'"
         if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
             # SGE can't include "/" in a job name
-            jobname="$(basename ${st_exp})"
+            jobname="$(basename ${mt_exp})"
         else
-            jobname="${st_exp}/train.log"
+            jobname="${mt_exp}/train.log"
         fi
 
         # TODO(jiatong): fix bpe
         # shellcheck disable=SC2086
         ${python} -m espnet2.bin.launch \
             --cmd "${cuda_cmd} --name ${jobname}" \
-            --log "${st_exp}"/train.log \
+            --log "${mt_exp}"/train.log \
             --ngpu "${ngpu}" \
             --num_nodes "${num_nodes}" \
-            --init_file_prefix "${st_exp}"/.dist_init_ \
+            --init_file_prefix "${mt_exp}"/.dimt_init_ \
             --multiprocessing_distributed true -- \
-            ${python} -m espnet2.bin.st_train \
+            ${python} -m espnet2.bin.mt_train \
                 --use_preprocessor true \
                 --bpemodel "${tgt_bpemodel}" \
                 --token_type "${tgt_token_type}" \
@@ -1299,20 +1061,20 @@ if ! "${skip_train}"; then
                 --non_linguistic_symbols "${nlsyms_txt}" \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
-                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
-                --valid_shape_file "${st_stats_dir}/valid/speech_shape" \
-                --valid_shape_file "${st_stats_dir}/valid/text_shape.${tgt_token_type}" \
-                --valid_shape_file "${st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${mt_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${mt_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${mt_stats_dir}/valid/src_text_shape.${src_token_type}" \
                 --resume true \
                 --init_param ${pretrained_asr} \
                 --ignore_init_mismatch ${ignore_init_mismatch} \
                 --fold_length "${_fold_length}" \
-                --fold_length "${st_text_fold_length}" \
-                --fold_length "${st_text_fold_length}" \
-                --output_dir "${st_exp}" \
-                ${_opts} ${st_args}
+                --fold_length "${mt_text_fold_length}" \
+                --fold_length "${mt_text_fold_length}" \
+                --output_dir "${mt_exp}" \
+                ${_opts} ${mt_args}
 
     fi
 else
@@ -1322,24 +1084,24 @@ fi
 
 if [ -n "${download_model}" ]; then
     log "Use ${download_model} for decoding and evaluation"
-    st_exp="${expdir}/${download_model}"
-    mkdir -p "${st_exp}"
+    mt_exp="${expdir}/${download_model}"
+    mkdir -p "${mt_exp}"
 
     # If the model already exists, you can skip downloading
-    espnet_model_zoo_download --unpack true "${download_model}" > "${st_exp}/config.txt"
+    espnet_model_zoo_download --unpack true "${download_model}" > "${mt_exp}/config.txt"
 
     # Get the path of each file
-    _st_model_file=$(<"${st_exp}/config.txt" sed -e "s/.*'st_model_file': '\([^']*\)'.*$/\1/")
-    _st_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'st_train_config': '\([^']*\)'.*$/\1/")
+    _mt_model_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_model_file': '\([^']*\)'.*$/\1/")
+    _mt_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_train_config': '\([^']*\)'.*$/\1/")
 
     # Create symbolic links
-    ln -sf "${_st_model_file}" "${st_exp}"
-    ln -sf "${_st_train_config}" "${st_exp}"
-    inference_st_model=$(basename "${_st_model_file}")
+    ln -sf "${_mt_model_file}" "${mt_exp}"
+    ln -sf "${_mt_train_config}" "${mt_exp}"
+    inference_mt_model=$(basename "${_mt_model_file}")
 
-    if [ "$(<${st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
-        _lm_file=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
-        _lm_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+    if [ "$(<${mt_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
 
         lm_exp="${expdir}/${download_model}/lm"
         mkdir -p "${lm_exp}"
@@ -1680,11 +1442,11 @@ if ! "${skip_upload_hf}"; then
         unzip -o ${packed_model} -d ${dir_repo}
         # Generate description file
         # shellcheck disable=SC2034
-        hf_task=speech-translation
+        hf_task=machine-translation
         # shellcheck disable=SC2034     
-        espnet_task=ST
+        espnet_task=MT
         # shellcheck disable=SC2034
-        task_exp=${st_exp}
+        task_exp=${mt_exp}
         eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
 
         this_folder=${PWD}
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index 964ae2e7ece..93ffe4d3cf5 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -298,8 +298,12 @@ utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
 # Use the same text as ST for bpe training if not specified.
 if "${token_joint}"; then
     # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
-    # TODO (prepare data as text.${src_lang}_${tgt_lang})
-    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+    [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+
+    # Prepare data as text.${src_lang}_${tgt_lang})
+    cat $src_bpe_train_text $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+    tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
 else
     [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
     [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
old mode 100644
new mode 100755
index 2f40495279a..7ecfb07a6f5
--- a/egs2/iwslt14/mt1/local/data.sh
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -18,17 +18,168 @@ SECONDS=0
 stage=1
 stop_stage=100000
 splits_dir=data/iwslt22_splits
+URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
+GZ=de-en.tgz
 
 log "$0 $*"
 . utils/parse_options.sh
 
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
 if [ -z "${IWSLT14}" ]; then
-    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    log "Fill the value of 'IWSLT14' of db.sh"
     exit 1
 fi
 
-if [ $# -ne 0 ]; then
-    log "Error: No positional arguments are required."
-    exit 2
+
+if [ -f "${IWSLT14}/${GZ}" ]; then
+    log "Data already downloaded"
+else
+    cd ${IWSLT14}
+    wget "$URL"
+    tar zxvf $GZ
+    cd -
+    log "Data downloaded and extracted"
+fi
+
+src=de
+tgt=en
+lang=de-en
+prep=iwslt14.tokenized.de-en
+tmp=data/$prep/tmp
+
+if [ ! -d "${IWSLT14}/${lang}" ]; then
+    cd ${IWSLT14}
+    tar zxvf $GZ
+    cd -
+    log "Data extracted"
+fi
+
+# check extra module installation
+if ! command -v tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    exit 1
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    mkdir -p data/train data/valid data/test $tmp
+
+
+    log "preparing test and valid data"
+
+    for l in $src $tgt; do
+        for o in `ls $IWSLT14/$lang/IWSLT14.TED*.$l.xml`; do
+            fname=${o##*/}
+            f=$tmp/${fname%.*}
+            echo $o $f
+            grep '<seg id' $o | \
+                sed -e 's/<seg id="[0-9]*">\s*//g' | \
+                sed -e 's/\s*<\/seg>\s*//g' | \
+                sed -e "s/\’/\'/g" > $f
+            tokenizer.perl -threads 8 -l $l < $f > $f.tok 
+            lowercase.perl < $f.tok > $f.tok.lc
+            remove_punctuation.pl < $f.tok > $f.tok.rm
+            remove_punctuation.pl < $f.tok.lc > $f.tok.lc.rm
+            echo ""
+        done
+    done
+
+    log "pre-processing train data..."
+    for l in $src $tgt; do
+        f=train.tags.$lang.$l
+        tok=train.tags.$lang.tok.$l
+
+        cat $IWSLT14/$lang/$f | \
+        grep -v '<url>' | \
+        grep -v '<talkid>' | \
+        grep -v '<keywords>' | \
+        sed -e 's/<title>//g' | \
+        sed -e 's/<\/title>//g' | \
+        sed -e 's/<description>//g' | \
+        sed -e 's/<\/description>//g' > $tmp/$f
+        tokenizer.perl -threads 8 -l $l < $tmp/$f > $tmp/$tok
+        echo ""
+    done
+
+    log "Cleaning train data"
+    clean-corpus-n.perl -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.tok.clean 1 175
+    for l in $src $tgt; do
+        lowercase.perl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.clean.lc.$l
+
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tags.$lang.tok.lc.rm.$l
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.rm.$l
+    done
+
+    #Clean again
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.lc.rm $src $tgt $tmp/train.tags.$lang.tok.clean.lc.rm 1 175
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.rm $src $tgt $tmp/train.tags.$lang.tok.clean.rm 1 175
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Creating Splits"
+    for l in $src $tgt; do
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tok.clean.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/valid.tok.clean.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tok.clean.lc.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/valid.tok.clean.lc.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/train.tok.clean.lc.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/valid.tok.clean.lc.rm.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/train.tok.clean.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/valid.tok.clean.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok \
+            > $tmp/test.tok.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc \
+            > $tmp/test.tok.lc.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc.rm \
+            > $tmp/test.tok.lc.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.rm \
+            > $tmp/test.tok.rm.$l
+
+        nl -s ' ' -n rz $tmp/train.tok.clean.$l | awk '{print "utt" $0}' > data/train/text.tc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.rm.$l | awk '{print "utt" $0}' > data/train/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.$l | awk '{print "utt" $0}' > data/train/text.lc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/train/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/valid.tok.clean.$l | awk '{print "utt" $0}' > data/valid/text.tc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.rm.$l | awk '{print "utt" $0}' > data/valid/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.$l | awk '{print "utt" $0}' > data/valid/text.lc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/valid/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/test.tok.$l | awk '{print "utt" $0}' > data/test/text.tc.$l
+        nl -s ' ' -n rz $tmp/test.tok.rm.$l | awk '{print "utt" $0}' > data/test/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.$l | awk '{print "utt" $0}' > data/test/text.lc.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.rm.$l | awk '{print "utt" $0}' > data/test/text.lc.rm.$l
+
+    done
+fi
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
index c977de7cddb..c3a462b0d36 100755
--- a/egs2/iwslt14/mt1/run.sh
+++ b/egs2/iwslt14/mt1/run.sh
@@ -5,14 +5,14 @@ set -e
 set -u
 set -o pipefail
 
-src_lang=ta
+src_lang=de
 tgt_lang=en
 
 train_set=train
-train_dev=dev
-test_set=test1
+train_dev=valid
+test_set=test
 
-st_config=conf/train_st_conformer.yaml
+mt_config=conf/train_mt_conformer.yaml
 inference_config=conf/decode_st.yaml
 
 src_nbpe=1000
@@ -22,19 +22,17 @@ tgt_nbpe=1000
 # lc: lowercase
 # lc.rm: lowercase with punctuation removal
 # Note, it is best to keep tgt_case as tc to match IWSLT22 eval
-src_case=tc.rm
+src_case=tc
 tgt_case=tc
 
-./st.sh \
+./mt.sh \
     --ignore_init_mismatch true \
     --stage 1 \
     --stop_stage 13 \
     --use_lm false \
     --token_joint false \
-    --audio_format "flac.ark" \
     --nj 40 \
     --inference_nj 40 \
-    --audio_format "flac.ark" \
     --src_lang ${src_lang} \
     --tgt_lang ${tgt_lang} \
     --src_token_type "bpe" \
@@ -44,8 +42,7 @@ tgt_case=tc
     --src_case ${src_case} \
     --tgt_case ${tgt_case} \
     --feats_type raw \
-    --speed_perturb_factors "0.9 1.0 1.1" \
-    --st_config "${st_config}" \
+    --mt_config "${mt_config}" \
     --inference_config "${inference_config}" \
     --train_set "${train_set}" \
     --valid_set "${train_dev}" \
diff --git a/espnet2/asr/encoder/lego_transformer_encoder.py b/espnet2/asr/encoder/lego_transformer_encoder.py
new file mode 100644
index 00000000000..b1b55c1cd92
--- /dev/null
+++ b/espnet2/asr/encoder/lego_transformer_encoder.py
@@ -0,0 +1,189 @@
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Encoder definition."""
+from typing import Optional
+from typing import Tuple
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+
+
+class LegoTransformerEncoder(AbsEncoder):
+    """Transformer encoder module.
+
+    Args:
+        input_size: input dim
+        output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the number of units of position-wise feed forward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        attention_dropout_rate: dropout rate in attention
+        positional_dropout_rate: dropout rate after adding positional encoding
+        input_layer: input layer type
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before: whether to use layer_norm before the first block
+        concat_after: whether to concat attention layer's input and output
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied.
+            i.e. x -> x + att(x)
+        positionwise_layer_type: linear of conv1d
+        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
+        padding_idx: padding_idx for input_layer=embed
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 1,
+        padding_idx: int = -1,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate)
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(input_size, output_size, dropout_rate)
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate)
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate)
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                output_size,
+                MultiHeadedAttention(
+                    attention_heads, output_size, attention_dropout_rate
+                ),
+                positionwise_layer(*positionwise_layer_args),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+        xs_pad, masks = self.encoders(xs_pad, masks)
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        return xs_pad, olens, None
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index d0eaaef367a..f6032d03f49 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -70,9 +70,6 @@ def __init__(
         extract_feats_in_collect_stats: bool = True,
     ):
         assert check_argument_types()
-        assert 0.0 <= asr_weight < 1.0, "asr_weight should be [0.0, 1.0)"
-        assert 0.0 <= mt_weight < 1.0, "mt_weight should be [0.0, 1.0)"
-        assert 0.0 <= mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
 
         super().__init__()
         # note that eos is the same as sos (equivalent ID)
@@ -159,8 +156,6 @@ def __init__(
 
     def forward(
         self,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
         src_text: Optional[torch.Tensor],
@@ -169,8 +164,6 @@ def forward(
         """Frontend + Encoder + Decoder + Calc loss
 
         Args:
-            speech: (Batch, Length, ...)
-            speech_lengths: (Batch,)
             text: (Batch, Length)
             text_lengths: (Batch,)
             src_text: (Batch, length)

From 9169d985d76d0ea52ff89dea22433ba28c01fe07 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Wed, 23 Feb 2022 17:52:28 +0800
Subject: [PATCH 11/34] add pre-trained SkiM model

---
 egs2/TEMPLATE/enh1/enh.sh                     |  2 +-
 egs2/wsj0_2mix/enh1/README.md                 | 21 ++++++
 ...l => train_enh_skim_tasnet_noncausal.yaml} |  4 +-
 .../tuning/train_enh_skim_tasnet_steplr.yaml  | 71 +++++++++++++++++++
 4 files changed, 95 insertions(+), 3 deletions(-)
 rename egs2/wsj0_2mix/enh1/conf/tuning/{train_enh_skim_tasnet.yaml => train_enh_skim_tasnet_noncausal.yaml} (96%)
 create mode 100644 egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml

diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
index ff8c8040206..cb6e9e8503b 100755
--- a/egs2/TEMPLATE/enh1/enh.sh
+++ b/egs2/TEMPLATE/enh1/enh.sh
@@ -72,7 +72,7 @@ init_param=
 
 # Enhancement related
 inference_args="--normalize_output_wav true"
-inference_model=valid.si_snr.ave.pth
+inference_model=valid.loss.ave.pth
 download_model=
 
 # Evaluation related
diff --git a/egs2/wsj0_2mix/enh1/README.md b/egs2/wsj0_2mix/enh1/README.md
index d718e363ffb..c4a92afe2dd 100644
--- a/egs2/wsj0_2mix/enh1/README.md
+++ b/egs2/wsj0_2mix/enh1/README.md
@@ -51,3 +51,24 @@
 |enhanced_cv_min_8k|0.96|19.04|18.54|29.15|
 |enhanced_tt_min_8k|0.96|18.82|18.29|28.92|
 
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Wed Feb 23 16:42:06 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `ac3c10cfe4faf82c0bb30f8b32d9e8692363e0a9`
+  - Commit date: `Fri Feb 11 16:22:52 2022 +0800`
+
+
+## enh_train_enh_skim_tasnet_noncausal_raw
+
+ - config: conf/tuning/train_enh_skim_tasnet_noncausal.yaml
+ - Pretrained model: https://huggingface.co/lichenda/wsj0_2mix_skim_noncausal
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_cv_min_8k|0.96|19.17|18.70|29.56|
+|enhanced_tt_min_8k|0.97|18.96|18.45|29.31|
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
similarity index 96%
rename from egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
rename to egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
index 2eced345061..0c338171b63 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
@@ -10,7 +10,7 @@ optim_conf:
     lr: 1.0e-03
     eps: 1.0e-08
     weight_decay: 0
-patience: 4
+patience: 20
 val_scheduler_criterion:
 - valid
 - loss
@@ -48,7 +48,7 @@ separator_conf:
     segment_size: 250
     dropout: 0.1
     mem_type: hc 
-    seg_overlap: False
+    seg_overlap: True
 
 # A list for criterions
 # The overlall loss in the multi-task learning will be:
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
new file mode 100644
index 00000000000..9b85b4f7944
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
@@ -0,0 +1,71 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 50
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: True
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+

From 79b9f112f801521528607984e1deecc4080136e1 Mon Sep 17 00:00:00 2001
From: LiChenda <lichenda1996@sjtu.edu.cn>
Date: Wed, 23 Feb 2022 19:22:38 +0800
Subject: [PATCH 12/34] fix for review comments

---
 espnet2/enh/layers/skim.py              | 11 +++++------
 espnet2/enh/layers/tcn.py               | 21 +++++++--------------
 espnet2/enh/separator/skim_separator.py |  3 ++-
 3 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
index 3560d13b56e..f095f97495c 100644
--- a/espnet2/enh/layers/skim.py
+++ b/espnet2/enh/layers/skim.py
@@ -9,7 +9,7 @@
 from espnet2.enh.layers.dprnn import merge_feature
 from espnet2.enh.layers.dprnn import SingleRNN
 from espnet2.enh.layers.dprnn import split_feature
-from espnet2.enh.layers.tcn import chose_norm
+from espnet2.enh.layers.tcn import choose_norm
 
 
 class MemLSTM(nn.Module):
@@ -57,7 +57,7 @@ def __init__(
                 dropout=dropout,
                 bidirectional=bidirectional,
             )
-            self.h_norm = chose_norm(
+            self.h_norm = choose_norm(
                 norm_type=norm_type, channel_size=self.input_size, shape="BTD"
             )
         if mem_type in ["hc", "c"]:
@@ -68,7 +68,7 @@ def __init__(
                 dropout=dropout,
                 bidirectional=bidirectional,
             )
-            self.c_norm = chose_norm(
+            self.c_norm = choose_norm(
                 norm_type=norm_type, channel_size=self.input_size, shape="BTD"
             )
 
@@ -146,7 +146,7 @@ def __init__(
         )
         self.dropout = nn.Dropout(p=dropout)
         self.proj = nn.Linear(hidden_size * self.num_direction, input_size)
-        self.norm = chose_norm(
+        self.norm = choose_norm(
             norm_type=norm_type, channel_size=input_size, shape="BTD"
         )
 
@@ -294,8 +294,7 @@ def _padfeature(self, input):
         rest = self.segment_size - T % self.segment_size
 
         if rest > 0:
-            pad = torch.zeros(B, rest, D, device=input.device)
-            input = torch.cat([input, pad], dim=1)
+            input = torch.nn.functional.pad(input, (0, 0, 0, rest))
         return input, rest
 
 
diff --git a/espnet2/enh/layers/tcn.py b/espnet2/enh/layers/tcn.py
index acc2ba6e309..11cd5fe3170 100644
--- a/espnet2/enh/layers/tcn.py
+++ b/espnet2/enh/layers/tcn.py
@@ -111,7 +111,7 @@ def __init__(
         # [M, B, K] -> [M, H, K]
         conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, out_channels)
+        norm = choose_norm(norm_type, out_channels)
         # [M, H, K] -> [M, B, K]
         dsconv = DepthwiseSeparableConv(
             out_channels,
@@ -170,7 +170,7 @@ def __init__(
         if causal:
             chomp = Chomp1d(padding)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, in_channels)
+        norm = choose_norm(norm_type, in_channels)
         # [M, H, K] -> [M, B, K]
         pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         # Put together
@@ -215,7 +215,7 @@ def check_nonlinear(nolinear_type):
         raise ValueError("Unsupported nonlinear type")
 
 
-def chose_norm(norm_type, channel_size, shape="BDT"):
+def choose_norm(norm_type, channel_size, shape="BDT"):
     """The input of normalization will be (M, C, K), where M is batch size.
 
     C is channel size and K is sequence length.
@@ -258,11 +258,8 @@ def forward(self, y):
         Returns:
             cLN_y: [M, N, K]
         """
-        dim = 3
-        if y.dim() == 4:
-            dim = 4
-            M, N, K, L = y.shape
-            y = y.view(M, N, K * L)
+
+        assert y.dim() == 3
 
         if self.shape == "BTD":
             y = y.transpose(1, 2).contiguous()
@@ -274,8 +271,6 @@ def forward(self, y):
         if self.shape == "BTD":
             cLN_y = cLN_y.transpose(1, 2).contiguous()
 
-        if dim == 4:
-            cLN_y = cLN_y.view(M, N, K, L)
         return cLN_y
 
 
@@ -306,10 +301,8 @@ def forward(self, y):
         if self.shape == "BTD":
             y = y.transpose(1, 2).contiguous()
 
-        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
-        var = (
-            (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
-        )
+        mean = y.mean(dim=(1, 2), keepdim=True)  # [M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=(1, 2), keepdim=True)
         gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
 
         if self.shape == "BTD":
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
index 9df8e517a96..2f58421df32 100644
--- a/espnet2/enh/separator/skim_separator.py
+++ b/espnet2/enh/separator/skim_separator.py
@@ -6,6 +6,7 @@
 import torch
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.layers.skim import SkiM
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
@@ -99,7 +100,7 @@ def forward(
         """
 
         # if complex spectrum,
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input

From c5fea1a42c21853afbdcae2827f709e6f48ea9ee Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Thu, 24 Feb 2022 06:28:31 -0500
Subject: [PATCH 13/34] mt model

---
 espnet2/mt/espnet_model.py            | 273 ++-----------
 espnet2/mt/{ => frontend}/__init__.py |   0
 espnet2/mt/frontend/embedding.py      |  59 +++
 espnet2/tasks/mt.py                   | 527 ++++++++++++++++++++++++++
 4 files changed, 628 insertions(+), 231 deletions(-)
 rename espnet2/mt/{ => frontend}/__init__.py (100%)
 create mode 100644 espnet2/mt/frontend/embedding.py
 create mode 100644 espnet2/tasks/mt.py

diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index f6032d03f49..43cca42dfb4 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -10,20 +10,17 @@
 import torch
 from typeguard import check_argument_types
 
-from espnet.nets.e2e_asr_common import ErrorCalculator as ASRErrorCalculator
 from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
 from espnet.nets.pytorch_backend.nets_utils import th_accuracy
 from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
 from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
     LabelSmoothingLoss,  # noqa: H301
 )
-from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
-from espnet2.asr.specaug.abs_specaug import AbsSpecAug
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
@@ -37,33 +34,23 @@ def autocast(enabled=True):
         yield
 
 
-class ESPnetSTModel(AbsESPnetModel):
-    """CTC-attention hybrid Encoder-Decoder model"""
+class ESPnetMTModel(AbsESPnetModel):
+    """Encoder-Decoder model"""
 
     def __init__(
         self,
         vocab_size: int,
         token_list: Union[Tuple[str, ...], List[str]],
         frontend: Optional[AbsFrontend],
-        specaug: Optional[AbsSpecAug],
-        normalize: Optional[AbsNormalize],
         preencoder: Optional[AbsPreEncoder],
         encoder: AbsEncoder,
         postencoder: Optional[AbsPostEncoder],
         decoder: AbsDecoder,
-        extra_asr_decoder: Optional[AbsDecoder],
-        extra_mt_decoder: Optional[AbsDecoder],
-        ctc: CTC,
         src_vocab_size: int = 0,
         src_token_list: Union[Tuple[str, ...], List[str]] = [],
-        asr_weight: float = 0.0,
-        mt_weight: float = 0.0,
-        mtlalpha: float = 0.0,
         ignore_id: int = -1,
         lsm_weight: float = 0.0,
         length_normalized_loss: bool = False,
-        report_cer: bool = True,
-        report_wer: bool = True,
         report_bleu: bool = True,
         sym_space: str = "<space>",
         sym_blank: str = "<blank>",
@@ -78,59 +65,21 @@ def __init__(
         self.vocab_size = vocab_size
         self.src_vocab_size = src_vocab_size
         self.ignore_id = ignore_id
-        self.asr_weight = asr_weight
-        self.mt_weight = mt_weight
-        self.mtlalpha = mtlalpha
         self.token_list = token_list.copy()
 
         self.frontend = frontend
-        self.specaug = specaug
-        self.normalize = normalize
         self.preencoder = preencoder
         self.postencoder = postencoder
         self.encoder = encoder
-        self.decoder = (
-            decoder  # TODO(jiatong): directly implement multi-decoder structure at here
-        )
+        self.decoder = decoder
 
-        self.criterion_st = LabelSmoothingLoss(
+        self.criterion_mt = LabelSmoothingLoss(
             size=vocab_size,
             padding_idx=ignore_id,
             smoothing=lsm_weight,
             normalize_length=length_normalized_loss,
         )
 
-        self.criterion_asr = LabelSmoothingLoss(
-            size=src_vocab_size,
-            padding_idx=ignore_id,
-            smoothing=lsm_weight,
-            normalize_length=length_normalized_loss,
-        )
-
-        # submodule for ASR task
-        if self.asr_weight > 0:
-            assert (
-                src_token_list is not None
-            ), "Missing src_token_list, cannot add asr module to st model"
-            if self.mtlalpha > 0.0:
-                self.ctc = ctc
-            if self.mtlalpha < 1.0:
-                self.extra_asr_decoder = extra_asr_decoder
-            elif extra_asr_decoder is not None:
-                logging.warning(
-                    "Not using extra_asr_decoder because "
-                    "mtlalpha is set as {} (== 1.0)".format(mtlalpha),
-                )
-
-        # submodule for MT task
-        if self.mt_weight > 0:
-            self.extra_mt_decoder = extra_mt_decoder
-        elif extra_mt_decoder is not None:
-            logging.warning(
-                "Not using extra_mt_decoder because "
-                "mt_weight is set as {} (== 0)".format(mt_weight),
-            )
-
         # MT error calculator
         if report_bleu:
             self.mt_error_calculator = MTErrorCalculator(
@@ -139,27 +88,16 @@ def __init__(
         else:
             self.mt_error_calculator = None
 
-        # ASR error calculator
-        if report_cer or report_wer:
-            assert (
-                src_token_list is not None
-            ), "Missing src_token_list, cannot add asr module to st model"
-            self.asr_error_calculator = ASRErrorCalculator(
-                src_token_list, sym_space, sym_blank, report_cer, report_wer
-            )
-        else:
-            self.asr_error_calculator = None
 
         self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
 
-        # TODO(jiatong): add multilingual related functions
 
     def forward(
         self,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
-        src_text: Optional[torch.Tensor],
-        src_text_lengths: Optional[torch.Tensor],
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -172,98 +110,33 @@ def forward(
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
         assert (
-            speech.shape[0]
-            == speech_lengths.shape[0]
-            == text.shape[0]
+            text.shape[0]
             == text_lengths.shape[0]
-        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
-
-        # additional checks with valid src_text
-        if src_text is not None:
-            assert src_text_lengths.dim() == 1, src_text_lengths.shape
-            assert text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0], (
-                text.shape,
-                src_text.shape,
-                src_text_lengths.shape,
-            )
+            == src_text.shape[0]
+            == src_text_lengths.shape[0]
+        ), (text.shape, text_lengths.shape, src_text.shape, src_text_lengths.shape)
 
-        batch_size = speech.shape[0]
+        batch_size = src_text.shape[0]
 
         # for data-parallel
         text = text[:, : text_lengths.max()]
-        if src_text is not None:
-            src_text = src_text[:, : src_text_lengths.max()]
+        src_text = src_text[:, : src_text_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        encoder_out, encoder_out_lens = self.encode(src_text, src_text_lengths)
 
-        # 2a. Attention-decoder branch (ST)
-        loss_st_att, acc_st_att, bleu_st_att = self._calc_mt_att_loss(
-            encoder_out, encoder_out_lens, text, text_lengths, st=True
+        # 2a. Attention-decoder branch (MT)
+        loss_mt_att, acc_mt_att, bleu_mt_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths
         )
 
-        # 2b. CTC branch
-        if self.asr_weight > 0:
-            assert src_text is not None, "missing source text for asr sub-task of ST"
-
-        if self.asr_weight > 0 and self.mtlalpha > 0:
-            loss_asr_ctc, cer_asr_ctc = self._calc_ctc_loss(
-                encoder_out, encoder_out_lens, src_text, src_text_lengths
-            )
-        else:
-            loss_asr_ctc, cer_asr_ctc = 0, None
-
-        # 2c. Attention-decoder branch (extra ASR)
-        if self.asr_weight > 0 and self.mtlalpha < 1.0:
-            (
-                loss_asr_att,
-                acc_asr_att,
-                cer_asr_att,
-                wer_asr_att,
-            ) = self._calc_asr_att_loss(
-                encoder_out, encoder_out_lens, src_text, src_text_lengths
-            )
-        else:
-            loss_asr_att, acc_asr_att, cer_asr_att, wer_asr_att = 0, None, None, None
-
-        # 2d. Attention-decoder branch (extra MT)
-        if self.mt_weight > 0:
-            loss_mt_att, acc_mt_att = self._calc_mt_att_loss(
-                encoder_out, encoder_out_lens, text, text_lengths, st=False
-            )
-        else:
-            loss_mt_att, acc_mt_att = 0, None
-
         # 3. Loss computation
-        asr_ctc_weight = self.mtlalpha
-        loss_st = loss_st_att
-        if asr_ctc_weight == 1.0:
-            loss_asr = loss_asr_ctc
-        elif asr_ctc_weight == 0.0:
-            loss_asr = loss_asr_att
-        else:
-            loss_asr = (
-                asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att
-            )
-        loss_mt = self.mt_weight * loss_mt_att
-        loss = (
-            (1 - self.asr_weight - self.mt_weight) * loss_st
-            + self.asr_weight * loss_asr
-            + self.mt_weight * loss_mt
-        )
+        loss = loss_mt_att
 
         stats = dict(
             loss=loss.detach(),
-            loss_asr=loss_asr.detach() if type(loss_asr) is not float else loss_asr,
-            loss_mt=loss_mt.detach() if type(loss_mt) is not float else loss_mt,
-            loss_st=loss_st.detach(),
-            acc_asr=acc_asr_att,
-            acc_mt=acc_mt_att,
-            acc=acc_st_att,
-            cer_ctc=cer_asr_ctc,
-            cer=cer_asr_att,
-            wer=wer_asr_att,
-            bleu=bleu_st_att,
+            acc=acc_mt_att,
+            bleu=bleu_mt_att,
         )
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
@@ -272,15 +145,13 @@ def forward(
 
     def collect_feats(
         self,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
-        src_text: Optional[torch.Tensor],
-        src_text_lengths: Optional[torch.Tensor],
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
-            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
         else:
             # Generate dummy stats if extract_feats_in_collect_stats is False
             logging.warning(
@@ -288,29 +159,25 @@ def collect_feats(
                 "because encoder_conf.extract_feats_in_collect_stats is "
                 f"{self.extract_feats_in_collect_stats}"
             )
-            feats, feats_lengths = speech, speech_lengths
+            feats, feats_lengths = src_text, src_text_lengths
         return {"feats": feats, "feats_lengths": feats_lengths}
 
     def encode(
-        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Frontend + Encoder. Note that this method is used by st_inference.py
+        """Frontend + Encoder. Note that this method is used by mt_inference.py
 
         Args:
-            speech: (Batch, Length, ...)
-            speech_lengths: (Batch, )
+            src_text: (Batch, Length, ...)
+            src_text_lengths: (Batch, )
         """
         with autocast(False):
             # 1. Extract feats
-            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
 
             # 2. Data augmentation
-            if self.specaug is not None and self.training:
-                feats, feats_lengths = self.specaug(feats, feats_lengths)
-
-            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
-            if self.normalize is not None:
-                feats, feats_lengths = self.normalize(feats, feats_lengths)
+            # if self.specaug is not None and self.training:
+            #     feats, feats_lengths = self.specaug(feats, feats_lengths)
 
         # Pre-encoder, e.g. used for raw input data
         if self.preencoder is not None:
@@ -327,9 +194,9 @@ def encode(
                 encoder_out, encoder_out_lens
             )
 
-        assert encoder_out.size(0) == speech.size(0), (
+        assert encoder_out.size(0) == src_text.size(0), (
             encoder_out.size(),
-            speech.size(0),
+            src_text.size(0),
         )
         assert encoder_out.size(1) <= encoder_out_lens.max(), (
             encoder_out.size(),
@@ -339,22 +206,21 @@ def encode(
         return encoder_out, encoder_out_lens
 
     def _extract_feats(
-        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        assert speech_lengths.dim() == 1, speech_lengths.shape
+        assert src_text_lengths.dim() == 1, src_text_lengths.shape
 
         # for data-parallel
-        speech = speech[:, : speech_lengths.max()]
+        src_text = src_text[:, : src_text_lengths.max()]
 
         if self.frontend is not None:
             # Frontend
-            #  e.g. STFT and Feature extract
-            #       data_loader may send time-domain signal in this case
-            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
-            feats, feats_lengths = self.frontend(speech, speech_lengths)
+            #  e.g. Embedding Lookup
+            # src_text (Batch, NSamples) -> feats: (Batch, NSamples, Dim)
+            feats, feats_lengths = self.frontend(src_text, src_text_lengths)
         else:
             # No frontend and no feature extract
-            feats, feats_lengths = speech, speech_lengths
+            feats, feats_lengths = src_text, src_text_lengths
         return feats, feats_lengths
 
     def _calc_mt_att_loss(
@@ -363,20 +229,14 @@ def _calc_mt_att_loss(
         encoder_out_lens: torch.Tensor,
         ys_pad: torch.Tensor,
         ys_pad_lens: torch.Tensor,
-        st: bool = True,
     ):
         ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
         ys_in_lens = ys_pad_lens + 1
 
         # 1. Forward decoder
-        if st:
-            decoder_out, _ = self.decoder(
-                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
-            )
-        else:
-            decoder_out, _ = self.extra_mt_decoder(
-                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
-            )
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
 
         # 2. Compute attention loss
         loss_att = self.criterion_st(decoder_out, ys_out_pad)
@@ -394,52 +254,3 @@ def _calc_mt_att_loss(
             bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
 
         return loss_att, acc_att, bleu_att
-
-    def _calc_asr_att_loss(
-        self,
-        encoder_out: torch.Tensor,
-        encoder_out_lens: torch.Tensor,
-        ys_pad: torch.Tensor,
-        ys_pad_lens: torch.Tensor,
-    ):
-        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
-        ys_in_lens = ys_pad_lens + 1
-
-        # 1. Forward decoder
-        decoder_out, _ = self.decoder(
-            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
-        )
-
-        # 2. Compute attention loss
-        loss_att = self.criterion_asr(decoder_out, ys_out_pad)
-        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
-            ys_out_pad,
-            ignore_label=self.ignore_id,
-        )
-
-        # Compute cer/wer using attention-decoder
-        if self.training or self.error_calculator is None:
-            cer_att, wer_att = None, None
-        else:
-            ys_hat = decoder_out.argmax(dim=-1)
-            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
-
-        return loss_att, acc_att, cer_att, wer_att
-
-    def _calc_ctc_loss(
-        self,
-        encoder_out: torch.Tensor,
-        encoder_out_lens: torch.Tensor,
-        ys_pad: torch.Tensor,
-        ys_pad_lens: torch.Tensor,
-    ):
-        # Calc CTC loss
-        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
-
-        # Calc CER using CTC
-        cer_ctc = None
-        if not self.training and self.asr_error_calculator is not None:
-            ys_hat = self.ctc.argmax(encoder_out).data
-            cer_ctc = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
-        return loss_ctc, cer_ctc
diff --git a/espnet2/mt/__init__.py b/espnet2/mt/frontend/__init__.py
similarity index 100%
rename from espnet2/mt/__init__.py
rename to espnet2/mt/frontend/__init__.py
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
new file mode 100644
index 00000000000..d3eb9993c02
--- /dev/null
+++ b/espnet2/mt/frontend/embedding.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+#  2020, Technische Universität München;  Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Embedding Frontend for text based inputs."""
+
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+import torch
+from typeguard import check_argument_types
+from typing import Tuple
+
+
+class Embedding(AbsFrontend):
+    """Embedding Frontend for text based inputs.
+    """
+
+    def __init__(
+        self,
+        input_size: int = 400,
+        embed_dim: int = 400,
+        no_embed_scale: bool = False,
+        padding: int = -1,
+    ):
+        """Initialize.
+
+        Args:
+            input_size: Number of input tokens.
+            embed_dim: Embedding Size.
+            no_embed_scale: Whether to scale the embeddings or not.
+            padding: Padding.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.padding = padding
+        self.embed_scale = 1.0 if no_embed_scale else math.sqrt(embed_dim)
+        self.embed = torch.nn.Embedding(input_size, embed_dim, padding_idx=padding)
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply a sliding window on the input.
+
+        Args:
+            input: Input (B, T) or (B, T,D), with D.
+            input_lengths: Input lengths within batch.
+
+        Returns:
+            Tensor: Output with dimensions (B, T, D).
+            Tensor: Output lengths within batch.
+        """
+        token_embedding = self.embed(input)
+        x = self.embed_scale * token_embedding
+
+        return x, input_lengths
+
+    def output_size(self) -> int:
+        """Return output length of feature dimension D, i.e. the embedding dim."""
+        return self.embed_dim
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
new file mode 100644
index 00000000000..50d6d1df121
--- /dev/null
+++ b/espnet2/tasks/mt.py
@@ -0,0 +1,527 @@
+import argparse
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.rnn_decoder import RNNDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    DynamicConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import DynamicConvolutionTransformerDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolutionTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
+from espnet2.asr.encoder.rnn_encoder import RNNEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
+from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import float_or_none
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+    ),
+    type_check=AbsFrontend,
+    default="default",
+)
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(specaug=SpecAug),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
+normalize_choices = ClassChoices(
+    "normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default="utterance_mvn",
+    optional=True,
+)
+preencoder_choices = ClassChoices(
+    name="preencoder",
+    classes=dict(
+        sinc=LightweightSincConvs,
+        linear=LinearProjection,
+    ),
+    type_check=AbsPreEncoder,
+    default=None,
+    optional=True,
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
+        vgg_rnn=VGGRNNEncoder,
+        rnn=RNNEncoder,
+        wav2vec2=FairSeqWav2Vec2Encoder,
+        hubert=FairseqHubertEncoder,
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="rnn",
+)
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
+decoder_choices = ClassChoices(
+    "decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_asr_decoder_choices = ClassChoices(
+    "extra_asr_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_mt_decoder_choices = ClassChoices(
+    "extra_mt_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+
+
+class MTTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
+        # --decoder and --decoder_conf
+        decoder_choices,
+        # --extra_asr_decoder and --extra_asr_decoder_conf
+        extra_asr_decoder_choices,
+        # --extra_mt_decoder and --extra_mt_decoder_conf
+        extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["src_token_list", "token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for target language)",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetSTModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The target text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for target language)",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = MutliTokenizerCommonPreprocessor(
+                train=train,
+                token_type=[args.token_type, args.src_token_type],
+                token_list=[args.token_list, args.src_token_list],
+                bpemodel=[args.bpemodel, args.src_bpemodel],
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                text_name=["text", "src_text"],
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("src_text",)
+        else:
+            retval = ()
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        if args.src_token_list is not None:
+            if isinstance(args.src_token_list, str):
+                with open(args.src_token_list, encoding="utf-8") as f:
+                    src_token_list = [line.rstrip() for line in f]
+
+                # Overwriting src_token_list to keep it as "portable".
+                args.src_token_list = list(src_token_list)
+            elif isinstance(args.src_token_list, (tuple, list)):
+                src_token_list = list(args.src_token_list)
+            else:
+                raise RuntimeError("token_list must be str or list")
+            src_vocab_size = len(src_token_list)
+            logging.info(f"Source vocabulary size: {src_vocab_size }")
+        else:
+            src_token_list, src_vocab_size = None, None
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
+        # 5. Decoder
+        decoder_class = decoder_choices.get_class(args.decoder)
+
+        decoder = decoder_class(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder_output_size,
+            **args.decoder_conf,
+        )
+
+        # 6. CTC
+        if src_token_list is not None:
+            ctc = CTC(
+                odim=src_vocab_size,
+                encoder_output_sizse=encoder_output_size,
+                **args.ctc_conf,
+            )
+        else:
+            ctc = None
+
+        # 7. ASR extra decoder
+        if (
+            getattr(args, "extra_asr_decoder", None) is not None
+            and src_token_list is not None
+        ):
+            extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
+                args.extra_asr_decoder
+            )
+            extra_asr_decoder = extra_asr_decoder_class(
+                vocab_size=src_vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_asr_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. MT extra decoder
+        if getattr(args, "extra_mt_decoder", None) is not None:
+            extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
+                args.extra_mt_decoder
+            )
+            extra_mt_decoder = extra_mt_decoder_class(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_mt_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. Build model
+        model = ESPnetSTModel(
+            vocab_size=vocab_size,
+            src_vocab_size=src_vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            extra_asr_decoder=extra_asr_decoder,
+            extra_mt_decoder=extra_mt_decoder,
+            token_list=token_list,
+            src_token_list=src_token_list,
+            **args.model_conf,
+        )
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model

From e04e67a4ef7b5757282cef78e5bdee6dc91eb986 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Thu, 24 Feb 2022 06:32:39 -0500
Subject: [PATCH 14/34] bin files

---
 espnet2/bin/mt_train.py     | 23 +++++++++++++++++++++++
 espnet2/bin/st_inference.py |  0
 espnet2/bin/st_train.py     |  0
 3 files changed, 23 insertions(+)
 create mode 100755 espnet2/bin/mt_train.py
 mode change 100644 => 100755 espnet2/bin/st_inference.py
 mode change 100644 => 100755 espnet2/bin/st_train.py

diff --git a/espnet2/bin/mt_train.py b/espnet2/bin/mt_train.py
new file mode 100755
index 00000000000..03739be8623
--- /dev/null
+++ b/espnet2/bin/mt_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.mt import MTTask
+
+
+def get_parser():
+    parser = MTTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""MT training.
+
+    Example:
+
+        % python mt_train.py st --print_config --optim adadelta \
+                > conf/train_mt.yaml
+        % python mt_train.py --config conf/train_mt.yaml
+    """
+    MTTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
old mode 100644
new mode 100755
diff --git a/espnet2/bin/st_train.py b/espnet2/bin/st_train.py
old mode 100644
new mode 100755

From 33621ec5b143b97c79631f3dc2ca79b18c12546f Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Thu, 24 Feb 2022 21:13:40 -0500
Subject: [PATCH 15/34] training config file

---
 .../mt1/conf/train_mt_transformer.yaml        | 49 +++++++++++++++++++
 egs2/iwslt14/mt1/run.sh                       |  4 +-
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 egs2/iwslt14/mt1/conf/train_mt_transformer.yaml

diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
new file mode 100644
index 00000000000..75ef57bbfe9
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -0,0 +1,49 @@
+# This configuration requires TITAN RTX (12GB) x 4 GPUs It takes about XX days.
+batch_type: numel
+batch_bins: 15000000
+accum_grad: 8
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: none
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
index c3a462b0d36..39981f12c07 100755
--- a/egs2/iwslt14/mt1/run.sh
+++ b/egs2/iwslt14/mt1/run.sh
@@ -12,8 +12,8 @@ train_set=train
 train_dev=valid
 test_set=test
 
-mt_config=conf/train_mt_conformer.yaml
-inference_config=conf/decode_st.yaml
+mt_config=conf/train_mt_transformer.yaml
+inference_config=conf/decode_mt.yaml
 
 src_nbpe=1000
 tgt_nbpe=1000

From 3ca060856f044aaf56c184aa54f345867d652731 Mon Sep 17 00:00:00 2001
From: Xuankai <netnetchangxk@gmail.com>
Date: Thu, 24 Feb 2022 21:35:26 -0500
Subject: [PATCH 16/34] fix mt task towards collect_stats at stage 9

---
 egs2/TEMPLATE/asr1/db.sh                      |   2 +-
 egs2/TEMPLATE/mt1/mt.sh                       |  14 +--
 .../mt1/conf/train_mt_transformer.yaml        |   7 +-
 espnet2/mt/frontend/embedding.py              |   2 +
 espnet2/tasks/mt.py                           | 115 ++++++++----------
 5 files changed, 62 insertions(+), 78 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 62fd7e048f2..4a89236e8f0 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -125,7 +125,7 @@ TOTONAC=downloads
 PRIMEWORDS_CHINESE=downloads
 SEAME=
 BENGALI=downloads
-IWSLT14=downloads
+IWSLT14=~/Documents/work/data/IWSLT14
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index f7ad47931b3..ebfa1d0229b 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -558,7 +558,7 @@ if ! "${skip_data_prep}"; then
             cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
 
             # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
             ${python} -m espnet2.bin.tokenize_text  \
                 --token_type "${tgt_token_type}" \
                 --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
@@ -634,7 +634,7 @@ if ! "${skip_data_prep}"; then
                 cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
 
                 # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
                 ${python} -m espnet2.bin.tokenize_text  \
                     --token_type "${src_token_type}" \
                     --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
@@ -872,6 +872,8 @@ if ! "${skip_train}"; then
         _logdir="${mt_stats_dir}/logdir"
         mkdir -p "${_logdir}"
 
+        _scp=text.${src_case}.${src_lang}
+
         # Get the minimum number among ${nj} and the number lines of input files
         _nj=$(min "${nj}" "$(<${_mt_train_dir}/${_scp} wc -l)" "$(<${_mt_valid_dir}/${_scp} wc -l)")
 
@@ -896,7 +898,7 @@ if ! "${skip_train}"; then
         mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
 
         # 3. Submit jobs
-        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+        log "MT collect-stats started... log: '${_logdir}/stats.*.log'"
 
         # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
         #       but it's used only for deciding the sample ids.
@@ -916,10 +918,8 @@ if ! "${skip_train}"; then
                 --non_linguistic_symbols "${nlsyms_txt}" \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
-                --train_data_path_and_name_and_type "${_mt_train_dir}/${_scp},speech,${_type}" \
                 --train_data_path_and_name_and_type "${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
                 --train_data_path_and_name_and_type "${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text" \
-                --valid_data_path_and_name_and_type "${_mt_valid_dir}/${_scp},speech,${_type}" \
                 --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
                 --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
                 --train_shape_file "${_logdir}/train.JOB.scp" \
@@ -1033,7 +1033,7 @@ if ! "${skip_train}"; then
         mkdir -p "${mt_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
 
         # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
-        log "ST training started... log: '${mt_exp}/train.log'"
+        log "MT training started... log: '${mt_exp}/train.log'"
         if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
             # SGE can't include "/" in a job name
             jobname="$(basename ${mt_exp})"
@@ -1388,7 +1388,7 @@ cd $(pwd | rev | cut -d/ -f1-3 | rev)
 ./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
 </pre></li>
 <li><strong>Results</strong><pre><code>$(cat "${st_exp}"/RESULTS.md)</code></pre></li>
-<li><strong>ST config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
+<li><strong>MT config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
 <li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
 </ul>
 EOF
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
index 75ef57bbfe9..f39b863dc70 100644
--- a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
+++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -1,4 +1,3 @@
-# This configuration requires TITAN RTX (12GB) x 4 GPUs It takes about XX days.
 batch_type: numel
 batch_bins: 15000000
 accum_grad: 8
@@ -20,7 +19,7 @@ encoder_conf:
     dropout_rate: 0.1
     positional_dropout_rate: 0.1
     attention_dropout_rate: 0.1
-    input_layer: none
+    input_layer: null
     normalize_before: true
 
 decoder: transformer
@@ -43,7 +42,3 @@ optim_conf:
 scheduler: warmuplr
 scheduler_conf:
     warmup_steps: 25000
-
-frontend_conf:
-  n_fft: 512
-  hop_length: 256
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
index d3eb9993c02..6d5e824f7c0 100644
--- a/espnet2/mt/frontend/embedding.py
+++ b/espnet2/mt/frontend/embedding.py
@@ -4,6 +4,8 @@
 
 """Embedding Frontend for text based inputs."""
 
+import math
+
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 import torch
 from typeguard import check_argument_types
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
index 50d6d1df121..6bdc44f1a73 100644
--- a/espnet2/tasks/mt.py
+++ b/espnet2/tasks/mt.py
@@ -53,7 +53,8 @@
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.layers.global_mvn import GlobalMVN
 from espnet2.layers.utterance_mvn import UtteranceMVN
-from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.mt.frontend.embedding import Embedding
+from espnet2.mt.espnet_model import ESPnetMTModel
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
@@ -71,12 +72,10 @@
 frontend_choices = ClassChoices(
     name="frontend",
     classes=dict(
-        default=DefaultFrontend,
-        sliding_window=SlidingWindow,
-        s3prl=S3prlFrontend,
+        embed=Embedding,
     ),
     type_check=AbsFrontend,
-    default="default",
+    default="embed",
 )
 specaug_choices = ClassChoices(
     name="specaug",
@@ -251,7 +250,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--model_conf",
             action=NestedDictAction,
-            default=get_default_kwargs(ESPnetSTModel),
+            default=get_default_kwargs(ESPnetMTModel),
             help="The keyword arguments for model class.",
         )
 
@@ -350,7 +349,7 @@ def required_data_names(
         cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("speech", "text")
+            retval = ("src_text", "text")
         else:
             # Recognition mode
             retval = ("speech",)
@@ -361,14 +360,14 @@ def optional_data_names(
         cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("src_text",)
+            retval = ()
         else:
             retval = ()
         assert check_return_type(retval)
         return retval
 
     @classmethod
-    def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
+    def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
         assert check_argument_types()
         if isinstance(args.token_list, str):
             with open(args.token_list, encoding="utf-8") as f:
@@ -412,21 +411,14 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
             frontend = None
             input_size = args.input_size
 
-        # 2. Data augmentation for spectrogram
-        if args.specaug is not None:
-            specaug_class = specaug_choices.get_class(args.specaug)
-            specaug = specaug_class(**args.specaug_conf)
-        else:
-            specaug = None
-
-        # 3. Normalization layer
-        if args.normalize is not None:
-            normalize_class = normalize_choices.get_class(args.normalize)
-            normalize = normalize_class(**args.normalize_conf)
-        else:
-            normalize = None
+        # 2. Normalization layer
+        # if args.normalize is not None:
+        #     normalize_class = normalize_choices.get_class(args.normalize)
+        #     normalize = normalize_class(**args.normalize_conf)
+        # else:
+        #     normalize = None
 
-        # 4. Pre-encoder input block
+        # 3. Pre-encoder input block
         # NOTE(kan-bayashi): Use getattr to keep the compatibility
         if getattr(args, "preencoder", None) is not None:
             preencoder_class = preencoder_choices.get_class(args.preencoder)
@@ -461,58 +453,53 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
         )
 
         # 6. CTC
-        if src_token_list is not None:
-            ctc = CTC(
-                odim=src_vocab_size,
-                encoder_output_sizse=encoder_output_size,
-                **args.ctc_conf,
-            )
-        else:
-            ctc = None
+        # if src_token_list is not None:
+        #     ctc = CTC(
+        #         odim=src_vocab_size,
+        #         encoder_output_sizse=encoder_output_size,
+        #         **args.ctc_conf,
+        #     )
+        # else:
+        #     ctc = None
 
         # 7. ASR extra decoder
-        if (
-            getattr(args, "extra_asr_decoder", None) is not None
-            and src_token_list is not None
-        ):
-            extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
-                args.extra_asr_decoder
-            )
-            extra_asr_decoder = extra_asr_decoder_class(
-                vocab_size=src_vocab_size,
-                encoder_output_size=encoder_output_size,
-                **args.extra_asr_decoder_conf,
-            )
-        else:
-            extra_asr_decoder = None
-
-        # 8. MT extra decoder
-        if getattr(args, "extra_mt_decoder", None) is not None:
-            extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
-                args.extra_mt_decoder
-            )
-            extra_mt_decoder = extra_mt_decoder_class(
-                vocab_size=vocab_size,
-                encoder_output_size=encoder_output_size,
-                **args.extra_mt_decoder_conf,
-            )
-        else:
-            extra_asr_decoder = None
+        # if (
+        #     getattr(args, "extra_asr_decoder", None) is not None
+        #     and src_token_list is not None
+        # ):
+        #     extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
+        #         args.extra_asr_decoder
+        #     )
+        #     extra_asr_decoder = extra_asr_decoder_class(
+        #         vocab_size=src_vocab_size,
+        #         encoder_output_size=encoder_output_size,
+        #         **args.extra_asr_decoder_conf,
+        #     )
+        # else:
+        #     extra_asr_decoder = None
+
+        # # 8. MT extra decoder
+        # if getattr(args, "extra_mt_decoder", None) is not None:
+        #     extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
+        #         args.extra_mt_decoder
+        #     )
+        #     extra_mt_decoder = extra_mt_decoder_class(
+        #         vocab_size=vocab_size,
+        #         encoder_output_size=encoder_output_size,
+        #         **args.extra_mt_decoder_conf,
+        #     )
+        # else:
+        #     extra_asr_decoder = None
 
         # 8. Build model
-        model = ESPnetSTModel(
+        model = ESPnetMTModel(
             vocab_size=vocab_size,
             src_vocab_size=src_vocab_size,
             frontend=frontend,
-            specaug=specaug,
-            normalize=normalize,
             preencoder=preencoder,
             encoder=encoder,
             postencoder=postencoder,
             decoder=decoder,
-            ctc=ctc,
-            extra_asr_decoder=extra_asr_decoder,
-            extra_mt_decoder=extra_mt_decoder,
             token_list=token_list,
             src_token_list=src_token_list,
             **args.model_conf,

From a807c0149de4678ffa5352bf54cb0e9badbe6349 Mon Sep 17 00:00:00 2001
From: Xuankai <netnetchangxk@gmail.com>
Date: Thu, 24 Feb 2022 22:09:48 -0500
Subject: [PATCH 17/34] fix mt task collect_stats at stage 9

---
 egs2/TEMPLATE/mt1/mt.sh          | 17 -----------------
 espnet2/mt/espnet_model.py       |  2 ++
 espnet2/mt/frontend/embedding.py |  4 +---
 espnet2/tasks/mt.py              |  4 +++-
 4 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index ebfa1d0229b..bde7a456023 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -967,24 +967,7 @@ if ! "${skip_train}"; then
         fi
 
         _feats_type="$(<${_mt_train_dir}/feats_type)"
-        if [ "${_feats_type}" = raw ]; then
-            _scp=wav.scp
-            # "sound" supports "wav", "flac", etc.
-            if [[ "${audio_format}" == *ark* ]]; then
-                _type=kaldi_ark
-            else
-                _type=sound
-            fi
-            _fold_length="$((mt_speech_fold_length * 100))"
-            _opts+="--frontend_conf fs=${fs} "
-        else
-            _scp=feats.scp
-            _type=kaldi_ark
-            _fold_length="${mt_speech_fold_length}"
-            _input_size="$(<${_mt_train_dir}/feats_dim)"
-            _opts+="--input_size=${_input_size} "
 
-        fi
         if [ "${feats_normalize}" = global_mvn ]; then
             # Default normalization is utterance_mvn and changes to global_mvn
             _opts+="--normalize=global_mvn --normalize_conf stats_file=${mt_stats_dir}/train/feats_stats.npz "
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index 43cca42dfb4..a848d725063 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -212,6 +212,8 @@ def _extract_feats(
 
         # for data-parallel
         src_text = src_text[:, : src_text_lengths.max()]
+        src_text, _ = add_sos_eos(src_text, self.sos, self.eos, self.ignore_id)
+        src_text_lengths = src_text_lengths + 1
 
         if self.frontend is not None:
             # Frontend
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
index 6d5e824f7c0..03050072527 100644
--- a/espnet2/mt/frontend/embedding.py
+++ b/espnet2/mt/frontend/embedding.py
@@ -21,7 +21,6 @@ def __init__(
         input_size: int = 400,
         embed_dim: int = 400,
         no_embed_scale: bool = False,
-        padding: int = -1,
     ):
         """Initialize.
 
@@ -34,9 +33,8 @@ def __init__(
         assert check_argument_types()
         super().__init__()
         self.embed_dim = embed_dim
-        self.padding = padding
         self.embed_scale = 1.0 if no_embed_scale else math.sqrt(embed_dim)
-        self.embed = torch.nn.Embedding(input_size, embed_dim, padding_idx=padding)
+        self.embed = torch.nn.Embedding(input_size, embed_dim)
 
     def forward(
         self, input: torch.Tensor, input_lengths: torch.Tensor
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
index 6bdc44f1a73..0cfef72e595 100644
--- a/espnet2/tasks/mt.py
+++ b/espnet2/tasks/mt.py
@@ -402,7 +402,9 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
         if args.input_size is None:
             # Extract features in the model
             frontend_class = frontend_choices.get_class(args.frontend)
-            frontend = frontend_class(**args.frontend_conf)
+            frontend = frontend_class(
+                input_size=src_vocab_size, **args.frontend_conf
+            )
             input_size = frontend.output_size()
         else:
             # Give features from data-loader

From c0864cbb3c0ad87f4e8868dfda70ef80e1449d3a Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Fri, 25 Feb 2022 06:43:45 -0500
Subject: [PATCH 18/34] stage 10 working

---
 egs2/TEMPLATE/asr1/db.sh                      |   2 +-
 egs2/TEMPLATE/mt1/mt.sh                       |  20 +-
 .../asr/encoder/lego_transformer_encoder.py   | 189 ------------------
 espnet2/asr/encoder/transformer_encoder.py    |  11 +-
 espnet2/mt/espnet_model.py                    |   2 +-
 espnet2/mt/frontend/embedding.py              |  18 +-
 espnet2/tasks/mt.py                           | 117 +----------
 7 files changed, 24 insertions(+), 335 deletions(-)
 delete mode 100644 espnet2/asr/encoder/lego_transformer_encoder.py

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 4a89236e8f0..3785aef57a8 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -125,7 +125,7 @@ TOTONAC=downloads
 PRIMEWORDS_CHINESE=downloads
 SEAME=
 BENGALI=downloads
-IWSLT14=~/Documents/work/data/IWSLT14
+IWSLT14=
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index bde7a456023..ac98dfc9d4d 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -968,17 +968,12 @@ if ! "${skip_train}"; then
 
         _feats_type="$(<${_mt_train_dir}/feats_type)"
 
-        if [ "${feats_normalize}" = global_mvn ]; then
-            # Default normalization is utterance_mvn and changes to global_mvn
-            _opts+="--normalize=global_mvn --normalize_conf stats_file=${mt_stats_dir}/train/feats_stats.npz "
-        fi
-
-        if [ "${num_splits_st}" -gt 1 ]; then
+        if [ "${num_splits_mt}" -gt 1 ]; then
             # If you met a memory error when parsing text files, this option may help you.
             # The corpus is split into subsets and each subset is used for training one by one in order,
             # so the memory footprint can be limited to the memory required for each dataset.
 
-            _split_dir="${mt_stats_dir}/splits${num_splits_st}"
+            _split_dir="${mt_stats_dir}/splits${num_splits_mt}"
             if [ ! -f "${_split_dir}/.done" ]; then
                 rm -f "${_split_dir}/.done"
                 ${python} -m espnet2.bin.split_scps \
@@ -986,28 +981,23 @@ if ! "${skip_train}"; then
                       "${_mt_train_dir}/${_scp}" \
                       "${_mt_train_dir}/text.${tgt_case}.${tgt_lang}" \
                       "${_mt_train_dir}/text.${src_case}.${src_lang}" \
-                      "${mt_stats_dir}/train/speech_shape" \
                       "${mt_stats_dir}/train/text_shape.${tgt_token_type}" \
                       "${mt_stats_dir}/train/src_text_shape.${src_token_type}" \
-                  --num_splits "${num_splits_st}" \
+                  --num_splits "${num_splits_mt}" \
                   --output_dir "${_split_dir}"
                 touch "${_split_dir}/.done"
             else
                 log "${_split_dir}/.done exists. Spliting is skipped"
             fi
 
-            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
             _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
             _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
-            _opts+="--train_shape_file ${_split_dir}/speech_shape "
             _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
             _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
             _opts+="--multiple_iterator true "
         else
-            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/${_scp},speech,${_type} "
             _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
             _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text "
-            _opts+="--train_shape_file ${mt_stats_dir}/train/speech_shape "
             _opts+="--train_shape_file ${mt_stats_dir}/train/text_shape.${tgt_token_type} "
             _opts+="--train_shape_file ${mt_stats_dir}/train/src_text_shape.${src_token_type} "
         fi
@@ -1044,16 +1034,12 @@ if ! "${skip_train}"; then
                 --non_linguistic_symbols "${nlsyms_txt}" \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
-                --valid_data_path_and_name_and_type "${_mt_valid_dir}/${_scp},speech,${_type}" \
                 --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
                 --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
-                --valid_shape_file "${mt_stats_dir}/valid/speech_shape" \
                 --valid_shape_file "${mt_stats_dir}/valid/text_shape.${tgt_token_type}" \
                 --valid_shape_file "${mt_stats_dir}/valid/src_text_shape.${src_token_type}" \
                 --resume true \
-                --init_param ${pretrained_asr} \
                 --ignore_init_mismatch ${ignore_init_mismatch} \
-                --fold_length "${_fold_length}" \
                 --fold_length "${mt_text_fold_length}" \
                 --fold_length "${mt_text_fold_length}" \
                 --output_dir "${mt_exp}" \
diff --git a/espnet2/asr/encoder/lego_transformer_encoder.py b/espnet2/asr/encoder/lego_transformer_encoder.py
deleted file mode 100644
index b1b55c1cd92..00000000000
--- a/espnet2/asr/encoder/lego_transformer_encoder.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2019 Shigeki Karita
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Encoder definition."""
-from typing import Optional
-from typing import Tuple
-
-import torch
-from typeguard import check_argument_types
-
-from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
-from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
-from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
-from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
-from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
-from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
-from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
-    PositionwiseFeedForward,  # noqa: H301
-)
-from espnet.nets.pytorch_backend.transformer.repeat import repeat
-from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
-from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
-from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
-from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
-from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
-from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
-from espnet2.asr.encoder.abs_encoder import AbsEncoder
-
-
-class LegoTransformerEncoder(AbsEncoder):
-    """Transformer encoder module.
-
-    Args:
-        input_size: input dim
-        output_size: dimension of attention
-        attention_heads: the number of heads of multi head attention
-        linear_units: the number of units of position-wise feed forward
-        num_blocks: the number of decoder blocks
-        dropout_rate: dropout rate
-        attention_dropout_rate: dropout rate in attention
-        positional_dropout_rate: dropout rate after adding positional encoding
-        input_layer: input layer type
-        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
-        normalize_before: whether to use layer_norm before the first block
-        concat_after: whether to concat attention layer's input and output
-            if True, additional linear will be applied.
-            i.e. x -> x + linear(concat(x, att(x)))
-            if False, no additional linear will be applied.
-            i.e. x -> x + att(x)
-        positionwise_layer_type: linear of conv1d
-        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
-        padding_idx: padding_idx for input_layer=embed
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int = 256,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-        input_layer: Optional[str] = "conv2d",
-        pos_enc_class=PositionalEncoding,
-        normalize_before: bool = True,
-        concat_after: bool = False,
-        positionwise_layer_type: str = "linear",
-        positionwise_conv_kernel_size: int = 1,
-        padding_idx: int = -1,
-    ):
-        assert check_argument_types()
-        super().__init__()
-        self._output_size = output_size
-
-        if input_layer == "linear":
-            self.embed = torch.nn.Sequential(
-                torch.nn.Linear(input_size, output_size),
-                torch.nn.LayerNorm(output_size),
-                torch.nn.Dropout(dropout_rate),
-                torch.nn.ReLU(),
-                pos_enc_class(output_size, positional_dropout_rate),
-            )
-        elif input_layer == "conv2d":
-            self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate)
-        elif input_layer == "conv2d2":
-            self.embed = Conv2dSubsampling2(input_size, output_size, dropout_rate)
-        elif input_layer == "conv2d6":
-            self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate)
-        elif input_layer == "conv2d8":
-            self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate)
-        elif input_layer == "embed":
-            self.embed = torch.nn.Sequential(
-                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
-                pos_enc_class(output_size, positional_dropout_rate),
-            )
-        elif input_layer is None:
-            self.embed = torch.nn.Sequential(
-                pos_enc_class(output_size, positional_dropout_rate)
-            )
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-        self.normalize_before = normalize_before
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (
-                output_size,
-                linear_units,
-                dropout_rate,
-            )
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (
-                output_size,
-                linear_units,
-                positionwise_conv_kernel_size,
-                dropout_rate,
-            )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (
-                output_size,
-                linear_units,
-                positionwise_conv_kernel_size,
-                dropout_rate,
-            )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                output_size,
-                MultiHeadedAttention(
-                    attention_heads, output_size, attention_dropout_rate
-                ),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after,
-            ),
-        )
-        if self.normalize_before:
-            self.after_norm = LayerNorm(output_size)
-
-    def output_size(self) -> int:
-        return self._output_size
-
-    def forward(
-        self,
-        xs_pad: torch.Tensor,
-        ilens: torch.Tensor,
-        prev_states: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        """Embed positions in tensor.
-
-        Args:
-            xs_pad: input tensor (B, L, D)
-            ilens: input length (B)
-            prev_states: Not to be used now.
-        Returns:
-            position embedded tensor and mask
-        """
-        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
-
-        if (
-            isinstance(self.embed, Conv2dSubsampling)
-            or isinstance(self.embed, Conv2dSubsampling2)
-            or isinstance(self.embed, Conv2dSubsampling6)
-            or isinstance(self.embed, Conv2dSubsampling8)
-        ):
-            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
-            if short_status:
-                raise TooShortUttError(
-                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
-                    + f"(it needs more than {limit_size} frames), return empty results",
-                    xs_pad.size(1),
-                    limit_size,
-                )
-            xs_pad, masks = self.embed(xs_pad, masks)
-        else:
-            xs_pad = self.embed(xs_pad)
-        xs_pad, masks = self.encoders(xs_pad, masks)
-        if self.normalize_before:
-            xs_pad = self.after_norm(xs_pad)
-
-        olens = masks.squeeze(1).sum(1)
-        return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/transformer_encoder.py b/espnet2/asr/encoder/transformer_encoder.py
index dff1686ffa8..60c36b3353e 100644
--- a/espnet2/asr/encoder/transformer_encoder.py
+++ b/espnet2/asr/encoder/transformer_encoder.py
@@ -97,9 +97,10 @@ def __init__(
                 pos_enc_class(output_size, positional_dropout_rate),
             )
         elif input_layer is None:
-            self.embed = torch.nn.Sequential(
-                pos_enc_class(output_size, positional_dropout_rate)
-            )
+            if input_size == output_size:
+                self.embed = None
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
         else:
             raise ValueError("unknown input_layer: " + input_layer)
         self.normalize_before = normalize_before
@@ -164,7 +165,9 @@ def forward(
         """
         masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
 
-        if (
+        if self.embed is None:
+            xs_pad = xs_pad
+        elif (
             isinstance(self.embed, Conv2dSubsampling)
             or isinstance(self.embed, Conv2dSubsampling2)
             or isinstance(self.embed, Conv2dSubsampling6)
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index a848d725063..736aac171d8 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -241,7 +241,7 @@ def _calc_mt_att_loss(
         )
 
         # 2. Compute attention loss
-        loss_att = self.criterion_st(decoder_out, ys_out_pad)
+        loss_att = self.criterion_mt(decoder_out, ys_out_pad)
         acc_att = th_accuracy(
             decoder_out.view(-1, self.vocab_size),
             ys_out_pad,
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
index 03050072527..d7696a9801b 100644
--- a/espnet2/mt/frontend/embedding.py
+++ b/espnet2/mt/frontend/embedding.py
@@ -6,6 +6,7 @@
 
 import math
 
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 import torch
 from typeguard import check_argument_types
@@ -20,21 +21,25 @@ def __init__(
         self,
         input_size: int = 400,
         embed_dim: int = 400,
-        no_embed_scale: bool = False,
+        pos_enc_class=PositionalEncoding,
+        positional_dropout_rate: float = 0.1,
     ):
         """Initialize.
 
         Args:
             input_size: Number of input tokens.
             embed_dim: Embedding Size.
-            no_embed_scale: Whether to scale the embeddings or not.
-            padding: Padding.
+            pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+            positional_dropout_rate: dropout rate after adding positional encoding
         """
         assert check_argument_types()
         super().__init__()
         self.embed_dim = embed_dim
-        self.embed_scale = 1.0 if no_embed_scale else math.sqrt(embed_dim)
-        self.embed = torch.nn.Embedding(input_size, embed_dim)
+        #TODO (sdalmia): check for padding idx
+        self.embed = torch.nn.Sequential(
+            torch.nn.Embedding(input_size, embed_dim),
+            pos_enc_class(embed_dim, positional_dropout_rate),
+        )
 
     def forward(
         self, input: torch.Tensor, input_lengths: torch.Tensor
@@ -49,8 +54,7 @@ def forward(
             Tensor: Output with dimensions (B, T, D).
             Tensor: Output lengths within batch.
         """
-        token_embedding = self.embed(input)
-        x = self.embed_scale * token_embedding
+        x = self.embed(input)
 
         return x, input_lengths
 
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
index 0cfef72e595..750ce58d35d 100644
--- a/espnet2/tasks/mt.py
+++ b/espnet2/tasks/mt.py
@@ -12,7 +12,6 @@
 from typeguard import check_argument_types
 from typeguard import check_return_type
 
-from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
 from espnet2.asr.decoder.transformer_decoder import (
@@ -28,18 +27,14 @@
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
-from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
-from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 from espnet2.asr.encoder.contextual_block_transformer_encoder import (
     ContextualBlockTransformerEncoder,  # noqa: H301
 )
 from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
-from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.frontend.default import DefaultFrontend
-from espnet2.asr.frontend.s3prl import S3prlFrontend
 from espnet2.asr.frontend.windowing import SlidingWindow
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
@@ -48,11 +43,6 @@
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
 from espnet2.asr.preencoder.linear import LinearProjection
 from espnet2.asr.preencoder.sinc import LightweightSincConvs
-from espnet2.asr.specaug.abs_specaug import AbsSpecAug
-from espnet2.asr.specaug.specaug import SpecAug
-from espnet2.layers.abs_normalize import AbsNormalize
-from espnet2.layers.global_mvn import GlobalMVN
-from espnet2.layers.utterance_mvn import UtteranceMVN
 from espnet2.mt.frontend.embedding import Embedding
 from espnet2.mt.espnet_model import ESPnetMTModel
 from espnet2.tasks.abs_task import AbsTask
@@ -77,23 +67,6 @@
     type_check=AbsFrontend,
     default="embed",
 )
-specaug_choices = ClassChoices(
-    name="specaug",
-    classes=dict(specaug=SpecAug),
-    type_check=AbsSpecAug,
-    default=None,
-    optional=True,
-)
-normalize_choices = ClassChoices(
-    "normalize",
-    classes=dict(
-        global_mvn=GlobalMVN,
-        utterance_mvn=UtteranceMVN,
-    ),
-    type_check=AbsNormalize,
-    default="utterance_mvn",
-    optional=True,
-)
 preencoder_choices = ClassChoices(
     name="preencoder",
     classes=dict(
@@ -112,9 +85,6 @@
         contextual_block_transformer=ContextualBlockTransformerEncoder,
         vgg_rnn=VGGRNNEncoder,
         rnn=RNNEncoder,
-        wav2vec2=FairSeqWav2Vec2Encoder,
-        hubert=FairseqHubertEncoder,
-        hubert_pretrain=FairseqHubertPretrainEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
@@ -141,32 +111,6 @@
     type_check=AbsDecoder,
     default="rnn",
 )
-extra_asr_decoder_choices = ClassChoices(
-    "extra_asr_decoder",
-    classes=dict(
-        transformer=TransformerDecoder,
-        lightweight_conv=LightweightConvolutionTransformerDecoder,
-        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
-        dynamic_conv=DynamicConvolutionTransformerDecoder,
-        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
-        rnn=RNNDecoder,
-    ),
-    type_check=AbsDecoder,
-    default="rnn",
-)
-extra_mt_decoder_choices = ClassChoices(
-    "extra_mt_decoder",
-    classes=dict(
-        transformer=TransformerDecoder,
-        lightweight_conv=LightweightConvolutionTransformerDecoder,
-        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
-        dynamic_conv=DynamicConvolutionTransformerDecoder,
-        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
-        rnn=RNNDecoder,
-    ),
-    type_check=AbsDecoder,
-    default="rnn",
-)
 
 
 class MTTask(AbsTask):
@@ -177,10 +121,6 @@ class MTTask(AbsTask):
     class_choices_list = [
         # --frontend and --frontend_conf
         frontend_choices,
-        # --specaug and --specaug_conf
-        specaug_choices,
-        # --normalize and --normalize_conf
-        normalize_choices,
         # --preencoder and --preencoder_conf
         preencoder_choices,
         # --encoder and --encoder_conf
@@ -189,10 +129,6 @@ class MTTask(AbsTask):
         postencoder_choices,
         # --decoder and --decoder_conf
         decoder_choices,
-        # --extra_asr_decoder and --extra_asr_decoder_conf
-        extra_asr_decoder_choices,
-        # --extra_mt_decoder and --extra_mt_decoder_conf
-        extra_mt_decoder_choices,
     ]
 
     # If you need to modify train() or eval() procedures, change Trainer class here
@@ -241,12 +177,6 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             help="The number of input dimension of the feature",
         )
 
-        group.add_argument(
-            "--ctc_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(CTC),
-            help="The keyword arguments for CTC class.",
-        )
         group.add_argument(
             "--model_conf",
             action=NestedDictAction,
@@ -352,7 +282,7 @@ def required_data_names(
             retval = ("src_text", "text")
         else:
             # Recognition mode
-            retval = ("speech",)
+            retval = ("src_text",)
         return retval
 
     @classmethod
@@ -413,13 +343,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
             frontend = None
             input_size = args.input_size
 
-        # 2. Normalization layer
-        # if args.normalize is not None:
-        #     normalize_class = normalize_choices.get_class(args.normalize)
-        #     normalize = normalize_class(**args.normalize_conf)
-        # else:
-        #     normalize = None
-
         # 3. Pre-encoder input block
         # NOTE(kan-bayashi): Use getattr to keep the compatibility
         if getattr(args, "preencoder", None) is not None:
@@ -454,44 +377,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
             **args.decoder_conf,
         )
 
-        # 6. CTC
-        # if src_token_list is not None:
-        #     ctc = CTC(
-        #         odim=src_vocab_size,
-        #         encoder_output_sizse=encoder_output_size,
-        #         **args.ctc_conf,
-        #     )
-        # else:
-        #     ctc = None
-
-        # 7. ASR extra decoder
-        # if (
-        #     getattr(args, "extra_asr_decoder", None) is not None
-        #     and src_token_list is not None
-        # ):
-        #     extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
-        #         args.extra_asr_decoder
-        #     )
-        #     extra_asr_decoder = extra_asr_decoder_class(
-        #         vocab_size=src_vocab_size,
-        #         encoder_output_size=encoder_output_size,
-        #         **args.extra_asr_decoder_conf,
-        #     )
-        # else:
-        #     extra_asr_decoder = None
-
-        # # 8. MT extra decoder
-        # if getattr(args, "extra_mt_decoder", None) is not None:
-        #     extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
-        #         args.extra_mt_decoder
-        #     )
-        #     extra_mt_decoder = extra_mt_decoder_class(
-        #         vocab_size=vocab_size,
-        #         encoder_output_size=encoder_output_size,
-        #         **args.extra_mt_decoder_conf,
-        #     )
-        # else:
-        #     extra_asr_decoder = None
 
         # 8. Build model
         model = ESPnetMTModel(

From a9c2b351cc1705b61f01a65ea77e35897c75a809 Mon Sep 17 00:00:00 2001
From: Wangyou Zhang <C0me_On@163.com>
Date: Fri, 25 Feb 2022 21:29:14 +0800
Subject: [PATCH 19/34] Fix CI test failures related to torch_complex 0.4.0

---
 egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh | 6 +++++-
 espnet2/enh/layers/complex_utils.py                | 2 +-
 test/espnet2/enh/layers/test_complex_utils.py      | 5 ++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
index 2a453dab44c..66fb9bc81c2 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
@@ -69,7 +69,11 @@ while IFS= read -r expdir; do
             line="|${dset}|"
             for ((i=0; i<${#metrics[@]}; i++)); do
                 type=${metrics[$i]}
-                score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                if [ -f "${expdir}"/${dset}/scoring/result_${type}.txt ]; then
+                    score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                else
+                    score=""
+                fi
                 line+="${score}|"
             done
             echo $line
diff --git a/espnet2/enh/layers/complex_utils.py b/espnet2/enh/layers/complex_utils.py
index 591988eb480..bf4799f58d8 100644
--- a/espnet2/enh/layers/complex_utils.py
+++ b/espnet2/enh/layers/complex_utils.py
@@ -164,7 +164,7 @@ def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexT
     # mixed input with complex and real tensors.
     if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
         if isinstance(a, ComplexTensor) and isinstance(b, ComplexTensor):
-            return FC.solve(b, a)[0]
+            return FC.solve(b, a, return_LU=False)
         else:
             return matmul(inverse(a), b)
     elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
diff --git a/test/espnet2/enh/layers/test_complex_utils.py b/test/espnet2/enh/layers/test_complex_utils.py
index 6e82eeafd08..ab3e072dff3 100644
--- a/test/espnet2/enh/layers/test_complex_utils.py
+++ b/test/espnet2/enh/layers/test_complex_utils.py
@@ -162,7 +162,10 @@ def test_solve(real_vec):
             vec = torch.rand(2, 3, 1)
             vec2 = complex_wrapper(vec, torch.zeros_like(vec))
         ret = solve(vec, mat)
-        ret2 = complex_module.solve(vec2, mat)[0]
+        if isinstance(vec2, ComplexTensor):
+            ret2 = FC.solve(vec2, mat, return_LU=False)
+        else:
+            ret2 = torch.solve(vec2, mat)[0]
         assert complex_module.allclose(ret, ret2)
 
 

From 14f4df750c8dff68588038af2c917274007e967e Mon Sep 17 00:00:00 2001
From: Wangyou Zhang <C0me_On@163.com>
Date: Fri, 25 Feb 2022 21:46:20 +0800
Subject: [PATCH 20/34] Fix CI test failures related to torch_complex 0.4.0

---
 test/espnet2/enh/layers/test_complex_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/espnet2/enh/layers/test_complex_utils.py b/test/espnet2/enh/layers/test_complex_utils.py
index ab3e072dff3..ca988a7bd2b 100644
--- a/test/espnet2/enh/layers/test_complex_utils.py
+++ b/test/espnet2/enh/layers/test_complex_utils.py
@@ -165,7 +165,7 @@ def test_solve(real_vec):
         if isinstance(vec2, ComplexTensor):
             ret2 = FC.solve(vec2, mat, return_LU=False)
         else:
-            ret2 = torch.solve(vec2, mat)[0]
+            return torch.linalg.solve(mat, vec2)
         assert complex_module.allclose(ret, ret2)
 
 

From dd8c32e22e6730c896c7bf61c1ea1c8d38a52916 Mon Sep 17 00:00:00 2001
From: ftshijt <ftshijt@gmail.com>
Date: Fri, 25 Feb 2022 22:39:47 -0500
Subject: [PATCH 21/34] fix st compatibility

---
 egs2/fisher_callhome_spanish/asr1/RESULTS.md  | 71 +++++++++++++++++--
 .../asr1/conf/pitch.conf                      |  2 +-
 .../asr1/local/data.sh                        | 11 +--
 egs2/fisher_callhome_spanish/asr1/run.sh      |  3 +-
 .../st1/conf/pitch.conf                       |  2 +-
 .../fisher_callhome_spanish/st1/local/data.sh | 20 +++---
 egs2/fisher_callhome_spanish/st1/run.sh       |  2 +-
 7 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/egs2/fisher_callhome_spanish/asr1/RESULTS.md b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
index c8811239235..e8292401a7b 100644
--- a/egs2/fisher_callhome_spanish/asr1/RESULTS.md
+++ b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
@@ -1,12 +1,75 @@
 <!-- Generated by scripts/utils/show_asr_result.sh -->
 # RESULTS
 ## Environments
-- date: `Sat Nov 27 12:21:09 EST 2021`
+- date: `Fri Feb 25 11:45:29 EST 2022`
 - python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
-- espnet version: `espnet 0.10.3a3`
+- espnet version: `espnet 0.10.7a1`
 - pytorch version: `pytorch 1.9.0`
-- Git hash: `9d06e80ac454bfbc4b95575d7f2f48da0cc880f9`
-  - Commit date: `Mon Nov 22 01:34:44 2021 -0500`
+- Git hash: `54799d2fa7beb702ab909a7e57cc70288e3ce96c`
+  - Commit date: `Tue Feb 22 10:31:31 2022 -0500`
+
+## asr_8k_conformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|64.9|24.8|10.3|6.4|41.5|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|63.1|25.6|11.3|6.4|43.3|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|78.5|13.4|8.1|4.8|26.3|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|78.2|14.0|7.8|5.8|27.7|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|80.0|12.8|7.2|5.8|25.8|64.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|80.8|7.1|12.1|6.2|25.4|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|79.2|7.6|13.2|5.9|26.8|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|88.6|3.4|8.0|5.3|16.7|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|88.4|3.8|7.7|7.0|18.6|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|89.6|3.2|7.3|5.9|16.3|64.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|64.3|20.7|15.0|5.6|41.3|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|62.2|21.4|16.3|6.1|43.9|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|79.0|11.6|9.4|6.5|27.5|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|79.1|12.5|8.4|8.9|29.8|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|81.4|10.7|7.9|7.7|26.3|64.2|
+
+## asr_8k_transformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|53.1|33.3|13.7|6.0|52.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|52.3|34.0|13.7|6.0|53.7|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|76.8|16.5|6.7|5.1|28.3|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|77.8|16.3|5.9|6.0|28.2|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|79.9|14.5|5.5|5.8|25.9|69.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|74.3|10.0|15.7|6.2|31.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|73.3|10.2|16.5|6.1|32.8|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|89.7|4.0|6.3|5.6|15.9|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|90.4|4.1|5.5|7.0|16.6|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|91.6|3.4|5.0|5.9|14.3|69.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|52.2|28.0|19.8|5.0|52.8|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|50.8|28.9|20.2|5.2|54.3|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|76.2|14.3|9.5|5.8|29.6|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|77.5|14.3|8.2|7.9|30.4|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|80.1|12.3|7.5|6.6|26.5|69.2|
+
 
 ## asr_train_asr_raw_bpe1000_sp
 ### WER
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
index e959a19d5b8..926bcfca92a 100644
--- a/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
+++ b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
@@ -1 +1 @@
---sample-frequency=16000
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/asr1/local/data.sh b/egs2/fisher_callhome_spanish/asr1/local/data.sh
index 10f29d23b38..ebfb413212a 100755
--- a/egs2/fisher_callhome_spanish/asr1/local/data.sh
+++ b/egs2/fisher_callhome_spanish/asr1/local/data.sh
@@ -65,13 +65,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # concatenate multiple utterances
     local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
 
-    utils/combine_data.sh \
-        --extra-files "text.lc.es text.lc.rm.es text.tc.es" \
-        data/train \
-        data/fisher_train data/callhome_train/ 
+    # skip combination to have same condition of previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.es text.lc.rm.es text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
 
+    cp -r data/fisher_train data/train
     cp -r data/fisher_dev data/dev
-
 fi
 
 
diff --git a/egs2/fisher_callhome_spanish/asr1/run.sh b/egs2/fisher_callhome_spanish/asr1/run.sh
index 6ba462eaa96..632af2b4b49 100755
--- a/egs2/fisher_callhome_spanish/asr1/run.sh
+++ b/egs2/fisher_callhome_spanish/asr1/run.sh
@@ -17,6 +17,7 @@ nbpe=1000
 
 ./asr.sh \
     --ngpu 1 \
+    --fs 8k \
     --audio_format "flac.ark" \
     --local_data_opts "--stage 0" \
     --use_lm false \
@@ -30,7 +31,5 @@ nbpe=1000
     --train_set "${train_set}" \
     --valid_set "${train_dev}" \
     --test_sets "${test_set}" \
-    --gpu_inference true \
-    --inference_nj 10 \
     --bpe_train_text "data/${train_set}/text" \
     --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/fisher_callhome_spanish/st1/conf/pitch.conf b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
index e959a19d5b8..926bcfca92a 100644
--- a/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
+++ b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
@@ -1 +1 @@
---sample-frequency=16000
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/st1/local/data.sh b/egs2/fisher_callhome_spanish/st1/local/data.sh
index 1b69d3d5ff7..92c3c9be50e 100755
--- a/egs2/fisher_callhome_spanish/st1/local/data.sh
+++ b/egs2/fisher_callhome_spanish/st1/local/data.sh
@@ -68,16 +68,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
 fi
 
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     log "stage 2: Combine train and dev set"
-    utils/combine_data.sh \
-        --extra-files "text.lc.en text.lc.es text.lc.rm.en text.lc.rm.es text.tc.en text.tc.es" \
-        data/train \
-        data/fisher_train data/callhome_train/ 
 
+    # skip the combination to have same condition to previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.en text.lc.es text.lc.rm.en text.lc.rm.es text.tc.en text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
+
+    cp -r data/fisher_train data/train
     cp -r data/fisher_dev data/dev
+    rm data/dev/text.*.en
     # Use 1 reference as dev set
-    ln -sf text.lc.en.0 data/dev/text.lc.en
-    ln -sf text.tc.en.0 data/dev/text.tc.en
-    ln -sf text.lc.rm.en.0 data/dev/text.lc.rm.en
+    cp data/fisher_dev/text.lc.en.0 data/dev/text.lc.en
+    cp data/fisher_dev/text.tc.en.0 data/dev/text.tc.en
+    cp data/fisher_dev/text.lc.rm.en.0 data/dev/text.lc.rm.en
 fi
diff --git a/egs2/fisher_callhome_spanish/st1/run.sh b/egs2/fisher_callhome_spanish/st1/run.sh
index 760ae193781..e48d8924263 100755
--- a/egs2/fisher_callhome_spanish/st1/run.sh
+++ b/egs2/fisher_callhome_spanish/st1/run.sh
@@ -27,7 +27,7 @@ tgt_case=lc.rm
     --use_lm false \
     --token_joint false \
     --nj 40 \
-    --inference_nj 8 \
+    --fs 8k \
     --src_lang ${src_lang} \
     --tgt_lang ${tgt_lang} \
     --src_token_type "bpe" \

From e05263d14352a4ddc9b45b22bd69374ec12be039 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sat, 26 Feb 2022 04:35:46 -0500
Subject: [PATCH 22/34] inference and scoring working

---
 egs2/TEMPLATE/mt1/mt.sh              |  98 +++--
 egs2/iwslt14/mt1/conf/decode_mt.yaml |   5 +
 egs2/iwslt14/mt1/run.sh              |   4 +-
 espnet2/bin/mt_inference.py          | 533 +++++++++++++++++++++++++++
 4 files changed, 581 insertions(+), 59 deletions(-)
 create mode 100644 egs2/iwslt14/mt1/conf/decode_mt.yaml
 create mode 100755 espnet2/bin/mt_inference.py

diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index ac98dfc9d4d..a049a38791c 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -894,8 +894,8 @@ if ! "${skip_train}"; then
         utils/split_scp.pl "${key_file}" ${split_scps}
 
         # 2. Generate run.sh
-        log "Generate '${mt_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
-        mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
+        log "Generate '${mt_stats_dir}/run.sh'. You can resume the process from stage 9 using this script"
+        mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 9 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
 
         # 3. Submit jobs
         log "MT collect-stats started... log: '${_logdir}/stats.*.log'"
@@ -966,8 +966,6 @@ if ! "${skip_train}"; then
             _opts+="--config ${mt_config} "
         fi
 
-        _feats_type="$(<${_mt_train_dir}/feats_type)"
-
         if [ "${num_splits_mt}" -gt 1 ]; then
             # If you met a memory error when parsing text files, this option may help you.
             # The corpus is split into subsets and each subset is used for training one by one in order,
@@ -1002,8 +1000,8 @@ if ! "${skip_train}"; then
             _opts+="--train_shape_file ${mt_stats_dir}/train/src_text_shape.${src_token_type} "
         fi
 
-        log "Generate '${mt_exp}/run.sh'. You can resume the process from stage 11 using this script"
-        mkdir -p "${mt_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
+        log "Generate '${mt_exp}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${mt_exp}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
 
         # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
         log "MT training started... log: '${mt_exp}/train.log'"
@@ -1084,8 +1082,8 @@ fi
 
 
 if ! "${skip_eval}"; then
-    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
-        log "Stage 12: Decoding: training_dir=${st_exp}"
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        log "Stage 11: Decoding: training_dir=${mt_exp}"
 
         if ${gpu_inference}; then
             _cmd="${cuda_cmd}"
@@ -1113,33 +1111,22 @@ if ! "${skip_eval}"; then
         fi
 
         # 2. Generate run.sh
-        log "Generate '${st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
-        mkdir -p "${st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${st_exp}/${inference_tag}/run.sh"; chmod +x "${st_exp}/${inference_tag}/run.sh"
+        log "Generate '${mt_exp}/${inference_tag}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${mt_exp}/${inference_tag}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/${inference_tag}/run.sh"; chmod +x "${mt_exp}/${inference_tag}/run.sh"
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${st_exp}/${inference_tag}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
             _logdir="${_dir}/logdir"
             mkdir -p "${_logdir}"
 
-            _feats_type="$(<${_data}/feats_type)"
-            if [ "${_feats_type}" = raw ]; then
-                _scp=wav.scp
-                if [[ "${audio_format}" == *ark* ]]; then
-                    _type=kaldi_ark
-                else
-                    _type=sound
-                fi
-            else
-                _scp=feats.scp
-                _type=kaldi_ark
-            fi
+            _scp=text.${src_case}.${src_lang}
 
             # 1. Split the key file
             key_file=${_data}/${_scp}
             split_scps=""
             _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            st_inference_tool="espnet2.bin.st_inference"
+            mt_inference_tool="espnet2.bin.mt_inference"
 
             for n in $(seq "${_nj}"); do
                 split_scps+=" ${_logdir}/keys.${n}.scp"
@@ -1148,16 +1135,16 @@ if ! "${skip_eval}"; then
             utils/split_scp.pl "${key_file}" ${split_scps}
 
             # 2. Submit decoding jobs
-            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            log "Decoding started... log: '${_logdir}/mt_inference.*.log'"
             # shellcheck disable=SC2086
-            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
-                ${python} -m ${st_inference_tool} \
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/mt_inference.JOB.log \
+                ${python} -m ${mt_inference_tool} \
                     --batch_size ${batch_size} \
                     --ngpu "${_ngpu}" \
-                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},src_text,text" \
                     --key_file "${_logdir}"/keys.JOB.scp \
-                    --st_train_config "${st_exp}"/config.yaml \
-                    --st_model_file "${st_exp}"/"${inference_st_model}" \
+                    --mt_train_config "${mt_exp}"/config.yaml \
+                    --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${inference_args}
 
@@ -1170,12 +1157,12 @@ if ! "${skip_eval}"; then
         done
     fi
 
-    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
-        log "Stage 13: Scoring"
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Scoring"
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${st_exp}/${inference_tag}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
 
             # TODO(jiatong): add asr scoring and inference
 
@@ -1191,7 +1178,7 @@ if ! "${skip_eval}"; then
                         --remove_non_linguistic_symbols true \
                         --cleaner "${cleaner}" \
                         ) \
-                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
                     >"${_scoredir}/ref.trn.org"
 
             # NOTE(kamo): Don't use cleaner for hyp
@@ -1203,7 +1190,7 @@ if ! "${skip_eval}"; then
                             --non_linguistic_symbols "${nlsyms_txt}" \
                             --remove_non_linguistic_symbols true \
                             ) \
-                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
                     >"${_scoredir}/hyp.trn.org"
             
             # remove utterance id
@@ -1250,7 +1237,7 @@ if ! "${skip_eval}"; then
                                 --remove_non_linguistic_symbols true \
                                 --cleaner "${cleaner}" \
                                 ) \
-                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                        <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
                             >"${_scoredir}/ref.trn.org.${ref_idx}"
                     
                     # 
@@ -1278,7 +1265,7 @@ if ! "${skip_eval}"; then
         done
 
         # Show results in Markdown syntax
-        scripts/utils/show_st_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
+        scripts/utils/show_translation_result.sh --case $tgt_case "${mt_exp}" > "${mt_exp}"/RESULTS.md
         cat "${cat_exp}"/RESULTS.md
     fi
 else
@@ -1286,10 +1273,10 @@ else
 fi
 
 
-packed_model="${st_exp}/${st_exp##*/}_${inference_st_model%.*}.zip"
+packed_model="${mt_exp}/${mt_exp##*/}_${inference_mt_model%.*}.zip"
 if ! "${skip_upload}"; then
-    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
-        log "Stage 14: Pack model: ${packed_model}"
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Pack model: ${packed_model}"
 
         _opts=
         if "${use_lm}"; then
@@ -1298,9 +1285,6 @@ if ! "${skip_upload}"; then
             _opts+="--option ${lm_exp}/perplexity_test/ppl "
             _opts+="--option ${lm_exp}/images "
         fi
-        if [ "${feats_normalize}" = global_mvn ]; then
-            _opts+="--option ${st_stats_dir}/train/feats_stats.npz "
-        fi
         if [ "${tgt_token_type}" = bpe ]; then
             _opts+="--option ${tgt_bpemodel} "
             _opts+="--option ${src_bpemodel} "
@@ -1309,19 +1293,19 @@ if ! "${skip_upload}"; then
             _opts+="--option ${nlsyms_txt} "
         fi
         # shellcheck disable=SC2086
-        ${python} -m espnet2.bin.pack st \
-            --st_train_config "${st_exp}"/config.yaml \
-            --st_model_file "${st_exp}"/"${inference_st_model}" \
+        ${python} -m espnet2.bin.pack mt \
+            --mt_train_config "${mt_exp}"/config.yaml \
+            --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
             ${_opts} \
-            --option "${st_exp}"/RESULTS.md \
-            --option "${st_exp}"/RESULTS.md \
-            --option "${st_exp}"/images \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/images \
             --outpath "${packed_model}"
     fi
 
 
-    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
-        log "Stage 15: Upload model to Zenodo: ${packed_model}"
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Upload model to Zenodo: ${packed_model}"
 
         # To upload your model, you need to do:
         #   1. Sign up to Zenodo: https://zenodo.org/
@@ -1344,7 +1328,7 @@ git checkout $(git show -s --format=%H)"
         _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
 
         # Generate description file
-        cat << EOF > "${st_exp}"/description
+        cat << EOF > "${mt_exp}"/description
 This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
 <p>&nbsp;</p>
 <ul>
@@ -1356,8 +1340,8 @@ pip install -e .
 cd $(pwd | rev | cut -d/ -f1-3 | rev)
 ./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
 </pre></li>
-<li><strong>Results</strong><pre><code>$(cat "${st_exp}"/RESULTS.md)</code></pre></li>
-<li><strong>MT config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${mt_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>MT config</strong><pre><code>$(cat "${mt_exp}"/config.yaml)</code></pre></li>
 <li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
 </ul>
 EOF
@@ -1369,7 +1353,7 @@ EOF
         espnet_model_zoo_upload \
             --file "${packed_model}" \
             --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
-            --description_file "${st_exp}"/description \
+            --description_file "${mt_exp}"/description \
             --creator_name "${_creator_name}" \
             --license "CC-BY-4.0" \
             --use_sandbox false \
@@ -1380,11 +1364,11 @@ else
 fi
 
 if ! "${skip_upload_hf}"; then
-    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
         [ -z "${hf_repo}" ] && \
             log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
             exit 1
-        log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
+        log "Stage 15: Upload model to HuggingFace: ${hf_repo}"
 
         gitlfs=$(git lfs --version 2> /dev/null || true)
         [ -z "${gitlfs}" ] && \
diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/decode_mt.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
index 39981f12c07..e43e18042d7 100755
--- a/egs2/iwslt14/mt1/run.sh
+++ b/egs2/iwslt14/mt1/run.sh
@@ -31,8 +31,8 @@ tgt_case=tc
     --stop_stage 13 \
     --use_lm false \
     --token_joint false \
-    --nj 40 \
-    --inference_nj 40 \
+    --nj 20 \
+    --inference_nj 20 \
     --src_lang ${src_lang} \
     --tgt_lang ${tgt_lang} \
     --src_token_type "bpe" \
diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py
new file mode 100755
index 00000000000..6ce966679a3
--- /dev/null
+++ b/espnet2/bin/mt_inference.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.mt import MTTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Text2Text:
+    """Text2Text class
+
+    Examples:
+        >>> text2text = Text2Text("mt_config.yml", "mt.pth")
+        >>> text2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        mt_train_config: Union[Path, str] = None,
+        mt_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
+        penalty: float = 0.0,
+        nbest: int = 1,
+    ):
+        assert check_argument_types()
+
+        # 1. Build MT model
+        scorers = {}
+        mt_model, mt_train_args = MTTask.build_model_from_file(
+            mt_train_config, mt_model_file, device
+        )
+        mt_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = mt_model.decoder
+        token_list = mt_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=mt_model.sos,
+            eos=mt_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+        )
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                beam_search.__class__ = BatchBeamSearch
+                logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = mt_train_args.token_type
+        if bpemodel is None:
+            bpemodel = mt_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.mt_model = mt_model
+        self.mt_train_args = mt_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, src_text: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input text data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(src_text, np.ndarray):
+            src_text = torch.tensor(src_text)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        src_text = src_text.unsqueeze(0).to(torch.long)
+        # lengths: (1,)
+        lengths = src_text.new_full([1], dtype=torch.long, fill_value=src_text.size(1))
+        batch = {"src_text": src_text, "src_text_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.mt_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Text2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+        Returns:
+            Text2Text: Text2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Text2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    mt_train_config: Optional[str],
+    mt_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build text2text
+    text2text_kwargs = dict(
+        mt_train_config=mt_train_config,
+        mt_model_file=mt_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+    )
+    text2text = Text2Text.from_pretrained(
+        model_tag=model_tag,
+        **text2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = MTTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=MTTask.build_preprocess_fn(text2text.mt_train_args, False),
+        collate_fn=MTTask.build_collate_fn(text2text.mt_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = text2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="MT Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--mt_train_config",
+        type=str,
+        help="ST training configuration",
+    )
+    group.add_argument(
+        "--mt_model_file",
+        type=str,
+        help="MT model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()

From a16a16caa58bd927fe6a6686e4e9f76d7325d4fb Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sat, 26 Feb 2022 09:14:09 -0500
Subject: [PATCH 23/34] black

---
 espnet2/mt/espnet_model.py       | 2 --
 espnet2/mt/frontend/embedding.py | 5 ++---
 espnet2/tasks/mt.py              | 5 +----
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index 736aac171d8..341a57bcf15 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -88,10 +88,8 @@ def __init__(
         else:
             self.mt_error_calculator = None
 
-
         self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
 
-
     def forward(
         self,
         text: torch.Tensor,
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
index d7696a9801b..5548c45046c 100644
--- a/espnet2/mt/frontend/embedding.py
+++ b/espnet2/mt/frontend/embedding.py
@@ -14,8 +14,7 @@
 
 
 class Embedding(AbsFrontend):
-    """Embedding Frontend for text based inputs.
-    """
+    """Embedding Frontend for text based inputs."""
 
     def __init__(
         self,
@@ -35,7 +34,7 @@ def __init__(
         assert check_argument_types()
         super().__init__()
         self.embed_dim = embed_dim
-        #TODO (sdalmia): check for padding idx
+        # TODO (sdalmia): check for padding idx
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(input_size, embed_dim),
             pos_enc_class(embed_dim, positional_dropout_rate),
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
index 750ce58d35d..2b229a29e9e 100644
--- a/espnet2/tasks/mt.py
+++ b/espnet2/tasks/mt.py
@@ -332,9 +332,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
         if args.input_size is None:
             # Extract features in the model
             frontend_class = frontend_choices.get_class(args.frontend)
-            frontend = frontend_class(
-                input_size=src_vocab_size, **args.frontend_conf
-            )
+            frontend = frontend_class(input_size=src_vocab_size, **args.frontend_conf)
             input_size = frontend.output_size()
         else:
             # Give features from data-loader
@@ -377,7 +375,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
             **args.decoder_conf,
         )
 
-
         # 8. Build model
         model = ESPnetMTModel(
             vocab_size=vocab_size,

From 3133539f3a9336a3331b60b07f80ced0680c0d88 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sat, 26 Feb 2022 11:02:10 -0500
Subject: [PATCH 24/34] fix comments

---
 egs2/TEMPLATE/mt1/conf/fbank.conf | 2 --
 egs2/TEMPLATE/mt1/conf/pitch.conf | 1 -
 egs2/TEMPLATE/mt1/mt.sh           | 3 +--
 egs2/iwslt14/mt1/conf/fbank.conf  | 2 --
 egs2/iwslt14/mt1/conf/pitch.conf  | 1 -
 5 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 egs2/TEMPLATE/mt1/conf/fbank.conf
 delete mode 100644 egs2/TEMPLATE/mt1/conf/pitch.conf
 delete mode 100644 egs2/iwslt14/mt1/conf/fbank.conf
 delete mode 100644 egs2/iwslt14/mt1/conf/pitch.conf

diff --git a/egs2/TEMPLATE/mt1/conf/fbank.conf b/egs2/TEMPLATE/mt1/conf/fbank.conf
deleted file mode 100644
index 82ac7bd0dbc..00000000000
--- a/egs2/TEMPLATE/mt1/conf/fbank.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---sample-frequency=16000 
---num-mel-bins=80
diff --git a/egs2/TEMPLATE/mt1/conf/pitch.conf b/egs2/TEMPLATE/mt1/conf/pitch.conf
deleted file mode 100644
index e959a19d5b8..00000000000
--- a/egs2/TEMPLATE/mt1/conf/pitch.conf
+++ /dev/null
@@ -1 +0,0 @@
---sample-frequency=16000
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index a049a38791c..cbfaf059d41 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -486,7 +486,7 @@ if ! "${skip_data_prep}"; then
 
 
     if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+        log "Stage 3: Data filtering: ${data_feats}/org -> ${data_feats}"
 
         # NOTE(kamo): Not applying to test_sets to keep original data
         for dset in "${train_set}" "${valid_set}"; do
@@ -791,7 +791,6 @@ if ! "${skip_train}"; then
                 jobname="${lm_exp}/train.log"
             fi
 
-            # TODO(jiatong): fix bpe
             # shellcheck disable=SC2086
             ${python} -m espnet2.bin.launch \
                 --cmd "${cuda_cmd} --name ${jobname}" \
diff --git a/egs2/iwslt14/mt1/conf/fbank.conf b/egs2/iwslt14/mt1/conf/fbank.conf
deleted file mode 100644
index 82ac7bd0dbc..00000000000
--- a/egs2/iwslt14/mt1/conf/fbank.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---sample-frequency=16000 
---num-mel-bins=80
diff --git a/egs2/iwslt14/mt1/conf/pitch.conf b/egs2/iwslt14/mt1/conf/pitch.conf
deleted file mode 100644
index e959a19d5b8..00000000000
--- a/egs2/iwslt14/mt1/conf/pitch.conf
+++ /dev/null
@@ -1 +0,0 @@
---sample-frequency=16000

From ba4700218219147df4bf160722fabedc5b787695 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sun, 27 Feb 2022 00:10:05 -0500
Subject: [PATCH 25/34] fix CI issues

---
 egs2/iwslt14/mt1/local/data.sh | 3 +--
 egs2/iwslt14/mt1/run.sh        | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
index 7ecfb07a6f5..aa7d962c1ed 100755
--- a/egs2/iwslt14/mt1/local/data.sh
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -17,7 +17,6 @@ SECONDS=0
 
 stage=1
 stop_stage=100000
-splits_dir=data/iwslt22_splits
 URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
 GZ=de-en.tgz
 
@@ -74,7 +73,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "preparing test and valid data"
 
     for l in $src $tgt; do
-        for o in `ls $IWSLT14/$lang/IWSLT14.TED*.$l.xml`; do
+        for o in $IWSLT14/$lang/IWSLT14.TED*.$l.xml; do
             fname=${o##*/}
             f=$tmp/${fname%.*}
             echo $o $f
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
index e43e18042d7..06c09e91751 100755
--- a/egs2/iwslt14/mt1/run.sh
+++ b/egs2/iwslt14/mt1/run.sh
@@ -10,7 +10,7 @@ tgt_lang=en
 
 train_set=train
 train_dev=valid
-test_set=test
+test_set="test"
 
 mt_config=conf/train_mt_transformer.yaml
 inference_config=conf/decode_mt.yaml

From df639db472d14a2c6f82d3478f0f5fb137beb612 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sun, 27 Feb 2022 00:46:20 -0500
Subject: [PATCH 26/34] CI fix

---
 egs2/iwslt14/mt1/local/data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
index aa7d962c1ed..7b467554eb2 100755
--- a/egs2/iwslt14/mt1/local/data.sh
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -73,7 +73,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "preparing test and valid data"
 
     for l in $src $tgt; do
-        for o in $IWSLT14/$lang/IWSLT14.TED*.$l.xml; do
+        for o in "${IWSLT14}/${lang}"/IWSLT14.TED*."${l}".xml; do
             fname=${o##*/}
             f=$tmp/${fname%.*}
             echo $o $f

From 273929fdadf9f45d9a5d60c8a8e36245a7d71305 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sun, 27 Feb 2022 01:24:21 -0500
Subject: [PATCH 27/34] ci fixes

---
 egs2/iwslt14/mt1/local/data.sh   | 18 ++++++++++--------
 espnet2/mt/espnet_model.py       |  1 -
 espnet2/mt/frontend/embedding.py |  4 +---
 espnet2/tasks/mt.py              |  5 +----
 4 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
index 7b467554eb2..7a2ba392392 100755
--- a/egs2/iwslt14/mt1/local/data.sh
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -37,10 +37,11 @@ fi
 if [ -f "${IWSLT14}/${GZ}" ]; then
     log "Data already downloaded"
 else
-    cd ${IWSLT14}
-    wget "$URL"
-    tar zxvf $GZ
-    cd -
+    (
+        cd ${IWSLT14}
+        wget "$URL"
+        tar zxvf $GZ
+    )
     log "Data downloaded and extracted"
 fi
 
@@ -51,9 +52,10 @@ prep=iwslt14.tokenized.de-en
 tmp=data/$prep/tmp
 
 if [ ! -d "${IWSLT14}/${lang}" ]; then
-    cd ${IWSLT14}
-    tar zxvf $GZ
-    cd -
+    (
+        cd ${IWSLT14}
+        tar zxvf $GZ
+    )
     log "Data extracted"
 fi
 
@@ -94,7 +96,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         f=train.tags.$lang.$l
         tok=train.tags.$lang.tok.$l
 
-        cat $IWSLT14/$lang/$f | \
+        < $IWSLT14/$lang/$f \
         grep -v '<url>' | \
         grep -v '<talkid>' | \
         grep -v '<keywords>' | \
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index 341a57bcf15..57f1ad83658 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -21,7 +21,6 @@
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
-from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
index 5548c45046c..b9044c1385f 100644
--- a/espnet2/mt/frontend/embedding.py
+++ b/espnet2/mt/frontend/embedding.py
@@ -4,8 +4,6 @@
 
 """Embedding Frontend for text based inputs."""
 
-import math
-
 from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 import torch
@@ -34,7 +32,7 @@ def __init__(
         assert check_argument_types()
         super().__init__()
         self.embed_dim = embed_dim
-        # TODO (sdalmia): check for padding idx
+        # TODO(sdalmia): check for padding idx
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(input_size, embed_dim),
             pos_enc_class(embed_dim, positional_dropout_rate),
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
index 2b229a29e9e..496b48b96e7 100644
--- a/espnet2/tasks/mt.py
+++ b/espnet2/tasks/mt.py
@@ -34,8 +34,6 @@
 )
 from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
-from espnet2.asr.frontend.default import DefaultFrontend
-from espnet2.asr.frontend.windowing import SlidingWindow
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
     HuggingFaceTransformersPostEncoder,  # noqa: H301
@@ -43,8 +41,8 @@
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
 from espnet2.asr.preencoder.linear import LinearProjection
 from espnet2.asr.preencoder.sinc import LightweightSincConvs
-from espnet2.mt.frontend.embedding import Embedding
 from espnet2.mt.espnet_model import ESPnetMTModel
+from espnet2.mt.frontend.embedding import Embedding
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
@@ -54,7 +52,6 @@
 from espnet2.train.trainer import Trainer
 from espnet2.utils.get_default_kwargs import get_default_kwargs
 from espnet2.utils.nested_dict_action import NestedDictAction
-from espnet2.utils.types import float_or_none
 from espnet2.utils.types import int_or_none
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str_or_none

From 8e43b78983eb519b1d18ca986059562955eb402b Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@andrew.cmu.edu>
Date: Sun, 27 Feb 2022 02:21:22 -0500
Subject: [PATCH 28/34] ci fixes

---
 test/espnet2/asr/encoder/test_transformer_encoder.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 82bb317dc9c..743595d5c37 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -15,8 +15,6 @@ def test_Encoder_forward_backward(input_layer, positionwise_layer_type):
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 10])
-    elif input_layer is None:
-        x = torch.randn(2, 10, 40, requires_grad=True)
     else:
         x = torch.randn(2, 10, 20, requires_grad=True)
     x_lens = torch.LongTensor([10, 8])

From bc0860942150a7f0b9f4e7a442a5d4986eb9e759 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Sun, 27 Feb 2022 15:27:01 -0500
Subject: [PATCH 29/34] Update the example PyTorch version

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 999082c9043..0a1c8acf022 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -202,14 +202,14 @@ We also have [prebuilt Kaldi binaries](https://github.com/espnet/espnet/blob/mas
 
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1
+    $ make TH_VERSION=1.10.1
     ```
     
     Note that the CUDA version is derived from `nvcc` command. If you'd like to specify the other CUDA version, you need to give `CUDA_VERSION`.
     
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1 CUDA_VERSION=10.1
+    $ make TH_VERSION=1.10.1 CUDA_VERSION=11.3
     ```
 
     If you don't have `nvcc` command, packages are installed for CPU mode by default.

From 82ac5f751b8f09c99eb71822da375c8ae153fafb Mon Sep 17 00:00:00 2001
From: YosukeHiguchi <wasapon.dev@gmail.com>
Date: Mon, 28 Feb 2022 09:49:14 +0900
Subject: [PATCH 30/34] add tests

---
 .../asr/encoder/test_conformer_encoder.py     | 22 +++++++++++++++++++
 .../asr/encoder/test_transformer_encoder.py   | 12 +++++++++-
 test/espnet2/asr/test_ctc.py                  |  8 +++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index 43837c2ac4e..7849ef7f61f 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -17,12 +17,17 @@
         ("legacy", "legacy_rel_pos", "legacy_rel_selfattn"),
     ],
 )
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning", [([1], False), ([1], True)]
+)
 def test_encoder_forward_backward(
     input_layer,
     positionwise_layer_type,
     rel_pos_type,
     pos_enc_layer_type,
     selfattention_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
 ):
     encoder = ConformerEncoder(
         20,
@@ -39,6 +44,8 @@ def test_encoder_forward_backward(
         use_cnn_module=True,
         cnn_module_kernel=3,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx
+        interctc_use_conditioning=interctc_use_conditioning,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 32])
@@ -82,6 +89,21 @@ def test_encoder_invalid_rel_pos_combination():
         )
 
 
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=6,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_encoder_output_size():
     encoder = ConformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256
diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 82bb317dc9c..5fe221eb66f 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -6,12 +6,22 @@
 
 @pytest.mark.parametrize("input_layer", ["linear", "conv2d", "embed", None])
 @pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
-def test_Encoder_forward_backward(input_layer, positionwise_layer_type):
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning", [([1], False), ([1], True)]
+)
+def test_Encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+):
     encoder = TransformerEncoder(
         20,
         output_size=40,
         input_layer=input_layer,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 10])
diff --git a/test/espnet2/asr/test_ctc.py b/test/espnet2/asr/test_ctc.py
index a218e6d6815..6926ad31841 100644
--- a/test/espnet2/asr/test_ctc.py
+++ b/test/espnet2/asr/test_ctc.py
@@ -22,6 +22,14 @@ def test_ctc_forward_backward(ctc_type, ctc_args):
     ctc(*ctc_args).sum().backward()
 
 
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
+def test_ctc_softmax(ctc_type, ctc_args):
+    if ctc_type == "warpctc":
+        pytest.importorskip("warpctc_pytorch")
+    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc.softmax(ctc_args[0])
+
+
 @pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
 def test_ctc_log_softmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":

From b39be4fea5208994f59e934747b70b18cd54f8db Mon Sep 17 00:00:00 2001
From: YosukeHiguchi <wasapon.dev@gmail.com>
Date: Mon, 28 Feb 2022 10:11:39 +0900
Subject: [PATCH 31/34] minor fix

---
 test/espnet2/asr/encoder/test_conformer_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index 7849ef7f61f..321ced32d8f 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -44,7 +44,7 @@ def test_encoder_forward_backward(
         use_cnn_module=True,
         cnn_module_kernel=3,
         positionwise_layer_type=positionwise_layer_type,
-        interctc_layer_idx=interctc_layer_idx
+        interctc_layer_idx=interctc_layer_idx,
         interctc_use_conditioning=interctc_use_conditioning,
     )
     if input_layer == "embed":

From 1a51c373462542a9bce01822f2202ffce7d15214 Mon Sep 17 00:00:00 2001
From: YosukeHiguchi <wasapon.dev@gmail.com>
Date: Mon, 28 Feb 2022 10:59:01 +0900
Subject: [PATCH 32/34] fix typo sizse->size

---
 espnet2/asr/ctc.py                                   | 6 +++---
 espnet2/tasks/asr.py                                 | 2 +-
 espnet2/tasks/enh_asr.py                             | 2 +-
 espnet2/tasks/st.py                                  | 2 +-
 test/espnet2/asr/decoder/test_transformer_decoder.py | 2 +-
 test/espnet2/asr/test_ctc.py                         | 8 ++++----
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/espnet2/asr/ctc.py b/espnet2/asr/ctc.py
index 78fa431c458..64b87106ac8 100644
--- a/espnet2/asr/ctc.py
+++ b/espnet2/asr/ctc.py
@@ -10,7 +10,7 @@ class CTC(torch.nn.Module):
 
     Args:
         odim: dimension of outputs
-        encoder_output_sizse: number of encoder projection units
+        encoder_output_size: number of encoder projection units
         dropout_rate: dropout rate (0.0 ~ 1.0)
         ctc_type: builtin or warpctc
         reduce: reduce the CTC loss into a scalar
@@ -19,7 +19,7 @@ class CTC(torch.nn.Module):
     def __init__(
         self,
         odim: int,
-        encoder_output_sizse: int,
+        encoder_output_size: int,
         dropout_rate: float = 0.0,
         ctc_type: str = "builtin",
         reduce: bool = True,
@@ -27,7 +27,7 @@ def __init__(
     ):
         assert check_argument_types()
         super().__init__()
-        eprojs = encoder_output_sizse
+        eprojs = encoder_output_size
         self.dropout_rate = dropout_rate
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
         self.ctc_type = ctc_type
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index ef198ccd5ae..780aa905697 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -477,7 +477,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
 
         # 6. CTC
         ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder_output_size, **args.ctc_conf
+            odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf
         )
 
         # 8. Build model
diff --git a/espnet2/tasks/enh_asr.py b/espnet2/tasks/enh_asr.py
index 49d83e26ee9..c452ab2201d 100644
--- a/espnet2/tasks/enh_asr.py
+++ b/espnet2/tasks/enh_asr.py
@@ -339,7 +339,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
 
         # 6. CTC
         ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf
+            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
         )
 
         # 7. RNN-T Decoder (Not implemented)
diff --git a/espnet2/tasks/st.py b/espnet2/tasks/st.py
index d7b5a48c0c4..182a335cc56 100644
--- a/espnet2/tasks/st.py
+++ b/espnet2/tasks/st.py
@@ -516,7 +516,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
         if src_token_list is not None:
             ctc = CTC(
                 odim=src_vocab_size,
-                encoder_output_sizse=encoder_output_size,
+                encoder_output_size=encoder_output_size,
                 **args.ctc_conf,
             )
         else:
diff --git a/test/espnet2/asr/decoder/test_transformer_decoder.py b/test/espnet2/asr/decoder/test_transformer_decoder.py
index df44bcc7e43..d01c5b07a64 100644
--- a/test/espnet2/asr/decoder/test_transformer_decoder.py
+++ b/test/espnet2/asr/decoder/test_transformer_decoder.py
@@ -217,7 +217,7 @@ def test_TransformerDecoder_batch_beam_search_online(
         use_output_layer=use_output_layer,
         linear_units=10,
     )
-    ctc = CTC(odim=vocab_size, encoder_output_sizse=encoder_output_size)
+    ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size)
     ctc.to(dtype)
     ctc_scorer = CTCPrefixScorer(ctc=ctc, eos=vocab_size - 1)
     beam = BatchBeamSearchOnlineSim(
diff --git a/test/espnet2/asr/test_ctc.py b/test/espnet2/asr/test_ctc.py
index 6926ad31841..5e17121415d 100644
--- a/test/espnet2/asr/test_ctc.py
+++ b/test/espnet2/asr/test_ctc.py
@@ -18,7 +18,7 @@ def ctc_args():
 def test_ctc_forward_backward(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc(*ctc_args).sum().backward()
 
 
@@ -26,7 +26,7 @@ def test_ctc_forward_backward(ctc_type, ctc_args):
 def test_ctc_softmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.softmax(ctc_args[0])
 
 
@@ -34,7 +34,7 @@ def test_ctc_softmax(ctc_type, ctc_args):
 def test_ctc_log_softmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.log_softmax(ctc_args[0])
 
 
@@ -42,5 +42,5 @@ def test_ctc_log_softmax(ctc_type, ctc_args):
 def test_ctc_argmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.argmax(ctc_args[0])

From 3251ed4be12ff5efab939f2dec6577cd76627191 Mon Sep 17 00:00:00 2001
From: YosukeHiguchi <wasapon.dev@gmail.com>
Date: Mon, 28 Feb 2022 11:13:41 +0900
Subject: [PATCH 33/34] update tests

---
 .../asr/encoder/test_conformer_encoder.py     | 24 ++++++++++--
 .../asr/encoder/test_transformer_encoder.py   | 37 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index 321ced32d8f..2a38323d635 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
 
 
@@ -18,7 +19,12 @@
     ],
 )
 @pytest.mark.parametrize(
-    "interctc_layer_idx, interctc_use_conditioning", [([1], False), ([1], True)]
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
 )
 def test_encoder_forward_backward(
     input_layer,
@@ -52,7 +58,19 @@ def test_encoder_forward_backward(
     else:
         x = torch.randn(2, 32, 20, requires_grad=True)
     x_lens = torch.LongTensor([32, 28])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc=None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, output_size
+            )
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
@@ -99,7 +117,7 @@ def test_encoder_invalid_interctc_layer_idx():
     with pytest.raises(AssertionError):
         ConformerEncoder(
             20,
-            num_blocks=6,
+            num_blocks=2,
             interctc_layer_idx=[1, 2],
         )
 
diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 5fe221eb66f..1caf48d69af 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -1,13 +1,19 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 
 
 @pytest.mark.parametrize("input_layer", ["linear", "conv2d", "embed", None])
 @pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
 @pytest.mark.parametrize(
-    "interctc_layer_idx, interctc_use_conditioning", [([1], False), ([1], True)]
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
 )
 def test_Encoder_forward_backward(
     input_layer,
@@ -30,10 +36,37 @@ def test_Encoder_forward_backward(
     else:
         x = torch.randn(2, 10, 20, requires_grad=True)
     x_lens = torch.LongTensor([10, 8])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc=None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, output_size
+            )
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_Encoder_output_size():
     encoder = TransformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256

From 4604a2df662cd74ad739eac1364fcaacfba67153 Mon Sep 17 00:00:00 2001
From: YosukeHiguchi <wasapon.dev@gmail.com>
Date: Mon, 28 Feb 2022 11:30:53 +0900
Subject: [PATCH 34/34] apply black

---
 test/espnet2/asr/encoder/test_conformer_encoder.py   | 6 ++----
 test/espnet2/asr/encoder/test_transformer_encoder.py | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index 2a38323d635..ddc2d077f9d 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -59,14 +59,12 @@ def test_encoder_forward_backward(
         x = torch.randn(2, 32, 20, requires_grad=True)
     x_lens = torch.LongTensor([32, 28])
     if len(interctc_layer_idx) > 0:
-        ctc=None
+        ctc = None
         if interctc_use_conditioning:
             vocab_size = 5
             output_size = encoder.output_size()
             ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
-            encoder.conditioning_layer = torch.nn.Linear(
-                vocab_size, output_size
-            )
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
         y, _, _ = encoder(x, x_lens, ctc=ctc)
         y = y[0]
     else:
diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 1caf48d69af..15bd6e0f331 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -37,14 +37,12 @@ def test_Encoder_forward_backward(
         x = torch.randn(2, 10, 20, requires_grad=True)
     x_lens = torch.LongTensor([10, 8])
     if len(interctc_layer_idx) > 0:
-        ctc=None
+        ctc = None
         if interctc_use_conditioning:
             vocab_size = 5
             output_size = encoder.output_size()
             ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
-            encoder.conditioning_layer = torch.nn.Linear(
-                vocab_size, output_size
-            )
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
         y, _, _ = encoder(x, x_lens, ctc=ctc)
         y = y[0]
     else: