PaddlePaddle · zxcd · Dec 18, 2024 · Dec 12, 2024 · Dec 12, 2024 · zxcd
diff --git a/examples/ted_en_zh/st0/README.md b/examples/ted_en_zh/st0/README.md
@@ -127,7 +127,7 @@ source path.h
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer_mtl_noam.yaml transformer_mtl_noam
 avg.sh latest exp/transformer_mtl_noam/checkpoints 5
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml exp/transformer_mtl_noam/checkpoints/avg_5
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml conf/tuning/decode.yaml exp/transformer_mtl_noam/checkpoints/avg_5
 ```
 The performance of the released models are shown below:
 ### Transformer

diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -34,9 +34,6 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 

diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -115,6 +115,10 @@ def defeaturize(self, idxs):
         """
         assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
         tokens = []
+        # unwrap `idxs`` like `[[1,2,3]]`
+        if idxs and isinstance(idxs[0], (list, tuple)) and len(idxs) == 1:
+            idxs = idxs[0]
+
         for idx in idxs:
             if idx == self.eos_id:
                 break

diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
@@ -404,6 +404,12 @@ def get_dataloader(mode: str, config, args):
                 config['subsampling_factor'] = 1
                 config['num_encs'] = 1
                 config['shortest_first'] = False
+                config['minibatches'] = 0
+                config['batch_count'] = 'auto'
+                config['batch_bins'] = 0
+                config['batch_frames_in'] = 0
+                config['batch_frames_out'] = 0
+                config['batch_frames_inout'] = 0
             elif mode == 'valid':
                 config['manifest'] = config.dev_manifest
                 config['train_mode'] = False

diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -170,8 +170,8 @@ def _calc_st_loss(
         ys_in_lens = ys_pad_lens + 1
 
         # 1. Forward decoder
-        decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
-                                         ys_in_lens)
+        decoder_out, *_ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
+                                          ys_in_lens)
 
         # 2. Compute attention loss
         loss_att = self.criterion_att(decoder_out, ys_out_pad)
@@ -203,8 +203,8 @@ def _calc_att_loss(
         ys_in_lens = ys_pad_lens + 1
 
         # 1. Forward decoder
-        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
-                                      ys_in_lens)
+        decoder_out, *_ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
+                                       ys_in_lens)
 
         # 2. Compute attention loss
         loss_att = self.criterion_att(decoder_out, ys_out_pad)

diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
@@ -110,14 +110,14 @@ def __init__(self,
                 concat_after=concat_after, ) for _ in range(num_blocks)
         ])
 
-    def forward(
-            self,
-            memory: paddle.Tensor,
-            memory_mask: paddle.Tensor,
-            ys_in_pad: paddle.Tensor,
-            ys_in_lens: paddle.Tensor,
-            r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
-            reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def forward(self,
+                memory: paddle.Tensor,
+                memory_mask: paddle.Tensor,
+                ys_in_pad: paddle.Tensor,
+                ys_in_lens: paddle.Tensor,
+                r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
+                reverse_weight: float=0.0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Forward decoder.
         Args:
             memory: encoded memory, float32  (batch, maxlen_in, feat)