PaddlePaddle · SigureMo · Jul 19, 2024 · Jul 15, 2024 · Jul 16, 2024
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
@@ -11,14 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import tarfile
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 from paddle.dataset.common import _check_exists_and_download
 from paddle.io import Dataset
 
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+    _Wmt14DataSetMode = Literal["train", "test", "gen"]
 __all__ = []
 
 URL_DEV_TEST = (
@@ -45,12 +51,12 @@ class WMT14(Dataset):
     http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz .
 
     Args:
-        data_file(str): path to data tar file, can be set None if
-            :attr:`download` is True. Default None
-        mode(str): 'train', 'test' or 'gen'. Default 'train'
+        data_file(str|None): path to data tar file, can be set None if
+            :attr:`download` is True. Default None.
+        mode(str): 'train', 'test' or 'gen'. Default 'train'.
         dict_size(int): word dictionary size. Default -1.
         download(bool): whether to download dataset automatically if
-            :attr:`data_file` is not set. Default True
+            :attr:`data_file` is not set. Default True.
 
     Returns:
         Dataset: Instance of WMT14 dataset
@@ -95,9 +101,21 @@ class WMT14(Dataset):
 
     """
 
+    mode: _Wmt14DataSetMode
+    data_file: str | None
+    dict_size: int
+    out_dict: dict[str, int]
+    src_ids: list[npt.NDArray[np.int_]]
+    trg_ids: list[npt.NDArray[np.int_]]
+    trg_ids_next: list[npt.NDArray[np.int_]]
+
     def __init__(
-        self, data_file=None, mode='train', dict_size=-1, download=True
-    ):
+        self,
+        data_file: str | None = None,
+        mode: _Wmt14DataSetMode = 'train',
+        dict_size: int = -1,
+        download: bool = True,
+    ) -> None:
         assert mode.lower() in [
             'train',
             'test',
@@ -119,8 +137,8 @@ def __init__(
         self.dict_size = dict_size
         self._load_data()
 
-    def _load_data(self):
-        def __to_dict(fd, size):
+    def _load_data(self) -> None:
+        def __to_dict(fd, size: int) -> dict:
             out_dict = {}
             for line_count, line in enumerate(fd):
                 if line_count < size:
@@ -181,17 +199,25 @@ def __to_dict(fd, size):
                     self.trg_ids.append(trg_ids)
                     self.trg_ids_next.append(trg_ids_next)
 
-    def __getitem__(self, idx):
+    def __getitem__(
+        self, idx: int
+    ) -> tuple[
+        npt.NDArray[np.int_],
+        npt.NDArray[np.int_],
+        npt.NDArray[np.int_],
+    ]:
         return (
             np.array(self.src_ids[idx]),
             np.array(self.trg_ids[idx]),
             np.array(self.trg_ids_next[idx]),
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.src_ids)
 
-    def get_dict(self, reverse=False):
+    def get_dict(
+        self, reverse: bool = False
+    ) -> tuple[dict[str, int], dict[int, str]]:
         """
         Get the source and target dictionary.
 

diff --git a/python/paddle/text/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
@@ -12,17 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
 
 import os
 import tarfile
 from collections import defaultdict
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 import paddle
 from paddle.dataset.common import _check_exists_and_download
 from paddle.io import Dataset
 
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+    _Wmt16DataSetMode = Literal["train", "test", "val"]
+    _Wmt16Language = Literal["en", "de"]
 __all__ = []
 
 DATA_URL = "http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz"
@@ -57,7 +64,7 @@ class WMT16(Dataset):
         }
 
     Args:
-        data_file(str): path to data tar file, can be set None if
+        data_file(str|None): path to data tar file, can be set None if
             :attr:`download` is True. Default None.
         mode(str): 'train', 'test' or 'val'. Default 'train'.
         src_dict_size(int): word dictionary size for source language word. Default -1.
@@ -109,15 +116,26 @@ class WMT16(Dataset):
             55 24 25
     """
 
+    mode: _Wmt16DataSetMode
+    data_file: str | None
+    lang: _Wmt16Language
+    src_dict_size: int
+    trg_dict_size: int
+    src_dict: dict[str, int]
+    trg_dict: dict[str, int]
+    src_ids: list[npt.NDArray[np.int_]]
+    trg_ids: list[npt.NDArray[np.int_]]
+    trg_ids_next: list[npt.NDArray[np.int_]]
+
     def __init__(
         self,
-        data_file=None,
-        mode='train',
-        src_dict_size=-1,
-        trg_dict_size=-1,
-        lang='en',
-        download=True,
-    ):
+        data_file: str | None = None,
+        mode: _Wmt16DataSetMode = 'train',
+        src_dict_size: int = -1,
+        trg_dict_size: int = -1,
+        lang: _Wmt16Language = 'en',
+        download: bool = True,
+    ) -> None:
         assert mode.lower() in [
             'train',
             'test',
@@ -153,7 +171,9 @@ def __init__(
         # load data
         self.data = self._load_data()
 
-    def _load_dict(self, lang, dict_size, reverse=False):
+    def _load_dict(
+        self, lang: _Wmt16Language, dict_size: int, reverse: bool = False
+    ) -> dict[str, int] | dict[int, str]:
         dict_path = os.path.join(
             paddle.dataset.common.DATA_HOME,
             "wmt16/%s_%d.dict" % (lang, dict_size),
@@ -174,7 +194,9 @@ def _load_dict(self, lang, dict_size, reverse=False):
                     word_dict[line.strip().decode()] = idx
         return word_dict
 
-    def _build_dict(self, dict_path, dict_size, lang):
+    def _build_dict(
+        self, dict_path: str, dict_size: int, lang: _Wmt16Language
+    ) -> None:
         word_dict = defaultdict(int)
         with tarfile.open(self.data_file, mode="r") as f:
             for line in f.extractfile("wmt16/train"):
@@ -196,7 +218,7 @@ def _build_dict(self, dict_path, dict_size, lang):
                 fout.write(word[0].encode())
                 fout.write(b'\n')
 
-    def _load_data(self):
+    def _load_data(self) -> None:
         # the index for start mark, end mark, and unk are the same in source
         # language and target language. Here uses the source language
         # dictionary to determine their indices.
@@ -233,17 +255,25 @@ def _load_data(self):
                 self.trg_ids.append(trg_ids)
                 self.trg_ids_next.append(trg_ids_next)
 
-    def __getitem__(self, idx):
+    def __getitem__(
+        self, idx: int
+    ) -> tuple[
+        npt.NDArray[np.int_],
+        npt.NDArray[np.int_],
+        npt.NDArray[np.int_],
+    ]:
         return (
             np.array(self.src_ids[idx]),
             np.array(self.trg_ids[idx]),
             np.array(self.trg_ids_next[idx]),
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.src_ids)
 
-    def get_dict(self, lang, reverse=False):
+    def get_dict(
+        self, lang: _Wmt16Language, reverse: bool = False
+    ) -> dict[str, int] | dict[int, str]:
         """
         return the word dictionary for the specified language.