fix code and style

lijialin03 · lijialin03 · commit f9298c449e6e · 2024-03-19T08:06:12.000Z
diff --git a/ppsci/data/dataset/mrms_dataset.py b/ppsci/data/dataset/mrms_dataset.py
@@ -15,8 +15,11 @@
 from __future__ import annotations
 
 import glob
-from datetime import datetime, timedelta
+import os.path as osp
+from datetime import datetime
+from datetime import timedelta
 from typing import Dict
+from typing import List
 from typing import Optional
 from typing import Tuple
 
@@ -31,16 +34,15 @@ class MRMSDataset(io.Dataset):
     """Class for MRMS dataset. MRMS day's data is stored in a .h5 file. Each file includes keys "date"/"time_interval"/"dataset".
 
     Args:
-        file_path (str): Data set path.
+        file_path (str): Dataset path.
         input_keys (Tuple[str, ...]): Input keys, usually there is only one, such as ("input",).
         label_keys (Tuple[str, ...]): Output keys, usually there is only one, such as ("output",).
         weight_dict (Optional[Dict[str, float]]): Weight dictionary. Defaults to None.
         date_period (Tuple[str,...], optional): Dates of data. Scale is [start_date, end_date] with format "%Y%m%d". Defaults to ("20230101","20230101").
         num_input_timestamps (int, optional): Number of timestamp of label. Defaults to 1.
         num_label_timestamps (int, optional): Number of timestamp of label. Defaults to 1.
         stride (int, optional): Stride of sampling data. Defaults to 1.
-        transforms (Optional[vision.Compose]): Compose object contains sample wise
-            transform(s). Defaults to None. 
+        transforms (Optional[vision.Compose]): Composed transform functor(s). Defaults to None.
 
     Examples:
         >>> import ppsci
@@ -65,12 +67,12 @@ def __init__(
         input_keys: Tuple[str, ...],
         label_keys: Tuple[str, ...],
         weight_dict: Optional[Dict[str, float]] = None,
-        date_period: Tuple[str,...] = ("20230101","20230101"),
+        date_period: Tuple[str, ...] = ("20230101", "20230101"),
         num_input_timestamps: int = 1,
         num_label_timestamps: int = 1,
         stride: int = 1,
         transforms: Optional[vision.Compose] = None,
-    ):  
+    ):
         super().__init__()
         self.file_path = file_path
         self.input_keys = input_keys
@@ -81,17 +83,22 @@ def __init__(
             self.weight_dict = {key: 1.0 for key in self.label_keys}
             self.weight_dict.update(weight_dict)
 
-        self.date_list = self.get_date_strs(date_period)
+        self.date_list = self._get_date_strs(date_period)
         self.num_input_timestamps = num_input_timestamps
         self.num_label_timestamps = num_label_timestamps
         self.stride = stride
         self.transforms = transforms
 
-        self.files = self.read_data(file_path)
+        self.files = self._read_data(file_path)
         self.num_samples_per_day = self.files[0].shape[0]
         self.num_samples = self.num_samples_per_day * len(self.date_list)
-    
-    def get_date_strs(self, date_period):
+
+    def _get_date_strs(self, date_period: Tuple[str, ...]) -> List:
+        """Get a string list of all dates within given period.
+
+        Args:
+            date_period (Tuple[str,...]): Dates of data. Scale is [start_date, end_date] with format "%Y%m%d".
+        """
         start_time = datetime.strptime(date_period[0], "%Y%m%d")
         end_time = datetime.strptime(date_period[1], "%Y%m%d")
         results = []
@@ -102,31 +109,48 @@ def get_date_strs(self, date_period):
             current_time += timedelta(days=1)
         return results
 
-    def read_data(self, path: str, var="dataset"):
-        paths = [path] if path.endswith(".h5") else [_path for _path in glob.glob(path + "/*.h5") if _path.split(".h5")[0].split("_")[-1] in self.date_list]
-        assert len(paths) == len(self.date_list), f"Data of {len(self.date_list)} mouths wanted but only {len(paths)} mouths be found"
+    def _read_data(self, path: str):
+        if path.endswith(".h5"):
+            paths = [path]
+        else:
+            paths = [
+                _path
+                for _path in glob.glob(osp.join(path, "*.h5"))
+                if _path.split(".h5")[0].split("_")[-1] in self.date_list
+            ]
+        assert len(paths) == len(
+            self.date_list
+        ), f"Data of {len(self.date_list)} days wanted but only {len(paths)} days be found"
         paths.sort()
-        
-        files = []
-        for _path in paths:
-            _file = h5py.File(_path, "r")
-            files.append(_file[var])
+
+        files = [h5py.File(_path, "r")["dataset"] for _path in paths]
         return files
 
     def __len__(self):
-        return self.num_samples//self.stride - self.num_input_timestamps - self.num_label_timestamps + 1
+        return (
+            self.num_samples // self.stride
+            - self.num_input_timestamps
+            - self.num_label_timestamps
+            + 1
+        )
 
     def __getitem__(self, global_idx):
         global_idx *= self.stride
-        _samples = np.empty((self.num_input_timestamps + self.num_label_timestamps, *self.files[0].shape[1:]), dtype=paddle.get_default_dtype())
-        for idx in range(self.num_input_timestamps+self.num_label_timestamps):
-            sample_idx = global_idx + idx*self.stride
+        _samples = np.empty(
+            (
+                self.num_input_timestamps + self.num_label_timestamps,
+                *self.files[0].shape[1:],
+            ),
+            dtype=paddle.get_default_dtype(),
+        )
+        for idx in range(self.num_input_timestamps + self.num_label_timestamps):
+            sample_idx = global_idx + idx * self.stride
             day_idx = sample_idx // self.num_samples_per_day
             local_idx = sample_idx % self.num_samples_per_day
-            _samples[idx]=self.files[day_idx][local_idx]
+            _samples[idx] = self.files[day_idx][local_idx]
 
-        input_item = {self.input_keys[0]: _samples[:self.num_input_timestamps]}
-        label_item = {self.label_keys[0]: _samples[self.num_input_timestamps:]}
+        input_item = {self.input_keys[0]: _samples[: self.num_input_timestamps]}
+        label_item = {self.label_keys[0]: _samples[self.num_input_timestamps :]}
 
         weight_shape = [1] * len(next(iter(label_item.values())).shape)
         weight_item = {
@@ -143,17 +167,16 @@ def __getitem__(self, global_idx):
 
 
 class MRMSSampledDataset(io.Dataset):
-    """Class for MRMS sampled dataset.MRMS one sample's data is stored in a .h5 file. Each file includes keys "date"/"time_interval"/"dataset".
+    """Class for MRMS sampled dataset. MRMS one sample's data is stored in a .h5 file. Each file includes keys "date"/"time_interval"/"dataset".
         The class just return data by input_item and values of label_item are empty for all label_keys.
 
     Args:
-        file_path (str): Data set path.
+        file_path (str): Dataset path.
         input_keys (Tuple[str, ...]): Input keys, such as ("input",).
         label_keys (Tuple[str, ...]): Output keys, such as ("output",).
         weight_dict (Optional[Dict[str, float]]): Weight dictionary. Defaults to None.
         num_total_timestamps (int, optional):  Number of timestamp of input+label. Defaults to 1.
-        transforms (Optional[vision.Compose]): Compose object contains sample wise
-            transform(s). Defaults to None.
+        transforms (Optional[vision.Compose]): Composed transform functor(s). Defaults to None.
 
     Examples:
         >>> import ppsci
@@ -192,16 +215,13 @@ def __init__(
         self.num_total_timestamps = num_total_timestamps
         self.transforms = transforms
 
-        self.files = self.read_data(file_path)
+        self.files = self._read_data(file_path)
         self.num_samples = len(self.files)
 
-    def read_data(self, path: str):
-        paths = glob.glob(path + "/*.h5")
+    def _read_data(self, path: str):
+        paths = glob.glob(osp.join(path, "*.h5"))
         paths.sort()
-        files = []
-        for _path in paths:
-            _file = h5py.File(_path, "r")
-            files.append(_file)
+        files = [h5py.File(_path, "r")["dataset"] for _path in paths]
         return files
 
     def __len__(self):
@@ -210,20 +230,15 @@ def __len__(self):
     def __getitem__(self, global_idx):
         _samples = []
         for idx in range(global_idx, global_idx + self.num_total_timestamps):
-            _samples.append(np.expand_dims(self.files[idx]["dataset"],axis=0))
+            _samples.append(np.expand_dims(self.files[idx], axis=0))
 
-        input_item = {self.input_keys[0]: np.concatenate(_samples, axis=0).astype(paddle.get_default_dtype())}
+        input_item = {
+            self.input_keys[0]: np.concatenate(_samples, axis=0).astype(
+                paddle.get_default_dtype()
+            )
+        }
         label_item = {}
-        for key in self.label_keys:
-            label_item[key] = np.asarray([], paddle.get_default_dtype())
-        
         weight_item = {}
-        if len(label_item) > 0:
-            weight_shape = [1] * len(next(iter(label_item.values())).shape)
-            weight_item = {
-                key: np.full(weight_shape, value, paddle.get_default_dtype())
-                for key, value in self.weight_dict.items()
-            }
 
         if self.transforms is not None:
             input_item, label_item, weight_item = self.transforms(