Merge pull request #176 from AugustJW/main

add models GP-VAE/USGAN
WenjieDu · Sep 21, 2023 · 9bfffa1 · 9bfffa1
2 parents a329f79 + 0a6b37a
commit 9bfffa1
Show file tree

Hide file tree

Showing 18 changed files with 1,608 additions and 14 deletions.
diff --git a/docs/about_us.rst b/docs/about_us.rst
@@ -33,5 +33,5 @@ PyPOTS exists thanks to all the nice people (sorted by contribution time) who co
 
 .. raw:: html
 
- <object data="https://pypots.com/figs/PyPOTS_contributors.svg">
+ <object data="https://pypots.com/figs/pypots_logos/PyPOTS_contributors.svg">
  </object>
diff --git a/pypots/base.py b/pypots/base.py
@@ -96,7 +96,9 @@ def _setup_device(self, device: Union[None, str, torch.device, list]):
  self.device = device
  elif isinstance(device, list):
  if len(device) == 0:
- raise ValueError("The list of devices should have at least 1 device, but got 0.")
+ raise ValueError(
+ "The list of devices should have at least 1 device, but got 0."
+ )
  elif len(device) == 1:
  return self._setup_device(device[0])
  # parallely training on multiple CUDA devices
@@ -176,7 +178,6 @@ def _send_data_to_given_device(self, data):
  if isinstance(self.device, torch.device): # single device
  data = map(lambda x: x.to(self.device), data)
  else: # parallely training on multiple devices
-
  # randomly choose one device to balance the workload
  # device = np.random.choice(self.device)
 

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
@@ -256,7 +256,6 @@ def _train_model(
  training_loader: DataLoader,
  val_loader: DataLoader = None,
  ) -> None:
-
  # each training starts from the very beginning, so reset the loss and model dict here
  self.best_loss = float("inf")
  self.best_model_dict = None

diff --git a/pypots/classification/raindrop/modules.py b/pypots/classification/raindrop/modules.py
@@ -174,7 +174,6 @@ def forward(
  edge_attr: OptTensor = None,
  return_attention_weights=None,
  ) -> Tuple[torch.Tensor, Any]:
-
  r"""
  Args:
  return_attention_weights (bool, optional): If set to :obj:`True`,

diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
@@ -244,7 +244,6 @@ def _train_model(
  training_loader: DataLoader,
  val_loader: DataLoader = None,
  ) -> None:
-
  """
 
  Parameters

diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py
@@ -226,7 +226,6 @@ def __init__(
  saving_path: Optional[str] = None,
  model_saving_strategy: Optional[str] = "best",
  ):
-
  super().__init__(
  n_clusters,
  batch_size,

diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py
@@ -184,7 +184,6 @@ def forward(
  ) = self.get_results(X, missing_mask)
 
  if not training and not pretrain:
-
  results = {
  "mu_tilde": mu_tilde,
  "mu": mu_c,
@@ -403,7 +402,6 @@ def _train_model(
  training_loader: DataLoader,
  val_loader: DataLoader = None,
  ) -> None:
-
  # each training starts from the very beginning, so reset the loss and model dict here
  self.best_loss = float("inf")
  self.best_model_dict = None

diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
@@ -242,7 +242,6 @@ def _train_model(
  training_loader: DataLoader,
  val_loader: DataLoader = None,
  ) -> None:
-
  # each training starts from the very beginning, so reset the loss and model dict here
  self.best_loss = float("inf")
  self.best_model_dict = None

diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py
@@ -6,15 +6,19 @@
 # License: GPL-v3
 
 from .brits import BRITS
+from .gpvae import GPVAE
 from .locf import LOCF
+from .mrnn import MRNN
 from .saits import SAITS
 from .transformer import Transformer
-from .mrnn import MRNN
+from .usgan import USGAN
 
 __all__ = [
  "SAITS",
  "Transformer",
  "BRITS",
  "MRNN",
  "LOCF",
+ "GPVAE",
+ "USGAN",
 ]
diff --git a/pypots/imputation/gpvae/__init__.py b/pypots/imputation/gpvae/__init__.py
@@ -0,0 +1,12 @@
+"""
+The package of the partially-observed time-series imputation method GP-VAE.
+"""
+
+# Created by Jun Wang <jwangfx@connect.ust.hk>
+# License: GLP-v3
+
+from .model import GPVAE
+
+__all__ = [
+ "GPVAE",
+]
diff --git a/pypots/imputation/gpvae/data.py b/pypots/imputation/gpvae/data.py
@@ -0,0 +1,133 @@
+"""
+Dataset class for model GP-VAE.
+"""
+
+# Created by Jun Wang <jwangfx@connect.ust.hk> and Wenjie Du <wenjay.du@gmail.com>
+# License: GLP-v3
+
+from typing import Union, Iterable
+
+import torch
+
+from ...data.base import BaseDataset
+from ...data.utils import torch_parse_delta
+
+
+class DatasetForGPVAE(BaseDataset):
+ """Dataset class for GP-VAE.
+
+ Parameters
+ ----------
+ data : dict or str,
+ The dataset for model input, should be a dictionary including keys as 'X' and 'y',
+ or a path string locating a data file.
+ If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+ which is time-series data for input, can contain missing values, and y should be array-like of shape
+ [n_samples], which is classification labels of X.
+ If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+ key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+ return_labels : bool, default = True,
+ Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example,
+ during training of classification models, the Dataset class will return labels in __getitem__() for model input.
+ Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we
+ need the defined Dataset class for all training/validating/testing stages. For those big datasets stored in h5
+ files, they already have both X and y saved. But we don't read labels from the file for validating and testing
+ with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for
+ distinction.
+
+ file_type : str, default = "h5py"
+ The type of the given file if train_set and val_set are path strings.
+ """
+
+ def __init__(
+ self,
+ data: Union[dict, str],
+ return_labels: bool = True,
+ file_type: str = "h5py",
+ ):
+ super().__init__(data, return_labels, file_type)
+
+ if not isinstance(self.data, str):
+ # calculate all delta here.
+ missing_mask = (~torch.isnan(self.X)).type(torch.float32)
+ X = torch.nan_to_num(self.X)
+
+ self.processed_data = {
+ "X": X,
+ "missing_mask": missing_mask,
+ }
+
+ def _fetch_data_from_array(self, idx: int) -> Iterable:
+ """Fetch data from self.X if it is given.
+
+ Parameters
+ ----------
+ idx : int,
+ The index of the sample to be return.
+
+ Returns
+ -------
+ sample : list,
+ A list contains
+
+ index : int tensor,
+ The index of the sample.
+
+ X : tensor,
+ The feature vector for model input.
+
+ missing_mask : tensor,
+ The mask indicates all missing values in X.
+
+ delta : tensor,
+ The delta matrix contains time gaps of missing values.
+
+ label (optional) : tensor,
+ The target label of the time-series sample.
+ """
+ sample = [
+ torch.tensor(idx),
+ # for forward
+ self.processed_data["X"][idx].to(torch.float32),
+ self.processed_data["missing_mask"][idx].to(torch.float32),
+ ]
+
+ if self.y is not None and self.return_labels:
+ sample.append(self.y[idx].to(torch.long))
+
+ return sample
+
+ def _fetch_data_from_file(self, idx: int) -> Iterable:
+ """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+ Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
+
+ Parameters
+ ----------
+ idx : int,
+ The index of the sample to be return.
+
+ Returns
+ -------
+ sample : list,
+ The collated data sample, a list including all necessary sample info.
+ """
+
+ if self.file_handle is None:
+ self.file_handle = self._open_file_handle()
+
+ X = torch.from_numpy(self.file_handle["X"][idx])
+ missing_mask = (~torch.isnan(X)).to(torch.float32)
+ X = torch.nan_to_num(X)
+
+ sample = [
+ torch.tensor(idx),
+ X,
+ missing_mask,
+ ]
+
+ # if the dataset has labels and is for training, then fetch it from the file
+ if "y" in self.file_handle.keys() and self.return_labels:
+ sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))
+
+ return sample