diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml index 10572236..d9380be9 100644 --- a/.github/workflows/testing_ci.yml +++ b/.github/workflows/testing_ci.yml @@ -78,8 +78,8 @@ jobs: - name: Test with pytest run: | - python tests/global_test_config.py rm -rf testing_results && rm -rf tests/__pycache__ && rm -rf tests/*/__pycache__ + python tests/global_test_config.py python -m pytest -rA tests/*/* -s -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc - name: Generate the LCOV report diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 0be3b71a..2d79de72 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -31,6 +31,9 @@ build: - pip install ./TSDB_repo && pip install ./PyGrinder_repo && pip install . post_install: + # To fix the exception: This documentation is not using `furo.css` as the stylesheet. + # If you have set `html_style` in your conf.py file, remove it. + - pip install sphinx==7.2.6 + # this docutils version fixes issue#102, put it in post_install to avoid being + # overwritten by other versions (like 0.19) while installing other packages - pip install docutils==0.20 - # this version fixes issue#102, put it in post_install to avoid being - # overwritten by other versions (like 0.19) while installing other packages diff --git a/README.md b/README.md index 3152f3c4..b04fb585 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,7 @@ the same as we did in [SAITS paper](https://arxiv.org/pdf/2202.08516).** | Neural Net | VaDER | Variational Deep Embedding with Recurrence [^7] | 2019 | | ***`Forecasting`*** | 🚥 | 🚥 | 🚥 | | **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | +| Neural Net | CSDI | Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation [^12] | 2021 | | Probabilistic | BTTF | Bayesian Temporal Tensor Factorization [^8] | 2021 | diff --git a/docs/index.rst b/docs/index.rst index 61a6e9b6..9bfe69bf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -193,7 +193,7 @@ Imputation Neural Net FEDformer (Frequency Enhanced De Imputation Neural Net Informer (Beyond Efficient Transformer for Long Sequence Time-Series Forecasting) 2021 :cite:`zhou2021informer` Imputation Neural Net Autoformer (Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting) 2021 :cite:`wu2021autoformer` Imputation Neural Net US-GAN (Unsupervised GAN for Multivariate Time Series Imputation) 2021 :cite:`miao2021SSGAN` -Imputation Neural Net CSDI (Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation) 2021 :cite:`tashiro2021csdi` +Imputation, Forecasting Neural Net CSDI (Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation) 2021 :cite:`tashiro2021csdi` Imputation Neural Net GP-VAE (Gaussian Process Variational Autoencoder) 2020 :cite:`fortuin2020gpvae` Imputation, Classification Neural Net BRITS (Bidirectional Recurrent Imputation for Time Series) 2018 :cite:`cao2018BRITS` Imputation Neural Net M-RNN (Multi-directional Recurrent Neural Network) 2019 :cite:`yoon2019MRNN` diff --git a/docs/pypots.data.rst b/docs/pypots.data.rst index 624f7b1d..79fde3fb 100644 --- a/docs/pypots.data.rst +++ b/docs/pypots.data.rst @@ -1,10 +1,10 @@ pypots.data package =================== -pypots.data.base +pypots.data.dataset ----------------------- -.. automodule:: pypots.data.base +.. automodule:: pypots.data.dataset :members: :undoc-members: :show-inheritance: diff --git a/pypots/base.py b/pypots/base.py index ac3287c5..699fc098 100644 --- a/pypots/base.py +++ b/pypots/base.py @@ -337,13 +337,13 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the classifier on the given data. Parameters ---------- - train_set : dict or str + train_set : The dataset for model training, should be a dictionary including keys as 'X', or a path string locating a data file supported by PyPOTS (e.g. h5 file). If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -352,7 +352,7 @@ def fit( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - val_set : dict or str + val_set : The dataset for model validating, should be a dictionary including keys as 'X', or a path string locating a data file supported by PyPOTS (e.g. h5 file). If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -361,7 +361,7 @@ def fit( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -371,13 +371,13 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. Parameters ---------- - test_set : dict or str + test_set : The dataset for model validating, should be a dictionary including keys as 'X', or a path string locating a data file supported by PyPOTS (e.g. h5 file). If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -386,12 +386,12 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict: dict + result_dict : Prediction results in a Python Dictionary for the given samples. It should be a dictionary including keys as 'imputation', 'classification', 'clustering', and 'forecasting'. For sure, only the keys that relevant tasks are supported by the model will be returned. @@ -512,7 +512,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: raise NotImplementedError @@ -520,6 +520,6 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError diff --git a/pypots/classification/base.py b/pypots/classification/base.py index 50bd5afd..817302ec 100644 --- a/pypots/classification/base.py +++ b/pypots/classification/base.py @@ -72,7 +72,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the classifier on the given data. @@ -106,7 +106,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -114,7 +114,7 @@ def predict( def classify( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Classify the input data with the trained model. @@ -214,12 +214,12 @@ def __init__( self.n_classes = n_classes @abstractmethod - def _assemble_input_for_training(self, data) -> dict: + def _assemble_input_for_training(self, data: list) -> dict: """Assemble the given data into a dictionary for training input. Parameters ---------- - data : list, + data : Input data from dataloader, should be list. Returns @@ -230,12 +230,12 @@ def _assemble_input_for_training(self, data) -> dict: raise NotImplementedError @abstractmethod - def _assemble_input_for_validating(self, data) -> dict: + def _assemble_input_for_validating(self, data: list) -> dict: """Assemble the given data into a dictionary for validating input. Parameters ---------- - data : list, + data : Data output from dataloader, should be list. Returns @@ -246,7 +246,7 @@ def _assemble_input_for_validating(self, data) -> dict: raise NotImplementedError @abstractmethod - def _assemble_input_for_testing(self, data) -> dict: + def _assemble_input_for_testing(self, data: list) -> dict: """Assemble the given data into a dictionary for testing input. Notes @@ -259,7 +259,7 @@ def _assemble_input_for_testing(self, data) -> dict: Parameters ---------- - data : list, + data : Data output from dataloader, should be list. Returns @@ -386,7 +386,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the classifier on the given data. @@ -420,7 +420,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -428,7 +428,7 @@ def predict( def classify( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Classify the input data with the trained model. diff --git a/pypots/classification/brits/data.py b/pypots/classification/brits/data.py index 2c5c2581..663ba81d 100644 --- a/pypots/classification/brits/data.py +++ b/pypots/classification/brits/data.py @@ -17,7 +17,7 @@ class DatasetForBRITS(DatasetForBRITS_Imputation): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -26,7 +26,7 @@ class DatasetForBRITS(DatasetForBRITS_Imputation): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -35,14 +35,19 @@ class DatasetForBRITS(DatasetForBRITS_Imputation): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_y: bool = True, + file_type: str = "hdf5", ): - super().__init__(data, False, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=False, + return_y=return_y, + file_type=file_type, + ) diff --git a/pypots/classification/brits/model.py b/pypots/classification/brits/model.py index f95177a5..fa0ad349 100644 --- a/pypots/classification/brits/model.py +++ b/pypots/classification/brits/model.py @@ -208,7 +208,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForBRITS(train_set, file_type=file_type) @@ -239,10 +239,10 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForBRITS(test_set, return_labels=False, file_type=file_type) + test_set = DatasetForBRITS(test_set, return_y=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -267,7 +267,7 @@ def predict( def classify( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Classify the input data with the trained model. diff --git a/pypots/classification/grud/data.py b/pypots/classification/grud/data.py index 34865428..99401310 100644 --- a/pypots/classification/grud/data.py +++ b/pypots/classification/grud/data.py @@ -10,7 +10,7 @@ import torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset from ...data.utils import _parse_delta_torch from ...imputation.locf import locf_torch @@ -20,7 +20,7 @@ class DatasetForGRUD(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -29,7 +29,7 @@ class DatasetForGRUD(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -38,17 +38,23 @@ class DatasetForGRUD(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_y: bool = True, + file_type: str = "hdf5", ): - super().__init__(data, False, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=False, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) if not isinstance(self.data, str): # data from array self.missing_mask = (~torch.isnan(self.X)).to(torch.float32) self.X_filledLOCF = locf_torch(self.X) @@ -63,12 +69,12 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index to fetch the specified sample. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -98,7 +104,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: self.empirical_mean.to(torch.float32), ] - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -109,12 +115,12 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : The collated data sample, a list including all necessary sample info. """ @@ -140,7 +146,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: ] # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/classification/grud/model.py b/pypots/classification/grud/model.py index deec8bd5..a18e78c8 100644 --- a/pypots/classification/grud/model.py +++ b/pypots/classification/grud/model.py @@ -180,7 +180,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForGRUD(train_set, file_type=file_type) @@ -211,10 +211,10 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(test_set, return_labels=False, file_type=file_type) + test_set = DatasetForGRUD(test_set, return_y=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -239,7 +239,7 @@ def predict( def classify( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Classify the input data with the trained model. diff --git a/pypots/classification/raindrop/data.py b/pypots/classification/raindrop/data.py index be1c32b3..9449976f 100644 --- a/pypots/classification/raindrop/data.py +++ b/pypots/classification/raindrop/data.py @@ -16,7 +16,7 @@ class DatasetForRaindrop(DatasetForGRUD): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -25,7 +25,7 @@ class DatasetForRaindrop(DatasetForGRUD): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -34,14 +34,14 @@ class DatasetForRaindrop(DatasetForGRUD): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_y: bool = True, + file_type: str = "hdf5", ): - super().__init__(data, return_labels, file_type) + super().__init__(data, return_y, file_type) diff --git a/pypots/classification/raindrop/model.py b/pypots/classification/raindrop/model.py index 179531e7..8def46d2 100644 --- a/pypots/classification/raindrop/model.py +++ b/pypots/classification/raindrop/model.py @@ -19,8 +19,8 @@ from torch.utils.data import DataLoader from .core import _Raindrop +from .data import DatasetForRaindrop from ...classification.base import BaseNNClassifier -from ...classification.grud.data import DatasetForGRUD from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -224,10 +224,10 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type="h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader - training_set = DatasetForGRUD(train_set, file_type=file_type) + training_set = DatasetForRaindrop(train_set, file_type=file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, @@ -236,7 +236,7 @@ def fit( ) val_loader = None if val_set is not None: - val_set = DatasetForGRUD(val_set, file_type=file_type) + val_set = DatasetForRaindrop(val_set, file_type=file_type) val_loader = DataLoader( val_set, batch_size=self.batch_size, @@ -255,10 +255,10 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(test_set, return_labels=False, file_type=file_type) + test_set = DatasetForRaindrop(test_set, return_y=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -284,7 +284,7 @@ def predict( def classify( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Classify the input data with the trained model. diff --git a/pypots/classification/template/core.py b/pypots/classification/template/core.py new file mode 100644 index 00000000..157cc4d9 --- /dev/null +++ b/pypots/classification/template/core.py @@ -0,0 +1,42 @@ +""" +The implementation of YourNewModel for the partially-observed time-series classification task. + +Refer to the paper "Your paper citation". + +""" + +# Created by Your Name TODO: modify the author information. +# License: BSD-3-Clause + +import torch.nn as nn + +# from ...nn.modules import some_modules + + +# TODO: define your new model here. +# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). +# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. +# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. +class _YourNewModel(nn.Module): + def __init__(self): + super().__init__() + + # TODO: define your model's components here. If modules in pypots.nn.modules can be reused in your model, + # you can import them and use them here. AND if you think the modules you implemented can be reused by + # other models, you can also consider to contribute them to pypots.nn.modules + self.embedding = nn.Module + self.submodule = nn.Module + self.backbone = nn.Module + + def forward(self, inputs: dict) -> dict: + # TODO: define your model's forward propagation process here. + # The input is a dict, and the output `results` should also be a dict. + output = self.backbone() # replace this with your model's process + + # TODO: `results` must contains the key `loss` which is will be used for + # backward propagation to update the model. + loss = None + results = { + "loss": loss, + } + return results diff --git a/pypots/classification/template/data.py b/pypots/classification/template/data.py index c391740e..3c4ca97e 100644 --- a/pypots/classification/template/data.py +++ b/pypots/classification/template/data.py @@ -1,7 +1,7 @@ """ Dataset class for YourNewModel. -TODO: modify the above description with your model's information. +TODO: modify the above description for your model's dataset class. """ @@ -10,17 +10,26 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset +# TODO: define your new dataset class here. Remove or add arguments as needed. class DatasetForYourNewModel(BaseDataset): def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_X_ori: bool, + return_X_pred: bool, + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=return_X_pred, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: raise NotImplementedError diff --git a/pypots/classification/template/model.py b/pypots/classification/template/model.py index 584fe1d7..40f6b252 100644 --- a/pypots/classification/template/model.py +++ b/pypots/classification/template/model.py @@ -3,6 +3,8 @@ Refer to the paper "Your paper citation". +TODO: modify the above description with your model's information. + """ # Created by Your Name TODO: modify the author information. @@ -10,40 +12,19 @@ from typing import Union, Optional -import numpy as np import torch -import torch.nn as nn + +from .core import _YourNewModel # TODO: import the base class from the classification package in PyPOTS. # Here I suppose this is a neural-network classification model. # You should make your model inherent BaseClassifier if it is not a NN. # from ..base import BaseClassifier from ..base import BaseNNClassifier - from ...optim.adam import Adam from ...optim.base import Optimizer -# TODO: define your new model here. -# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). -# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. -# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. -class _YourNewModel(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, inputs: dict) -> dict: - # TODO: define your model's forward propagation process here. - # The input is a dict, and the output `results` should also be a dict. - # `results` must contains the key `loss` which is will be used for backward propagation to update the model. - - loss = None - results = { - "loss": loss, - } - return results - - # TODO: define your new model's wrapper here. # It should be a subclass of a base class defined in PyPOTS task packages (e.g. # BaseNNClassifier of PyPOTS classification task package). It has to implement all abstract methods of the base class. @@ -53,13 +34,13 @@ def __init__( self, # TODO: add your model's hyper-parameters here n_classes: int, - batch_size: int, - epochs: int, - patience: int, - num_workers: int = 0, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, device: Optional[Union[str, torch.device, list]] = None, - saving_path: str = None, + saving_path: Optional[str] = None, model_saving_strategy: Optional[str] = "best", ): super().__init__( @@ -76,9 +57,11 @@ def __init__( # TODO: set up your model's hyper-parameters here # set up the model - self.model = _YourNewModel() - self.model = self.model.to(self.device) + self.model = _YourNewModel( + # pass the arguments to your model + ) self._print_model_size() + self._send_model_to_given_device() # set up the optimizer self.optimizer = optimizer @@ -97,13 +80,13 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: raise NotImplementedError def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError diff --git a/pypots/classification/template/module.py b/pypots/classification/template/module.py deleted file mode 100644 index fa20e4cd..00000000 --- a/pypots/classification/template/module.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -The implementation of the modules for YourNewModel. - -Refer to the paper "Your paper citation". - -""" - -# Created by Your Name TODO: modify the author information. -# License: BSD-3-Clause - - -# TODO: this file is not necessary. If your new model has customized layers or modules, please put them here. -# Otherwise, please delete this modules.py file, don't commit it to the repository. diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py index 47f70a18..2ecc46e3 100644 --- a/pypots/clustering/base.py +++ b/pypots/clustering/base.py @@ -72,7 +72,7 @@ def fit( self, train_set: Union[dict, str], val_set: Union[dict, str] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the cluster. @@ -105,7 +105,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -113,7 +113,7 @@ def predict( def cluster( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Cluster the input with the trained model. @@ -379,7 +379,7 @@ def fit( self, train_set: Union[dict, str], val_set: Union[dict, str] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the cluster. @@ -412,7 +412,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -420,7 +420,7 @@ def predict( def cluster( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Cluster the input with the trained model. diff --git a/pypots/clustering/crli/data.py b/pypots/clustering/crli/data.py index 6025752a..cf8976a2 100644 --- a/pypots/clustering/crli/data.py +++ b/pypots/clustering/crli/data.py @@ -8,7 +8,7 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset class DatasetForCRLI(BaseDataset): @@ -16,7 +16,7 @@ class DatasetForCRLI(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -25,7 +25,7 @@ class DatasetForCRLI(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -34,17 +34,23 @@ class DatasetForCRLI(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_y: bool = True, + file_type: str = "hdf5", ): - super().__init__(data, False, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=False, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: return super()._fetch_data_from_array(idx) diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py index 8c7805a1..90651ca0 100644 --- a/pypots/clustering/crli/model.py +++ b/pypots/clustering/crli/model.py @@ -343,12 +343,10 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader - training_set = DatasetForCRLI( - train_set, return_labels=False, file_type=file_type - ) + training_set = DatasetForCRLI(train_set, return_y=False, file_type=file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, @@ -358,7 +356,7 @@ def fit( val_loader = None if val_set is not None: - val_set = DatasetForCRLI(val_set, return_labels=False, file_type=file_type) + val_set = DatasetForCRLI(val_set, return_y=False, file_type=file_type) val_loader = DataLoader( val_set, batch_size=self.batch_size, @@ -377,7 +375,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", return_latent_vars: bool = False, ) -> dict: """Make predictions for the input data with the trained model. @@ -393,7 +391,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. return_latent_vars : bool @@ -402,13 +400,13 @@ def predict( Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForCRLI(test_set, return_labels=False, file_type=file_type) + test_set = DatasetForCRLI(test_set, return_y=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -448,7 +446,7 @@ def predict( def cluster( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Cluster the input with the trained model. diff --git a/pypots/clustering/template/core.py b/pypots/clustering/template/core.py new file mode 100644 index 00000000..524a7a3c --- /dev/null +++ b/pypots/clustering/template/core.py @@ -0,0 +1,42 @@ +""" +The implementation of YourNewModel for the partially-observed time-series clustering task. + +Refer to the paper "Your paper citation". + +""" + +# Created by Your Name TODO: modify the author information. +# License: BSD-3-Clause + +import torch.nn as nn + +# from ...nn.modules import some_modules + + +# TODO: define your new model here. +# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). +# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. +# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. +class _YourNewModel(nn.Module): + def __init__(self): + super().__init__() + + # TODO: define your model's components here. If modules in pypots.nn.modules can be reused in your model, + # you can import them and use them here. AND if you think the modules you implemented can be reused by + # other models, you can also consider to contribute them to pypots.nn.modules + self.embedding = nn.Module + self.submodule = nn.Module + self.backbone = nn.Module + + def forward(self, inputs: dict) -> dict: + # TODO: define your model's forward propagation process here. + # The input is a dict, and the output `results` should also be a dict. + output = self.backbone() # replace this with your model's process + + # TODO: `results` must contains the key `loss` which is will be used for + # backward propagation to update the model. + loss = None + results = { + "loss": loss, + } + return results diff --git a/pypots/clustering/template/data.py b/pypots/clustering/template/data.py index c391740e..3c4ca97e 100644 --- a/pypots/clustering/template/data.py +++ b/pypots/clustering/template/data.py @@ -1,7 +1,7 @@ """ Dataset class for YourNewModel. -TODO: modify the above description with your model's information. +TODO: modify the above description for your model's dataset class. """ @@ -10,17 +10,26 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset +# TODO: define your new dataset class here. Remove or add arguments as needed. class DatasetForYourNewModel(BaseDataset): def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_X_ori: bool, + return_X_pred: bool, + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=return_X_pred, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: raise NotImplementedError diff --git a/pypots/clustering/template/model.py b/pypots/clustering/template/model.py index 06a4a38b..65bc8aa9 100644 --- a/pypots/clustering/template/model.py +++ b/pypots/clustering/template/model.py @@ -3,6 +3,8 @@ Refer to the paper "Your paper citation". +TODO: modify the above description with your model's information. + """ # Created by Your Name TODO: modify the author information. @@ -10,40 +12,19 @@ from typing import Union, Optional -import numpy as np import torch -import torch.nn as nn + +from .core import _YourNewModel # TODO: import the base class from the clustering package in PyPOTS. # Here I suppose this is a neural-network clustering model. # You should make your model inherent BaseClusterer if it is not a NN. # from ..base import BaseClusterer from ..base import BaseNNClusterer - from ...optim.adam import Adam from ...optim.base import Optimizer -# TODO: define your new model here. -# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). -# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. -# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. -class _YourNewModel(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, inputs: dict) -> dict: - # TODO: define your model's forward propagation process here. - # The input is a dict, and the output `results` should also be a dict. - # `results` must contains the key `loss` which is will be used for backward propagation to update the model. - - loss = None - results = { - "loss": loss, - } - return results - - # TODO: define your new model's wrapper here. # It should be a subclass of a base class defined in PyPOTS task packages (e.g. # BaseNNClusterer of PyPOTS clustering task package), and it has to implement all abstract methods of the base class. @@ -53,13 +34,13 @@ def __init__( self, # TODO: add your model's hyper-parameters here n_clusters: int, - batch_size: int, - epochs: int, - patience: int, - num_workers: int = 0, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, device: Optional[Union[str, torch.device, list]] = None, - saving_path: str = None, + saving_path: Optional[str] = None, model_saving_strategy: Optional[str] = "best", ): super().__init__( @@ -76,9 +57,11 @@ def __init__( # TODO: set up your model's hyper-parameters here # set up the model - self.model = _YourNewModel() - self.model = self.model.to(self.device) + self.model = _YourNewModel( + # pass the arguments to your model + ) self._print_model_size() + self._send_model_to_given_device() # set up the optimizer self.optimizer = optimizer @@ -97,13 +80,13 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: raise NotImplementedError def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError diff --git a/pypots/clustering/template/module.py b/pypots/clustering/template/module.py deleted file mode 100644 index fa20e4cd..00000000 --- a/pypots/clustering/template/module.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -The implementation of the modules for YourNewModel. - -Refer to the paper "Your paper citation". - -""" - -# Created by Your Name TODO: modify the author information. -# License: BSD-3-Clause - - -# TODO: this file is not necessary. If your new model has customized layers or modules, please put them here. -# Otherwise, please delete this modules.py file, don't commit it to the repository. diff --git a/pypots/clustering/vader/core.py b/pypots/clustering/vader/core.py index c2e0da99..3d6df970 100644 --- a/pypots/clustering/vader/core.py +++ b/pypots/clustering/vader/core.py @@ -15,8 +15,8 @@ import torch import torch.nn as nn -from pypots.utils.metrics import calc_mse from ...nn.modules.vader import BackboneVaDER +from ...utils.metrics import calc_mse def inverse_softplus(x: np.ndarray) -> np.ndarray: diff --git a/pypots/clustering/vader/data.py b/pypots/clustering/vader/data.py index 6a098774..ea718397 100644 --- a/pypots/clustering/vader/data.py +++ b/pypots/clustering/vader/data.py @@ -8,7 +8,7 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset class DatasetForVaDER(BaseDataset): @@ -16,7 +16,7 @@ class DatasetForVaDER(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -25,7 +25,7 @@ class DatasetForVaDER(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -34,17 +34,23 @@ class DatasetForVaDER(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_y: bool = True, + file_type: str = "hdf5", ): - super().__init__(data, False, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=False, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: return super()._fetch_data_from_array(idx) diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py index 26ae9687..a9b151dc 100644 --- a/pypots/clustering/vader/model.py +++ b/pypots/clustering/vader/model.py @@ -357,12 +357,10 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader - training_set = DatasetForVaDER( - train_set, return_labels=False, file_type=file_type - ) + training_set = DatasetForVaDER(train_set, return_y=False, file_type=file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, @@ -372,7 +370,7 @@ def fit( val_loader = None if val_set is not None: - val_set = DatasetForVaDER(val_set, return_labels=False, file_type=file_type) + val_set = DatasetForVaDER(val_set, return_y=False, file_type=file_type) val_loader = DataLoader( val_set, batch_size=self.batch_size, @@ -391,7 +389,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", return_latent_vars: bool = False, ) -> dict: """Make predictions for the input data with the trained model. @@ -407,7 +405,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. return_latent_vars : bool @@ -415,12 +413,12 @@ def predict( Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForVaDER(test_set, return_labels=False, file_type=file_type) + test_set = DatasetForVaDER(test_set, return_y=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -501,7 +499,7 @@ def func_to_apply( def cluster( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> Union[np.ndarray]: """Cluster the input with the trained model. diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py index d243b50e..73b274dd 100644 --- a/pypots/data/__init__.py +++ b/pypots/data/__init__.py @@ -5,7 +5,7 @@ # Created by Wenjie Du # License: BSD-3-Clause -from .base import BaseDataset +from .dataset import BaseDataset, SUPPORTED_DATASET_FILE_FORMATS from .generating import ( gene_complete_random_walk, gene_complete_random_walk_for_anomaly_detection, @@ -21,9 +21,10 @@ from .utils import parse_delta, sliding_window __all__ = [ - # datasets + # base dataset classes "BaseDataset", - # data generation + "SUPPORTED_DATASET_FILE_FORMATS", + # dataset generation functions "gene_complete_random_walk", "gene_complete_random_walk_for_anomaly_detection", "gene_complete_random_walk_for_classification", diff --git a/pypots/data/base.py b/pypots/data/base.py deleted file mode 100644 index 096bcd08..00000000 --- a/pypots/data/base.py +++ /dev/null @@ -1,337 +0,0 @@ -""" -The base class for PyPOTS datasets. -""" - -# Created by Wenjie Du -# License: BSD-3-Clause - -from abc import abstractmethod -from typing import Union, Optional, Tuple, Iterable - -import h5py -import numpy as np -import torch -from pygrinder import fill_and_get_mask_torch -from torch.utils.data import Dataset - -from .utils import turn_data_into_specified_dtype - -# Currently we only support h5 files -SUPPORTED_DATASET_FILE_TYPE = ["h5py"] - - -class BaseDataset(Dataset): - """Base dataset class in PyPOTS. - - Parameters - ---------- - data : - The dataset for model input, should be a dictionary including keys as 'X' and 'y', - or a path string locating a data file. - If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], - which is time-series data for input, can contain missing values, and y should be array-like of shape - [n_samples], which is classification labels of X. - If it is a path string, the path should point to a data file, e.g. a h5 file, which contains - key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - - return_labels : - Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, - during training of classification models, the Dataset class will return labels in __getitem__() for model input. - Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we - need the defined Dataset class for all training/validating/testing stages. For those big datasets stored in h5 - files, they already have both X and y saved. But we don't read labels from the file for validating and testing - with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for - distinction. - - file_type : - The type of the given file if train_set and val_set are path strings. - - """ - - def __init__( - self, - data: Union[dict, str], - return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", - ): - super().__init__() - # types and shapes had been checked after X and y input into the model - # So they are safe to use here. No need to check again. - - self.data = data - self.return_X_ori = return_X_ori - self.return_labels = return_labels - - if isinstance(self.data, str): # data from file - # check if the given file type is supported - assert ( - file_type in SUPPORTED_DATASET_FILE_TYPE - ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" - self.file_type = file_type - - # open the file handle - self.file_handle = self._open_file_handle() - # check if X exists in the file - assert ( - "X" in self.file_handle.keys() - ), "The given dataset file doesn't contains X. Please double check." - - else: # data from array - X = data["X"] - X_ori = None if "X_ori" not in data.keys() else data["X_ori"] - y = None if "y" not in data.keys() else data["y"] - self.X, self.X_ori, self.y = self._check_array_input(X, X_ori, y) - - if self.X_ori is not None and self.return_X_ori: - # Only when X_ori is given and fixed, we fill the missing values in X here in advance. - # Otherwise, we may need original X with missing values to generate X_ori, e.g. in DatasetForSAITS. - self.X, self.missing_mask = fill_and_get_mask_torch(self.X) - - self.X_ori, X_ori_missing_mask = fill_and_get_mask_torch(self.X_ori) - indicating_mask = X_ori_missing_mask - self.missing_mask - self.indicating_mask = indicating_mask.to(torch.float32) - else: - self.missing_mask = None - self.indicating_mask = None - # if return_X_ori is false, set X_ori to None as well - self.X_ori = None - - self.n_samples, self.n_steps, self.n_features = self._get_data_sizes() - - # set up function fetch_data() - if isinstance(self.data, str): - self.fetch_data = self._fetch_data_from_file - else: - self.fetch_data = self._fetch_data_from_array - - def _get_data_sizes(self) -> Tuple[int, int, int]: - """Determine the number of samples in the dataset and return the number. - - Returns - ------- - n_samples : - The number of the samples in the given dataset. - """ - - if isinstance(self.data, str): - if self.file_handle is None: - self.file_handle = self._open_file_handle() - n_samples = len(self.file_handle["X"]) - first_sample = self.file_handle["X"][0] - n_steps = len(first_sample) - n_features = first_sample.shape[-1] - else: - n_samples = len(self.X) - n_steps = len(self.X[0]) - n_features = self.X[0].shape[-1] - - return n_samples, n_steps, n_features - - def __len__(self) -> int: - return self.n_samples - - @staticmethod - def _check_array_input( - X: Union[np.ndarray, torch.Tensor, list], - X_ori: Union[np.ndarray, torch.Tensor, list], - y: Optional[Union[np.ndarray, torch.Tensor, list]] = None, - out_dtype: str = "tensor", - ) -> Tuple[ - Union[np.ndarray, torch.Tensor], - Union[np.ndarray, torch.Tensor], - Optional[Union[np.ndarray, torch.Tensor, list]], - ]: - """Check value type and shape of input X and y - - Parameters - ---------- - X : - Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features]. - - X_ori : - If X is with artificial missingness, X_ori is the original X without artificial missing values. - It must have the same shape as X. If X_ori is with original missing values, should be left as NaN. - - y : - Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes]. - - out_dtype : - Data type of the output, should be np.ndarray or torch.Tensor - - Returns - ------- - X : - - X_ori : - - y : - - """ - assert out_dtype in [ - "tensor", - "ndarray", - ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}' - - # change the data type of X - X = turn_data_into_specified_dtype(X, out_dtype) - X = X.to(torch.float32) - - # check the shape of X here - X_shape = X.shape - assert len(X_shape) == 3, ( - f"input should have 3 dimensions [n_samples, seq_len, n_features]," - f"but got X: {X_shape}" - ) - if X_ori is not None: - X_ori = turn_data_into_specified_dtype(X_ori, out_dtype) - X_ori = X_ori.to(torch.float32) - assert ( - X_shape == X_ori.shape - ), f"X and X_ori must have matched shape, but got X: f{X.shape} and X_ori: {X_ori.shape}" - if y is not None: - assert len(X) == len(y), ( - f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}" - ) - y = turn_data_into_specified_dtype(y, out_dtype) - - return X, X_ori, y - - @abstractmethod - def _fetch_data_from_array(self, idx: int) -> Iterable: - """Fetch data from self.X if it is given. - - Parameters - ---------- - idx : - The index of the sample to be return. - - Returns - ------- - sample : - The collated data sample, a list including all necessary sample info. - """ - - if self.X_ori is None: - X = self.X[idx] - X, missing_mask = fill_and_get_mask_torch(X) - else: - X = self.X[idx] - missing_mask = self.missing_mask[idx] - - sample = [ - torch.tensor(idx), - X, - missing_mask, - ] - - if self.X_ori is not None and self.return_X_ori: - X_ori = self.X_ori[idx] - indicating_mask = self.indicating_mask[idx] - sample.extend([X_ori, indicating_mask]) - - if self.y is not None and self.return_labels: - sample.append(self.y[idx].to(torch.long)) - - return sample - - def _open_file_handle(self) -> h5py.File: - """Open the file handle for reading data from the file. - - Notes - ----- - This function can also help confirm if the given file and file type match. - - Returns - ------- - file_handle : - - """ - data_file_path = self.data - try: - file_handler = h5py.File( - data_file_path, - "r", - ) # set swmr=True if the h5 file need to be written into new content during reading - except ImportError: - raise ImportError( - "h5py is missing and cannot be imported. Please install it first." - ) - except FileNotFoundError as e: - raise FileNotFoundError(f"{e}") - except OSError as e: - raise TypeError( - f"{e}\n" - f"Check out the above error log. This probably is caused by file type error. " - f"Please confirm that the given file {data_file_path} is an h5 file." - ) - except Exception as e: - raise RuntimeError(e) - return file_handler - - @abstractmethod - def _fetch_data_from_file(self, idx: int) -> Iterable: - """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. - Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. - - Notes - ----- - Multi workers reading from h5 file is tricky, and I was confronted with a problem similar to - https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/7 in 2020, please - refer to it for more details about the problem. - The implementation here is referred to - https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/10 - And according to https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/37, - pytorch v1.7.1 and h5py v3.2.0 work well, so probably updating to the latest version can avoid the - issue I met. After all, this implementation may need to be updated in the near future. - - Parameters - ---------- - idx : - The index of the sample to be return. - - Returns - ------- - sample : - The collated data sample, a list including all necessary sample info. - """ - - if self.file_handle is None: - self.file_handle = self._open_file_handle() - - X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) - X, missing_mask = fill_and_get_mask_torch(X) - sample = [ - torch.tensor(idx), - X, - missing_mask, - ] - - if "X_ori" in self.file_handle.keys() and self.return_X_ori: - X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) - X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) - indicating_mask = (X_ori_missing_mask - missing_mask).to(torch.float32) - sample.extend([X_ori, indicating_mask]) - - # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: - sample.append(self.file_handle["y"][idx].to(torch.long)) - - return sample - - def __getitem__(self, idx: int) -> Iterable: - """Fetch data according to index. - - Parameters - ---------- - idx : - The index to fetch the specified sample. - - Returns - ------- - sample : - The collated data sample, a list including all necessary sample info. - """ - - sample = self.fetch_data(idx) - return sample diff --git a/pypots/data/checking.py b/pypots/data/checking.py index af22958f..4f0e7767 100644 --- a/pypots/data/checking.py +++ b/pypots/data/checking.py @@ -11,11 +11,30 @@ import h5py -def check_X_ori_in_val_set(val_set: Union[str, dict]) -> bool: - if isinstance(val_set, str): - with h5py.File(val_set, "r") as f: - return "X_ori" in f.keys() - elif isinstance(val_set, dict): - return "X_ori" in val_set.keys() +def key_in_data_set(key: str, dataset: Union[str, dict]) -> bool: + """Check if the key is in the given dataset. + The dataset could be a path to an HDF5 file or a Python dictionary. + + Parameters + ---------- + key : + The key to check. + + dataset : + The dataset to be checked. + + Returns + ------- + bool + Whether the key is in the dataset. + """ + + if isinstance(dataset, str): + with h5py.File(dataset, "r") as f: + return key in f.keys() + elif isinstance(dataset, dict): + return key in dataset.keys() else: - raise TypeError("val_set must be a str or a Python dictionary.") + raise TypeError( + f"dataset must be a str or a Python dictionary, but got {type(dataset)}" + ) diff --git a/pypots/data/dataset/__init__.py b/pypots/data/dataset/__init__.py new file mode 100644 index 00000000..a29c06f5 --- /dev/null +++ b/pypots/data/dataset/__init__.py @@ -0,0 +1,14 @@ +""" +The package including dataset classes for PyPOTS. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from .base import BaseDataset +from .config import SUPPORTED_DATASET_FILE_FORMATS + +__all__ = [ + "BaseDataset", + "SUPPORTED_DATASET_FILE_FORMATS", +] diff --git a/pypots/data/dataset/base.py b/pypots/data/dataset/base.py new file mode 100644 index 00000000..9388c351 --- /dev/null +++ b/pypots/data/dataset/base.py @@ -0,0 +1,465 @@ +""" +The base dataset class. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from typing import Union, Optional, Tuple, Iterable + +import h5py +import numpy as np +import torch +from numpy import ndarray +from pygrinder import fill_and_get_mask_torch +from torch import Tensor +from torch.utils.data import Dataset + +from .config import SUPPORTED_DATASET_FILE_FORMATS +from ..utils import turn_data_into_specified_dtype + + +class BaseDataset(Dataset): + """Base dataset class for models in PyPOTS. + + Parameters + ---------- + data : + The dataset for model input, should be a dictionary or + a path string locating a data file that is in supported formats. + If it is a dict, 'X' is mandatory and 'X_ori', 'X_pred', and 'y' are optional. + ``X`` is time-series data for input and could contain missing values. + It should be array-like of shape [n_samples, n_steps (sequence length), n_features]. + ``X_ori`` is optional. If ``X`` is constructed from ``X_ori`` with specially designed artificial missingness, + your model may need ``X_ori`` for evaluation or loss calculation during training (e.g. SAITS). + It should have the same shape as ``X``. + ``X_pred`` is optional, and it is the forecasting results for the model to predict in forecasting tasks. + ``X_pred`` should be array-like of shape [n_samples, n_steps (sequence length), n_features], and its shape + could different from ``X``. But remember that ``X_pred`` contains time series forecasting results of ``X``, + hence it has the same number of samples as ``X``, i.e. n_samples of them are the same, but their n_steps + and n_features could be different. ``X_pred`` could have missing values as well as ``X``. + ``y`` should be array-like of shape [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X', etc. + + return_X_ori : + Whether to return X_ori and indicating_mask in function __getitem__() if it is given. If `True`, for example, + during training of models that need the original X, the Dataset class will return X_ori in __getitem__() for + model input. Otherwise, X_ori and indicating mask won't be included in the data list returned by __getitem__(). + + return_X_pred : + Whether to return X_pred and X_pred_missing_mask in function __getitem__() if it is given. + If `True`, for example, during training of forecasting models, the Dataset class will return forecasting X + in __getitem__() for model input. Otherwise, X_pred and its missing mask X_pred_missing_mask won't be included + in the data list returned by __getitem__(). + + return_y : + Whether to return y (i.e. labels) in function __getitem__() if they exist in the given data. If `True`, + for example, during training of classification models, the Dataset class will return labels in __getitem__() + for model input. Otherwise, labels won't be included in the data returned by __getitem__(). + This parameter exists because we need the defined Dataset class for all training/validating/testing stages. + For those big datasets stored in h5 files, they already have both X and y saved. + But we don't read labels from the file for validating and testing with function _fetch_data_from_file(), + which works for all three stages. Therefore, we need this parameter for distinction. + + file_type : + The type of the given file if train_set and val_set are path strings. + + """ + + def __init__( + self, + data: Union[dict, str], + return_X_ori: bool, + return_X_pred: bool, + return_y: bool, + file_type: str = "hdf5", + ): + super().__init__() + # types and shapes had been checked after X and y input into the model + # So they are safe to use here. No need to check again. + + self.data = data + self.return_X_ori = return_X_ori + self.return_X_pred = return_X_pred + self.return_y = return_y + self.file_type = file_type + + # initialize the following attributes + self.X = None + self.X_ori = None + self.missing_mask = None + self.indicating_mask = None + self.X_pred = None + self.X_pred_missing_mask = None + self.y = None + self.file_handle = None + self.fetch_data = None + self.n_samples: int = 0 # num of the samples in the dataset + self.n_steps: int = 0 # num of the time steps in each sample + self.n_features: int = 0 # num of the features in each sample + self.n_pred_steps: int = 0 # num of the time steps in each forecasting sample + self.n_pred_features: int = 0 # num of the features in each forecasting sample + + # check the data type and set up the fetch_data function + if isinstance(self.data, str): # data from file + # check if the given file type is supported + assert ( + file_type in SUPPORTED_DATASET_FILE_FORMATS + ), f"file_type should be one of {SUPPORTED_DATASET_FILE_FORMATS}, but got {file_type}" + self.file_type = file_type + + # open the file handle + self.file_handle = self._open_file_handle() + # check if X exists in the file + assert ( + "X" in self.file_handle.keys() + ), "The given dataset file doesn't contains X. Please double check." + # check whether X_ori, X_pred, and y exist in the file if they are required + if self.return_X_ori: + assert ( + "X_ori" in self.file_handle.keys() + ), "The given dataset file doesn't contains X_ori. Please double check." + if self.return_X_pred: + assert ( + "X_pred" in self.file_handle.keys() + ), "The given dataset file doesn't contains X_pred. Please double check." + if self.return_y: + assert ( + "y" in self.file_handle.keys() + ), "The given dataset file doesn't contains y. Please double check." + + # set up the function fetch_data() to fetch data from file + self.fetch_data = self._fetch_data_from_file + + else: # data from array + # check if X exists in the dictionary + assert ( + "X" in self.data.keys() + ), "The given dataset dictionary doesn't contains X. Please double check." + # check whether X_ori, X_pred, and y exist in the file if they are required + if self.return_X_ori: + assert ( + "X_ori" in self.data.keys() + ), "The given dataset dictionary doesn't contains X_ori. Please double check." + if self.return_X_pred: + assert ( + "X_pred" in self.data.keys() + ), "The given dataset dictionary doesn't contains X_pred. Please double check." + if self.return_y: + assert ( + "y" in self.data.keys() + ), "The given dataset dictionary doesn't contains y. Please double check." + + X = data["X"] + X_ori = None if "X_ori" not in data.keys() else data["X_ori"] + X_pred = None if "X_pred" not in data.keys() else data["X_pred"] + y = None if "y" not in data.keys() else data["y"] + self.X, self.X_ori, self.X_pred, self.y = self._check_array_input( + X, X_ori, X_pred, y, "tensor" + ) + + if self.return_X_ori: + # Only when X_ori is given and fixed, we fill the missing values in X here in advance. + # Otherwise, we may need original X with missing values to generate X_ori, e.g. in DatasetForSAITS. + self.X, self.missing_mask = fill_and_get_mask_torch(self.X) + + self.X_ori, X_ori_missing_mask = fill_and_get_mask_torch(self.X_ori) + indicating_mask = X_ori_missing_mask - self.missing_mask + self.indicating_mask = indicating_mask.to(torch.float32) + + if self.return_X_pred: + self.X_pred, self.X_pred_missing_mask = fill_and_get_mask_torch( + self.X_pred + ) + + # set up the function fetch_data() to fetch data from array + self.fetch_data = self._fetch_data_from_array + + # get the sizes of the dataset + ( + self.n_samples, + self.n_steps, + self.n_features, + self.n_pred_steps, + self.n_pred_features, + ) = self._get_data_sizes() + + def _get_data_sizes(self) -> Tuple[int, ...]: + """Detect the data sample sizes in the dataset and return the numbers. + + Returns + ------- + n_samples : + The number of the samples in the given dataset. + + n_steps : + The number of each sample's time steps in the given dataset. + + n_features : + The number of each sample's features in the given dataset. + + n_pred_steps : + The number of each sample's forecasting time steps in the given dataset. + Return as 0 if the dataset does not contain X_pred which includes data samples for forecasting tasks. + + n_pred_features : + The number of each sample's forecasting features in the given dataset. + Return as 0 if the dataset does not contain X_pred which includes data samples for forecasting tasks. + """ + + # initialize the sizes + n_samples, n_steps, n_features, n_pred_steps, n_pred_features = 0, 0, 0, 0, 0 + + if isinstance(self.data, str): + if self.file_handle is None: + self.file_handle = self._open_file_handle() + n_samples = len(self.file_handle["X"]) + first_sample = self.file_handle["X"][0] + n_steps = len(first_sample) + n_features = first_sample.shape[-1] + + if self.return_X_pred: + first_pred_sample = self.file_handle["X_pred"][0] + n_pred_steps = len(first_pred_sample) + n_pred_features = first_pred_sample.shape[-1] + else: + n_samples = len(self.X) + n_steps = len(self.X[0]) + n_features = self.X[0].shape[-1] + + if self.return_X_pred: + n_pred_steps = len(self.X_pred[0]) + n_pred_features = self.X_pred[0].shape[-1] + + return n_samples, n_steps, n_features, n_pred_steps, n_pred_features + + def __len__(self) -> int: + return self.n_samples + + @staticmethod + def _check_array_input( + X: Union[np.ndarray, torch.Tensor], + X_ori: Optional[Union[np.ndarray, torch.Tensor]] = None, + X_pred: Optional[Union[np.ndarray, torch.Tensor]] = None, + y: Optional[Union[np.ndarray, torch.Tensor]] = None, + out_dtype: str = "tensor", + ) -> Tuple[ + Union[Tensor, ndarray], + Optional[Union[Tensor, ndarray]], + Optional[Union[Tensor, ndarray]], + Optional[Union[Tensor, ndarray]], + ]: + """Check value type and shape of input X and y + + Parameters + ---------- + X : + Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features]. + + X_ori : + If X is with artificial missingness, X_ori is the original X without artificial missing values. + It must have the same shape as X. If X_ori is with original missing values, should be left as NaN. + + y : + Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes]. + + out_dtype : + Data type of the output, should be np.ndarray or torch.Tensor + + Returns + ------- + X : + + X_ori : + + X_pred : + + y : + + """ + assert out_dtype in [ + "tensor", + "ndarray", + ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}' + + # change the data type of X + X = turn_data_into_specified_dtype(X, out_dtype) + X = X.to(torch.float32) if out_dtype == "tensor" else X + + # check the shape of X here + X_shape = X.shape + assert len(X_shape) == 3, ( + f"input should have 3 dimensions [n_samples, seq_len, n_features]," + f"but got X: {X_shape}" + ) + if X_ori is not None: + X_ori = turn_data_into_specified_dtype(X_ori, out_dtype) + X_ori = X_ori.to(torch.float32) if out_dtype == "tensor" else X_ori + assert ( + X_shape == X_ori.shape + ), f"X and X_ori must have matched shape, but got X: f{X.shape} and X_ori: {X_ori.shape}" + + if X_pred is not None: + X_pred = turn_data_into_specified_dtype(X_pred, out_dtype) + X_pred = X_pred.to(torch.float32) if out_dtype == "tensor" else X_pred + assert len(X) == len( + X_pred + ), f"X and X_pred must have the same number of samples, but got X: f{X.shape} and X_pred: {X_pred.shape}" + + if y is not None: + assert len(X) == len(y), ( + f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}" + ) + y = turn_data_into_specified_dtype(y, out_dtype) + y = y.to(torch.long) if out_dtype == "tensor" else y + + return X, X_ori, X_pred, y + + def _fetch_data_from_array(self, idx: int) -> Iterable: + """Fetch data from self.X if it is given. + + Parameters + ---------- + idx : + The index of the sample to be return. + + Returns + ------- + sample : + The collated data sample, a list including all necessary sample info. + """ + + X = self.X[idx] + + if self.return_X_ori: + # if X_ori is given, fetch missing mask from self.missing_mask that has been created in __init__() + missing_mask = self.missing_mask[idx] + else: + X, missing_mask = fill_and_get_mask_torch(X) + + sample = [ + torch.tensor(idx), + X, + missing_mask, + ] + + if self.return_X_ori: + X_ori = self.X_ori[idx] + indicating_mask = self.indicating_mask[idx] + sample.extend([X_ori, indicating_mask]) + + if self.return_X_pred: + X_pred = self.X_pred[idx] + pred_missing_mask = self.X_pred[idx] + sample.extend([X_pred, pred_missing_mask]) + + if self.return_y: + sample.append(self.y[idx].to(torch.long)) + + return sample + + def _open_file_handle(self) -> h5py.File: + """Open the file handle for reading data from the file. + + Notes + ----- + This function can also help confirm if the given file and file type match. + + Returns + ------- + file_handle : + + """ + data_file_path = self.data + try: + file_handler = h5py.File( + data_file_path, + "r", + ) # set swmr=True if the h5 file need to be written into new content during reading + except ImportError: + raise ImportError( + "h5py is missing and cannot be imported. Please install it first." + ) + except FileNotFoundError as e: + raise FileNotFoundError(f"{e}") + except OSError as e: + raise TypeError( + f"{e}\n" + f"Check out the above error log. This probably is caused by file type error. " + f"Please confirm that the given file {data_file_path} is an h5 file." + ) + except Exception as e: + raise RuntimeError(e) + return file_handler + + def _fetch_data_from_file(self, idx: int) -> Iterable: + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Notes + ----- + Multi workers reading from h5 file is tricky, and I was confronted with a problem similar to + https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/7 in 2020, please + refer to it for more details about the problem. + The implementation here is referred to + https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/10 + And according to https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/37, + pytorch v1.7.1 and h5py v3.2.0 work well, so probably updating to the latest version can avoid the + issue I met. After all, this implementation may need to be updated in the near future. + + Parameters + ---------- + idx : + The index of the sample to be return. + + Returns + ------- + sample : + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handle is None: + self.file_handle = self._open_file_handle() + + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) + X, missing_mask = fill_and_get_mask_torch(X) + sample = [ + torch.tensor(idx), + X, + missing_mask, + ] + + if self.return_X_ori: + X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) + X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) + indicating_mask = (X_ori_missing_mask - missing_mask).to(torch.float32) + sample.extend([X_ori, indicating_mask]) + + if self.return_X_pred: + X_pred = torch.from_numpy(self.file_handle["X_pred"][idx]).to(torch.float32) + X_pred, X_pred_missing_mask = fill_and_get_mask_torch(X_pred) + sample.extend([X_pred, X_pred_missing_mask]) + + # if the dataset has labels and is for training, then fetch it from the file + if self.return_y: + sample.append(self.file_handle["y"][idx].to(torch.long)) + + return sample + + def __getitem__(self, idx: int) -> Iterable: + """Fetch data according to index. + + Parameters + ---------- + idx : + The index to fetch the specified sample. + + Returns + ------- + sample : + The collated data sample, a list including all necessary sample info. + """ + + sample = self.fetch_data(idx) + return sample diff --git a/pypots/data/dataset/config.py b/pypots/data/dataset/config.py new file mode 100644 index 00000000..c8ec59cb --- /dev/null +++ b/pypots/data/dataset/config.py @@ -0,0 +1,11 @@ +""" +This module contains the configuration for the dataset module. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +# Currently we only support h5 files +SUPPORTED_DATASET_FILE_FORMATS = [ + "hdf5", +] diff --git a/pypots/data/generating.py b/pypots/data/generating.py index 4094489c..1330128d 100644 --- a/pypots/data/generating.py +++ b/pypots/data/generating.py @@ -272,7 +272,9 @@ def gene_random_walk( if missing_rate > 0: # create random missing values + train_X_ori = train_X train_X = mcar(train_X, missing_rate) + val_X_ori = val_X val_X = mcar(val_X, missing_rate) # test set is left to mask after normalization @@ -302,21 +304,18 @@ def gene_random_walk( } if missing_rate > 0: - # mask values in the validation set as ground truth - val_X_ori = val_X - val_X = mcar(val_X, missing_rate) - # mask values in the test set as ground truth test_X_ori = test_X - test_X = mcar(test_X, 0.3) + test_X = mcar(test_X, missing_rate) + data["train_X"] = train_X + data["train_X_ori"] = train_X_ori data["val_X"] = val_X data["val_X_ori"] = val_X_ori # test_X is for model input data["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs - data["test_X_ori"] = np.nan_to_num(test_X_ori) + data["test_X_ori"] = test_X_ori data["test_X_indicating_mask"] = ~np.isnan(test_X_ori) ^ ~np.isnan(test_X) return data diff --git a/pypots/data/utils.py b/pypots/data/utils.py index e01f744c..a33e2d01 100644 --- a/pypots/data/utils.py +++ b/pypots/data/utils.py @@ -136,8 +136,9 @@ def parse_delta( Parameters ---------- - missing_mask : shape of [n_steps, n_features] or [n_samples, n_steps, n_features] + missing_mask : Binary masks indicate missing data (0 means missing values, 1 means observed values). + Shape of [n_steps, n_features] or [n_samples, n_steps, n_features]. Returns ------- diff --git a/pypots/forecasting/__init__.py b/pypots/forecasting/__init__.py index d54032af..66efcf67 100644 --- a/pypots/forecasting/__init__.py +++ b/pypots/forecasting/__init__.py @@ -6,7 +6,9 @@ # License: BSD-3-Clause from .bttf import BTTF +from .csdi import CSDI __all__ = [ "BTTF", + "CSDI", ] diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py index 2cdf641d..f1ef2e8f 100644 --- a/pypots/forecasting/base.py +++ b/pypots/forecasting/base.py @@ -67,7 +67,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the classifier on the given data. @@ -99,7 +99,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -107,7 +107,7 @@ def predict( def forecast( self, X: dict or str, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Forecast the future the input with the trained model. @@ -200,7 +200,7 @@ def __init__( ) @abstractmethod - def _assemble_input_for_training(self, data) -> dict: + def _assemble_input_for_training(self, data: list) -> dict: """Assemble the given data into a dictionary for training input. Parameters @@ -216,7 +216,7 @@ def _assemble_input_for_training(self, data) -> dict: raise NotImplementedError @abstractmethod - def _assemble_input_for_validating(self, data) -> dict: + def _assemble_input_for_validating(self, data: list) -> dict: """Assemble the given data into a dictionary for validating input. Parameters @@ -232,7 +232,7 @@ def _assemble_input_for_validating(self, data) -> dict: raise NotImplementedError @abstractmethod - def _assemble_input_for_testing(self, data) -> dict: + def _assemble_input_for_testing(self, data: list) -> dict: """Assemble the given data into a dictionary for testing input. Notes @@ -380,7 +380,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the classifier on the given data. @@ -412,7 +412,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -420,7 +420,7 @@ def predict( def forecast( self, X: dict or str, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Forecast the future the input with the trained model. diff --git a/pypots/forecasting/bttf/model.py b/pypots/forecasting/bttf/model.py index 6c81b995..ab530058 100644 --- a/pypots/forecasting/bttf/model.py +++ b/pypots/forecasting/bttf/model.py @@ -97,7 +97,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type="h5py", + file_type: str = "hdf5", ) -> None: """Train the forecaster on the given data. @@ -112,7 +112,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: assert not isinstance( test_set, str @@ -140,7 +140,7 @@ def predict( def forecast( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Forecast the future the input with the trained model. diff --git a/pypots/forecasting/csdi/__init__.py b/pypots/forecasting/csdi/__init__.py new file mode 100644 index 00000000..fbaae9a5 --- /dev/null +++ b/pypots/forecasting/csdi/__init__.py @@ -0,0 +1,19 @@ +""" +The implementation of CSDI for the partially-observed time-series forecasting task. + +Refer to the paper +`Yusuke Tashiro, Jiaming Song, Yang Song, and Stefano Ermon. +CSDI: Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation. +In NeurIPS, 2021. +`_ + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from .model import CSDI + +__all__ = [ + "CSDI", +] diff --git a/pypots/forecasting/csdi/core.py b/pypots/forecasting/csdi/core.py new file mode 100644 index 00000000..c66c99fd --- /dev/null +++ b/pypots/forecasting/csdi/core.py @@ -0,0 +1,141 @@ +# Created by Wenjie Du +# License: BSD-3-Clause + +import torch +import torch.nn as nn + +from ...nn.modules.csdi import BackboneCSDI + + +class _CSDI(nn.Module): + def __init__( + self, + n_features, + n_pred_features, + n_layers, + n_heads, + n_channels, + d_time_embedding, + d_feature_embedding, + d_diffusion_embedding, + is_unconditional, + n_diffusion_steps, + schedule, + beta_start, + beta_end, + ): + super().__init__() + + self.n_features = n_features + self.n_pred_features = n_pred_features + self.d_time_embedding = d_time_embedding + self.is_unconditional = is_unconditional + + self.embed_layer = nn.Embedding( + num_embeddings=n_features, + embedding_dim=d_feature_embedding, + ) + self.backbone = BackboneCSDI( + n_layers, + n_heads, + n_channels, + n_pred_features, + d_time_embedding, + d_feature_embedding, + d_diffusion_embedding, + is_unconditional, + n_diffusion_steps, + schedule, + beta_start, + beta_end, + ) + + @staticmethod + def time_embedding(pos, d_model=128): + pe = torch.zeros(pos.shape[0], pos.shape[1], d_model).to(pos.device) + position = pos.unsqueeze(2) + div_term = 1 / torch.pow( + 10000.0, torch.arange(0, d_model, 2, device=pos.device) / d_model + ) + pe[:, :, 0::2] = torch.sin(position * div_term) + pe[:, :, 1::2] = torch.cos(position * div_term) + return pe + + def get_side_info(self, observed_tp, cond_mask, feature_id): + B, K, L = cond_mask.shape + device = observed_tp.device + time_embed = self.time_embedding( + observed_tp, self.d_time_embedding + ) # (B,L,emb) + time_embed = time_embed.to(device) + time_embed = time_embed.unsqueeze(2).expand(-1, -1, self.n_pred_features, -1) + + if self.n_pred_features == self.n_features: + feature_embed = self.embed_layer( + torch.arange(self.n_pred_features).to(device) + ) # (K,emb) + feature_embed = feature_embed.unsqueeze(0).unsqueeze(0).expand(B, L, -1, -1) + else: + feature_embed = ( + self.embed_layer(feature_id).unsqueeze(1).expand(-1, L, -1, -1) + ) + + side_info = torch.cat( + [time_embed, feature_embed], dim=-1 + ) # (B,L,K,emb+d_feature_embedding) + side_info = side_info.permute(0, 3, 2, 1) # (B,*,K,L) + + if not self.is_unconditional: + side_mask = cond_mask.unsqueeze(1) # (B,1,K,L) + side_info = torch.cat([side_info, side_mask], dim=1) + + return side_info + + def forward(self, inputs, training=True, n_sampling_times=1): + results = {} + if training: # for training + (observed_data, indicating_mask, cond_mask, observed_tp, feature_id) = ( + inputs["X_ori"], + inputs["indicating_mask"], + inputs["cond_mask"], + inputs["observed_tp"], + inputs["feature_id"], + ) + side_info = self.get_side_info(observed_tp, cond_mask, feature_id) + training_loss = self.backbone.calc_loss( + observed_data, cond_mask, indicating_mask, side_info, training + ) + results["loss"] = training_loss + elif not training and n_sampling_times == 0: # for validating + (observed_data, indicating_mask, cond_mask, observed_tp, feature_id) = ( + inputs["X_ori"], + inputs["indicating_mask"], + inputs["cond_mask"], + inputs["observed_tp"], + inputs["feature_id"], + ) + side_info = self.get_side_info(observed_tp, cond_mask, feature_id) + validating_loss = self.backbone.calc_loss_valid( + observed_data, cond_mask, indicating_mask, side_info, training + ) + results["loss"] = validating_loss + elif not training and n_sampling_times > 0: # for testing + observed_data, cond_mask, observed_tp, feature_id = ( + inputs["X"], + inputs["cond_mask"], + inputs["observed_tp"], + inputs["feature_id"], + ) + side_info = self.get_side_info(observed_tp, cond_mask, feature_id) + samples = self.backbone( + observed_data, cond_mask, side_info, n_sampling_times + ) # (n_samples, n_sampling_times, n_features, n_steps) + repeated_obs = observed_data.unsqueeze(1).repeat(1, n_sampling_times, 1, 1) + repeated_mask = cond_mask.unsqueeze(1).repeat(1, n_sampling_times, 1, 1) + forecasting = repeated_obs + samples * (1 - repeated_mask) + + results["forecasting_data"] = forecasting.permute( + 0, 1, 3, 2 + ) # (n_samples, n_sampling_times, n_steps, n_features) + + return results diff --git a/pypots/forecasting/csdi/data.py b/pypots/forecasting/csdi/data.py new file mode 100644 index 00000000..d39bfb92 --- /dev/null +++ b/pypots/forecasting/csdi/data.py @@ -0,0 +1,365 @@ +""" + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from typing import Union, Iterable + +import numpy as np +import torch +from pygrinder import fill_and_get_mask_torch + +from ...data.dataset import BaseDataset + + +class DatasetForForecastingCSDI(BaseDataset): + """Dataset for CSDI forecasting model.""" + + def __init__( + self, + data: Union[dict, str], + file_type: str = "hdf5", + ): + super().__init__( + data=data, + return_X_ori=False, + return_X_pred=True, + return_y=False, + file_type=file_type, + ) + + def sample_features(self, observed_data, observed_mask, feature_id, gt_mask): + ind = np.arange(self.n_pred_features) + np.random.shuffle(ind) + + extracted_data = observed_data[:, ind[: self.n_features]] + extracted_mask = observed_mask[:, ind[: self.n_features]] + extracted_feature_id = feature_id[ind[: self.n_features]] + extracted_gt_mask = gt_mask[:, ind[: self.n_features]] + + return extracted_data, extracted_mask, extracted_feature_id, extracted_gt_mask + + def _fetch_data_from_array(self, idx: int) -> Iterable: + """Fetch data according to index. + + Parameters + ---------- + idx : + The index to fetch the specified sample. + + Returns + ------- + sample : + A list contains + + index : int tensor, + The index of the sample. + + observed_data : tensor, + Time-series data with all observed values for model input. + + indicating_mask : tensor, + The mask records all artificially missing values to the model. + + cond_mask : tensor, + The mask records all originally and artificially missing values to the model. + + observed_tp : tensor, + The time points (timestamp) of the observed data. + + """ + + feature_id = torch.arange(self.n_pred_features) + observed_data = self.X[idx] + observed_data, observed_mask = fill_and_get_mask_torch(observed_data) + + # apply specifically given mask or the hist masking strategy, rather than the random masking strategy + if "for_pattern_mask" in self.data.keys(): + for_pattern_mask = torch.from_numpy(self.data["for_pattern_mask"][idx]).to( + torch.float32 + ) + else: + previous_sample = self.X[idx - 1] + for_pattern_mask = (~torch.isnan(previous_sample)).to(torch.float32) + cond_mask = observed_mask * for_pattern_mask + + indicating_mask = observed_mask - cond_mask + + if self.n_pred_features > self.n_features: + ( + observed_data, + observed_mask, + feature_id, + cond_mask, + ) = self.sample_features( + observed_data, observed_mask, feature_id, cond_mask + ) + + X_pred = self.X_pred[idx] + X_pred_missing_mask = self.X_pred_missing_mask[idx] + + observed_data = torch.concat([observed_data, X_pred], dim=0) + indicating_mask = torch.concat([indicating_mask, X_pred_missing_mask], dim=0) + cond_mask = torch.concat([cond_mask, torch.zeros(X_pred.shape)], dim=0) + observed_tp = torch.arange( + 0, self.n_steps + self.n_pred_steps, dtype=torch.float32 + ) + + sample = [ + torch.tensor(idx), + observed_data, + indicating_mask, + cond_mask, + observed_tp, + feature_id, + ] + + if self.return_y: + sample.append(self.y[idx].to(torch.long)) + + return sample + + def _fetch_data_from_file(self, idx: int) -> Iterable: + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : + The index of the sample to be return. + + Returns + ------- + sample : + A list contains + + index : int tensor, + The index of the sample. + + observed_data : tensor, + Time-series data with all observed values for model input. + + indicating_mask : tensor, + The mask records all artificially missing values to the model. + + cond_mask : tensor, + The mask records all originally and artificially missing values to the model. + + observed_tp : tensor, + The time points (timestamp) of the observed data. + + """ + + if self.file_handle is None: + self.file_handle = self._open_file_handle() + + feature_id = torch.arange(self.n_pred_features) + observed_data = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) + observed_data, observed_mask = fill_and_get_mask_torch(observed_data) + + # apply specifically given mask or the hist masking strategy, rather than the random masking strategy + if "for_pattern_mask" in self.file_handle.keys(): + for_pattern_mask = torch.from_numpy( + self.file_handle["for_pattern_mask"][idx] + ).to(torch.float32) + else: + previous_sample = torch.from_numpy(self.file_handle["X"][idx - 1]).to( + torch.float32 + ) + for_pattern_mask = (~torch.isnan(previous_sample)).to(torch.float32) + cond_mask = observed_mask * for_pattern_mask + + indicating_mask = observed_mask - cond_mask + + if self.n_pred_features > self.n_features: + ( + observed_data, + observed_mask, + feature_id, + cond_mask, + ) = self.sample_features( + observed_data, observed_mask, feature_id, cond_mask + ) + + X_pred = torch.from_numpy(self.file_handle["X_pred"][idx]).to(torch.float32) + X_pred, X_pred_missing_mask = fill_and_get_mask_torch(X_pred) + + observed_data = torch.concat([observed_data, X_pred], dim=0) + indicating_mask = torch.concat([indicating_mask, X_pred_missing_mask], dim=0) + cond_mask = torch.concat([cond_mask, torch.zeros(X_pred.shape)], dim=0) + observed_tp = torch.arange( + 0, self.n_steps + self.n_pred_steps, dtype=torch.float32 + ) + + sample = [ + torch.tensor(idx), + observed_data, + indicating_mask, + cond_mask, + observed_tp, + feature_id, + ] + + if self.return_y: + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) + + return sample + + +class TestDatasetForForecastingCSDI(DatasetForForecastingCSDI): + """Test dataset for CSDI forecasting model.""" + + def __init__( + self, + data: Union[dict, str], + n_pred_steps: int, + n_pred_features: int, + file_type: str = "hdf5", + ): + super().__init__( + data=data, + file_type=file_type, + ) + self.n_pred_steps = n_pred_steps + self.n_pred_features = n_pred_features + + def _fetch_data_from_array(self, idx: int) -> Iterable: + """Fetch data according to index. + + Parameters + ---------- + idx : + The index to fetch the specified sample. + + Returns + ------- + sample : + A list contains + + index : int tensor, + The index of the sample. + + observed_data : tensor, + Time-series data with all observed values for model input. + + cond_mask : tensor, + The mask records missing values to the model. + + observed_tp : tensor, + The time points (timestamp) of the observed data. + """ + + feature_id = torch.arange(self.n_pred_features) + observed_data = self.X[idx] + observed_data, observed_mask = fill_and_get_mask_torch(observed_data) + cond_mask = observed_mask + + if self.n_pred_features > self.n_features: + ( + observed_data, + observed_mask, + feature_id, + cond_mask, + ) = self.sample_features( + observed_data, observed_mask, feature_id, cond_mask + ) + + observed_data = torch.concat( + [observed_data, torch.zeros([self.n_pred_steps, self.n_pred_features])], + dim=0, + ) + + cond_mask = torch.concat( + [cond_mask, torch.zeros([self.n_pred_steps, self.n_pred_features])], dim=0 + ) + observed_tp = torch.arange( + 0, self.n_steps + self.n_pred_steps, dtype=torch.float32 + ) + + sample = [ + torch.tensor(idx), + observed_data, + cond_mask, + observed_tp, + feature_id, + ] + + if self.return_y: + sample.append(self.y[idx].to(torch.long)) + + return sample + + def _fetch_data_from_file(self, idx: int) -> Iterable: + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : + The index of the sample to be return. + + Returns + ------- + sample : + A list contains + + index : int tensor, + The index of the sample. + + observed_data : tensor, + Time-series data with all observed values for model input. + + cond_mask : tensor, + The mask records missing values to the model. + + observed_tp : tensor, + The time points (timestamp) of the observed data. + + """ + + if self.file_handle is None: + self.file_handle = self._open_file_handle() + + feature_id = torch.arange(self.n_pred_features) + observed_data = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) + observed_data, observed_mask = fill_and_get_mask_torch(observed_data) + cond_mask = observed_mask + + if self.n_pred_features > self.n_features: + ( + observed_data, + observed_mask, + feature_id, + cond_mask, + ) = self.sample_features( + observed_data, observed_mask, feature_id, cond_mask + ) + + observed_data = torch.concat( + [observed_data, torch.zeros([self.n_pred_steps, self.n_pred_features])], + dim=0, + ) + + cond_mask = torch.concat( + [cond_mask, torch.zeros([self.n_pred_steps, self.n_pred_features])], dim=0 + ) + observed_tp = torch.arange( + 0, self.n_steps + self.n_pred_steps, dtype=torch.float32 + ) + + feature_id = torch.arange(self.n_pred_features) + + sample = [ + torch.tensor(idx), + observed_data, + cond_mask, + observed_tp, + feature_id, + ] + + if self.return_y: + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) + + return sample diff --git a/pypots/forecasting/csdi/model.py b/pypots/forecasting/csdi/model.py new file mode 100644 index 00000000..68f6a412 --- /dev/null +++ b/pypots/forecasting/csdi/model.py @@ -0,0 +1,496 @@ +""" +The implementation of CSDI for the partially-observed time-series forecasting task. + +Refer to the paper "Yusuke Tashiro, Jiaming Song, Yang Song, and Stefano Ermon. +CSDI: Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation. +In NeurIPS, 2021." + +Notes +----- +Partial implementation uses code from the official implementation https://github.com/ermongroup/CSDI. + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import os +from typing import Union, Optional + +import numpy as np +import torch +from torch.utils.data import DataLoader + +try: + import nni +except ImportError: + pass + +from .core import _CSDI +from .data import DatasetForForecastingCSDI, TestDatasetForForecastingCSDI +from ..base import BaseNNForecaster +from ...data.checking import key_in_data_set +from ...optim.adam import Adam +from ...optim.base import Optimizer +from ...utils.logging import logger + + +class CSDI(BaseNNForecaster): + """The PyTorch implementation of the CSDI model :cite:`tashiro2021csdi`. + + Parameters + ---------- + n_steps : + The number of time steps in the time-series data sample. + + n_features : + The number of features in the time-series data sample. + + n_pred_steps : + The number of steps in the forecasting time series. + + n_pred_features : + The number of features in the forecasting time series. + + n_layers : + The number of layers in the CSDI model. + + n_heads : + The number of heads in the multi-head attention mechanism. + + n_channels : + The number of residual channels. + + d_time_embedding : + The dimension number of the time (temporal) embedding. + + d_feature_embedding : + The dimension number of the feature embedding. + + d_diffusion_embedding : + The dimension number of the diffusion embedding. + + is_unconditional : + Whether the model is unconditional or conditional. + + target_strategy : + The strategy for selecting the target for the diffusion process. It has to be one of ["mix", "random"]. + + n_diffusion_steps : + The number of the diffusion step T in the original paper. + + schedule: + The schedule for other noise levels. It has to be one of ["quad", "linear"]. + + beta_start: + The minimum noise level. + + beta_end: + The maximum noise level. + + batch_size : + The batch size for training and evaluating the model. + + epochs : + The number of epochs for training the model. + + patience : + The patience for the early-stopping mechanism. Given a positive integer, the training process will be + stopped when the model does not perform better after that number of epochs. + Leaving it default as None will disable the early-stopping. + + optimizer : + The optimizer for model training. + If not given, will use a default Adam optimizer. + + num_workers : + The number of subprocesses to use for data loading. + `0` means data loading will be in the main process, i.e. there won't be subprocesses. + + device : + The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them. + If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple), + then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models. + If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the + model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices). + Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future. + + saving_path : + The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during + training into a tensorboard file). Will not save if not given. + + model_saving_strategy : + The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"]. + No model will be saved when it is set as None. + The "best" strategy will only automatically save the best model after the training finished. + The "better" strategy will automatically save the model during training whenever the model performs + better than in previous epochs. + The "all" strategy will save every model after each epoch training. + + """ + + def __init__( + self, + n_steps: int, + n_features: int, + n_pred_steps: int, + n_pred_features: int, + n_layers: int, + n_heads: int, + n_channels: int, + d_time_embedding: int, + d_feature_embedding: int, + d_diffusion_embedding: int, + n_diffusion_steps: int = 50, + target_strategy: str = "random", + is_unconditional: bool = False, + schedule: str = "quad", + beta_start: float = 0.0001, + beta_end: float = 0.5, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, + optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, + device: Optional[Union[str, torch.device, list]] = None, + saving_path: Optional[str] = None, + model_saving_strategy: Optional[str] = "best", + ): + super().__init__( + batch_size, + epochs, + patience, + num_workers, + device, + saving_path, + model_saving_strategy, + ) + assert n_pred_features == n_features, ( + f"currently n_pred_features of CSDI forecasting model should be equal to n_features, " + f"but got {n_pred_features} and {n_features}." + ) + assert target_strategy in ["mix", "random"] + assert schedule in ["quad", "linear"] + self.n_steps = n_steps + self.n_features = n_features + self.n_pred_steps = n_pred_steps + self.n_pred_features = n_pred_features + self.target_strategy = target_strategy + + # set up the model + self.model = _CSDI( + n_features, + n_pred_features, + n_layers, + n_heads, + n_channels, + d_time_embedding, + d_feature_embedding, + d_diffusion_embedding, + is_unconditional, + n_diffusion_steps, + schedule, + beta_start, + beta_end, + ) + self._print_model_size() + self._send_model_to_given_device() + + # set up the optimizer + self.optimizer = optimizer + self.optimizer.init_optimizer(self.model.parameters()) + + def _assemble_input_for_training(self, data: list) -> dict: + ( + indices, + X_ori, + indicating_mask, + cond_mask, + observed_tp, + feature_id, + ) = self._send_data_to_given_device(data) + + inputs = { + "X_ori": X_ori.permute(0, 2, 1), # ori observed part for model hint + "indicating_mask": indicating_mask.permute(0, 2, 1), # for loss calc + "cond_mask": cond_mask.permute(0, 2, 1), # for masking X_ori + "observed_tp": observed_tp, + "feature_id": feature_id, + } + return inputs + + def _assemble_input_for_validating(self, data: list) -> dict: + return self._assemble_input_for_training(data) + + def _assemble_input_for_testing(self, data: list) -> dict: + ( + indices, + X, + cond_mask, + observed_tp, + feature_id, + ) = self._send_data_to_given_device(data) + + inputs = { + "X": X.permute(0, 2, 1), # for model input + "cond_mask": cond_mask.permute(0, 2, 1), # missing mask + "observed_tp": observed_tp, + "feature_id": feature_id, + } + return inputs + + def _train_model( + self, + training_loader: DataLoader, + val_loader: DataLoader = None, + ) -> None: + # each training starts from the very beginning, so reset the loss and model dict here + self.best_loss = float("inf") + self.best_model_dict = None + + try: + training_step = 0 + for epoch in range(1, self.epochs + 1): + self.model.train() + epoch_train_loss_collector = [] + for idx, data in enumerate(training_loader): + training_step += 1 + inputs = self._assemble_input_for_training(data) + self.optimizer.zero_grad() + results = self.model.forward(inputs) + # use sum() before backward() in case of multi-gpu training + results["loss"].sum().backward() + self.optimizer.step() + epoch_train_loss_collector.append(results["loss"].sum().item()) + + # save training loss logs into the tensorboard file for every step if in need + if self.summary_writer is not None: + self._save_log_into_tb_file(training_step, "training", results) + + # mean training loss of the current epoch + mean_train_loss = np.mean(epoch_train_loss_collector) + + if val_loader is not None: + self.model.eval() + val_loss_collector = [] + with torch.no_grad(): + for idx, data in enumerate(val_loader): + inputs = self._assemble_input_for_validating(data) + results = self.model.forward( + inputs, training=False, n_sampling_times=0 + ) + val_loss_collector.append(results["loss"].sum().item()) + + mean_val_loss = np.asarray(val_loss_collector).mean() + + # save validation loss logs into the tensorboard file for every epoch if in need + if self.summary_writer is not None: + val_loss_dict = { + "validating_loss": mean_val_loss, + } + self._save_log_into_tb_file(epoch, "validating", val_loss_dict) + + logger.info( + f"Epoch {epoch:03d} - " + f"training loss: {mean_train_loss:.4f}, " + f"validation loss: {mean_val_loss:.4f}" + ) + mean_loss = mean_val_loss + else: + logger.info( + f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}" + ) + mean_loss = mean_train_loss + + if np.isnan(mean_loss): + logger.warning( + f"‼️ Attention: got NaN loss in Epoch {epoch}. This may lead to unexpected errors." + ) + + if mean_loss < self.best_loss: + self.best_epoch = epoch + self.best_loss = mean_loss + self.best_model_dict = self.model.state_dict() + self.patience = self.original_patience + else: + self.patience -= 1 + + # save the model if necessary + self._auto_save_model_if_necessary( + confirm_saving=mean_loss < self.best_loss, + saving_name=f"{self.__class__.__name__}_epoch{epoch}_loss{mean_loss}", + ) + + if os.getenv("enable_tuning", False): + nni.report_intermediate_result(mean_loss) + if epoch == self.epochs - 1 or self.patience == 0: + nni.report_final_result(self.best_loss) + + if self.patience == 0: + logger.info( + "Exceeded the training patience. Terminating the training procedure..." + ) + break + + except Exception as e: + logger.error(f"❌ Exception: {e}") + if self.best_model_dict is None: + raise RuntimeError( + "Training got interrupted. Model was not trained. Please investigate the error printed above." + ) + else: + RuntimeWarning( + "Training got interrupted. Please investigate the error printed above.\n" + "Model got trained and will load the best checkpoint so far for testing.\n" + "If you don't want it, please try fit() again." + ) + + if np.isnan(self.best_loss): + raise ValueError("Something is wrong. best_loss is Nan after training.") + + logger.info( + f"Finished training. The best model is from epoch#{self.best_epoch}." + ) + + def fit( + self, + train_set: Union[dict, str], + val_set: Optional[Union[dict, str]] = None, + file_type: str = "hdf5", + n_sampling_times: int = 1, + ) -> None: + # Step 1: wrap the input data with classes Dataset and DataLoader + training_set = DatasetForForecastingCSDI( + train_set, + file_type=file_type, + ) + training_loader = DataLoader( + training_set, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + ) + val_loader = None + if val_set is not None: + if not key_in_data_set("X_pred", val_set): + raise ValueError("val_set must contain 'X_pred' for model validation.") + val_set = DatasetForForecastingCSDI( + val_set, + file_type=file_type, + ) + val_loader = DataLoader( + val_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + + # Step 2: train the model and freeze it + self._train_model(training_loader, val_loader) + self.model.load_state_dict(self.best_model_dict) + self.model.eval() # set the model as eval status to freeze it. + + # Step 3: save the model if necessary + self._auto_save_model_if_necessary(confirm_saving=True) + + def predict( + self, + test_set: Union[dict, str], + file_type: str = "hdf5", + n_sampling_times: int = 1, + ) -> dict: + """ + + Parameters + ---------- + test_set : dict or str + The dataset for model validating, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : + The type of the given file if test_set is a path string. + + n_sampling_times: + The number of sampling times for the model to sample from the diffusion process. + + Returns + ------- + result_dict: dict + Prediction results in a Python Dictionary for the given samples. + It should be a dictionary including a key named 'imputation'. + + """ + assert n_sampling_times > 0, "n_sampling_times should be greater than 0." + + # Step 1: wrap the input data with classes Dataset and DataLoader + self.model.eval() # set the model as eval status to freeze it. + test_set = TestDatasetForForecastingCSDI( + test_set, + self.n_pred_steps, + self.n_pred_features, + file_type=file_type, + ) + test_loader = DataLoader( + test_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + forecasting_collector = [] + + # Step 2: process the data with the model + with torch.no_grad(): + for idx, data in enumerate(test_loader): + inputs = self._assemble_input_for_testing(data) + results = self.model( + inputs, + training=False, + n_sampling_times=n_sampling_times, + ) + forecasting_data = results["forecasting_data"][ + :, :, -self.n_pred_steps : + ] + forecasting_collector.append(forecasting_data) + + # Step 3: output collection and return + forecasting_data = torch.cat(forecasting_collector).cpu().detach().numpy() + result_dict = { + "forecasting": forecasting_data, # [bz, n_sampling_times, n_pred_steps, n_features] + } + return result_dict + + def forecast( + self, + X: Union[dict, str], + file_type: str = "hdf5", + ) -> np.ndarray: + """Impute missing values in the given data with the trained model. + + Warnings + -------- + The method impute is deprecated. Please use `predict()` instead. + + Parameters + ---------- + X : + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ + logger.warning( + "🚨DeprecationWarning: The method impute is deprecated. Please use `predict` instead." + ) + results_dict = self.predict(X, file_type=file_type) + return results_dict["forecasting"] diff --git a/pypots/forecasting/template/core.py b/pypots/forecasting/template/core.py new file mode 100644 index 00000000..55cf3ada --- /dev/null +++ b/pypots/forecasting/template/core.py @@ -0,0 +1,42 @@ +""" +The implementation of YourNewModel for the partially-observed time-series forecasting task. + +Refer to the paper "Your paper citation". + +""" + +# Created by Your Name TODO: modify the author information. +# License: BSD-3-Clause + +import torch.nn as nn + +# from ...nn.modules import some_modules + + +# TODO: define your new model here. +# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). +# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. +# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. +class _YourNewModel(nn.Module): + def __init__(self): + super().__init__() + + # TODO: define your model's components here. If modules in pypots.nn.modules can be reused in your model, + # you can import them and use them here. AND if you think the modules you implemented can be reused by + # other models, you can also consider to contribute them to pypots.nn.modules + self.embedding = nn.Module + self.submodule = nn.Module + self.backbone = nn.Module + + def forward(self, inputs: dict) -> dict: + # TODO: define your model's forward propagation process here. + # The input is a dict, and the output `results` should also be a dict. + output = self.backbone() # replace this with your model's process + + # TODO: `results` must contains the key `loss` which is will be used for + # backward propagation to update the model. + loss = None + results = { + "loss": loss, + } + return results diff --git a/pypots/forecasting/template/data.py b/pypots/forecasting/template/data.py index c391740e..3c4ca97e 100644 --- a/pypots/forecasting/template/data.py +++ b/pypots/forecasting/template/data.py @@ -1,7 +1,7 @@ """ Dataset class for YourNewModel. -TODO: modify the above description with your model's information. +TODO: modify the above description for your model's dataset class. """ @@ -10,17 +10,26 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset +# TODO: define your new dataset class here. Remove or add arguments as needed. class DatasetForYourNewModel(BaseDataset): def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_X_ori: bool, + return_X_pred: bool, + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=return_X_pred, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: raise NotImplementedError diff --git a/pypots/forecasting/template/model.py b/pypots/forecasting/template/model.py index 099c617b..890c3fde 100644 --- a/pypots/forecasting/template/model.py +++ b/pypots/forecasting/template/model.py @@ -3,6 +3,8 @@ Refer to the paper "Your paper citation". +TODO: modify the above description with your model's information. + """ # Created by Your Name TODO: modify the author information. @@ -10,40 +12,19 @@ from typing import Union, Optional -import numpy as np import torch -import torch.nn as nn + +from .core import _YourNewModel # TODO: import the base class from the forecasting package in PyPOTS. # Here I suppose this is a neural-network forecasting model. # You should make your model inherent BaseForecaster if it is not a NN. # from ..base import BaseForecaster from ..base import BaseNNForecaster - from ...optim.adam import Adam from ...optim.base import Optimizer -# TODO: define your new model here. -# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). -# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. -# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. -class _YourNewModel(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, inputs: dict) -> dict: - # TODO: define your model's forward propagation process here. - # The input is a dict, and the output `results` should also be a dict. - # `results` must contains the key `loss` which is will be used for backward propagation to update the model. - - loss = None - results = { - "loss": loss, - } - return results - - # TODO: define your new model's wrapper here. # It should be a subclass of a base class defined in PyPOTS task packages (e.g. # BaseNNForecaster of PyPOTS forecasting task package), and it has to implement all abstract methods of the base class. @@ -52,13 +33,13 @@ class YourNewModel(BaseNNForecaster): def __init__( self, # TODO: add your model's hyper-parameters here - batch_size: int, - epochs: int, - patience: int, - num_workers: int = 0, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, device: Optional[Union[str, torch.device, list]] = None, - saving_path: str = None, + saving_path: Optional[str] = None, model_saving_strategy: Optional[str] = "best", ): super().__init__( @@ -74,9 +55,11 @@ def __init__( # TODO: set up your model's hyper-parameters here # set up the model - self.model = _YourNewModel() - self.model = self.model.to(self.device) + self.model = _YourNewModel( + # pass the arguments to your model + ) self._print_model_size() + self._send_model_to_given_device() # set up the optimizer self.optimizer = optimizer @@ -95,13 +78,13 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: raise NotImplementedError def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError diff --git a/pypots/forecasting/template/module.py b/pypots/forecasting/template/module.py deleted file mode 100644 index fa20e4cd..00000000 --- a/pypots/forecasting/template/module.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -The implementation of the modules for YourNewModel. - -Refer to the paper "Your paper citation". - -""" - -# Created by Your Name TODO: modify the author information. -# License: BSD-3-Clause - - -# TODO: this file is not necessary. If your new model has customized layers or modules, please put them here. -# Otherwise, please delete this modules.py file, don't commit it to the repository. diff --git a/pypots/imputation/autoformer/data.py b/pypots/imputation/autoformer/data.py index 1b9a8d0e..15eef9b3 100644 --- a/pypots/imputation/autoformer/data.py +++ b/pypots/imputation/autoformer/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/autoformer/model.py b/pypots/imputation/autoformer/model.py index bce312e0..edafd0a5 100644 --- a/pypots/imputation/autoformer/model.py +++ b/pypots/imputation/autoformer/model.py @@ -20,11 +20,11 @@ import torch from torch.utils.data import DataLoader +from .core import _Autoformer from .data import DatasetForAutoformer -from pypots.imputation.autoformer.core import _Autoformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -211,11 +211,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForAutoformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -225,10 +225,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForAutoformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -248,7 +248,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -263,19 +263,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -302,7 +306,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py index 284d1af2..18218c3e 100644 --- a/pypots/imputation/base.py +++ b/pypots/imputation/base.py @@ -68,7 +68,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the imputer on the given data. @@ -90,7 +90,7 @@ def fit( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include the key 'X'. - file_type : str, default = "h5py", + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -100,7 +100,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -108,7 +108,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. @@ -207,7 +207,7 @@ def _assemble_input_for_training(self, data: list) -> dict: Parameters ---------- - data : list, + data : Input data from dataloader, should be list. Returns @@ -223,7 +223,7 @@ def _assemble_input_for_validating(self, data: list) -> dict: Parameters ---------- - data : list, + data : Data output from dataloader, should be list. Returns @@ -247,7 +247,7 @@ def _assemble_input_for_testing(self, data: list) -> dict: Parameters ---------- - data : list, + data : Data output from dataloader, should be list. Returns @@ -383,7 +383,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the imputer on the given data. @@ -405,7 +405,7 @@ def fit( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include the key 'X'. - file_type : str, default = "h5py", + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -415,7 +415,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError @@ -423,7 +423,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/brits/data.py b/pypots/imputation/brits/data.py index 3144d8c1..589a5d5f 100644 --- a/pypots/imputation/brits/data.py +++ b/pypots/imputation/brits/data.py @@ -10,7 +10,7 @@ import torch from pygrinder import fill_and_get_mask_torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset from ...data.utils import _parse_delta_torch @@ -19,7 +19,7 @@ class DatasetForBRITS(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -28,7 +28,7 @@ class DatasetForBRITS(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -37,7 +37,7 @@ class DatasetForBRITS(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -45,18 +45,24 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) if not isinstance(self.data, str): # calculate all delta here. - if self.X_ori is None: - forward_X, forward_missing_mask = fill_and_get_mask_torch(self.X) - else: + if self.return_X_ori: forward_missing_mask = self.missing_mask forward_X = self.X + else: + forward_X, forward_missing_mask = fill_and_get_mask_torch(self.X) forward_delta = _parse_delta_torch(forward_missing_mask) backward_X = torch.flip(forward_X, dims=[1]) @@ -81,12 +87,12 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -116,10 +122,10 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: self.processed_data["backward"]["delta"][idx], ] - if self.X_ori is not None and self.return_X_ori: + if self.return_X_ori: sample.extend([self.X_ori[idx], self.indicating_mask[idx]]) - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -130,12 +136,12 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : The collated data sample, a list including all necessary sample info. """ @@ -169,14 +175,14 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: backward["deltas"], ] - if "X_ori" in self.file_handle.keys() and self.return_X_ori: + if self.return_X_ori: X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) indicating_mask = X_ori_missing_mask - missing_mask sample.extend([X_ori, indicating_mask]) # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/brits/model.py b/pypots/imputation/brits/model.py index b7dc6edd..68d71355 100644 --- a/pypots/imputation/brits/model.py +++ b/pypots/imputation/brits/model.py @@ -24,7 +24,7 @@ from .core import _BRITS from .data import DatasetForBRITS from ..base import BaseNNImputer -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -194,11 +194,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForBRITS( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -208,10 +208,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForBRITS( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -231,11 +231,11 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForBRITS( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, return_X_ori=False, return_y=False, file_type=file_type ) test_loader = DataLoader( test_set, @@ -261,7 +261,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/crossformer/data.py b/pypots/imputation/crossformer/data.py index 056486f8..6bbc771d 100644 --- a/pypots/imputation/crossformer/data.py +++ b/pypots/imputation/crossformer/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/crossformer/model.py b/pypots/imputation/crossformer/model.py index e79a957d..a2076d95 100644 --- a/pypots/imputation/crossformer/model.py +++ b/pypots/imputation/crossformer/model.py @@ -21,11 +21,11 @@ import torch from torch.utils.data import DataLoader +from .core import _Crossformer from .data import DatasetForCrossformer -from pypots.imputation.crossformer.core import _Crossformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -218,11 +218,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForCrossformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -232,10 +232,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForCrossformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -255,7 +255,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -270,19 +270,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -309,7 +313,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/csdi/core.py b/pypots/imputation/csdi/core.py index cfa975dc..fd3cdd84 100644 --- a/pypots/imputation/csdi/core.py +++ b/pypots/imputation/csdi/core.py @@ -1,6 +1,7 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from ...nn.modules.csdi import BackboneCSDI @@ -9,10 +10,10 @@ class _CSDI(nn.Module): def __init__( self, + n_features, n_layers, n_heads, n_channels, - d_target, d_time_embedding, d_feature_embedding, d_diffusion_embedding, @@ -24,11 +25,19 @@ def __init__( ): super().__init__() + self.n_features = n_features + self.d_time_embedding = d_time_embedding + self.is_unconditional = is_unconditional + + self.embed_layer = nn.Embedding( + num_embeddings=n_features, + embedding_dim=d_feature_embedding, + ) self.backbone = BackboneCSDI( n_layers, n_heads, n_channels, - d_target, + n_features, d_time_embedding, d_feature_embedding, d_diffusion_embedding, @@ -39,6 +48,41 @@ def __init__( beta_end, ) + @staticmethod + def time_embedding(pos, d_model=128): + pe = torch.zeros(pos.shape[0], pos.shape[1], d_model).to(pos.device) + position = pos.unsqueeze(2) + div_term = 1 / torch.pow( + 10000.0, torch.arange(0, d_model, 2, device=pos.device) / d_model + ) + pe[:, :, 0::2] = torch.sin(position * div_term) + pe[:, :, 1::2] = torch.cos(position * div_term) + return pe + + def get_side_info(self, observed_tp, cond_mask): + B, K, L = cond_mask.shape + device = observed_tp.device + time_embed = self.time_embedding( + observed_tp, self.d_time_embedding + ) # (B,L,emb) + time_embed = time_embed.to(device) + time_embed = time_embed.unsqueeze(2).expand(-1, -1, K, -1) + feature_embed = self.embed_layer( + torch.arange(self.n_features).to(device) + ) # (K,emb) + feature_embed = feature_embed.unsqueeze(0).unsqueeze(0).expand(B, L, -1, -1) + + side_info = torch.cat( + [time_embed, feature_embed], dim=-1 + ) # (B,L,K,emb+d_feature_embedding) + side_info = side_info.permute(0, 3, 2, 1) # (B,*,K,L) + + if not self.is_unconditional: + side_mask = cond_mask.unsqueeze(1) # (B,1,K,L) + side_info = torch.cat([side_info, side_mask], dim=1) + + return side_info + def forward(self, inputs, training=True, n_sampling_times=1): results = {} if training: # for training @@ -48,7 +92,7 @@ def forward(self, inputs, training=True, n_sampling_times=1): inputs["cond_mask"], inputs["observed_tp"], ) - side_info = self.backbone.get_side_info(observed_tp, cond_mask) + side_info = self.get_side_info(observed_tp, cond_mask) training_loss = self.backbone.calc_loss( observed_data, cond_mask, indicating_mask, side_info, training ) @@ -60,7 +104,7 @@ def forward(self, inputs, training=True, n_sampling_times=1): inputs["cond_mask"], inputs["observed_tp"], ) - side_info = self.backbone.get_side_info(observed_tp, cond_mask) + side_info = self.get_side_info(observed_tp, cond_mask) validating_loss = self.backbone.calc_loss_valid( observed_data, cond_mask, indicating_mask, side_info, training ) @@ -71,7 +115,7 @@ def forward(self, inputs, training=True, n_sampling_times=1): inputs["cond_mask"], inputs["observed_tp"], ) - side_info = self.backbone.get_side_info(observed_tp, cond_mask) + side_info = self.get_side_info(observed_tp, cond_mask) samples = self.backbone( observed_data, cond_mask, side_info, n_sampling_times ) # (n_samples, n_sampling_times, n_features, n_steps) diff --git a/pypots/imputation/csdi/data.py b/pypots/imputation/csdi/data.py index 8617e6bc..03d07923 100644 --- a/pypots/imputation/csdi/data.py +++ b/pypots/imputation/csdi/data.py @@ -11,21 +11,36 @@ import torch from pygrinder import fill_and_get_mask_torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset class DatasetForCSDI(BaseDataset): - """Dataset for CSDI model.""" + """Dataset for CSDI model. + + Notes + ----- + In CSDI official code, `observed_mask` indicates all observed values in raw data. + `gt_mask` indicates all observed values in the input data. + `observed_mask` - `gt_mask` = `indicating_mask` in our code. + `cond_mask`, for testing, it is `gt_mask`; for training, it is `observed_mask` + includes some artificially missing values. + + """ def __init__( self, data: Union[dict, str], target_strategy: str, return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + file_type: str = "hdf5", ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=False, + return_y=False, + file_type=file_type, + ) assert target_strategy in ["random", "hist", "mix"] self.target_strategy = target_strategy @@ -55,12 +70,12 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index to fetch the specified sample. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -80,7 +95,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: """ - if self.X_ori is not None and self.return_X_ori: + if self.return_X_ori: observed_data = self.X_ori[idx] cond_mask = self.missing_mask[idx] indicating_mask = self.indicating_mask[idx] @@ -117,7 +132,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: observed_tp, ] - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -128,12 +143,12 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -156,7 +171,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: if self.file_handle is None: self.file_handle = self._open_file_handle() - if "X_ori" in self.file_handle.keys() and self.return_X_ori: + if self.return_X_ori: observed_data = torch.from_numpy(self.file_handle["X_ori"][idx]).to( torch.float32 ) @@ -203,7 +218,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: observed_tp, ] - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample @@ -216,22 +231,21 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + file_type: str = "hdf5", ): - super().__init__(data, "random", return_X_ori, return_labels, file_type) + super().__init__(data, "random", return_X_ori, file_type) def _fetch_data_from_array(self, idx: int) -> Iterable: """Fetch data according to index. Parameters ---------- - idx : int, + idx : The index to fetch the specified sample. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -264,7 +278,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: observed_tp, ] - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -275,12 +289,12 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -319,7 +333,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: observed_tp, ] - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/csdi/model.py b/pypots/imputation/csdi/model.py index e43f6db4..bcfdc29d 100644 --- a/pypots/imputation/csdi/model.py +++ b/pypots/imputation/csdi/model.py @@ -29,7 +29,7 @@ from .core import _CSDI from .data import DatasetForCSDI, TestDatasetForCSDI from ..base import BaseNNImputer -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -40,6 +40,9 @@ class CSDI(BaseNNImputer): Parameters ---------- + n_steps : + The number of time steps in the time-series data sample. + n_features : The number of features in the time-series data sample. @@ -122,6 +125,7 @@ class CSDI(BaseNNImputer): def __init__( self, + n_steps: int, n_features: int, n_layers: int, n_heads: int, @@ -155,14 +159,15 @@ def __init__( ) assert target_strategy in ["mix", "random"] assert schedule in ["quad", "linear"] + self.n_steps = n_steps self.target_strategy = target_strategy # set up the model self.model = _CSDI( + n_features, n_layers, n_heads, n_channels, - n_features, d_time_embedding, d_feature_embedding, d_diffusion_embedding, @@ -196,10 +201,10 @@ def _assemble_input_for_training(self, data: list) -> dict: } return inputs - def _assemble_input_for_validating(self, data) -> dict: + def _assemble_input_for_validating(self, data: list) -> dict: return self._assemble_input_for_training(data) - def _assemble_input_for_testing(self, data) -> dict: + def _assemble_input_for_testing(self, data: list) -> dict: ( indices, X, @@ -331,7 +336,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", n_sampling_times: int = 1, ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader @@ -339,7 +344,6 @@ def fit( train_set, self.target_strategy, return_X_ori=False, - return_labels=False, file_type=file_type, ) training_loader = DataLoader( @@ -350,13 +354,12 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForCSDI( val_set, self.target_strategy, return_X_ori=True, - return_labels=False, file_type=file_type, ) val_loader = DataLoader( @@ -377,7 +380,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", n_sampling_times: int = 1, ) -> dict: """ @@ -393,7 +396,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. n_sampling_times: @@ -410,9 +413,7 @@ def predict( # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. - test_set = TestDatasetForCSDI( - test_set, return_X_ori=False, return_labels=False, file_type=file_type - ) + test_set = TestDatasetForCSDI(test_set, return_X_ori=False, file_type=file_type) test_loader = DataLoader( test_set, batch_size=self.batch_size, @@ -443,7 +444,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/dlinear/data.py b/pypots/imputation/dlinear/data.py index 1884054f..b47cb439 100644 --- a/pypots/imputation/dlinear/data.py +++ b/pypots/imputation/dlinear/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/dlinear/model.py b/pypots/imputation/dlinear/model.py index 3dc95445..af7ba286 100644 --- a/pypots/imputation/dlinear/model.py +++ b/pypots/imputation/dlinear/model.py @@ -21,11 +21,11 @@ import torch from torch.utils.data import DataLoader +from .core import _DLinear from .data import DatasetForDLinear -from pypots.imputation.dlinear.core import _DLinear from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -189,11 +189,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForDLinear( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -203,10 +203,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForDLinear( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -226,7 +226,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -241,19 +241,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -280,7 +284,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/etsformer/data.py b/pypots/imputation/etsformer/data.py index f03a4e61..19503a4d 100644 --- a/pypots/imputation/etsformer/data.py +++ b/pypots/imputation/etsformer/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/etsformer/model.py b/pypots/imputation/etsformer/model.py index 11a6e19f..94a253e1 100644 --- a/pypots/imputation/etsformer/model.py +++ b/pypots/imputation/etsformer/model.py @@ -20,11 +20,11 @@ import torch from torch.utils.data import DataLoader +from .core import _ETSformer from .data import DatasetForETSformer -from pypots.imputation.etsformer.core import _ETSformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -211,11 +211,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForETSformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -225,10 +225,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForETSformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -248,7 +248,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -263,19 +263,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -302,7 +306,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/fedformer/data.py b/pypots/imputation/fedformer/data.py index a5982636..f8d79217 100644 --- a/pypots/imputation/fedformer/data.py +++ b/pypots/imputation/fedformer/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/fedformer/model.py b/pypots/imputation/fedformer/model.py index d6f9746a..dfda3740 100644 --- a/pypots/imputation/fedformer/model.py +++ b/pypots/imputation/fedformer/model.py @@ -23,8 +23,8 @@ from .core import _FEDformer from .data import DatasetForFEDformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -225,11 +225,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForFEDformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -239,10 +239,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForFEDformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -262,7 +262,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -277,19 +277,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -316,7 +320,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/gpvae/data.py b/pypots/imputation/gpvae/data.py index 24b5739b..27b3b456 100644 --- a/pypots/imputation/gpvae/data.py +++ b/pypots/imputation/gpvae/data.py @@ -9,7 +9,7 @@ import torch from pygrinder import fill_and_get_mask_torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset class DatasetForGPVAE(BaseDataset): @@ -17,7 +17,7 @@ class DatasetForGPVAE(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -26,7 +26,7 @@ class DatasetForGPVAE(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -35,7 +35,7 @@ class DatasetForGPVAE(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -43,22 +43,28 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: """Fetch data from self.X if it is given. Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -78,7 +84,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: """ X = self.X[idx] - if self.X_ori is not None and self.return_X_ori: + if self.return_X_ori: X = self.X[idx] missing_mask = self.missing_mask[idx] X_ori = self.X_ori[idx] @@ -88,7 +94,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: X, missing_mask = fill_and_get_mask_torch(X) sample = [torch.tensor(idx), X, missing_mask] - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -99,19 +105,19 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : The collated data sample, a list including all necessary sample info. """ if self.file_handle is None: self.file_handle = self._open_file_handle() - if "X_ori" in self.file_handle.keys() and self.return_X_ori: + if self.return_X_ori: X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) @@ -124,7 +130,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: sample = [torch.tensor(idx), X, missing_mask] # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/gpvae/model.py b/pypots/imputation/gpvae/model.py index 272010e3..1ff234c9 100644 --- a/pypots/imputation/gpvae/model.py +++ b/pypots/imputation/gpvae/model.py @@ -27,7 +27,7 @@ from .core import _GPVAE from .data import DatasetForGPVAE from ..base import BaseNNImputer -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -361,11 +361,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForGPVAE( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -375,10 +375,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForGPVAE( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -398,7 +398,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", n_sampling_times: int = 1, ) -> dict: """ @@ -414,7 +414,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. n_sampling_times: @@ -431,7 +431,7 @@ def predict( self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForGPVAE( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, return_X_ori=False, return_y=False, file_type=file_type ) test_loader = DataLoader( test_set, @@ -459,7 +459,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", n_sampling_times: int = 1, ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/informer/data.py b/pypots/imputation/informer/data.py index bf6a146d..7624c41d 100644 --- a/pypots/imputation/informer/data.py +++ b/pypots/imputation/informer/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/informer/model.py b/pypots/imputation/informer/model.py index 007920db..9429485e 100644 --- a/pypots/imputation/informer/model.py +++ b/pypots/imputation/informer/model.py @@ -24,8 +24,8 @@ from .core import _Informer from .data import DatasetForInformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -206,11 +206,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForInformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -220,10 +220,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForInformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -243,7 +243,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -258,19 +258,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -297,7 +301,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/locf/model.py b/pypots/imputation/locf/model.py index e7c47366..b88e9e7a 100644 --- a/pypots/imputation/locf/model.py +++ b/pypots/imputation/locf/model.py @@ -55,7 +55,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the imputer on the given data. @@ -73,7 +73,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -88,7 +88,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns @@ -128,7 +128,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/mean/model.py b/pypots/imputation/mean/model.py index 2594df88..33582f8d 100644 --- a/pypots/imputation/mean/model.py +++ b/pypots/imputation/mean/model.py @@ -29,7 +29,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the imputer on the given data. @@ -47,7 +47,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -62,7 +62,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns @@ -114,7 +114,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/median/model.py b/pypots/imputation/median/model.py index 6d5db169..6295aa5f 100644 --- a/pypots/imputation/median/model.py +++ b/pypots/imputation/median/model.py @@ -29,7 +29,7 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: """Train the imputer on the given data. @@ -47,7 +47,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -62,7 +62,7 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns @@ -115,7 +115,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/mrnn/data.py b/pypots/imputation/mrnn/data.py index b8fcdbf4..cb228f53 100644 --- a/pypots/imputation/mrnn/data.py +++ b/pypots/imputation/mrnn/data.py @@ -10,7 +10,7 @@ import torch from pygrinder import fill_and_get_mask_torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset from ...data.utils import _parse_delta_torch @@ -19,7 +19,7 @@ class DatasetForMRNN(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -28,7 +28,7 @@ class DatasetForMRNN(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -37,7 +37,7 @@ class DatasetForMRNN(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -45,18 +45,24 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) if not isinstance(self.data, str): # calculate all delta here. - if self.X_ori is None: - forward_X, forward_missing_mask = fill_and_get_mask_torch(self.X) - else: + if self.return_X_ori: forward_missing_mask = self.missing_mask forward_X = self.X + else: + forward_X, forward_missing_mask = fill_and_get_mask_torch(self.X) forward_delta = _parse_delta_torch(forward_missing_mask) backward_X = torch.flip(forward_X, dims=[1]) @@ -81,12 +87,12 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : A list contains index : int tensor, @@ -116,10 +122,10 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: self.processed_data["backward"]["delta"][idx], ] - if self.X_ori is not None and self.return_X_ori: + if self.return_X_ori: sample.extend([self.X_ori[idx], self.indicating_mask[idx]]) - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -130,12 +136,12 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : The collated data sample, a list including all necessary sample info. """ @@ -169,14 +175,14 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: backward["deltas"], ] - if "X_ori" in self.file_handle.keys() and self.return_X_ori: + if self.return_X_ori: X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) indicating_mask = X_ori_missing_mask - missing_mask sample.extend([X_ori, indicating_mask]) # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/mrnn/model.py b/pypots/imputation/mrnn/model.py index 7fb88e00..378cd5c3 100644 --- a/pypots/imputation/mrnn/model.py +++ b/pypots/imputation/mrnn/model.py @@ -23,7 +23,7 @@ from .core import _MRNN from .data import DatasetForMRNN from ..base import BaseNNImputer -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -194,11 +194,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForMRNN( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -208,10 +208,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForMRNN( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -231,11 +231,11 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForMRNN( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, return_X_ori=False, return_y=False, file_type=file_type ) test_loader = DataLoader( test_set, @@ -261,7 +261,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/patchtst/data.py b/pypots/imputation/patchtst/data.py index c8c0ea1f..4ccb1e72 100644 --- a/pypots/imputation/patchtst/data.py +++ b/pypots/imputation/patchtst/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/patchtst/model.py b/pypots/imputation/patchtst/model.py index b2aceb4b..b4c72c4d 100644 --- a/pypots/imputation/patchtst/model.py +++ b/pypots/imputation/patchtst/model.py @@ -21,11 +21,11 @@ import torch from torch.utils.data import DataLoader +from .core import _PatchTST from .data import DatasetForPatchTST -from pypots.imputation.patchtst.core import _PatchTST from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -239,11 +239,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForPatchTST( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -253,10 +253,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForPatchTST( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -276,7 +276,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -291,19 +291,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -330,7 +334,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/saits/core.py b/pypots/imputation/saits/core.py index 01465448..89c5e731 100644 --- a/pypots/imputation/saits/core.py +++ b/pypots/imputation/saits/core.py @@ -18,8 +18,8 @@ import torch import torch.nn as nn -from pypots.utils.metrics import calc_mae from ...nn.modules.saits import BackboneSAITS +from ...utils.metrics import calc_mae class _SAITS(nn.Module): diff --git a/pypots/imputation/saits/data.py b/pypots/imputation/saits/data.py index aeae871a..0c25b0f8 100644 --- a/pypots/imputation/saits/data.py +++ b/pypots/imputation/saits/data.py @@ -10,7 +10,7 @@ import torch from pygrinder import mcar, fill_and_get_mask_torch -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset class DatasetForSAITS(BaseDataset): @@ -20,7 +20,7 @@ class DatasetForSAITS(BaseDataset): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -29,7 +29,7 @@ class DatasetForSAITS(BaseDataset): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -38,7 +38,7 @@ class DatasetForSAITS(BaseDataset): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. rate : float, in (0,1), @@ -54,11 +54,17 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=False, + return_y=return_y, + file_type=file_type, + ) self.rate = rate def _fetch_data_from_array(self, idx: int) -> Iterable: @@ -66,31 +72,31 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index to fetch the specified sample. Returns ------- - sample : list, + sample : A list contains - index : int tensor, + index : The index of the sample. - X_ori : tensor, + X_ori : Original time-series for calculating mask imputation loss. - X : tensor, + X : Time-series data with artificially missing values for model input. - missing_mask : tensor, + missing_mask : The mask records all missing values in X. - indicating_mask : tensor. + indicating_mask : The mask indicates artificially missing values in X. """ - if self.X_ori is not None and self.return_X_ori: + if self.return_X_ori: X = self.X[idx] X_ori = self.X_ori[idx] missing_mask = self.missing_mask[idx] @@ -110,7 +116,7 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: indicating_mask, ] - if self.y is not None and self.return_labels: + if self.return_y: sample.append(self.y[idx].to(torch.long)) return sample @@ -121,19 +127,19 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: Parameters ---------- - idx : int, + idx : The index of the sample to be return. Returns ------- - sample : list, + sample : The collated data sample, a list including all necessary sample info. """ if self.file_handle is None: self.file_handle = self._open_file_handle() - if "X_ori" in self.file_handle.keys() and self.return_X_ori: + if self.return_X_ori: X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) X_ori = torch.from_numpy(self.file_handle["X_ori"][idx]).to(torch.float32) X_ori, X_ori_missing_mask = fill_and_get_mask_torch(X_ori) @@ -149,7 +155,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: sample = [torch.tensor(idx), X, missing_mask, X_ori, indicating_mask] # if the dataset has labels and is for training, then fetch it from the file - if "y" in self.file_handle.keys() and self.return_labels: + if self.return_y: sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py index 4291c81a..a45927f6 100644 --- a/pypots/imputation/saits/model.py +++ b/pypots/imputation/saits/model.py @@ -20,11 +20,11 @@ import torch from torch.utils.data import DataLoader -from .data import DatasetForSAITS from .core import _SAITS +from .data import DatasetForSAITS from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set +from ...data.dataset import BaseDataset from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -246,11 +246,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForSAITS( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -260,10 +260,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForSAITS( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -283,7 +283,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", diagonal_attention_mask: bool = True, return_latent_vars: bool = False, ) -> dict: @@ -291,7 +291,7 @@ def predict( Parameters ---------- - test_set : dict or str + test_set : The dataset for model validating, should be a dictionary including keys as 'X', or a path string locating a data file supported by PyPOTS (e.g. h5 file). If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -300,26 +300,30 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. - diagonal_attention_mask : bool + diagonal_attention_mask : Whether to apply a diagonal attention mask to the self-attention mechanism in the testing stage. - return_latent_vars : bool + return_latent_vars : Whether to return the latent variables in SAITS, e.g. attention weights of two DMSA blocks and the weight matrix from the combination block, etc. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -375,7 +379,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/template/core.py b/pypots/imputation/template/core.py new file mode 100644 index 00000000..4bfd259d --- /dev/null +++ b/pypots/imputation/template/core.py @@ -0,0 +1,42 @@ +""" +The implementation of YourNewModel for the partially-observed time-series imputation task. + +Refer to the paper "Your paper citation". + +""" + +# Created by Your Name TODO: modify the author information. +# License: BSD-3-Clause + +import torch.nn as nn + +# from ...nn.modules import some_modules + + +# TODO: define your new model here. +# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). +# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. +# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. +class _YourNewModel(nn.Module): + def __init__(self): + super().__init__() + + # TODO: define your model's components here. If modules in pypots.nn.modules can be reused in your model, + # you can import them and use them here. AND if you think the modules you implemented can be reused by + # other models, you can also consider to contribute them to pypots.nn.modules + self.embedding = nn.Module + self.submodule = nn.Module + self.backbone = nn.Module + + def forward(self, inputs: dict) -> dict: + # TODO: define your model's forward propagation process here. + # The input is a dict, and the output `results` should also be a dict. + output = self.backbone() # replace this with your model's process + + # TODO: `results` must contains the key `loss` which is will be used for + # backward propagation to update the model. + loss = None + results = { + "loss": loss, + } + return results diff --git a/pypots/imputation/template/data.py b/pypots/imputation/template/data.py index c391740e..3c4ca97e 100644 --- a/pypots/imputation/template/data.py +++ b/pypots/imputation/template/data.py @@ -1,7 +1,7 @@ """ Dataset class for YourNewModel. -TODO: modify the above description with your model's information. +TODO: modify the above description for your model's dataset class. """ @@ -10,17 +10,26 @@ from typing import Union, Iterable -from ...data.base import BaseDataset +from ...data.dataset import BaseDataset +# TODO: define your new dataset class here. Remove or add arguments as needed. class DatasetForYourNewModel(BaseDataset): def __init__( self, data: Union[dict, str], - return_labels: bool = True, - file_type: str = "h5py", + return_X_ori: bool, + return_X_pred: bool, + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_X_pred=return_X_pred, + return_y=return_y, + file_type=file_type, + ) def _fetch_data_from_array(self, idx: int) -> Iterable: raise NotImplementedError diff --git a/pypots/imputation/template/model.py b/pypots/imputation/template/model.py index 496782bf..170e140b 100644 --- a/pypots/imputation/template/model.py +++ b/pypots/imputation/template/model.py @@ -3,6 +3,8 @@ Refer to the paper "Your paper citation". +TODO: modify the above description with your model's information. + """ # Created by Your Name TODO: modify the author information. @@ -10,40 +12,19 @@ from typing import Union, Optional -import numpy as np import torch -import torch.nn as nn + +from .core import _YourNewModel # TODO: import the base class from the imputation package in PyPOTS. # Here I suppose this is a neural-network imputation model. # You should make your model inherent BaseImputer if it is not a NN. # from ..base import BaseImputer from ..base import BaseNNImputer - from ...optim.adam import Adam from ...optim.base import Optimizer -# TODO: define your new model here. -# It could be a neural network model or a non-neural network algorithm (e.g. written in numpy). -# Your model should be implemented with PyTorch and subclass torch.nn.Module if it is a neural network. -# Note that your main algorithm is defined in this class, and this class usually won't be exposed to users. -class _YourNewModel(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, inputs: dict) -> dict: - # TODO: define your model's forward propagation process here. - # The input is a dict, and the output `results` should also be a dict. - # `results` must contains the key `loss` which is will be used for backward propagation to update the model. - - loss = None - results = { - "loss": loss, - } - return results - - # TODO: define your new model's wrapper here. # It should be a subclass of a base class defined in PyPOTS task packages (e.g. # BaseNNImputer of PyPOTS imputation task package), and it has to implement all abstract methods of the base class. @@ -52,13 +33,13 @@ class YourNewModel(BaseNNImputer): def __init__( self, # TODO: add your model's hyper-parameters here - batch_size: int, - epochs: int, - patience: int, - num_workers: int = 0, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, device: Optional[Union[str, torch.device, list]] = None, - saving_path: str = None, + saving_path: Optional[str] = None, model_saving_strategy: Optional[str] = "best", ): super().__init__( @@ -74,9 +55,11 @@ def __init__( # TODO: set up your model's hyper-parameters here # set up the model - self.model = _YourNewModel() - self.model = self.model.to(self.device) + self.model = _YourNewModel( + # pass the arguments to your model + ) self._print_model_size() + self._send_model_to_given_device() # set up the optimizer self.optimizer = optimizer @@ -95,13 +78,13 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: raise NotImplementedError def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: raise NotImplementedError diff --git a/pypots/imputation/template/module.py b/pypots/imputation/template/module.py deleted file mode 100644 index fa20e4cd..00000000 --- a/pypots/imputation/template/module.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -The implementation of the modules for YourNewModel. - -Refer to the paper "Your paper citation". - -""" - -# Created by Your Name TODO: modify the author information. -# License: BSD-3-Clause - - -# TODO: this file is not necessary. If your new model has customized layers or modules, please put them here. -# Otherwise, please delete this modules.py file, don't commit it to the repository. diff --git a/pypots/imputation/timesnet/data.py b/pypots/imputation/timesnet/data.py index d30f8a53..f65632c9 100644 --- a/pypots/imputation/timesnet/data.py +++ b/pypots/imputation/timesnet/data.py @@ -17,8 +17,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/timesnet/model.py b/pypots/imputation/timesnet/model.py index 699b55b4..419470b2 100644 --- a/pypots/imputation/timesnet/model.py +++ b/pypots/imputation/timesnet/model.py @@ -23,8 +23,8 @@ from .core import _TimesNet from .data import DatasetForTimesNet from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.dataset import BaseDataset +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -201,11 +201,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForTimesNet( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -215,10 +215,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForTimesNet( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -238,7 +238,7 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: """Make predictions for the input data with the trained model. @@ -253,19 +253,23 @@ def predict( If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - file_type : str + file_type : The type of the given file if test_set is a path string. Returns ------- - result_dict : dict, + file_type : The dictionary containing the clustering results and latent variables if necessary. """ # Step 1: wrap the input data with classes Dataset and DataLoader self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -292,7 +296,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/transformer/data.py b/pypots/imputation/transformer/data.py index 6974991d..d8751050 100644 --- a/pypots/imputation/transformer/data.py +++ b/pypots/imputation/transformer/data.py @@ -15,8 +15,8 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", rate: float = 0.2, ): - super().__init__(data, return_X_ori, return_labels, file_type, rate) + super().__init__(data, return_X_ori, return_y, file_type, rate) diff --git a/pypots/imputation/transformer/model.py b/pypots/imputation/transformer/model.py index e465db30..46ee13ab 100644 --- a/pypots/imputation/transformer/model.py +++ b/pypots/imputation/transformer/model.py @@ -23,8 +23,8 @@ from .core import _Transformer from .data import DatasetForTransformer from ..base import BaseNNImputer -from ...data.base import BaseDataset -from ...data.checking import check_X_ori_in_val_set +from ...data.dataset import BaseDataset +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -234,11 +234,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForTransformer( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -248,10 +248,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForTransformer( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -271,11 +271,15 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type: str = "h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, + return_X_ori=False, + return_X_pred=False, + return_y=False, + file_type=file_type, ) test_loader = DataLoader( test_set, @@ -301,7 +305,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/imputation/usgan/data.py b/pypots/imputation/usgan/data.py index 58e035c3..40e6ee77 100644 --- a/pypots/imputation/usgan/data.py +++ b/pypots/imputation/usgan/data.py @@ -15,7 +15,7 @@ class DatasetForUSGAN(DatasetForBRITS): Parameters ---------- - data : dict or str, + data : The dataset for model input, should be a dictionary including keys as 'X' and 'y', or a path string locating a data file. If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], @@ -24,7 +24,7 @@ class DatasetForUSGAN(DatasetForBRITS): If it is a path string, the path should point to a data file, e.g. a h5 file, which contains key-value pairs like a dict, and it has to include keys as 'X' and 'y'. - return_labels : bool, default = True, + return_y : Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, during training of classification models, the Dataset class will return labels in __getitem__() for model input. Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we @@ -33,7 +33,7 @@ class DatasetForUSGAN(DatasetForBRITS): with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for distinction. - file_type : str, default = "h5py" + file_type : The type of the given file if train_set and val_set are path strings. """ @@ -41,7 +41,12 @@ def __init__( self, data: Union[dict, str], return_X_ori: bool, - return_labels: bool, - file_type: str = "h5py", + return_y: bool, + file_type: str = "hdf5", ): - super().__init__(data, return_X_ori, return_labels, file_type) + super().__init__( + data=data, + return_X_ori=return_X_ori, + return_y=return_y, + file_type=file_type, + ) diff --git a/pypots/imputation/usgan/model.py b/pypots/imputation/usgan/model.py index 91aaff44..1f684e92 100644 --- a/pypots/imputation/usgan/model.py +++ b/pypots/imputation/usgan/model.py @@ -20,7 +20,7 @@ from .core import _USGAN from .data import DatasetForUSGAN from ..base import BaseNNImputer -from ...data.checking import check_X_ori_in_val_set +from ...data.checking import key_in_data_set from ...optim.adam import Adam from ...optim.base import Optimizer from ...utils.logging import logger @@ -375,11 +375,11 @@ def fit( self, train_set: Union[dict, str], val_set: Optional[Union[dict, str]] = None, - file_type: str = "h5py", + file_type: str = "hdf5", ) -> None: # Step 1: wrap the input data with classes Dataset and DataLoader training_set = DatasetForUSGAN( - train_set, return_X_ori=False, return_labels=False, file_type=file_type + train_set, return_X_ori=False, return_y=False, file_type=file_type ) training_loader = DataLoader( training_set, @@ -389,10 +389,10 @@ def fit( ) val_loader = None if val_set is not None: - if not check_X_ori_in_val_set(val_set): + if not key_in_data_set("X_ori", val_set): raise ValueError("val_set must contain 'X_ori' for model validation.") val_set = DatasetForUSGAN( - val_set, return_X_ori=True, return_labels=False, file_type=file_type + val_set, return_X_ori=True, return_y=False, file_type=file_type ) val_loader = DataLoader( val_set, @@ -412,11 +412,11 @@ def fit( def predict( self, test_set: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> dict: self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForUSGAN( - test_set, return_X_ori=False, return_labels=False, file_type=file_type + test_set, return_X_ori=False, return_y=False, file_type=file_type ) test_loader = DataLoader( test_set, @@ -442,7 +442,7 @@ def predict( def impute( self, X: Union[dict, str], - file_type="h5py", + file_type: str = "hdf5", ) -> np.ndarray: """Impute missing values in the given data with the trained model. diff --git a/pypots/nn/modules/csdi/backbone.py b/pypots/nn/modules/csdi/backbone.py index 697e76a2..26051060 100644 --- a/pypots/nn/modules/csdi/backbone.py +++ b/pypots/nn/modules/csdi/backbone.py @@ -44,11 +44,6 @@ def __init__( d_side += 1 # for conditional mask d_input = 2 - self.embed_layer = nn.Embedding( - num_embeddings=d_target, - embedding_dim=d_feature_embedding, - ) - self.diff_model = CsdiDiffusionModel( n_diffusion_steps, d_diffusion_embedding, @@ -78,41 +73,6 @@ def __init__( "alpha_torch", torch.tensor(self.alpha).float().unsqueeze(1).unsqueeze(1) ) - @staticmethod - def time_embedding(pos, d_model=128): - pe = torch.zeros(pos.shape[0], pos.shape[1], d_model).to(pos.device) - position = pos.unsqueeze(2) - div_term = 1 / torch.pow( - 10000.0, torch.arange(0, d_model, 2, device=pos.device) / d_model - ) - pe[:, :, 0::2] = torch.sin(position * div_term) - pe[:, :, 1::2] = torch.cos(position * div_term) - return pe - - def get_side_info(self, observed_tp, cond_mask): - B, K, L = cond_mask.shape - device = observed_tp.device - time_embed = self.time_embedding( - observed_tp, self.d_time_embedding - ) # (B,L,emb) - time_embed = time_embed.to(device) - time_embed = time_embed.unsqueeze(2).expand(-1, -1, K, -1) - feature_embed = self.embed_layer( - torch.arange(self.d_target).to(device) - ) # (K,emb) - feature_embed = feature_embed.unsqueeze(0).unsqueeze(0).expand(B, L, -1, -1) - - side_info = torch.cat( - [time_embed, feature_embed], dim=-1 - ) # (B,L,K,emb+d_feature_embedding) - side_info = side_info.permute(0, 3, 2, 1) # (B,*,K,L) - - if not self.is_unconditional: - side_mask = cond_mask.unsqueeze(1) # (B,1,K,L) - side_info = torch.cat([side_info, side_mask], dim=1) - - return side_info - def set_input_to_diffmodel(self, noisy_data, observed_data, cond_mask): if self.is_unconditional: total_input = noisy_data.unsqueeze(1) # (B,1,K,L) diff --git a/pypots/utils/visual/data.py b/pypots/utils/visual/data.py index 338b1145..ca5a5f6e 100644 --- a/pypots/utils/visual/data.py +++ b/pypots/utils/visual/data.py @@ -29,26 +29,26 @@ def plot_data( Parameters ---------- - X : ndarray, + X : The observed values - X_ori : ndarray, + X_ori : The evaluated values - X_imputed : ndarray, + X_imputed : The imputed values - sample_idx : int, + sample_idx : The index of the sample to be plotted. If None, a randomly-selected sample will be plotted for visualization. - n_rows : int, + n_rows : The number of rows in the plot - n_cols : int, + n_cols : The number of columns in the plot - fig_size : list, + fig_size : The size of the figure """ @@ -95,9 +95,9 @@ def plot_data( def plot_missingness( - missing_mask, - min_step=0, - max_step=1, + missing_mask: int, + min_step: int = 0, + max_step: int = 1, sample_idx: Optional[int] = None, ): """Plot the missingness pattern of one multivariate timeseries. For each feature, @@ -106,16 +106,16 @@ def plot_missingness( Parameters ---------- - missing_mask : ndarray, + missing_mask : The missing mask of multivariate time series. - min_step : int, + min_step : The minimum time step for visualization. - max_step : int, + max_step : The maximum time step for visualization. - sample_idx : int, + sample_idx : The index of the sample to be plotted, if None, a randomly-selected sample will be plotted for visualization. """ mask_shape = missing_mask.shape diff --git a/tests/classification/brits.py b/tests/classification/brits.py index 0ec7b68d..7441e40e 100644 --- a/tests/classification/brits.py +++ b/tests/classification/brits.py @@ -21,9 +21,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_CLASSIFICATION, check_tb_and_model_checkpoints_existence, ) @@ -104,8 +104,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="classification-brits") def test_4_lazy_loading(self): - self.brits.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - results = self.brits.predict(H5_TEST_SET_PATH) + self.brits.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + results = self.brits.predict(GENERAL_H5_TEST_SET_PATH) metrics = calc_binary_classification_metrics( results["classification"], DATA["test_y"] ) diff --git a/tests/classification/grud.py b/tests/classification/grud.py index 5c165e07..61f7d496 100644 --- a/tests/classification/grud.py +++ b/tests/classification/grud.py @@ -21,9 +21,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_CLASSIFICATION, check_tb_and_model_checkpoints_existence, ) @@ -101,8 +101,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="classification-grud") def test_4_lazy_loading(self): - self.grud.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - results = self.grud.predict(H5_TEST_SET_PATH) + self.grud.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + results = self.grud.predict(GENERAL_H5_TEST_SET_PATH) metrics = calc_binary_classification_metrics( results["classification"], DATA["test_y"] ) diff --git a/tests/classification/raindrop.py b/tests/classification/raindrop.py index 64f6aa59..10363b78 100644 --- a/tests/classification/raindrop.py +++ b/tests/classification/raindrop.py @@ -20,9 +20,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_CLASSIFICATION, check_tb_and_model_checkpoints_existence, ) @@ -106,8 +106,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="classification-raindrop") def test_4_lazy_loading(self): - self.raindrop.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - results = self.raindrop.predict(H5_TEST_SET_PATH) + self.raindrop.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + results = self.raindrop.predict(GENERAL_H5_TEST_SET_PATH) metrics = calc_binary_classification_metrics( results["classification"], DATA["test_y"] ) diff --git a/tests/clustering/crli.py b/tests/clustering/crli.py index 7046f792..6b3266ff 100644 --- a/tests/clustering/crli.py +++ b/tests/clustering/crli.py @@ -25,9 +25,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_CLUSTERING, check_tb_and_model_checkpoints_existence, ) @@ -165,9 +165,9 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="clustering-crli") def test_4_lazy_loading(self): - self.crli_lstm.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) + self.crli_lstm.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) clustering_results = self.crli_lstm.predict( - H5_TEST_SET_PATH, return_latent_vars=True + GENERAL_H5_TEST_SET_PATH, return_latent_vars=True ) external_metrics = calc_external_cluster_validation_metrics( clustering_results["clustering"], DATA["test_y"] diff --git a/tests/clustering/vader.py b/tests/clustering/vader.py index bf0b0989..ba8a02de 100644 --- a/tests/clustering/vader.py +++ b/tests/clustering/vader.py @@ -26,9 +26,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_CLUSTERING, check_tb_and_model_checkpoints_existence, ) @@ -113,9 +113,9 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="clustering-vader") def test_4_lazy_loading(self): - self.vader.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) + self.vader.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) clustering_results = self.vader.predict( - H5_TEST_SET_PATH, return_latent_vars=True + GENERAL_H5_TEST_SET_PATH, return_latent_vars=True ) external_metrics = calc_external_cluster_validation_metrics( clustering_results["clustering"], DATA["test_y"] diff --git a/tests/forecasting/bttf.py b/tests/forecasting/bttf.py index 2e9d21bb..5e87cac5 100644 --- a/tests/forecasting/bttf.py +++ b/tests/forecasting/bttf.py @@ -11,12 +11,8 @@ from pypots.forecasting import BTTF from pypots.utils.logging import logger -from pypots.utils.metrics import calc_mae -from tests.forecasting.config import ( - TEST_SET, - N_PRED_STEP, -) -from tests.global_test_config import DATA +from pypots.utils.metrics import calc_mse +from tests.global_test_config import DATA, FORECASTING_TEST_SET, N_PRED_STEPS class TestBTTF(unittest.TestCase): @@ -24,9 +20,9 @@ class TestBTTF(unittest.TestCase): # initialize a BTTF model bttf = BTTF( - n_steps=DATA["n_steps"] - N_PRED_STEP, + n_steps=DATA["n_steps"] - N_PRED_STEPS, n_features=DATA["n_features"], - pred_step=N_PRED_STEP, + pred_step=N_PRED_STEPS, rank=10, time_lags=[1, 2, 3, 2, 2 + 1, 2 + 2, 3, 3 + 1, 3 + 2], burn_iter=5, @@ -36,9 +32,9 @@ class TestBTTF(unittest.TestCase): @pytest.mark.xdist_group(name="forecasting-bttf") def test_0_forecasting(self): - predictions = self.bttf.predict(TEST_SET)["forecasting"] - mae = calc_mae(predictions, TEST_SET["X_ori"][:, -N_PRED_STEP:]) - logger.info(f"prediction MAE: {mae}") + predictions = self.bttf.predict(FORECASTING_TEST_SET)["forecasting"] + mse = calc_mse(predictions, FORECASTING_TEST_SET["X_pred"]) + logger.info(f"prediction MSE: {mse}") if __name__ == "__main__": diff --git a/tests/forecasting/config.py b/tests/forecasting/config.py deleted file mode 100644 index 3f2bc225..00000000 --- a/tests/forecasting/config.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Test configs for forecasting models. -""" - -# Created by Wenjie Du -# License: BSD-3-Clause - -from tests.global_test_config import DATA - -EPOCHS = 5 -N_PRED_STEP = 1 - -TRAIN_SET = {"X": DATA["train_X"]} -VAL_SET = {"X": DATA["val_X"]} -TEST_SET = { - "X": DATA["test_X"][:, :-N_PRED_STEP], - "X_ori": DATA["test_X_ori"], -} diff --git a/tests/forecasting/csdi.py b/tests/forecasting/csdi.py new file mode 100644 index 00000000..3df64ad8 --- /dev/null +++ b/tests/forecasting/csdi.py @@ -0,0 +1,149 @@ +""" +Test cases for CSDI forecasting model. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.forecasting import CSDI +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import calc_mse, calc_quantile_crps + +from tests.global_test_config import ( + DATA, + EPOCHS, + DEVICE, + N_PRED_STEPS, + FORECASTING_TRAIN_SET, + FORECASTING_VAL_SET, + FORECASTING_TEST_SET, + FORECASTING_H5_TRAIN_SET_PATH, + FORECASTING_H5_VAL_SET_PATH, + FORECASTING_H5_TEST_SET_PATH, + RESULT_SAVING_DIR_FOR_FORECASTING, + check_tb_and_model_checkpoints_existence, +) + + +class TestCSDI(unittest.TestCase): + logger.info("Running tests for an forecasting model CSDI...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_FORECASTING, "CSDI") + model_save_name = "saved_csdi_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a CSDI model + csdi = CSDI( + n_steps=DATA["n_steps"] - N_PRED_STEPS, + n_features=DATA["n_features"], + n_pred_steps=N_PRED_STEPS, + n_pred_features=DATA["n_features"], + n_layers=1, + n_channels=8, + d_time_embedding=32, + d_feature_embedding=3, + d_diffusion_embedding=32, + n_diffusion_steps=5, + n_heads=1, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="forecasting-csdi") + def test_0_fit(self): + self.csdi.fit(FORECASTING_TRAIN_SET, FORECASTING_VAL_SET) + + @pytest.mark.xdist_group(name="forecasting-csdi") + def test_1_forecasting(self): + forecasting_X = self.csdi.predict(FORECASTING_TEST_SET, n_sampling_times=2)[ + "forecasting" + ] + test_CRPS = calc_quantile_crps( + forecasting_X, + FORECASTING_TEST_SET["X_pred"], + ~np.isnan(FORECASTING_TEST_SET["X_pred"]), + ) + forecasting_X = forecasting_X.mean(axis=1) # mean over sampling times + assert not np.isnan( + forecasting_X + ).any(), ( + "Output has missing values in the forecasting results that should not be." + ) + test_MSE = calc_mse( + forecasting_X, + FORECASTING_TEST_SET["X_pred"], + ~np.isnan(FORECASTING_TEST_SET["X_pred"]), + ) + logger.info(f"CSDI test_MSE: {test_MSE}, test_CRPS: {test_CRPS}") + + @pytest.mark.xdist_group(name="forecasting-csdi") + def test_2_parameters(self): + assert hasattr(self.csdi, "model") and self.csdi.model is not None + + assert hasattr(self.csdi, "optimizer") and self.csdi.optimizer is not None + + assert hasattr(self.csdi, "best_loss") + self.assertNotEqual(self.csdi.best_loss, float("inf")) + + assert ( + hasattr(self.csdi, "best_model_dict") + and self.csdi.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="forecasting-csdi") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.csdi) + + # save the trained model into file, and check if the path exists + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.csdi.save(saved_model_path) + + # test loading the saved model, not necessary, but need to test + self.csdi.load(saved_model_path) + + @pytest.mark.xdist_group(name="forecasting-csdi") + def test_4_lazy_loading(self): + self.csdi.fit(FORECASTING_H5_TRAIN_SET_PATH, FORECASTING_H5_VAL_SET_PATH) + forecasting_results = self.csdi.predict(FORECASTING_H5_TEST_SET_PATH) + forecasting_X = forecasting_results["forecasting"] + test_CRPS = calc_quantile_crps( + forecasting_X, + FORECASTING_TEST_SET["X_pred"], + ~np.isnan(FORECASTING_TEST_SET["X_pred"]), + ) + forecasting_X = forecasting_X.mean(axis=1) # mean over sampling times + assert not np.isnan( + forecasting_X + ).any(), ( + "Output has missing values in the forecasting results that should not be." + ) + + test_MSE = calc_mse( + forecasting_X, + FORECASTING_TEST_SET["X_pred"], + ~np.isnan(FORECASTING_TEST_SET["X_pred"]), + ) + logger.info(f"Lazy-loading CSDI test_MSE: {test_MSE}, test_CRPS: {test_CRPS}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/global_test_config.py b/tests/global_test_config.py index 2eba4c22..ad388f43 100644 --- a/tests/global_test_config.py +++ b/tests/global_test_config.py @@ -15,7 +15,39 @@ from pypots.utils.logging import logger from pypots.utils.random import set_random_seed -set_random_seed(2023) +# set the random seed for all test cases +RANDOM_SEED = 2023 +# set the number of epochs for all model training +EPOCHS = 2 +# set the number of prediction steps for forecasting models +N_PRED_STEPS = 1 +# tensorboard and model files saving directory +RESULT_SAVING_DIR = "testing_results" +MODEL_SAVING_DIR = f"{RESULT_SAVING_DIR}/models" +DATA_SAVING_DIR = f"{RESULT_SAVING_DIR}/datasets" +RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(MODEL_SAVING_DIR, "imputation") +RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(MODEL_SAVING_DIR, "classification") +RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(MODEL_SAVING_DIR, "clustering") +RESULT_SAVING_DIR_FOR_FORECASTING = os.path.join(MODEL_SAVING_DIR, "forecasting") +# paths to save the generated dataset into files for testing the lazy-loading strategy +GENERAL_DATA_SAVING_DIR = f"{DATA_SAVING_DIR}/general_h5dataset" +GENERAL_H5_TRAIN_SET_PATH = os.path.abspath(f"{GENERAL_DATA_SAVING_DIR}/train_set.h5") +GENERAL_H5_VAL_SET_PATH = os.path.abspath(f"{GENERAL_DATA_SAVING_DIR}/val_set.h5") +GENERAL_H5_TEST_SET_PATH = os.path.abspath(f"{GENERAL_DATA_SAVING_DIR}/test_set.h5") +# paths to save the generated dataset for testing forecasting models with the lazy-loading strategy +FORECASTING_DATA_SAVING_DIR = f"{DATA_SAVING_DIR}/forecasting_h5dataset" +FORECASTING_H5_TRAIN_SET_PATH = os.path.abspath( + f"{FORECASTING_DATA_SAVING_DIR}/train_set.h5" +) +FORECASTING_H5_VAL_SET_PATH = os.path.abspath( + f"{FORECASTING_DATA_SAVING_DIR}/val_set.h5" +) +FORECASTING_H5_TEST_SET_PATH = os.path.abspath( + f"{FORECASTING_DATA_SAVING_DIR}/test_set.h5" +) + + +set_random_seed(RANDOM_SEED) # Generate the unified data for testing and cache it first, DATA here is a singleton # Otherwise, file lock will cause bug if running test parallely with pytest-xdist. @@ -43,15 +75,21 @@ "y": DATA["test_y"].astype(float), } -# tensorboard and model files saving directory -RESULT_SAVING_DIR = "testing_results" -RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(RESULT_SAVING_DIR, "imputation") -RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(RESULT_SAVING_DIR, "classification") -RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(RESULT_SAVING_DIR, "clustering") -RESULT_SAVING_DIR_FOR_FORECASTING = os.path.join(RESULT_SAVING_DIR, "forecasting") - -# set the number of epochs for all model training -EPOCHS = 2 +assert ( + N_PRED_STEPS <= DATA["train_X"].shape[1] +), "N_PRED_STEPS should be less than the sequence length." +FORECASTING_TRAIN_SET = { + "X": DATA["train_X"][:, :-N_PRED_STEPS], + "X_pred": DATA["train_X_ori"][:, -N_PRED_STEPS:], +} +FORECASTING_VAL_SET = { + "X": DATA["val_X"][:, :-N_PRED_STEPS], + "X_pred": DATA["val_X_ori"][:, -N_PRED_STEPS:], +} +FORECASTING_TEST_SET = { + "X": DATA["test_X"][:, :-N_PRED_STEPS], + "X_pred": DATA["test_X_ori"][:, -N_PRED_STEPS:], +} # set DEVICES to None if no cuda device is available, to avoid initialization failed while importing test classes n_cuda_devices = torch.cuda.device_count() @@ -65,12 +103,6 @@ # if having no multiple cuda devices, leave it as None to use the default device DEVICE = None -# save the generated dataset into files for testing the lazy-loading strategy -DATA_SAVING_DIR = "h5data_for_tests" -H5_TRAIN_SET_PATH = f"{DATA_SAVING_DIR}/train_set.h5" -H5_VAL_SET_PATH = f"{DATA_SAVING_DIR}/val_set.h5" -H5_TEST_SET_PATH = f"{DATA_SAVING_DIR}/test_set.h5" - def check_tb_and_model_checkpoints_existence(model): # check the tensorboard file existence @@ -86,31 +118,23 @@ def check_tb_and_model_checkpoints_existence(model): if __name__ == "__main__": - if not os.path.exists(H5_TRAIN_SET_PATH): - save_dict_into_h5( - { - "X": DATA["train_X"], - "y": DATA["train_y"].astype(float), - }, - H5_TRAIN_SET_PATH, - ) - - if not os.path.exists(H5_VAL_SET_PATH): - save_dict_into_h5( - { - "X": DATA["val_X"], - "X_ori": DATA["val_X_ori"], - "y": DATA["val_y"].astype(float), - }, - H5_VAL_SET_PATH, - ) - - if not os.path.exists(H5_TEST_SET_PATH): - save_dict_into_h5( - { - "X": DATA["test_X"], - "X_ori": DATA["test_X_ori"], - "y": DATA["test_y"].astype(float), - }, - H5_TEST_SET_PATH, - ) + if not os.path.exists(GENERAL_H5_TRAIN_SET_PATH): + save_dict_into_h5(TRAIN_SET, GENERAL_H5_TRAIN_SET_PATH) + if not os.path.exists(GENERAL_H5_VAL_SET_PATH): + save_dict_into_h5(VAL_SET, GENERAL_H5_VAL_SET_PATH) + if not os.path.exists(GENERAL_H5_TEST_SET_PATH): + save_dict_into_h5(TEST_SET, GENERAL_H5_TEST_SET_PATH) + + if not os.path.exists(FORECASTING_H5_TRAIN_SET_PATH): + save_dict_into_h5(FORECASTING_TRAIN_SET, FORECASTING_H5_TRAIN_SET_PATH) + if not os.path.exists(FORECASTING_H5_VAL_SET_PATH): + save_dict_into_h5(FORECASTING_VAL_SET, FORECASTING_H5_VAL_SET_PATH) + if not os.path.exists(FORECASTING_H5_TEST_SET_PATH): + save_dict_into_h5(FORECASTING_TEST_SET, FORECASTING_H5_TEST_SET_PATH) + + logger.info( + f"Files under GENERAL_DATA_SAVING_DIR: {os.listdir(GENERAL_DATA_SAVING_DIR)}" + ) + logger.info( + f"Files under FORECASTING_DATA_SAVING_DIR: {os.listdir(FORECASTING_DATA_SAVING_DIR)}" + ) diff --git a/tests/imputation/autoformer.py b/tests/imputation/autoformer.py index 83610812..f68da280 100644 --- a/tests/imputation/autoformer.py +++ b/tests/imputation/autoformer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -112,8 +112,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-autoformer") def test_4_lazy_loading(self): - self.autoformer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.autoformer.predict(H5_TEST_SET_PATH) + self.autoformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.autoformer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/brits.py b/tests/imputation/brits.py index 1e63ffa4..d69e4b1d 100644 --- a/tests/imputation/brits.py +++ b/tests/imputation/brits.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -100,8 +100,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-brits") def test_4_lazy_loading(self): - self.brits.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.brits.predict(H5_TEST_SET_PATH) + self.brits.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.brits.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/crossformer.py b/tests/imputation/crossformer.py index a6a6c55e..e33459ca 100644 --- a/tests/imputation/crossformer.py +++ b/tests/imputation/crossformer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -113,8 +113,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-crossformer") def test_4_lazy_loading(self): - self.crossformer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.crossformer.predict(H5_TEST_SET_PATH) + self.crossformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.crossformer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/csdi.py b/tests/imputation/csdi.py index a0ee0f93..3023cea2 100644 --- a/tests/imputation/csdi.py +++ b/tests/imputation/csdi.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -43,6 +43,7 @@ class TestCSDI(unittest.TestCase): # initialize a CSDI model csdi = CSDI( + n_steps=DATA["n_steps"], n_features=DATA["n_features"], n_layers=1, n_channels=8, @@ -109,8 +110,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-csdi") def test_4_lazy_loading(self): - self.csdi.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.csdi.predict(H5_TEST_SET_PATH) + self.csdi.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.csdi.predict(GENERAL_H5_TEST_SET_PATH) imputed_X = imputation_results["imputation"] test_CRPS = calc_quantile_crps( imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"] diff --git a/tests/imputation/dlinear.py b/tests/imputation/dlinear.py index c1351305..d8cdf858 100644 --- a/tests/imputation/dlinear.py +++ b/tests/imputation/dlinear.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -127,8 +127,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-dlinear") def test_4_lazy_loading(self): - self.dlinear.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.dlinear.predict(H5_TEST_SET_PATH) + self.dlinear.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.dlinear.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/etsformer.py b/tests/imputation/etsformer.py index 87b8ce49..3ade3dfd 100644 --- a/tests/imputation/etsformer.py +++ b/tests/imputation/etsformer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -112,8 +112,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-etsformer") def test_4_lazy_loading(self): - self.etsformer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.etsformer.predict(H5_TEST_SET_PATH) + self.etsformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.etsformer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/fedformer.py b/tests/imputation/fedformer.py index 7a6b24e5..fe563582 100644 --- a/tests/imputation/fedformer.py +++ b/tests/imputation/fedformer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -114,8 +114,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-fedformer") def test_4_lazy_loading(self): - self.fedformer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.fedformer.predict(H5_TEST_SET_PATH) + self.fedformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.fedformer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/gpvae.py b/tests/imputation/gpvae.py index 9db47e7e..c76170e8 100644 --- a/tests/imputation/gpvae.py +++ b/tests/imputation/gpvae.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -101,8 +101,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-gpvae") def test_4_lazy_loading(self): - self.gp_vae.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputed_X = self.gp_vae.predict(H5_TEST_SET_PATH, n_sampling_times=2)[ + self.gp_vae.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputed_X = self.gp_vae.predict(GENERAL_H5_TEST_SET_PATH, n_sampling_times=2)[ "imputation" ] imputed_X = imputed_X.mean(axis=1) diff --git a/tests/imputation/informer.py b/tests/imputation/informer.py index 6f13680b..63689b03 100644 --- a/tests/imputation/informer.py +++ b/tests/imputation/informer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -110,8 +110,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-informer") def test_4_lazy_loading(self): - self.informer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.informer.predict(H5_TEST_SET_PATH) + self.informer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.informer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/locf.py b/tests/imputation/locf.py index 38ed1ce9..b22f4b42 100644 --- a/tests/imputation/locf.py +++ b/tests/imputation/locf.py @@ -19,9 +19,9 @@ DATA, DEVICE, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, ) @@ -114,8 +114,8 @@ def test_0_impute(self): @pytest.mark.xdist_group(name="imputation-locf") def test_4_lazy_loading(self): - self.locf_backward.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.locf_backward.predict(H5_TEST_SET_PATH) + self.locf_backward.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.locf_backward.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/mean.py b/tests/imputation/mean.py index 31747c71..04be2c9d 100644 --- a/tests/imputation/mean.py +++ b/tests/imputation/mean.py @@ -18,9 +18,9 @@ from tests.global_test_config import ( DATA, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, ) @@ -56,8 +56,8 @@ def test_0_impute(self): @pytest.mark.xdist_group(name="imputation-mean") def test_4_lazy_loading(self): - self.mean.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.mean.predict(H5_TEST_SET_PATH) + self.mean.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.mean.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/median.py b/tests/imputation/median.py index c11ab3d3..d4960449 100644 --- a/tests/imputation/median.py +++ b/tests/imputation/median.py @@ -18,9 +18,9 @@ from tests.global_test_config import ( DATA, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, ) @@ -56,8 +56,8 @@ def test_0_impute(self): @pytest.mark.xdist_group(name="imputation-median") def test_4_lazy_loading(self): - self.median.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.median.predict(H5_TEST_SET_PATH) + self.median.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.median.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/mrnn.py b/tests/imputation/mrnn.py index 4506e755..5e42e256 100644 --- a/tests/imputation/mrnn.py +++ b/tests/imputation/mrnn.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -100,8 +100,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-mrnn") def test_4_lazy_loading(self): - self.mrnn.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.mrnn.predict(H5_TEST_SET_PATH) + self.mrnn.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.mrnn.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/patchtst.py b/tests/imputation/patchtst.py index 161d3cca..fcfdff4b 100644 --- a/tests/imputation/patchtst.py +++ b/tests/imputation/patchtst.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -114,8 +114,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-patchtst") def test_4_lazy_loading(self): - self.patchtst.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.patchtst.predict(H5_TEST_SET_PATH) + self.patchtst.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.patchtst.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/saits.py b/tests/imputation/saits.py index 960e2bd4..325b28d2 100644 --- a/tests/imputation/saits.py +++ b/tests/imputation/saits.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -112,8 +112,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-saits") def test_4_lazy_loading(self): - self.saits.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.saits.predict(H5_TEST_SET_PATH) + self.saits.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.saits.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/timesnet.py b/tests/imputation/timesnet.py index 606d8747..8959cc9f 100644 --- a/tests/imputation/timesnet.py +++ b/tests/imputation/timesnet.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -110,8 +110,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-timesnet") def test_4_lazy_loading(self): - self.timesnet.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.timesnet.predict(H5_TEST_SET_PATH) + self.timesnet.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.timesnet.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/transformer.py b/tests/imputation/transformer.py index 2563680c..06839b95 100644 --- a/tests/imputation/transformer.py +++ b/tests/imputation/transformer.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -109,8 +109,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-transformer") def test_4_lazy_loading(self): - self.transformer.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.transformer.predict(H5_TEST_SET_PATH) + self.transformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.transformer.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()." diff --git a/tests/imputation/usgan.py b/tests/imputation/usgan.py index 934553a3..c9da6696 100644 --- a/tests/imputation/usgan.py +++ b/tests/imputation/usgan.py @@ -23,9 +23,9 @@ TRAIN_SET, VAL_SET, TEST_SET, - H5_TRAIN_SET_PATH, - H5_VAL_SET_PATH, - H5_TEST_SET_PATH, + GENERAL_H5_TRAIN_SET_PATH, + GENERAL_H5_VAL_SET_PATH, + GENERAL_H5_TEST_SET_PATH, RESULT_SAVING_DIR_FOR_IMPUTATION, check_tb_and_model_checkpoints_existence, ) @@ -103,8 +103,8 @@ def test_3_saving_path(self): @pytest.mark.xdist_group(name="imputation-usgan") def test_4_lazy_loading(self): - self.usgan.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) - imputation_results = self.usgan.predict(H5_TEST_SET_PATH) + self.usgan.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH) + imputation_results = self.usgan.predict(GENERAL_H5_TEST_SET_PATH) assert not np.isnan( imputation_results["imputation"] ).any(), "Output still has missing values after running impute()."