From f16f1433dfee675413417eeb9bb48804c3dd93fe Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 30 Jun 2024 11:05:49 +0800 Subject: [PATCH 1/4] feat: enable calc_missing_rate to work with pandas.DataFrame; --- pygrinder/utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pygrinder/utils.py b/pygrinder/utils.py index 8fbcd2b..f2a71a1 100644 --- a/pygrinder/utils.py +++ b/pygrinder/utils.py @@ -8,20 +8,23 @@ from typing import Union, Tuple import numpy as np +import pandas as pd import torch -def calc_missing_rate(X: Union[np.ndarray, torch.Tensor]) -> float: +def calc_missing_rate( + X: Union[np.ndarray, torch.Tensor, pd.DataFrame], +) -> float: """Calculate the originally missing rate of the raw data. Parameters ---------- X: - Data array that may contain missing values. + Data array/tensor/frame that may contain missing values. Returns ------- - originally_missing_rate, + missing_rate, The originally missing rate of the raw data. Its value should be in the range [0,1]. """ @@ -29,16 +32,18 @@ def calc_missing_rate(X: Union[np.ndarray, torch.Tensor]) -> float: X = np.asarray(X) if isinstance(X, np.ndarray): - originally_missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape) + missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape) elif isinstance(X, torch.Tensor): - originally_missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape) - originally_missing_rate = originally_missing_rate.item() + missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape) + missing_rate = missing_rate.item() + elif isinstance(X, pd.DataFrame): + missing_rate = pd.isna(X).sum().sum() / np.prod(X.shape) else: raise TypeError( - f"X must be type of list/numpy.ndarray/torch.Tensor, but got {type(X)}" + f"X must be type of list/numpy.ndarray/torch.Tensor/pandas.DataFrame, but got {type(X)}" ) - return originally_missing_rate + return missing_rate def masked_fill( From 6cf07c04aae4c265d8f8cac3b7baef2d4070db60 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 30 Jun 2024 23:22:08 +0800 Subject: [PATCH 2/4] refactor: add doc string for seq_missing and block_missing; --- pygrinder/block_missing/block_missing.py | 32 +++++++++++++++++++-- pygrinder/sequential_missing/seq_missing.py | 26 +++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pygrinder/block_missing/block_missing.py b/pygrinder/block_missing/block_missing.py index cd59b4d..8b06014 100644 --- a/pygrinder/block_missing/block_missing.py +++ b/pygrinder/block_missing/block_missing.py @@ -112,12 +112,40 @@ def block_missing( feature_idx: list = None, step_idx: list = None, ) -> Union[np.ndarray, torch.Tensor]: + """Create block missing data. + + Parameters + ---------- + X : + Data vector. If X has any missing values, they should be numpy.nan. + + factor : + The actual missing rate of block_missing is hard to be strictly controlled. + Hence, we use ``factor`` to help adjust the final missing rate. + + block_len : + The length of the mask block. + + block_width : + The width of the mask block. + + feature_idx : + The indices of features for missing block to star with. + + step_idx : + The indices of steps for a missing block to start with. + + Returns + ------- + corrupted_X : + Original X with artificial missing values. + Both originally-missing and artificially-missing values are left as NaN. + + """ if isinstance(X, list): X = np.asarray(X) n_samples, n_steps, n_features = X.shape - # assert 0 < p <= 1, f"p must be in range (0, 1), but got {p}" - assert isinstance( block_len, int ), f"`block_len` must be type of int, but got {type(block_len)}" diff --git a/pygrinder/sequential_missing/seq_missing.py b/pygrinder/sequential_missing/seq_missing.py index a752cde..62d5367 100644 --- a/pygrinder/sequential_missing/seq_missing.py +++ b/pygrinder/sequential_missing/seq_missing.py @@ -109,6 +109,32 @@ def seq_missing( feature_idx: list = None, step_idx: list = None, ) -> Union[np.ndarray, torch.Tensor]: + """Create subsequence missing data. + + Parameters + ---------- + X : + Data vector. If X has any missing values, they should be numpy.nan. + + p : + The probability that values may be masked as missing completely at random. + + seq_len : + The length of missing sequence. + + feature_idx : + The indices of features for missing sequences to be corrupted. + + step_idx : + The indices of steps for a missing sequence to start with. + + Returns + ------- + corrupted_X : + Original X with artificial missing values. + Both originally-missing and artificially-missing values are left as NaN. + + """ if isinstance(X, list): X = np.asarray(X) n_samples, n_steps, n_features = X.shape From d7e4e1984f9e05540004d3e9b8869f870ae925e7 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 30 Jun 2024 23:22:43 +0800 Subject: [PATCH 3/4] docs: update README; --- README.md | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5888477..cd01c32 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,17 @@ or install from source code: ```python import numpy as np -from pygrinder import mcar, mar_logistic, mnar_x, mnar_t + +from pygrinder import ( + mcar, + mar_logistic, + mnar_x, + mnar_t, + rdo, + seq_missing, + block_missing, + calc_missing_rate +) # given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features ts_dataset = np.random.randn(128, 10, 36) @@ -87,11 +97,29 @@ X_with_mar_data = mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0 # grind the dataset with MNAR pattern X_with_mnar_x_data = mnar_x(ts_dataset, offset=0.1) -X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3) +X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos=10, scale=3) + +# grind the dataset with RDO pattern +X_with_rdo_data = rdo(ts_dataset, p=0.1) + +# grind the dataset with Sequence-Missing pattern +X_with_seq_missing_data = seq_missing(ts_dataset, p=0.1, seq_len=5) + +# grind the dataset with Block-Missing pattern +X_with_block_missing_data = block_missing(ts_dataset, factor=0.1, block_width=3, block_len=3) + +# calculate the missing rate of the dataset +missing_rate = calc_missing_rate(X_with_mcar_data) ``` ## ❖ Citing PyGrinder/PyPOTS +

+ + + +

+ The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811), A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))). **Additionally**, PyPOTS has been included as a [PyTorch Ecosystem](https://pytorch.org/ecosystem/) project. @@ -102,12 +130,6 @@ please cite it as below and 🌟star this repository to make others notice this There are scientific research projects using PyPOTS and referencing in their papers. Here is [an incomplete list of them](https://scholar.google.com/scholar?as_ylo=2022&q=%E2%80%9CPyPOTS%E2%80%9D&hl=en). -

- - - -

- ``` bibtex @article{du2023pypots, title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}}, @@ -117,9 +139,9 @@ year={2023}, } ``` or -> Wenjie Du. (2023). +> Wenjie Du. > PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series. -> arXiv, abs/2305.18811. https://arxiv.org/abs/2305.18811 +> arXiv, abs/2305.18811, 2023.
From 5c30c1049cb83247895472c71595a3595ec036da Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Mon, 1 Jul 2024 07:57:40 +0800 Subject: [PATCH 4/4] feat: release v0.6.1; --- pygrinder/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygrinder/__init__.py b/pygrinder/__init__.py index 676acab..447b8f1 100644 --- a/pygrinder/__init__.py +++ b/pygrinder/__init__.py @@ -21,7 +21,7 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.6" +__version__ = "0.6.1" from .missing_at_random import mar_logistic from .missing_completely_at_random import mcar, mcar_little_test