Skip to content

Commit

Permalink
Optimized the logic applying univariate model to DataFrame (#67)
Browse files Browse the repository at this point in the history
* optimized applying univariate model to DF

* updated version number and changelogs

* minor optimization

* Fixed a bug that model trained with Series cannot be applied to DataFrame due to name matching error

* modified docstrings

* updated version number

* updated changelog
  • Loading branch information
tailaiw authored Feb 18, 2020
1 parent 9e9b86b commit 90b9145
Show file tree
Hide file tree
Showing 11 changed files with 164 additions and 167 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
# The short X.Y version.
version = "0.5"
# The full version, including alpha/beta/rc tags.
release = "0.5.3"
release = "0.5.4"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
7 changes: 7 additions & 0 deletions docs/releasehistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
Release History
***************

Version 0.5.4 (Feb 18, 2020)
===================================
- Optimized the workflow of how a univariate model is applied to pandas DataFrame
- Added more informative error messages
- Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns
- Clarified the workflow in the documentation

Version 0.5.3 (Feb 12, 2020)
===================================
- Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = adtk
version = 0.5.3
version = 0.5.4
author = Arundo Analytics, Inc.
maintainer = Tailai Wen
maintainer_email = tailai.wen@arundo.com
Expand Down
2 changes: 1 addition & 1 deletion src/adtk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"""

__version__ = "0.5.3"
__version__ = "0.5.4"
79 changes: 65 additions & 14 deletions src/adtk/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class _Model(ABC):
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
self._fitted = False
self._fitted = (
0
) # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF

@abstractmethod
def _fit(self, ts):
Expand Down Expand Up @@ -91,37 +93,78 @@ def _fit(self, ts):
s = ts.copy()
self._fit_core(s)
self._models = None
self._fitted = 1
elif isinstance(ts, pd.DataFrame):
df = ts.copy()
if df.columns.duplicated().any():
raise ValueError(
"Input DataFrame must have unique column names."
)
if self._need_fit:
self._update_models(df.columns)
# fit model for each column
for col in df.columns:
self._models[col].fit(df[col])
self._fitted = 2
else:
pass
else:
raise TypeError("Input must be a pandas Series or DataFrame.")
self._fitted = True

def _predict(self, ts):
if self._need_fit and (not self._fitted):
if self._need_fit and (self._fitted == 0):
raise RuntimeError("The model must be trained first.")
if isinstance(ts, pd.Series):
if self._need_fit and (
self._fitted == 2
): # fitted by DF, to be applied to Series
raise RuntimeError(
"The model was trained by a pandas DataFrame object, "
"it can only be applied to a pandas DataFrame object."
)
s = ts.copy()
predicted = self._predict_core(s)
# if a Series-to-Series operation, make sure Series name keeps
if isinstance(predicted, pd.Series):
predicted.name = ts.name
elif isinstance(ts, pd.DataFrame):
df = ts.copy()
# if the model doesn't neef fit, initialize or reset a model for
# each column
if not self._need_fit:
self._update_models(df.columns)
# predict for each column
predicted = pd.concat(
[self._models[col]._predict(df[col]) for col in df.columns],
axis=1,
)
if df.columns.duplicated().any():
raise ValueError(
"Input DataFrame must have unique column names."
)
if (not self._need_fit) or (self._fitted == 1):
# apply the model to each column
predicted = []
for col in df.columns:
predicted_this_col = self._predict(df[col])
if isinstance(predicted_this_col, pd.DataFrame):
predicted_this_col = predicted_this_col.rename(
columns={
col1: "{}_{}".format(col, col1)
for col1 in predicted_this_col.columns
}
)
predicted.append(predicted_this_col)
predicted = pd.concat(predicted, axis=1)
else:
# predict for each column
if not (set(self._models.keys()) >= set(df.columns)):
raise ValueError(
"The model was trained by a pandas DataFrame with "
"columns {}, but the input DataFrame contains columns "
"{} which are unknown to the model.".format(
list(set(self._models.keys())),
list(set(df.columns) - set(self._models.keys())),
)
)
predicted = pd.concat(
[
self._models[col]._predict(df[col])
for col in df.columns
],
axis=1,
)
else:
raise TypeError("Input must be a pandas Series or DataFrame.")
# make sure index freq is the same (because pandas has a bug that some
Expand Down Expand Up @@ -153,16 +196,24 @@ def fit_predict(self, ts):
class _ModelHD(_Model):
def _fit(self, df):
if isinstance(df, pd.DataFrame):
if df.columns.duplicated().any():
raise ValueError(
"Input DataFrame must have unique column names."
)
df_copy = df.copy()
self._fit_core(df_copy)
else:
raise TypeError("Input must be a pandas DataFrame.")
self._fitted = True
self._fitted = 1

def _predict(self, df):
if self._need_fit and (not self._fitted):
if self._need_fit and (self._fitted == 0):
raise RuntimeError("The model must be trained first.")
if isinstance(df, pd.DataFrame):
if df.columns.duplicated().any():
raise ValueError(
"Input DataFrame must have unique column names."
)
df_copy = df.copy()
predicted = self._predict_core(df_copy)
else:
Expand Down
21 changes: 14 additions & 7 deletions src/adtk/_detector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,14 @@ def detect(self, ts, return_list=False):
Parameters
----------
ts: pandas.Series or pandas.DataFrame
Time series to detect anomalies from.
If a DataFrame with k columns, k univariate detectors will be
applied to them independently.
Time series to detect anomalies from. If a DataFrame with k
columns, it is treated as k independent univariate time series.
- If the detector was trained with a Series, the detector will be
applied to each univariate series independently;
- If the detector was trained with a DataFrame, i.e. the detector
is essentially k detectors, those detectors will be applied to
each univariate series respectivley.
return_list: bool, optional
Whether to return a list of anomalous time stamps, or a binary
Expand Down Expand Up @@ -66,8 +71,9 @@ def fit_detect(self, ts, return_list=False):
----------
ts: pandas.Series or pandas.DataFrame
Time series to be used for training and be detected for anomalies.
If a DataFrame with k columns, k univariate detectors will be
trained and applied to them independently.
If a DataFrame with k columns, it is treated as k independent
univariate time series, and k univariate detectors will be trained
and applied to each series independently.
return_list: bool, optional
Whether to return a list of anomalous time stamps, or a binary
Expand Down Expand Up @@ -109,8 +115,9 @@ def score(self, ts, anomaly_true, scoring="recall", **kwargs):
----------
ts: pandas Series or pandas.DataFrame
Time series to detect anomalies from.
If a DataFrame with k columns, k univariate detectors will be
applied to them independently.
If a DataFrame with k columns, it is treated as k independent
univariate time series, and k univariate detectors will be trained
and applied to each series independently.
anomaly_true: pandas.Series, pandas.DataFrame, list, or dict
True anomalies.
Expand Down
16 changes: 11 additions & 5 deletions src/adtk/_transformer_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,14 @@ def transform(self, ts):
Parameters
----------
ts: pandas.Series or pandas.DataFrame
Time series to be transformed.
If a DataFrame with k columns, k univariate transformers will be
applied to them independently.
Time series to be transformed. If a DataFrame with k columns, it is
treated as k independent univariate time series.
- If the transformer was trained with a Series, the transformer
will be applied to each univariate series independently;
- If the transformer was trained with a DataFrame, i.e. the
transformer is essentially k transformers, those transformers
will be applied to each univariate series respectivley.
Returns
-------
Expand All @@ -41,8 +46,9 @@ def fit_transform(self, ts):
----------
ts: pandas.Series or pandas.DataFrame
Time series to be used for training and be transformed.
If a DataFrame with k columns, k univariate transformers will be
applied to them independently.
If a DataFrame with k columns, it is treated as k independent
univariate time series, and k univariate transformers will be
trained and applied to each series independently.
Returns
-------
Expand Down
77 changes: 1 addition & 76 deletions src/adtk/detector/detector_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@
class CustomizedDetector1D(_Detector1D):
"""Detector derived from a user-given function and parameters.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
detect_func: function
Expand Down Expand Up @@ -133,13 +126,6 @@ class ThresholdAD(_Detector1D):
This detector compares time series values with user-given thresholds, and
identifies time points as anomalous when values are beyond the thresholds.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
low: float, optional
Expand Down Expand Up @@ -178,13 +164,6 @@ class QuantileAD(_Detector1D):
of historical data, and identifies time points as anomalous when values
are beyond the thresholds.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
low: float, optional
Expand Down Expand Up @@ -239,13 +218,6 @@ class InterQuartileRangeAD(_Detector1D):
historical data, and identifies time points as anomalous when differences
are beyond the inter-quartile range times a user-given factor c.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
c: float, or 2-tuple (float, float), optional
Expand Down Expand Up @@ -317,13 +289,6 @@ class GeneralizedESDTestAD(_Detector1D):
follow an approximately normal distribution. Please only use this detector
when this assumption holds.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
[1] Rosner, Bernard (May 1983), Percentage Points for a Generalized ESD
Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172.
Expand Down Expand Up @@ -412,13 +377,6 @@ class PersistAD(_Detector1D):
This detector is internally implemented as a `Pipenet` object. Advanced
users may learn more details by checking attribute `pipe_`.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
window: int, optional
Expand Down Expand Up @@ -575,13 +533,6 @@ class LevelShiftAD(_Detector1D):
This detector is internally implemented as a `Pipenet` object. Advanced
users may learn more details by checking attribute `pipe_`.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
window: int, optional
Expand Down Expand Up @@ -723,13 +674,6 @@ class VolatilityShiftAD(_Detector1D):
This detector is internally implemented as a `Pipenet` object. Advanced
users may learn more details by checking attribute `pipe_`.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
window: int, optional
Expand Down Expand Up @@ -886,13 +830,6 @@ class AutoregressionAD(_Detector1D):
This detector is internally implemented aattribute `pipe_`.nced
users may learn more details by checking attribute `pipe_`.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
n_steps: int, optional
Expand Down Expand Up @@ -1042,13 +979,6 @@ class SeasonalAD(_Detector1D):
This detector is internally implemented aattribute `pipe_`.nced
users may learn more details by checking attribute `pipe_`.
This is an univariate detector. When it is applied to a multivariate time
series (i.e. pandas DataFrame), it will be applied to every series
independently. All parameters can be defined as a dict object where key-
value pairs are series names (i.e. column names of DataFrame) and the
model parameter for that series. If not, then the same parameter will be
applied to all series.
Parameters
----------
freq: int, optional
Expand Down Expand Up @@ -1084,12 +1014,7 @@ class SeasonalAD(_Detector1D):
"""

_default_params = {
"freq": None,
"side": "both",
"c": 3.0,
"trend": False,
}
_default_params = {"freq": None, "side": "both", "c": 3.0, "trend": False}

def __init__(
self,
Expand Down
Loading

0 comments on commit 90b9145

Please sign in to comment.