From 9af86db5ebea40fb2a45ec999c7890b0d6441041 Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Mon, 28 Mar 2022 17:07:32 +0800 Subject: [PATCH 1/2] feat: linear imputation in Resample --- .../src/sections/advancedAnalytics.tsx | 1 + .../utils/pandas_postprocessing/resample.py | 7 +++ superset/utils/pandas_postprocessing/utils.py | 4 ++ .../pandas_postprocessing/test_resample.py | 46 +++++++++++++++++++ 4 files changed, 58 insertions(+) diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx b/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx index 983e45ec8e6fa..c9b10f9a37a3f 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx @@ -170,6 +170,7 @@ export const advancedAnalyticsControls: ControlPanelSectionConfig = { choices: [ ['asfreq', 'Null imputation'], ['zerofill', 'Zero imputation'], + ['linear', 'Linear imputation'], ['ffill', 'Forward values'], ['bfill', 'Backward values'], ['median', 'Median values'], diff --git a/superset/utils/pandas_postprocessing/resample.py b/superset/utils/pandas_postprocessing/resample.py index a777672b9db1a..a82d7031e9c12 100644 --- a/superset/utils/pandas_postprocessing/resample.py +++ b/superset/utils/pandas_postprocessing/resample.py @@ -20,6 +20,7 @@ from flask_babel import gettext as _ from superset.exceptions import InvalidPostProcessingError +from superset.utils.pandas_postprocessing.utils import RESAMPLE_METHOD def resample( @@ -40,9 +41,15 @@ def resample( """ if not isinstance(df.index, pd.DatetimeIndex): raise InvalidPostProcessingError(_("Resample operation requires DatetimeIndex")) + if method not in RESAMPLE_METHOD: + raise InvalidPostProcessingError( + _("Resample method should in ") + ", ".join(RESAMPLE_METHOD) + "." + ) if method == "asfreq" and fill_value is not None: _df = df.resample(rule).asfreq(fill_value=fill_value) + elif method == "linear": + _df = df.resample(rule).interpolate() else: _df = getattr(df.resample(rule), method)() return _df diff --git a/superset/utils/pandas_postprocessing/utils.py b/superset/utils/pandas_postprocessing/utils.py index 7aebe1e0d0346..15ea0cf3662e8 100644 --- a/superset/utils/pandas_postprocessing/utils.py +++ b/superset/utils/pandas_postprocessing/utils.py @@ -92,6 +92,10 @@ "P1W/1970-01-04T00:00:00Z": "W", } +RESAMPLE_METHOD = tuple( + ["asfreq", "bfill", "ffill", "linear", "median", "mean", "sum",] +) + FLAT_COLUMN_SEPARATOR = ", " diff --git a/tests/unit_tests/pandas_postprocessing/test_resample.py b/tests/unit_tests/pandas_postprocessing/test_resample.py index bd3a36e591648..9568d4ebd126a 100644 --- a/tests/unit_tests/pandas_postprocessing/test_resample.py +++ b/tests/unit_tests/pandas_postprocessing/test_resample.py @@ -14,8 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import numpy as np import pandas as pd import pytest +from pandas import to_datetime from superset.exceptions import InvalidPostProcessingError from superset.utils import pandas_postprocessing as pp @@ -151,3 +153,47 @@ def test_resample_should_raise_ex(): pp.resample( df=categories_df, rule="1D", method="asfreq", ) + + with pytest.raises(InvalidPostProcessingError): + pp.resample( + df=timeseries_df, rule="1D", method="foobar", + ) + + +def test_resample_linear(): + df = pd.DataFrame( + index=to_datetime(["2019-01-01", "2019-01-05", "2019-01-08"]), + data={"label": ["a", "e", "j"], "y": [1.0, 5.0, 8.0]}, + ) + post_df = pp.resample(df=df, rule="1D", method="linear") + """ + label y + 2019-01-01 a 1.0 + 2019-01-02 NaN 2.0 + 2019-01-03 NaN 3.0 + 2019-01-04 NaN 4.0 + 2019-01-05 e 5.0 + 2019-01-06 NaN 6.0 + 2019-01-07 NaN 7.0 + 2019-01-08 j 8.0 + """ + assert post_df.equals( + pd.DataFrame( + index=pd.to_datetime( + [ + "2019-01-01", + "2019-01-02", + "2019-01-03", + "2019-01-04", + "2019-01-05", + "2019-01-06", + "2019-01-07", + "2019-01-08", + ] + ), + data={ + "label": ["a", np.NaN, np.NaN, np.NaN, "e", np.NaN, np.NaN, "j"], + "y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }, + ) + ) From d7a43d0c555524052e3a041747d4fd63741215fd Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Mon, 28 Mar 2022 20:39:57 +0800 Subject: [PATCH 2/2] updates --- .../src/sections/advancedAnalytics.tsx | 2 +- superset/utils/pandas_postprocessing/utils.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx b/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx index c9b10f9a37a3f..ebd118d88122c 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/src/sections/advancedAnalytics.tsx @@ -170,7 +170,7 @@ export const advancedAnalyticsControls: ControlPanelSectionConfig = { choices: [ ['asfreq', 'Null imputation'], ['zerofill', 'Zero imputation'], - ['linear', 'Linear imputation'], + ['linear', 'Linear interpolation'], ['ffill', 'Forward values'], ['bfill', 'Backward values'], ['median', 'Median values'], diff --git a/superset/utils/pandas_postprocessing/utils.py b/superset/utils/pandas_postprocessing/utils.py index 15ea0cf3662e8..dc48cd1145c87 100644 --- a/superset/utils/pandas_postprocessing/utils.py +++ b/superset/utils/pandas_postprocessing/utils.py @@ -92,9 +92,7 @@ "P1W/1970-01-04T00:00:00Z": "W", } -RESAMPLE_METHOD = tuple( - ["asfreq", "bfill", "ffill", "linear", "median", "mean", "sum",] -) +RESAMPLE_METHOD = ("asfreq", "bfill", "ffill", "linear", "median", "mean", "sum") FLAT_COLUMN_SEPARATOR = ", "