-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add default ability to optimize the timeseries
- Loading branch information
Showing
4 changed files
with
214 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
__all__ = ["optimize_timeseries"] | ||
from datetime import timedelta | ||
|
||
from carlos.database.data.timeseries import TimeseriesData | ||
|
||
DEFAULT_SPLIT_THRESHOLD = timedelta(minutes=15) | ||
|
||
|
||
def optimize_timeseries( | ||
timeseries: TimeseriesData, | ||
sample_reduce_threshold: float = 0.01, | ||
split_threshold: timedelta = DEFAULT_SPLIT_THRESHOLD, | ||
) -> TimeseriesData: | ||
"""This function optimizes time series for the display in the frontend by, | ||
injecting None values into data gaps larger than the split_threshold. | ||
It further more allows to remove consecutive samples if the absolute relative | ||
change to last sample is less than sample_reduce_threshold. | ||
""" | ||
|
||
if len(timeseries.values) < 2: | ||
return timeseries | ||
|
||
sample_reduce_threshold = abs(sample_reduce_threshold) | ||
split_threshold = abs(split_threshold) | ||
|
||
# we always need the first value | ||
ts_len = len(timeseries.values) | ||
timestamps = [timeseries.timestamps[0]] | ||
values = [timeseries.values[0]] | ||
prev_ts = timeseries.timestamps[0] | ||
for idx, (ts, val) in enumerate( | ||
zip(timeseries.timestamps[1:], timeseries.values[1:]) | ||
): | ||
delta = ts - prev_ts | ||
if delta > split_threshold: | ||
timestamps.append(prev_ts + delta / 2) | ||
values.append(None) | ||
|
||
if ( | ||
is_value_changed( | ||
value=val, previous_value=values[-1], threshold=sample_reduce_threshold | ||
) | ||
or idx + 1 == ts_len | ||
): | ||
timestamps.append(ts) | ||
values.append(val) | ||
|
||
prev_ts = ts | ||
|
||
return TimeseriesData( | ||
timeseries_id=timeseries.timeseries_id, timestamps=timestamps, values=values | ||
) | ||
|
||
|
||
def is_value_changed( | ||
value: float | None, previous_value: float | None, threshold: float | ||
) -> bool: | ||
"""Returns true if the value changed the datatype or the value changed more | ||
than the threshold (in percent). | ||
""" | ||
|
||
# A 0 threshold means that we need to consider the value always to be changed | ||
if threshold <= 0.0: | ||
return True | ||
|
||
# if the data type changed, the value changed for sure | ||
if not isinstance(value, type(previous_value)): | ||
return True | ||
|
||
# in case any of the type are none, we just compare the values | ||
if value is None or previous_value is None: | ||
return value != previous_value | ||
|
||
if value == previous_value: | ||
return False | ||
|
||
if value == 0.0 or previous_value == 0.0: | ||
return True | ||
|
||
return abs(previous_value - value) > threshold * previous_value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
from datetime import UTC, datetime, timedelta | ||
|
||
import pytest | ||
from carlos.database.data.timeseries import TimeseriesData | ||
|
||
from carlos.api.utils.data_reduction import ( | ||
DEFAULT_SPLIT_THRESHOLD, | ||
is_value_changed, | ||
optimize_timeseries, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"value, previous_value, threshold, expected", | ||
[ | ||
pytest.param(None, None, 0.5, False, id="both None"), | ||
pytest.param(None, 1.0, 0.5, True, id="value None"), | ||
pytest.param(1.0, None, 0.5, True, id="previous None"), | ||
pytest.param(1.0, 1.0, 0.0, True, id="no threshold means always changed"), | ||
pytest.param(1.02, 1.0, 0.01, True, id="value >1% change"), | ||
pytest.param(1.0, 1.02, 0.01, True, id="previous >1% change"), | ||
pytest.param(0.02, 0.0, 0.01, True, id="zero - value >1% change"), | ||
pytest.param(0.0, 0.02, 0.01, True, id="zero - previous >1% change"), | ||
pytest.param(1.0, 1.0, 0.01, False, id="values equal"), | ||
pytest.param(0.0, 0.0, 0.01, False, id="zero - value equal"), | ||
pytest.param(1.009, 1.0, 0.01, False, id="value <1% change"), | ||
pytest.param(1.0, 1.009, 0.01, False, id="previous <1% change"), | ||
pytest.param(0.009, 0.0, 0.01, True, id="zero - value true"), | ||
pytest.param(0.0, 0.009, 0.01, True, id="zero - previous true"), | ||
], | ||
) | ||
def test_is_value_changed( | ||
value: float | None, previous_value: float | None, threshold: float, expected: bool | ||
): | ||
"""This test ensures that we covered all edge cases to detect changes.""" | ||
|
||
assert ( | ||
is_value_changed( | ||
value=value, previous_value=previous_value, threshold=threshold | ||
) | ||
is expected | ||
) | ||
|
||
|
||
def ts(offset: int) -> datetime: | ||
"""Little helper to reduce the boilerplate to generate test datetimes.""" | ||
return datetime(2024, 1, 1, tzinfo=UTC) + timedelta(seconds=offset) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"ts_in, split_threshold, ts_expected", | ||
[ | ||
pytest.param( | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[], | ||
values=[], | ||
), | ||
DEFAULT_SPLIT_THRESHOLD, | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[], | ||
values=[], | ||
), | ||
id="empty timeseries", | ||
), | ||
pytest.param( | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(0), ts(60), ts(120)], | ||
values=[1, 1, 2], | ||
), | ||
DEFAULT_SPLIT_THRESHOLD, | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(0), ts(120)], | ||
values=[1, 2], | ||
), | ||
id="duplicate data at beginning", | ||
), | ||
pytest.param( | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(i) for i in range(10)], | ||
values=[1, 2, 2, 2, 2, 2, 2, 3, 3, 4], | ||
), | ||
DEFAULT_SPLIT_THRESHOLD, | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(0), ts(1), ts(7), ts(9)], | ||
values=[1, 2, 3, 4], | ||
), | ||
id="duplicate in the middle beginning", | ||
), | ||
pytest.param( | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(0), ts(120)], | ||
values=[1, 2], | ||
), | ||
timedelta(seconds=30), | ||
TimeseriesData( | ||
timeseries_id=42, | ||
timestamps=[ts(0), ts(60), ts(120)], | ||
values=[1, None, 2], | ||
), | ||
id="None inserted in middle of gap", | ||
), | ||
], | ||
) | ||
def test_optimize_timeseries( | ||
ts_in: TimeseriesData, ts_expected: TimeseriesData, split_threshold: timedelta | ||
): | ||
|
||
assert optimize_timeseries(ts_in, split_threshold=split_threshold) == ts_expected |