-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/dei 109 duration analysis group #40
Changes from 4 commits
28048fd
8c8aa3d
5be6e05
46ec8db
c4ee96b
ecd412c
c495f89
d8c0b96
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from typing import List | ||
|
||
import xarray as _xr | ||
import numpy as _np | ||
from xarray.core.resample import DataArrayResample | ||
|
||
from decoimpact.business.entities.rules.i_array_based_rule import IArrayBasedRule | ||
|
@@ -126,6 +127,12 @@ def _perform_operation(self, aggregated_values: DataArrayResample) -> _xr.DataAr | |
Returns: | ||
DataArray: Values of operation type | ||
""" | ||
period_operations = [ | ||
TimeOperationType.COUNT_PERIODS, | ||
TimeOperationType.MAX_DURATION_PERIODS, | ||
TimeOperationType.AVG_DURATION_PERIODS | ||
] | ||
|
||
if self._operation_type is TimeOperationType.ADD: | ||
result = aggregated_values.sum() | ||
|
||
|
@@ -141,8 +148,8 @@ def _perform_operation(self, aggregated_values: DataArrayResample) -> _xr.DataAr | |
elif self._operation_type is TimeOperationType.MEDIAN: | ||
result = aggregated_values.median() | ||
|
||
elif self._operation_type is TimeOperationType.COUNT_PERIODS: | ||
result = aggregated_values.reduce(self.count_groups, dim="time") | ||
elif self._operation_type in period_operations: | ||
result = aggregated_values.reduce(self.analyze_groups, dim="time") | ||
|
||
else: | ||
raise NotImplementedError( | ||
|
@@ -152,37 +159,97 @@ def _perform_operation(self, aggregated_values: DataArrayResample) -> _xr.DataAr | |
|
||
return _xr.DataArray(result) | ||
|
||
def count_groups(self, elem, axis, **kwargs): | ||
"""In an array with 0 and 1, count the amount of times the | ||
groups of 1 occur. | ||
def count_groups(self, elem): | ||
""" | ||
Count the amount of times the groups of 1 occur. | ||
|
||
Args: | ||
elem (Array): the data array in N-dimensions | ||
|
||
Returns: | ||
List: list with the counted periods | ||
""" | ||
# in case of an example array with 5 values [1,1,0,1,0]: | ||
# subtract last 4 values from the first 4 values: [1,0,1,0] - [1,1,0,1]: | ||
# (the result of this example differences: [0,-1,1,0]) | ||
differences = _np.diff(elem) | ||
# First add the first element of the array to the difference array (as this | ||
# could also indicate a beginning of a group or not and the diff is calculated | ||
# from the second element) | ||
# when the difference of two neighbouring elements is 1, this indicates the | ||
# start of a group. to count the number of groups: count the occurences of | ||
# difference == 1: (the result of this examples: 1 + 1 = 2) | ||
differences = _np.append(differences, elem[0]) | ||
return _np.count_nonzero(differences == 1) | ||
|
||
def duration_groups(self, elem): | ||
""" | ||
Create an array that cumulative sums the values of the groups in the array, | ||
but restarts when a 0 occurs. For example the array: [0, 1, 1, 0, 1, 1, 1, 0, 1] | ||
This function will return: [0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 0, 1] | ||
|
||
Args: | ||
elem (List): the data array in N-dimensions | ||
|
||
Returns: | ||
List: List with the duration of the periods | ||
""" | ||
# Function to create a cumsum over the groups (where the elements in elem are 1) | ||
cumsum_groups = _np.frompyfunc(lambda a, b: a + b if b == 1 else 0, 2, 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leuke functie, maar ik snap het niet goed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added an example in the description of the duration_groups function explaining what the result is of this function. a and b are consecutive values when looping over the array in this function. |
||
return cumsum_groups.accumulate(elem) | ||
|
||
def analyze_groups(self, elem, axis, **kwargs): | ||
"""This function analyzes the input array (N-dimensional array containing 0 | ||
and 1) The function will reduce the array over the time axis, depending on a | ||
certain time operation type. Below are the operation types with what this | ||
function will do to this example input array: [0, 1, 1, 0, 1, 0]. A period | ||
is all consecutive 1 values. | ||
- COUNT_PERIODS: count the amount of periods (result: 2) | ||
- MAX_DURATION_PERIODS: gives the longest period (result: 2) | ||
- AVG_DURATION_PERIODS: gives the average of periods (result: 1.5) | ||
|
||
Args: | ||
elem (Array): the data array in N-dimensions | ||
axis (integer): the number of axis of the array | ||
axis (integer): the value describing the time axis | ||
|
||
Returns: | ||
array: array with the counted periods, with the same dimensions as elem | ||
array: array with the analyzed periods, with the same dimensions as elem | ||
""" | ||
# in case of 1 dimension: | ||
if axis == 0: | ||
# in case of an example array with 5 values [1,1,0,1,0]: | ||
# subtract last 4 values from the first 4 values: [1,0,1,0] - [1,1,0,1]: | ||
# (the result of this example differences: [0,-1,1,0]) | ||
differences = elem[1:] - elem[:-1] | ||
# when the difference of two neighbouring elements is 1, | ||
# this indicates the start of a group; to count the number of groups: | ||
# count the occurences of difference=1, and then add the first value: | ||
# (the result of this examples: 1 + 1 = 2) | ||
group_count = sum(map(lambda x: x == 1, differences)) + elem[0] | ||
no_axis = len(_np.shape(elem)) | ||
|
||
# The reduce function that calls this analyze_groups function should be reduces | ||
# over the time axis. The argument axis in this function gives a number of which | ||
# axis is in fact the time axis. This axis needs to move to the last position, | ||
# because we need to reduce the N-dimensional arary to a 1D array with all the | ||
# values in time for a specific cell in order to do the calculation for that | ||
# cell. Because we are looping over the N-dimensional array iteratively, we | ||
# should only move the time axis the first time this function is called (so when | ||
# the axis is not yet set to -1!) | ||
if axis != -1: | ||
elem = _np.moveaxis(elem, axis, -1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we do this exactly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll put in a comment |
||
axis = -1 | ||
|
||
# in case of 1 dimension: | ||
if no_axis == 1: | ||
if self._operation_type is TimeOperationType.COUNT_PERIODS: | ||
group_result = self.count_groups(elem) | ||
elif self._operation_type is TimeOperationType.MAX_DURATION_PERIODS: | ||
group_result = _np.max((self.duration_groups(elem))) | ||
elif self._operation_type is TimeOperationType.AVG_DURATION_PERIODS: | ||
period = _np.sum(elem) | ||
group_count = self.count_groups(elem) | ||
group_result = period / group_count | ||
|
||
# in case of multiple dimensions: | ||
else: | ||
group_count = [] | ||
group_result = [] | ||
for sub_elem in elem: | ||
# loop through this recursive function, determine output per axis: | ||
group_count_row = self.count_groups(sub_elem, axis - 1) | ||
group_result_row = self.analyze_groups(sub_elem, axis) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, I think I understand it a bit more |
||
# add the result to the list of results, per axis: | ||
group_count.append(group_count_row) | ||
return group_count | ||
group_result.append(group_result_row) | ||
|
||
return group_result | ||
|
||
def _get_time_dimension_name(self, variable: _xr.DataArray, logger: ILogger) -> str: | ||
"""Retrieves the dimension name | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the description says that the first value is added after counting the occurences. The code does the opposite, first append, then count. Which is correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, the code was changed and forgot to update the text. The first element of the array is either 0 or 1, so it is an addition to the sum, could be done first or second, but because I made a switch to use count_nonzero I moved the adding to the line above. Will adjust the comment