Skip to content

Commit

Permalink
Linting
Browse files Browse the repository at this point in the history
  • Loading branch information
nnansters committed Jul 3, 2024
1 parent 3ee722f commit 336a056
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 36 deletions.
21 changes: 10 additions & 11 deletions nannyml/drift/univariate/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,9 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift

_list_missing(self.column_names, reference_data)


self.continuous_column_names, self.categorical_column_names = self._split_continuous_and_categorical(reference_data)
self.continuous_column_names, self.categorical_column_names = self._split_continuous_and_categorical(
reference_data
)

timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None
for column_name in self.continuous_column_names:
Expand Down Expand Up @@ -394,7 +395,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
self.result.analysis_data = data.copy()

return self.result

def _split_continuous_and_categorical(self, data: pd.DataFrame) -> Tuple[List[str], List[str]]:
"""Splits the features in the data set into continuous and categorical features."""
treat_as_numerical_set, treat_as_categorical_set = set(self.treat_as_numerical), set(self.treat_as_categorical)
Expand All @@ -404,22 +405,20 @@ def _split_continuous_and_categorical(self, data: pd.DataFrame) -> Tuple[List[st
treat_as_numerical_set = treat_as_numerical_set - invalid_continuous_column_names
if invalid_continuous_column_names:
self._logger.info(
f"ignoring 'treat_as_numerical' values {list(invalid_continuous_column_names)} because they were not in "
f"listed column names"
f"ignoring 'treat_as_numerical' values {list(invalid_continuous_column_names)} because "
f"they were not in listed column names"
)

invalid_categorical_column_names = treat_as_categorical_set - column_names_set
treat_as_categorical_set = treat_as_categorical_set - invalid_categorical_column_names
if invalid_categorical_column_names:
self._logger.info(
f"ignoring 'treat_as_categorical' values {list(invalid_categorical_column_names)} because they were not in "
f"listed column names"
f"ignoring 'treat_as_categorical' values {list(invalid_categorical_column_names)} because "
f"they were not in listed column names"
)

unspecified_columns = column_names_set - treat_as_numerical_set - treat_as_categorical_set
continuous_column_names, categorical_column_names = _split_features_by_type(
data, unspecified_columns
)
continuous_column_names, categorical_column_names = _split_features_by_type(data, unspecified_columns)

continuous_column_names = continuous_column_names + list(treat_as_numerical_set)
categorical_column_names = categorical_column_names + list(treat_as_categorical_set)
Expand Down
13 changes: 7 additions & 6 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance

from nannyml._typing import Self
from nannyml.base import _column_is_categorical, _remove_nans
from nannyml.base import _remove_nans
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException, NotFittedException
from nannyml.thresholds import Threshold, calculate_threshold_values
Expand Down Expand Up @@ -290,7 +290,7 @@ def _calculate(self, data: pd.Series):
data = _remove_nans(data)
if data.empty:
return np.nan

len_data = len(data)
data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data

Expand Down Expand Up @@ -346,7 +346,7 @@ def _calculate(self, data: pd.Series):
data = _remove_nans(data)
if data.empty:
return np.nan

Check warning on line 348 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L348

Added line #L348 was not covered by tests

data_unique, data_counts = np.unique(data, return_counts=True)
data_counts_dic = dict(zip(data_unique, data_counts))
data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
Expand All @@ -361,6 +361,7 @@ def _calculate(self, data: pd.Series):

return distance


@MethodFactory.register(key='kolmogorov_smirnov', feature_type=FeatureType.CONTINUOUS)
class KolmogorovSmirnovStatistic(Method):
"""Calculates the Kolmogorov-Smirnov d-stat.
Expand Down Expand Up @@ -734,7 +735,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference
self._bins = bins
self._reference_proba_in_bins = reference_proba_in_bins

return self

def _calculate(self, data: pd.Series):
Expand All @@ -753,6 +754,7 @@ def _calculate(self, data: pd.Series):

return distance


@MethodFactory.register(key='hellinger', feature_type=FeatureType.CATEGORICAL)
class CategoricalHellingerDistance(Method):
"""Calculates the Hellinger Distance between two distributions."""
Expand Down Expand Up @@ -794,7 +796,7 @@ def _calculate(self, data: pd.Series):
if data.empty:
return np.nan
reference_proba_in_bins = copy(self._reference_proba_in_bins)

data_unique, data_counts = np.unique(data, return_counts=True)
data_counts_dic = dict(zip(data_unique, data_counts))
data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
Expand All @@ -808,4 +810,3 @@ def _calculate(self, data: pd.Series):
distance = np.sqrt(np.sum((np.sqrt(reference_proba_in_bins) - np.sqrt(data_proba_in_bins)) ** 2)) / np.sqrt(2)

return distance

35 changes: 16 additions & 19 deletions tests/drift/test_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from nannyml._typing import Key, Result, Self
from nannyml.base import Abstract1DResult, AbstractCalculator
from nannyml.chunk import CountBasedChunker, DefaultChunker, PeriodBasedChunker, SizeBasedChunker
from nannyml.datasets import load_synthetic_car_loan_dataset
from nannyml.drift.multivariate.data_reconstruction import DataReconstructionDriftCalculator
from nannyml.drift.multivariate.domain_classifier import DomainClassifierCalculator
from nannyml.drift.univariate.calculator import DEFAULT_THRESHOLDS, UnivariateDriftCalculator
from nannyml.exceptions import InvalidArgumentsException
from nannyml.performance_estimation.confidence_based import CBPE
from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold
from nannyml.datasets import load_synthetic_car_loan_dataset


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -183,9 +183,7 @@ def test_base_drift_calculator_uses_default_chunker_when_no_chunker_specified(sa


@pytest.mark.parametrize('column_names, expected', [('f1', ['f1']), (['f1', 'f2'], ['f1', 'f2'])])
def test_univariate_drift_calculator_create_with_single_or_list_of_column_names( # noqa: D103
column_names, expected
):
def test_univariate_drift_calculator_create_with_single_or_list_of_column_names(column_names, expected): # noqa: D103
calc = UnivariateDriftCalculator(
column_names=column_names,
timestamp_column_name='timestamp',
Expand Down Expand Up @@ -264,9 +262,7 @@ def test_univariate_drift_calculator_treat_as_categorical_for_continuous_column(
assert sorted(calc.categorical_column_names) == expected_categorical


def test_univariate_drift_calculator_treat_as_categorical_for_categorical_column( # noqa: D103
sample_drift_data
):
def test_univariate_drift_calculator_treat_as_categorical_for_categorical_column(sample_drift_data): # noqa: D103
calc = UnivariateDriftCalculator(
column_names=['f1', 'f2', 'f3', 'f4'],
treat_as_categorical='f3',
Expand All @@ -281,9 +277,7 @@ def test_univariate_drift_calculator_treat_as_categorical_for_categorical_column
assert sorted(calc.categorical_column_names) == expected_categorical


def test_univariate_drift_calculator_treat_as_for_non_existing_column( # noqa: D103
sample_drift_data, caplog
):
def test_univariate_drift_calculator_treat_as_for_non_existing_column(sample_drift_data, caplog): # noqa: D103
caplog.set_level(logging.INFO)

calc = UnivariateDriftCalculator(
Expand All @@ -300,8 +294,13 @@ def test_univariate_drift_calculator_treat_as_for_non_existing_column( # noqa:
assert sorted(calc.continuous_column_names) == expected_continuous
assert sorted(calc.categorical_column_names) == expected_categorical

assert "ignoring 'treat_as_categorical' values ['foo'] because they were not in listed column names" in caplog.messages
assert "ignoring 'treat_as_numerical' values ['bar'] because they were not in listed column names" in caplog.messages
assert (
"ignoring 'treat_as_categorical' values ['foo'] because they were not in listed column names" in caplog.messages
)
assert (
"ignoring 'treat_as_numerical' values ['bar'] because they were not in listed column names" in caplog.messages
)


def test_univariate_drift_calculator_without_custom_thresholds(): # noqa: D103
sut = UnivariateDriftCalculator(
Expand Down Expand Up @@ -602,7 +601,7 @@ def test_base_drift_calculator_given_non_empty_features_list_should_only_calcula

# See https://github.com/NannyML/nannyml/issues/192
def test_univariate_drift_calculator_returns_distinct_but_consistent_results_when_reused( # noqa: D103
sample_drift_data
sample_drift_data,
):
ref_data = sample_drift_data.loc[sample_drift_data['period'] == 'reference']
sut = UnivariateDriftCalculator(
Expand Down Expand Up @@ -871,7 +870,7 @@ def test_input_dataframes_are_not_altered_by_univ_calculator(): # noqa: D103
'loan_length',
'repaid_loan_on_prev_car',
'size_of_downpayment',
'driver_tenure'
'driver_tenure',
]
calc = UnivariateDriftCalculator(
column_names=feature_column_names,
Expand All @@ -897,7 +896,7 @@ def test_input_dataframes_are_not_altered_by_dre_calculator(): # noqa: D103
'loan_length',
'repaid_loan_on_prev_car',
'size_of_downpayment',
'driver_tenure'
'driver_tenure',
]
calc = DataReconstructionDriftCalculator(
column_names=feature_column_names,
Expand All @@ -920,12 +919,10 @@ def test_input_dataframes_are_not_altered_by_dc_calculator(): # noqa: D103
'loan_length',
'repaid_loan_on_prev_car',
'size_of_downpayment',
'driver_tenure'
'driver_tenure',
]
calc = DomainClassifierCalculator(
feature_column_names=feature_column_names,
timestamp_column_name='timestamp',
chunk_number=1
feature_column_names=feature_column_names, timestamp_column_name='timestamp', chunk_number=1
)
calc.fit(reference2)
results = calc.calculate(monitored2) # noqa: F841
Expand Down

0 comments on commit 336a056

Please sign in to comment.