Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUILD: release FACET 1.2.1 #305

Merged
merged 13 commits into from
Sep 22, 2021
Merged
11 changes: 11 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ fit the underlying crossfit.
One example where this can be useful is to use only a recent period of a time series as
the baseline of a simulation.


1.2.1
~~~~~

- FIX: fix a bug in :class:`.UnivariateProbabilitySimulator` that was introduced in
FACET 1.2.0
- catch up with FACET 1.1.1


1.2.0
~~~~~

Expand All @@ -26,10 +35,12 @@ FACET 1.1
FACET 1.1 refines and enhances the association/synergy/redundancy calculations provided
by the :class:`.LearnerInspector`.


1.1.1
~~~~~

- DOC: add reference to FACET research paper on the project landing page
- FIX: correctly count positive class frequency in UnivariateProbabilitySimulator


1.1.0
Expand Down
2 changes: 1 addition & 1 deletion src/facet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""


__version__ = "1.2.0"
__version__ = "1.2.1"

__logo__ = (
r"""
Expand Down
6 changes: 2 additions & 4 deletions src/facet/simulation/_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,11 +628,9 @@ def expected_output(self) -> float:

:return: observed frequency of the positive class
"""
actual_outputs = self.sample.target.loc[self.subsample]
actual_outputs = self.sample.target

return actual_outputs.loc[actual_outputs == self._positive_class()].sum() / len(
actual_outputs
)
return (actual_outputs == self._positive_class()).sum() / len(actual_outputs)

def _positive_class(self) -> Any:
"""
Expand Down
86 changes: 85 additions & 1 deletion test/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from sklearn.utils import Bunch

from sklearndf import TransformerDF
from sklearndf.pipeline import RegressorPipelineDF
from sklearndf.classification import RandomForestClassifierDF
from sklearndf.pipeline import ClassifierPipelineDF, RegressorPipelineDF
from sklearndf.regression import (
SVRDF,
AdaBoostRegressorDF,
Expand Down Expand Up @@ -349,3 +350,86 @@ def check_ranking(
f"unexpected parameters for learner at rank #{rank}: "
f"got {parameters_actual} but expected {parameters_expected}"
)


@pytest.fixture
def iris_classifier_ranker_binary(
iris_sample_binary: Sample,
cv_stratified_bootstrap: StratifiedBootstrapCV,
n_jobs: int,
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample_binary, cv=cv_stratified_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_ranker_multi_class(
iris_sample: Sample, cv_stratified_bootstrap: StratifiedBootstrapCV, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample, cv=cv_stratified_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_ranker_dual_target(
iris_sample_binary_dual_target: Sample, cv_bootstrap: BootstrapCV, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample_binary_dual_target, cv=cv_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_crossfit_binary(
iris_classifier_ranker_binary: LearnerRanker[ClassifierPipelineDF],
) -> LearnerCrossfit[ClassifierPipelineDF[RandomForestClassifierDF]]:
return iris_classifier_ranker_binary.best_model_crossfit_


@pytest.fixture
def iris_classifier_crossfit_multi_class(
iris_classifier_ranker_multi_class: LearnerRanker[ClassifierPipelineDF],
) -> LearnerCrossfit[ClassifierPipelineDF[RandomForestClassifierDF]]:
return iris_classifier_ranker_multi_class.best_model_crossfit_


@pytest.fixture
def iris_inspector_multi_class(
iris_classifier_crossfit_multi_class: LearnerCrossfit[
ClassifierPipelineDF[RandomForestClassifierDF]
],
n_jobs: int,
) -> LearnerInspector[ClassifierPipelineDF[RandomForestClassifierDF]]:
return LearnerInspector(shap_interaction=True, n_jobs=n_jobs).fit(
crossfit=iris_classifier_crossfit_multi_class
)


#
# Utility functions
#


def fit_learner_ranker(
sample: Sample, cv: BaseCrossValidator, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
# define parameters and crossfit
grids = [
LearnerGrid(
pipeline=ClassifierPipelineDF(
classifier=RandomForestClassifierDF(random_state=42), preprocessing=None
),
learner_parameters={"n_estimators": [10, 50], "min_samples_leaf": [4, 8]},
)
]
# pipeline inspector does only support binary classification - hence
# filter the test_sample down to only 2 target classes:
return LearnerRanker(
grids=grids,
cv=cv,
scoring="f1_macro",
random_state=42,
n_jobs=n_jobs,
).fit(sample=sample)
82 changes: 2 additions & 80 deletions test/test/facet/test_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
from sklearn.datasets import make_classification
from sklearn.model_selection import BaseCrossValidator, KFold
from sklearn.model_selection import KFold

from pytools.viz.dendrogram import DendrogramDrawer, DendrogramReportStyle
from sklearndf import TransformerDF
Expand All @@ -29,7 +29,7 @@
TreeExplainerFactory,
)
from facet.selection import LearnerGrid, LearnerRanker
from facet.validation import BootstrapCV, StratifiedBootstrapCV
from facet.validation import BootstrapCV

# noinspection PyMissingOrEmptyDocstring

Expand All @@ -38,61 +38,6 @@
T = TypeVar("T")


@pytest.fixture
def iris_classifier_ranker_binary(
iris_sample_binary: Sample,
cv_stratified_bootstrap: StratifiedBootstrapCV,
n_jobs: int,
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample_binary, cv=cv_stratified_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_ranker_multi_class(
iris_sample: Sample, cv_stratified_bootstrap: StratifiedBootstrapCV, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample, cv=cv_stratified_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_ranker_dual_target(
iris_sample_binary_dual_target: Sample, cv_bootstrap: BootstrapCV, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
return fit_learner_ranker(
sample=iris_sample_binary_dual_target, cv=cv_bootstrap, n_jobs=n_jobs
)


@pytest.fixture
def iris_classifier_crossfit_binary(
iris_classifier_ranker_binary: LearnerRanker[ClassifierPipelineDF],
) -> LearnerCrossfit[ClassifierPipelineDF[RandomForestClassifierDF]]:
return iris_classifier_ranker_binary.best_model_crossfit_


@pytest.fixture
def iris_classifier_crossfit_multi_class(
iris_classifier_ranker_multi_class: LearnerRanker[ClassifierPipelineDF],
) -> LearnerCrossfit[ClassifierPipelineDF[RandomForestClassifierDF]]:
return iris_classifier_ranker_multi_class.best_model_crossfit_


@pytest.fixture
def iris_inspector_multi_class(
iris_classifier_crossfit_multi_class: LearnerCrossfit[
ClassifierPipelineDF[RandomForestClassifierDF]
],
n_jobs: int,
) -> LearnerInspector[ClassifierPipelineDF[RandomForestClassifierDF]]:
return LearnerInspector(shap_interaction=True, n_jobs=n_jobs).fit(
crossfit=iris_classifier_crossfit_multi_class
)


def test_model_inspection(
regressor_grids: Sequence[LearnerGrid[RegressorPipelineDF]],
regressor_ranker: LearnerRanker[RegressorPipelineDF],
Expand Down Expand Up @@ -815,29 +760,6 @@ def test_shap_plot_data(
#


def fit_learner_ranker(
sample: Sample, cv: BaseCrossValidator, n_jobs: int
) -> LearnerRanker[ClassifierPipelineDF[RandomForestClassifierDF]]:
# define parameters and crossfit
grids = [
LearnerGrid(
pipeline=ClassifierPipelineDF(
classifier=RandomForestClassifierDF(random_state=42), preprocessing=None
),
learner_parameters={"n_estimators": [10, 50], "min_samples_leaf": [4, 8]},
)
]
# pipeline inspector does only support binary classification - hence
# filter the test_sample down to only 2 target classes:
return LearnerRanker(
grids=grids,
cv=cv,
scoring="f1_macro",
random_state=42,
n_jobs=n_jobs,
).fit(sample=sample)


def print_expected_matrix(error: AssertionError, split: bool = False):
# used to print expected output for copy/paste into assertion statement

Expand Down
66 changes: 61 additions & 5 deletions test/test/facet/test_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
from pytest import approx

from sklearndf import TransformerDF
from sklearndf.pipeline import RegressorPipelineDF
from sklearndf.classification import RandomForestClassifierDF
from sklearndf.pipeline import ClassifierPipelineDF, RegressorPipelineDF
from sklearndf.regression.extra import LGBMRegressorDF

from facet.crossfit import LearnerCrossfit
from facet.data import Sample
from facet.data.partition import ContinuousRangePartitioner
from facet.simulation import (
UnivariateProbabilitySimulator,
UnivariateSimulationResult,
UnivariateTargetSimulator,
UnivariateUpliftSimulator,
Expand Down Expand Up @@ -477,8 +479,62 @@ def test_univariate_uplift_subsample_simulation(
simulation_result.partitioner.frequencies_, [1, 4, 9, 10, 10, 6, 2, 1, 4]
)

SimulationDrawer(style="text").draw(
data=uplift_simulator.simulate_feature(
feature_name=parameterized_feature, partitioner=partitioner
)
SimulationDrawer(style="text").draw(data=simulation_result)


def test_univariate_probability_simulation(
iris_classifier_crossfit_binary: LearnerCrossfit[
ClassifierPipelineDF[RandomForestClassifierDF]
],
n_jobs: int,
) -> None:
parameterized_feature = "sepal length (cm)"
partitioner = ContinuousRangePartitioner(max_partitions=10)

print(iris_classifier_crossfit_binary.sample_.feature_names)

proba_simulator = UnivariateProbabilitySimulator(
crossfit=iris_classifier_crossfit_binary,
confidence_level=0.95,
n_jobs=n_jobs,
verbose=50,
)

simulation_result: UnivariateSimulationResult = proba_simulator.simulate_feature(
feature_name=parameterized_feature, partitioner=partitioner
)

index = pd.Index(
data=[5, 5.5, 6, 6.5, 7, 7.5, 8], name=UnivariateUpliftSimulator.IDX_PARTITION
)

assert simulation_result.baseline == approx(0.5)

assert_series_equal(
simulation_result.outputs_lower_bound(),
pd.Series(
[0.346255, 0.346255, 0.353697, 0.394167, 0.401895, 0.417372, 0.417372],
name=UnivariateSimulationResult.COL_LOWER_BOUND,
index=index,
),
)

assert_series_equal(
simulation_result.outputs_median(),
pd.Series(
[0.460432, 0.450516, 0.469412, 0.488569, 0.492651, 0.507788, 0.507788],
name=UnivariateSimulationResult.COL_MEDIAN,
index=index,
),
)

assert_series_equal(
simulation_result.outputs_upper_bound(),
pd.Series(
[0.582565, 0.562096, 0.570590, 0.580023, 0.599714, 0.602303, 0.602303],
name=UnivariateSimulationResult.COL_UPPER_BOUND,
index=index,
),
)

SimulationDrawer(style="text").draw(data=simulation_result)