diff --git a/src/facet/simulation/_simulation.py b/src/facet/simulation/_simulation.py index 7f2c1f165..c1a3812e8 100644 --- a/src/facet/simulation/_simulation.py +++ b/src/facet/simulation/_simulation.py @@ -417,8 +417,9 @@ class UnivariateProbabilitySimulator(BaseUnivariateSimulator[ClassifierPipelineD @property def values_label(self) -> str: """[see superclass]""" - return f"{self._positive_class()} probability" + return f"probability({self._positive_class()})" + @property def baseline(self) -> float: """ Calculate the actual observed frequency of the positive class as the baseline @@ -428,7 +429,7 @@ def baseline(self) -> float: actual_target: pd.Series = self.crossfit.sample.target assert isinstance(actual_target, pd.Series), "sample has one single target" - return actual_target.loc[actual_target == self._positive_class()] / len( + return actual_target.loc[actual_target == self._positive_class()].sum() / len( actual_target ) diff --git a/src/facet/simulation/partition/_partition.py b/src/facet/simulation/partition/_partition.py index a7cbd684c..b51b4f0b7 100644 --- a/src/facet/simulation/partition/_partition.py +++ b/src/facet/simulation/partition/_partition.py @@ -45,7 +45,7 @@ class Partitioner( - FittableMixin[Sequence[T_Value]], Generic[T_Value], metaclass=ABCMeta + FittableMixin[Iterable[T_Value]], Generic[T_Value], metaclass=ABCMeta ): """ Abstract base class of all partitioners. @@ -77,7 +77,7 @@ def max_partitions(self) -> int: return self._max_partitions @abstractmethod - def fit(self: T, values: Sequence[T_Value], **fit_params) -> T: + def fit(self: T, values: Iterable[T_Value], **fit_params) -> T: """ Calculate the partitioning for the given observed values. :param values: a sequence of observed values as the empirical basis for \ @@ -177,7 +177,7 @@ def upper_bound(self) -> T_Number: # noinspection PyMissingOrEmptyDocstring def fit( self: T, - values: Sequence[T_Value], + values: Iterable[T_Value], lower_bound: Optional[T_Number] = None, upper_bound: Optional[T_Number] = None, **fit_params, @@ -186,14 +186,26 @@ def fit( self: RangePartitioner # support type hinting in PyCharm + # ensure arg values is an array + if not isinstance(values, np.ndarray): + if isinstance(values, pd.Series): + values = values.values + else: + if not isinstance(values, Sequence): + try: + values = iter(values) + except TypeError: + raise TypeError("arg values must be iterable") + values = np.array(values) + lower_bound = self._lower_bound upper_bound = self._upper_bound if lower_bound is None: - lower_bound = np.quantile(values, q=0.025) + lower_bound = np.nanquantile(values, q=0.025) if upper_bound is None: - upper_bound = np.quantile(values, q=0.975) + upper_bound = np.nanquantile(values, q=0.975) if upper_bound < lower_bound: upper_bound = lower_bound elif upper_bound < lower_bound: @@ -213,19 +225,20 @@ def fit( int(round((self._last_partition - self._first_partition) / self._step)) + 1 ) - def _frequencies() -> List[int]: - # Return the number of elements in each partitions - partition_indices = [ - int(round(value - first_partition) / step) for value in values - ] - frequencies = [0] * n_partitions - for idx in partition_indices: - if 0 <= idx < n_partitions: - frequencies[idx] += 1 + # Return the number of elements in each partitions + + # create the bins, starting with the lower bound of the first partition + partition_bins = (first_partition - step / 2) + np.arange( + n_partitions + 1 + ) * step + partition_indices = np.digitize(values, bins=partition_bins) - return frequencies + # frequency counts will include left and right outliers, hence n_partitions + 2 + # and we exclude the first and last element of the result + frequencies = np.bincount(partition_indices, minlength=n_partitions + 2)[1:-1] + + self._frequencies = frequencies - self._frequencies = _frequencies() return self def is_fitted(self) -> bool: @@ -238,9 +251,11 @@ def partitions(self) -> Sequence[T_Number]: :return: for each partition, a central value representing the partition """ - offset = self._first_partition - step = self._step - return [offset + (idx * step) for idx in range(self._n_partitions)] + return np.round( + self._first_partition + np.arange(self._n_partitions) * self._step, + # round to the nearest power of 10 of the step variable + int(-np.floor(np.log10(self._step))), + ) def frequencies(self) -> Sequence[int]: """ @@ -396,7 +411,15 @@ def fit(self: T, values: Sequence[T_Value], **fit_params) -> T: self: CategoryPartitioner # support type hinting in PyCharm - value_counts = pd.Series(data=values).value_counts(ascending=False) + if not isinstance(values, pd.Series): + if not (isinstance(values, np.ndarray) or isinstance(values, Sequence)): + try: + values = iter(values) + except TypeError: + raise TypeError("arg values must be iterable") + values = pd.Series(data=values) + + value_counts = values.value_counts(ascending=False) max_partitions = self.max_partitions self._partitions = value_counts.index.values[:max_partitions] self._frequencies = value_counts.values[:max_partitions] diff --git a/src/facet/simulation/viz/_style.py b/src/facet/simulation/viz/_style.py index 5da0d4c08..daf07b53d 100644 --- a/src/facet/simulation/viz/_style.py +++ b/src/facet/simulation/viz/_style.py @@ -96,15 +96,14 @@ def draw_histogram( pass @staticmethod - def _legend( - percentile_lower: float, percentile_upper: float - ) -> Tuple[str, str, str]: + def _legend(percentile_lower: float, percentile_upper: float) -> Tuple[str, ...]: # generate a triple with legend names for the min percentile, median, and max # percentile return ( f"{percentile_lower}th percentile", "Median", f"{percentile_upper}th percentile", + "Baseline", ) @@ -154,12 +153,14 @@ def draw_uplift( line_min, = ax.plot(x, values_min, color=self._COLOR_CONFIDENCE) line_median, = ax.plot(x, values_median, color=self._COLOR_MEDIAN_UPLIFT) line_max, = ax.plot(x, values_max, color=self._COLOR_CONFIDENCE) + # add a horizontal line at y=0 + line_base = ax.axhline(y=values_baseline, linewidth=0.5) # add a legend labels = self._legend( percentile_lower=percentile_lower, percentile_upper=percentile_upper ) - handles = [line_max, line_median, line_min] + handles = (line_max, line_median, line_min, line_base) ax.legend(handles, labels) # label the y axis @@ -176,9 +177,6 @@ def draw_uplift( ax.set_xticks(x) ax.set_xticklabels(labels=partitions) - # add a horizontal line at y=0 - ax.axhline(y=values_baseline, linewidth=0.5) - # remove the top and right spines for pos in ["top", "right"]: ax.spines[pos].set_visible(False) @@ -337,7 +335,7 @@ def draw_uplift( *self._legend( percentile_lower=percentile_lower, percentile_upper=percentile_upper, - ), + )[:3], ], formats=[ self._partition_format(is_categorical_feature),