BCG-X-Official · jason-bentley · Sep 8, 2020 · Sep 8, 2020 · Sep 8, 2020 · Sep 8, 2020
diff --git a/src/facet/simulation/_simulation.py b/src/facet/simulation/_simulation.py
@@ -417,8 +417,9 @@ class UnivariateProbabilitySimulator(BaseUnivariateSimulator[ClassifierPipelineD
     @property
     def values_label(self) -> str:
         """[see superclass]"""
-        return f"{self._positive_class()} probability"
+        return f"probability({self._positive_class()})"
 
+    @property
     def baseline(self) -> float:
         """
         Calculate the actual observed frequency of the positive class as the baseline
@@ -428,7 +429,7 @@ def baseline(self) -> float:
         actual_target: pd.Series = self.crossfit.sample.target
         assert isinstance(actual_target, pd.Series), "sample has one single target"
 
-        return actual_target.loc[actual_target == self._positive_class()] / len(
+        return actual_target.loc[actual_target == self._positive_class()].sum() / len(
             actual_target
         )
 

diff --git a/src/facet/simulation/partition/_partition.py b/src/facet/simulation/partition/_partition.py
@@ -45,7 +45,7 @@
 
 
 class Partitioner(
-    FittableMixin[Sequence[T_Value]], Generic[T_Value], metaclass=ABCMeta
+    FittableMixin[Iterable[T_Value]], Generic[T_Value], metaclass=ABCMeta
 ):
     """
     Abstract base class of all partitioners.
@@ -77,7 +77,7 @@ def max_partitions(self) -> int:
         return self._max_partitions
 
     @abstractmethod
-    def fit(self: T, values: Sequence[T_Value], **fit_params) -> T:
+    def fit(self: T, values: Iterable[T_Value], **fit_params) -> T:
         """
         Calculate the partitioning for the given observed values.
         :param values: a sequence of observed values as the empirical basis for \
@@ -177,7 +177,7 @@ def upper_bound(self) -> T_Number:
     # noinspection PyMissingOrEmptyDocstring
     def fit(
         self: T,
-        values: Sequence[T_Value],
+        values: Iterable[T_Value],
         lower_bound: Optional[T_Number] = None,
         upper_bound: Optional[T_Number] = None,
         **fit_params,
@@ -186,14 +186,26 @@ def fit(
 
         self: RangePartitioner  # support type hinting in PyCharm
 
+        # ensure arg values is an array
+        if not isinstance(values, np.ndarray):
+            if isinstance(values, pd.Series):
+                values = values.values
+            else:
+                if not isinstance(values, Sequence):
+                    try:
+                        values = iter(values)
+                    except TypeError:
+                        raise TypeError("arg values must be iterable")
+                values = np.array(values)
+
         lower_bound = self._lower_bound
         upper_bound = self._upper_bound
 
         if lower_bound is None:
-            lower_bound = np.quantile(values, q=0.025)
+            lower_bound = np.nanquantile(values, q=0.025)
 
         if upper_bound is None:
-            upper_bound = np.quantile(values, q=0.975)
+            upper_bound = np.nanquantile(values, q=0.975)
             if upper_bound < lower_bound:
                 upper_bound = lower_bound
         elif upper_bound < lower_bound:
@@ -213,19 +225,20 @@ def fit(
             int(round((self._last_partition - self._first_partition) / self._step)) + 1
         )
 
-        def _frequencies() -> List[int]:
-            # Return the number of elements in each partitions
-            partition_indices = [
-                int(round(value - first_partition) / step) for value in values
-            ]
-            frequencies = [0] * n_partitions
-            for idx in partition_indices:
-                if 0 <= idx < n_partitions:
-                    frequencies[idx] += 1
+        # Return the number of elements in each partitions
+
+        # create the bins, starting with the lower bound of the first partition
+        partition_bins = (first_partition - step / 2) + np.arange(
+            n_partitions + 1
+        ) * step
+        partition_indices = np.digitize(values, bins=partition_bins)
 
-            return frequencies
+        # frequency counts will include left and right outliers, hence n_partitions + 2
+        # and we exclude the first and last element of the result
+        frequencies = np.bincount(partition_indices, minlength=n_partitions + 2)[1:-1]
+
+        self._frequencies = frequencies
 
-        self._frequencies = _frequencies()
         return self
 
     def is_fitted(self) -> bool:
@@ -238,9 +251,11 @@ def partitions(self) -> Sequence[T_Number]:
 
         :return: for each partition, a central value representing the partition
         """
-        offset = self._first_partition
-        step = self._step
-        return [offset + (idx * step) for idx in range(self._n_partitions)]
+        return np.round(
+            self._first_partition + np.arange(self._n_partitions) * self._step,
+            # round to the nearest power of 10 of the step variable
+            int(-np.floor(np.log10(self._step))),
+        )
 
     def frequencies(self) -> Sequence[int]:
         """
@@ -396,7 +411,15 @@ def fit(self: T, values: Sequence[T_Value], **fit_params) -> T:
 
         self: CategoryPartitioner  # support type hinting in PyCharm
 
-        value_counts = pd.Series(data=values).value_counts(ascending=False)
+        if not isinstance(values, pd.Series):
+            if not (isinstance(values, np.ndarray) or isinstance(values, Sequence)):
+                try:
+                    values = iter(values)
+                except TypeError:
+                    raise TypeError("arg values must be iterable")
+            values = pd.Series(data=values)
+
+        value_counts = values.value_counts(ascending=False)
         max_partitions = self.max_partitions
         self._partitions = value_counts.index.values[:max_partitions]
         self._frequencies = value_counts.values[:max_partitions]

diff --git a/src/facet/simulation/viz/_style.py b/src/facet/simulation/viz/_style.py
@@ -96,15 +96,14 @@ def draw_histogram(
         pass
 
     @staticmethod
-    def _legend(
-        percentile_lower: float, percentile_upper: float
-    ) -> Tuple[str, str, str]:
+    def _legend(percentile_lower: float, percentile_upper: float) -> Tuple[str, ...]:
         # generate a triple with legend names for the min percentile, median, and max
         # percentile
         return (
             f"{percentile_lower}th percentile",
             "Median",
             f"{percentile_upper}th percentile",
+            "Baseline",
         )
 
 
@@ -154,12 +153,14 @@ def draw_uplift(
         line_min, = ax.plot(x, values_min, color=self._COLOR_CONFIDENCE)
         line_median, = ax.plot(x, values_median, color=self._COLOR_MEDIAN_UPLIFT)
         line_max, = ax.plot(x, values_max, color=self._COLOR_CONFIDENCE)
+        # add a horizontal line at y=0
+        line_base = ax.axhline(y=values_baseline, linewidth=0.5)
 
         # add a legend
         labels = self._legend(
             percentile_lower=percentile_lower, percentile_upper=percentile_upper
         )
-        handles = [line_max, line_median, line_min]
+        handles = (line_max, line_median, line_min, line_base)
         ax.legend(handles, labels)
 
         # label the y axis
@@ -176,9 +177,6 @@ def draw_uplift(
             ax.set_xticks(x)
             ax.set_xticklabels(labels=partitions)
 
-        # add a horizontal line at y=0
-        ax.axhline(y=values_baseline, linewidth=0.5)
-
         # remove the top and right spines
         for pos in ["top", "right"]:
             ax.spines[pos].set_visible(False)
@@ -337,7 +335,7 @@ def draw_uplift(
                     *self._legend(
                         percentile_lower=percentile_lower,
                         percentile_upper=percentile_upper,
-                    ),
+                    )[:3],
                 ],
                 formats=[
                     self._partition_format(is_categorical_feature),