false positive risk (#83)

avast · Oct 30, 2024 · c9d5c1c · c9d5c1c
1 parent d2aab3e
commit c9d5c1c
Show file tree

Hide file tree

Showing 12 changed files with 235 additions and 81 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool]
 [tool.poetry]
 name = "ep-stats"
-version = "2.4.0"
+version = "2.5.0"
 homepage = "https://github.com/avast/ep-stats"
 description = "Statistical package to evaluate ab tests in experimentation platform."
 authors = [

diff --git a/src/epstats/server/req.py b/src/epstats/server/req.py
@@ -239,6 +239,13 @@ class Experiment(BaseModel):
         description="""List of filtering conditions to apply on exposure and goals.""",
     )
 
+    null_hypothesis_rate: Optional[float] = Field(
+        None,
+        title="Null hypothesis rate",
+        description="""Global null hypothesis rate of the experimentation program. It is defined as the
+        proportion of all tests in an experimentation program that have not improved or degraded the primary metric.""",
+    )
+
     query_parameters: dict = Field(
         {},
         title="Custom query parameters used in the data access.",
@@ -334,6 +341,7 @@ def to_experiment(self):
             unit_type=self.unit_type,
             variants=self.variants,
             filters=[f.to_filter() for f in self.filters] if self.filters else [],
+            null_hypothesis_rate=self.null_hypothesis_rate,
             query_parameters=self.query_parameters,
         )
 

diff --git a/src/epstats/server/res.py b/src/epstats/server/res.py
@@ -72,6 +72,9 @@ class MetricStat(BaseModel):
         title="Power",
         description="Test power based on the collected `sample_size`.",
     )
+    false_positive_risk: Optional[float] = Field(
+        None, title="False positive risk.", description="False positive risk of a statistically significant result."
+    )
 
     @staticmethod
     def from_df(df: pd.DataFrame):
@@ -88,6 +91,7 @@ def from_df(df: pd.DataFrame):
                 sample_size=r["sample_size"],
                 required_sample_size=r["required_sample_size"],
                 power=r["power"],
+                false_positive_risk=r["false_positive_risk"],
             )
             for i, r in df.iterrows()
         ]

diff --git a/src/epstats/toolkit/experiment.py b/src/epstats/toolkit/experiment.py
@@ -54,6 +54,7 @@ def metric_columns(cls) -> List[str]:
         1. `sample_size` - current sample size
         1. `required_sample_size` - size of the sample required to reach the required power
         1. `power` - power based on the collected `sample_size`
+        1. `false_positive_risk` - false positive risk of a significant metric
         """
         return [
             "timestamp",
@@ -76,6 +77,7 @@ def metric_columns(cls) -> List[str]:
             "sample_size",
             "required_sample_size",
             "power",
+            "false_positive_risk",
         ]
 
     @classmethod
@@ -156,6 +158,7 @@ def __init__(
         confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
         variants: Optional[List[str]] = None,
         filters: Optional[List[Filter]] = None,
+        null_hypothesis_rate: Optional[float] = None,
         query_parameters: dict = {},
     ):
         self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
@@ -189,6 +192,7 @@ def __init__(
         self._update_dimension_to_value()
         self.filters = filters if filters is not None else []
         self.query_parameters = query_parameters
+        self.null_hypothesis_rate = null_hypothesis_rate
 
     def _check_metric_ids_unique(self):
         """
@@ -765,6 +769,22 @@ def _get_power_from_required_sample_sizes(self, metrics: pd.DataFrame, n_variant
             axis=1,
         )
 
+    def _get_false_positive_risk(self, metric_row: pd.Series) -> float:
+        if self.null_hypothesis_rate is None:
+            return np.nan
+
+        if metric_row["p_value"] >= metric_row["confidence_level"]:
+            return np.nan
+
+        return Statistics.false_positive_risk(
+            null_hypothesis_rate=self.null_hypothesis_rate,
+            power=metric_row["power"],
+            p_value=metric_row["p_value"],
+        )
+
+    def _get_false_positive_risks(self, metrics: pd.DataFrame) -> pd.Series:
+        return metrics.apply(self._get_false_positive_risk, axis=1)
+
     def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
         if not self.metrics:
             return pd.DataFrame([], columns=Evaluation.metric_columns())
@@ -822,4 +842,5 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
         c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
         c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
         c["power"] = self._get_power_from_required_sample_sizes(c, n_variants)
+        c["false_positive_risk"] = self._get_false_positive_risks(c)
         return c[Evaluation.metric_columns()]
diff --git a/src/epstats/toolkit/statistics.py b/src/epstats/toolkit/statistics.py
@@ -432,3 +432,36 @@ def power_from_required_sample_size_per_variant(
             np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
             - st.norm.ppf(1 - alpha / 2)
         )
+
+    @staticmethod
+    def false_positive_risk(
+        null_hypothesis_rate: float,
+        power: float,
+        p_value: float,
+    ) -> float:
+        """
+        Computes false positive risk defined as:
+
+        $$
+        P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
+        $$
+
+        where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
+        is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
+        of all tests in an experimentation program that have not improved or degraded the primary metric.
+
+        False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.
+
+        More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.
+
+        Arguments:
+            null_hypothesis_rate: global null hypothesis rate of the experimanation program
+            current_power: power achieved in the test
+            confidence_level: confidence level of the test
+
+        Returns:
+            false positive risk
+        """
+
+        pi = null_hypothesis_rate
+        return (p_value * pi) / (p_value * pi + power * (1 - pi))
diff --git a/src/epstats/toolkit/testing/resources/evaluations_exposures.csv b/src/epstats/toolkit/testing/resources/evaluations_exposures.csv
@@ -64,6 +64,8 @@ test-multi-check	c	test_unit_type	global	exposure	5200
 test-conversion-with-minimum-effect	a	test_unit_type	global	exposure	21
 test-conversion-with-minimum-effect	b	test_unit_type	global	exposure	26
 test-conversion-with-minimum-effect	c	test_unit_type	global	exposure	30
+test-false-positive-risk	a	test_unit_type	global	exposure	1000
+test-false-positive-risk	b	test_unit_type	global	exposure	1001
 test-dim-operators	a	test_unit_type	global	exposure	1000
 test-dim-operators	b	test_unit_type	global	exposure	1001
 test-operator-precedence	a	test_unit_type	global	exposure	80