Skip to content

Commit

Permalink
false positive risk (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
jancervenka authored Oct 30, 2024
1 parent d2aab3e commit c9d5c1c
Show file tree
Hide file tree
Showing 12 changed files with 235 additions and 81 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool]
[tool.poetry]
name = "ep-stats"
version = "2.4.0"
version = "2.5.0"
homepage = "https://github.com/avast/ep-stats"
description = "Statistical package to evaluate ab tests in experimentation platform."
authors = [
Expand Down
8 changes: 8 additions & 0 deletions src/epstats/server/req.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,13 @@ class Experiment(BaseModel):
description="""List of filtering conditions to apply on exposure and goals.""",
)

null_hypothesis_rate: Optional[float] = Field(
None,
title="Null hypothesis rate",
description="""Global null hypothesis rate of the experimentation program. It is defined as the
proportion of all tests in an experimentation program that have not improved or degraded the primary metric.""",
)

query_parameters: dict = Field(
{},
title="Custom query parameters used in the data access.",
Expand Down Expand Up @@ -334,6 +341,7 @@ def to_experiment(self):
unit_type=self.unit_type,
variants=self.variants,
filters=[f.to_filter() for f in self.filters] if self.filters else [],
null_hypothesis_rate=self.null_hypothesis_rate,
query_parameters=self.query_parameters,
)

Expand Down
4 changes: 4 additions & 0 deletions src/epstats/server/res.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class MetricStat(BaseModel):
title="Power",
description="Test power based on the collected `sample_size`.",
)
false_positive_risk: Optional[float] = Field(
None, title="False positive risk.", description="False positive risk of a statistically significant result."
)

@staticmethod
def from_df(df: pd.DataFrame):
Expand All @@ -88,6 +91,7 @@ def from_df(df: pd.DataFrame):
sample_size=r["sample_size"],
required_sample_size=r["required_sample_size"],
power=r["power"],
false_positive_risk=r["false_positive_risk"],
)
for i, r in df.iterrows()
]
Expand Down
21 changes: 21 additions & 0 deletions src/epstats/toolkit/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def metric_columns(cls) -> List[str]:
1. `sample_size` - current sample size
1. `required_sample_size` - size of the sample required to reach the required power
1. `power` - power based on the collected `sample_size`
1. `false_positive_risk` - false positive risk of a significant metric
"""
return [
"timestamp",
Expand All @@ -76,6 +77,7 @@ def metric_columns(cls) -> List[str]:
"sample_size",
"required_sample_size",
"power",
"false_positive_risk",
]

@classmethod
Expand Down Expand Up @@ -156,6 +158,7 @@ def __init__(
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
variants: Optional[List[str]] = None,
filters: Optional[List[Filter]] = None,
null_hypothesis_rate: Optional[float] = None,
query_parameters: dict = {},
):
self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
Expand Down Expand Up @@ -189,6 +192,7 @@ def __init__(
self._update_dimension_to_value()
self.filters = filters if filters is not None else []
self.query_parameters = query_parameters
self.null_hypothesis_rate = null_hypothesis_rate

def _check_metric_ids_unique(self):
"""
Expand Down Expand Up @@ -765,6 +769,22 @@ def _get_power_from_required_sample_sizes(self, metrics: pd.DataFrame, n_variant
axis=1,
)

def _get_false_positive_risk(self, metric_row: pd.Series) -> float:
if self.null_hypothesis_rate is None:
return np.nan

if metric_row["p_value"] >= metric_row["confidence_level"]:
return np.nan

return Statistics.false_positive_risk(
null_hypothesis_rate=self.null_hypothesis_rate,
power=metric_row["power"],
p_value=metric_row["p_value"],
)

def _get_false_positive_risks(self, metrics: pd.DataFrame) -> pd.Series:
return metrics.apply(self._get_false_positive_risk, axis=1)

def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
if not self.metrics:
return pd.DataFrame([], columns=Evaluation.metric_columns())
Expand Down Expand Up @@ -822,4 +842,5 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
c["power"] = self._get_power_from_required_sample_sizes(c, n_variants)
c["false_positive_risk"] = self._get_false_positive_risks(c)
return c[Evaluation.metric_columns()]
33 changes: 33 additions & 0 deletions src/epstats/toolkit/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,3 +432,36 @@ def power_from_required_sample_size_per_variant(
np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
- st.norm.ppf(1 - alpha / 2)
)

@staticmethod
def false_positive_risk(
null_hypothesis_rate: float,
power: float,
p_value: float,
) -> float:
"""
Computes false positive risk defined as:
$$
P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
$$
where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
of all tests in an experimentation program that have not improved or degraded the primary metric.
False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.
More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.
Arguments:
null_hypothesis_rate: global null hypothesis rate of the experimanation program
current_power: power achieved in the test
confidence_level: confidence level of the test
Returns:
false positive risk
"""

pi = null_hypothesis_rate
return (p_value * pi) / (p_value * pi + power * (1 - pi))
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ test-multi-check c test_unit_type global exposure 5200
test-conversion-with-minimum-effect a test_unit_type global exposure 21
test-conversion-with-minimum-effect b test_unit_type global exposure 26
test-conversion-with-minimum-effect c test_unit_type global exposure 30
test-false-positive-risk a test_unit_type global exposure 1000
test-false-positive-risk b test_unit_type global exposure 1001
test-dim-operators a test_unit_type global exposure 1000
test-dim-operators b test_unit_type global exposure 1001
test-operator-precedence a test_unit_type global exposure 80
Expand Down
Loading

0 comments on commit c9d5c1c

Please sign in to comment.