Skip to content

Commit

Permalink
Required sample size (#41)
Browse files Browse the repository at this point in the history
* required sample size

* docs
  • Loading branch information
jancervenka authored Sep 23, 2022
1 parent 5cf6139 commit c523f98
Show file tree
Hide file tree
Showing 15 changed files with 334 additions and 115 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ in an AB test experiment.
* Sequential evaluations allow experiments to be stopped early.
* Connect it to any data source to get either pre-aggregated or per randomization unit data.
* Simple expression language to define arbitrary metrics.
* Sample size estimation.
* REST API to integrate it as a service in experimentation portal with score cards.

## Documentation
Expand Down
4 changes: 4 additions & 0 deletions docs/stats/sample_size.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ As we discussed in previous chapter, we can see that:
1. Sample size increases as MDE decreases (because $\Delta$ is in the denominator).
1. Sample size is greatest for conversion rates equal to 0.5.

Sample size calculation is available in the statistics toolkit of this package.
See the [API documentation](../api/statistics.md#epstats.toolkit.statistics.Statistics.required_sample_size_per_variant)
for more details.

## What to Do If Sample Size is Too Big

We saw that required sample size is
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ep-stats
version = 1.4.0
version = 1.5.0
description = Statistical package to evaluate ab tests in experimentation platform.
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down
13 changes: 6 additions & 7 deletions src/epstats/server/api_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ def _evaluate(experiment: EvExperiment, dao: Dao, statsd: StatsClient):
evaluation = experiment.evaluate_agg(goals)
statsd.incr("evaluations")
_logger.info(
(
f"Evaluation of experiment [{experiment.id}] finished with evaluation"
f" of {evaluation.metrics.metric_id.nunique()} "
f"metrics and {evaluation.checks.check_id.nunique()} checks."
f"Metrics: {evaluation.metrics.to_dict('records')}"
)
{
"evaluation": "response",
"exp_id": experiment.id,
"metrics": evaluation.metrics.to_dict("records"),
}
)
return Result.from_evaluation(experiment, evaluation)
except Exception as e:
Expand All @@ -56,7 +55,7 @@ async def evaluate_experiment(
"""
Evaluates single `Experiment`.
"""
_logger.info(f"Calling evaluate with {experiment.json()}")
_logger.info({"evaluation": "request", "experiment": experiment.dict()})
statsd.incr("requests.evaluate")
loop = asyncio.get_event_loop()
return await loop.run_in_executor(evaluation_pool, _evaluate, experiment.to_experiment(statsd), dao, statsd)
Expand Down
22 changes: 19 additions & 3 deletions src/epstats/server/req.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class Metric(BaseModel):
description="""EP metric is defined in the form of `nominator / denominator`.
Both parts are entered as expressions. Example: `count(my_unit_type.unit.conversion)`.""",
)
minimum_effect: Optional[float] = Field(
title="Minimum effect of interest",
description=f"""The minimum effect of interest is the smallest relative difference that is meaningful to detect,
defining it allows us to estimate the size of the sample data required to reach {DEFAULT_POWER:.0%} power.""",
default=None,
)

@validator("id")
def id_must_be_not_empty(cls, value):
Expand Down Expand Up @@ -80,7 +86,13 @@ def check_nominator_denominator(cls, values):
raise ValueError(f"Cannot parse nominator '{nominator}' or '{denominator}' because of '{e}'")

def to_metric(self):
return EvMetric(self.id, self.name, self.nominator, self.denominator)
return EvMetric(
id=self.id,
name=self.name,
nominator=self.nominator,
denominator=self.denominator,
minimum_effect=self.minimum_effect,
)


class Check(BaseModel):
Expand Down Expand Up @@ -378,10 +390,14 @@ class SampleSizeCalculationData(BaseModel):
Data needed for the sample size calculation.
"""

n_variants: int = Field(title="Number of variants", description="Number of variants in the experiment.")
n_variants: int = Field(
title="Number of variants",
description="Number of variants in the experiment.",
)

minimum_effect: float = Field(
title="Minimum effect of interest", description="Relative effect, must be greater than zero."
title="Minimum effect of interest",
description="Relative effect, must be greater than zero.",
)

mean: float = Field(
Expand Down
14 changes: 12 additions & 2 deletions src/epstats/server/res.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
from typing import List
from typing import List, Optional
from pydantic import BaseModel, Field

from ..toolkit import Evaluation
from ..toolkit import Evaluation, DEFAULT_POWER
from .req import Experiment, Metric, Check


Expand Down Expand Up @@ -49,6 +49,14 @@ class MetricStat(BaseModel):
description="""Confidence level used
to compute (obtain) `confidence_interval`.""",
)
sample_size: Optional[float] = Field(
title="Sample size",
description="Current sample size.",
)
required_sample_size: Optional[float] = Field(
title="Required sample size",
description=f"Size of the sample required to reach {DEFAULT_POWER:.0%} power.",
)

@staticmethod
def from_df(df: pd.DataFrame):
Expand All @@ -62,6 +70,8 @@ def from_df(df: pd.DataFrame):
p_value=r["p_value"],
confidence_interval=r["confidence_interval"],
confidence_level=r["confidence_level"],
sample_size=r["sample_size"],
required_sample_size=r["required_sample_size"],
)
for i, r in df.iterrows()
]
Expand Down
108 changes: 94 additions & 14 deletions src/epstats/toolkit/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
from enum import Enum
import pandas as pd
import numpy as np
from collections import Counter
from typing import Optional
from datetime import datetime
from statsd import StatsClient
from dataclasses import dataclass

from .metric import Metric
from .metric import Metric, SimpleMetric
from .check import Check
from .utils import get_utc_timestamp, goals_wide_to_long
from .parser import EpGoal, UnitType, AggType, Goal

from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL
from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_POWER


class Evaluation:
Expand Down Expand Up @@ -64,6 +66,9 @@ def metric_columns(cls) -> List[str]:
"confidence_interval",
"standard_error",
"degrees_of_freedom",
"minimum_effect",
"sample_size",
"required_sample_size",
]

@classmethod
Expand Down Expand Up @@ -128,20 +133,21 @@ def __init__(
metrics: List[Metric],
checks: List[Check],
unit_type: str,
date_from: str = None,
date_to: str = None,
date_for: str = None,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
date_for: Optional[str] = None,
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
variants: List[str] = None,
variants: Optional[List[str]] = None,
statsd: StatsClient = StatsClient(),
filters: List[Filter] = None,
outlier_detection_algorithm: str = None,
filters: Optional[List[Filter]] = None,
outlier_detection_algorithm: Optional[str] = None,
):
self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
self.id = id
self.control_variant = control_variant
self.unit_type = unit_type
self.metrics = metrics
self._check_metric_ids_unique()
self.checks = checks
self.date_from = datetime.strptime(date_from, "%Y-%m-%d").date() if date_from is not None else None
self.date_to = datetime.strptime(date_to, "%Y-%m-%d").date() if date_to is not None else None
Expand Down Expand Up @@ -169,6 +175,15 @@ def __init__(
self.filters = filters if filters is not None else []
self.outlier_detection_algorithm = outlier_detection_algorithm

def _check_metric_ids_unique(self):
"""
Raises an exception if `metrics` contain duplicated ids.
"""
id_counts = Counter(metric.id for metric in self.metrics)
for id_, count in id_counts.items():
if count > 1:
raise ValueError(f"Metric ids must be unique. Id={id_} found more than once.")

def _update_dimension_to_value(self):
"""
To every `EpGoal` across all metrics, we need to add missing dimensions
Expand Down Expand Up @@ -652,6 +667,70 @@ def _fix_missing_by_unit(self, goals: pd.DataFrame) -> pd.DataFrame:
+ self.get_dimension_columns()
]

def _get_required_sample_size(
self,
metric_row: pd.Series,
controls: dict,
minimum_effects: dict,
metrics_with_value_denominator: set,
n_variants: int,
) -> pd.Series:

metric_id = metric_row["metric_id"]
minimum_effect = minimum_effects[metric_id]
index = ["minimum_effect", "sample_size", "required_sample_size"]

# Right now, metric with value() denominator would return count that is not equal
# to the sample size. In such case we do not evaluate the required sample size.
# TODO: add suport for value() denominator metrics,
# parser will return an additional column equal to count or count_unique.
sample_size = metric_row["count"] if metric_id not in metrics_with_value_denominator else np.nan

if metric_row["exp_variant_id"] == self.control_variant or pd.isnull(minimum_effect):
return pd.Series([np.nan, sample_size, np.nan], index)

metric_id = metric_row["metric_id"]
return pd.Series(
[
minimum_effect,
sample_size,
Statistics.required_sample_size_per_variant(
n_variants=n_variants,
minimum_effect=minimum_effect,
mean=controls[metric_id]["mean"],
std=controls[metric_id]["std"],
std_2=metric_row["std"],
confidence_level=metric_row["confidence_level"],
power=DEFAULT_POWER,
),
],
index,
)

def _get_required_sample_sizes(self, metrics: pd.DataFrame, n_variants: int) -> pd.DataFrame:

controls = {
r["metric_id"]: {"mean": r["mean"], "std": r["std"]}
for _, r in metrics.iterrows()
if r["exp_variant_id"] == self.control_variant
}

minimum_effects = {m.id: m.minimum_effect for m in self.metrics}
metrics_with_value_denominator = {
m.id for m in self.metrics if m.denominator.startswith("value(") and not isinstance(m, SimpleMetric)
}

return metrics.apply(
lambda metric_row: self._get_required_sample_size(
metric_row=metric_row,
controls=controls,
minimum_effects=minimum_effects,
metrics_with_value_denominator=metrics_with_value_denominator,
n_variants=n_variants,
),
axis=1,
)

def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
if not self.metrics:
return pd.DataFrame([], columns=Evaluation.metric_columns())
Expand All @@ -662,7 +741,7 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
sts.append([count, sum_value, sum_sqr_value])
stats = np.array(sts).transpose(0, 2, 1)
metrics = stats.shape[0]
variants = stats.shape[1]
n_variants = stats.shape[1]

count = stats[:, :, 0]
sum_value = stats[:, :, 1]
Expand All @@ -689,9 +768,9 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
stats = np.dstack((count, mean, std, sum_value, np.ones(count.shape) * confidence_level))
stats = np.dstack(
(
np.repeat([m.id for m in self.metrics], variants).reshape(metrics, variants, -1),
np.repeat([m.name for m in self.metrics], variants).reshape(metrics, variants, -1),
np.tile(goals["exp_variant_id"].unique(), metrics).reshape(metrics, variants, -1),
np.repeat([m.id for m in self.metrics], n_variants).reshape(metrics, n_variants, -1),
np.repeat([m.name for m in self.metrics], n_variants).reshape(metrics, n_variants, -1),
np.tile(goals["exp_variant_id"].unique(), metrics).reshape(metrics, n_variants, -1),
stats,
)
)
Expand All @@ -702,9 +781,10 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
c = Statistics.ttest_evaluation(stats, self.control_variant)

# multiple variants (comparisons) correction - applied when we have multiple treatment variants
if variants > 2:
c = Statistics.multiple_comparisons_correction(c, variants, metrics, confidence_level)
if n_variants > 2:
c = Statistics.multiple_comparisons_correction(c, n_variants, metrics, confidence_level)

c["exp_id"] = self.id
c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
return c[Evaluation.metric_columns()]
7 changes: 5 additions & 2 deletions src/epstats/toolkit/metric.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Set
from typing import Set, Optional
import pandas as pd
import numpy as np

Expand All @@ -18,6 +18,7 @@ def __init__(
denominator: str,
metric_format: str = "{:.2%}",
metric_value_multiplier: int = 1,
minimum_effect: Optional[float] = None,
):
"""
Constructor of the general metric definition.
Expand Down Expand Up @@ -55,6 +56,7 @@ def __init__(
self._goals = self._parser.get_goals()
self.metric_format = metric_format
self.metric_value_multiplier = metric_value_multiplier
self.minimum_effect = minimum_effect

def get_goals(self) -> Set:
"""
Expand Down Expand Up @@ -111,6 +113,7 @@ def __init__(
unit_type: str = "test_unit_type",
metric_format: str = "{:.2%}",
metric_value_multiplier: int = 1,
minimum_effect: Optional[float] = None,
):
"""
Constructor of the simplified metric definition.
Expand Down Expand Up @@ -145,4 +148,4 @@ def __init__(
num = "value" + "(" + unit_type + "." + agg_type + "." + numerator + ")"
den = "value" + "(" + unit_type + "." + agg_type + "." + denominator + ")"

super().__init__(id, name, num, den, metric_format, metric_value_multiplier)
super().__init__(id, name, num, den, metric_format, metric_value_multiplier, minimum_effect)
Loading

0 comments on commit c523f98

Please sign in to comment.