Required sample size (#41)

* required sample size * docs
avast · Sep 23, 2022 · c523f98 · c523f98
1 parent 5cf6139
commit c523f98
Show file tree

Hide file tree

Showing 15 changed files with 334 additions and 115 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ in an AB test experiment.
 * Sequential evaluations allow experiments to be stopped early.
 * Connect it to any data source to get either pre-aggregated or per randomization unit data.
 * Simple expression language to define arbitrary metrics.
+* Sample size estimation.
 * REST API to integrate it as a service in experimentation portal with score cards.
 
 ## Documentation

diff --git a/docs/stats/sample_size.md b/docs/stats/sample_size.md
@@ -50,6 +50,10 @@ As we discussed in previous chapter, we can see that:
 1. Sample size increases as MDE decreases (because $\Delta$ is in the denominator).
 1. Sample size is greatest for conversion rates equal to 0.5.
 
+Sample size calculation is available in the statistics toolkit of this package.
+See the [API documentation](../api/statistics.md#epstats.toolkit.statistics.Statistics.required_sample_size_per_variant)
+for more details.
+
 ## What to Do If Sample Size is Too Big
 
 We saw that required sample size is

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = ep-stats
-version = 1.4.0
+version = 1.5.0
 description = Statistical package to evaluate ab tests in experimentation platform.
 long_description = file: README.md
 long_description_content_type = text/markdown

diff --git a/src/epstats/server/api_evaluate.py b/src/epstats/server/api_evaluate.py
@@ -27,12 +27,11 @@ def _evaluate(experiment: EvExperiment, dao: Dao, statsd: StatsClient):
                     evaluation = experiment.evaluate_agg(goals)
                     statsd.incr("evaluations")
                 _logger.info(
-                    (
-                        f"Evaluation of experiment [{experiment.id}] finished with evaluation"
-                        f" of {evaluation.metrics.metric_id.nunique()} "
-                        f"metrics and {evaluation.checks.check_id.nunique()} checks."
-                        f"Metrics: {evaluation.metrics.to_dict('records')}"
-                    )
+                    {
+                        "evaluation": "response",
+                        "exp_id": experiment.id,
+                        "metrics": evaluation.metrics.to_dict("records"),
+                    }
                 )
             return Result.from_evaluation(experiment, evaluation)
         except Exception as e:
@@ -56,7 +55,7 @@ async def evaluate_experiment(
         """
         Evaluates single `Experiment`.
         """
-        _logger.info(f"Calling evaluate with {experiment.json()}")
+        _logger.info({"evaluation": "request", "experiment": experiment.dict()})
         statsd.incr("requests.evaluate")
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(evaluation_pool, _evaluate, experiment.to_experiment(statsd), dao, statsd)

diff --git a/src/epstats/server/req.py b/src/epstats/server/req.py
@@ -39,6 +39,12 @@ class Metric(BaseModel):
         description="""EP metric is defined in the form of `nominator / denominator`.
         Both parts are entered as expressions. Example: `count(my_unit_type.unit.conversion)`.""",
     )
+    minimum_effect: Optional[float] = Field(
+        title="Minimum effect of interest",
+        description=f"""The minimum effect of interest is the smallest relative difference that is meaningful to detect,
+        defining it allows us to estimate the size of the sample data required to reach {DEFAULT_POWER:.0%} power.""",
+        default=None,
+    )
 
     @validator("id")
     def id_must_be_not_empty(cls, value):
@@ -80,7 +86,13 @@ def check_nominator_denominator(cls, values):
             raise ValueError(f"Cannot parse nominator '{nominator}' or '{denominator}' because of '{e}'")
 
     def to_metric(self):
-        return EvMetric(self.id, self.name, self.nominator, self.denominator)
+        return EvMetric(
+            id=self.id,
+            name=self.name,
+            nominator=self.nominator,
+            denominator=self.denominator,
+            minimum_effect=self.minimum_effect,
+        )
 
 
 class Check(BaseModel):
@@ -378,10 +390,14 @@ class SampleSizeCalculationData(BaseModel):
     Data needed for the sample size calculation.
     """
 
-    n_variants: int = Field(title="Number of variants", description="Number of variants in the experiment.")
+    n_variants: int = Field(
+        title="Number of variants",
+        description="Number of variants in the experiment.",
+    )
 
     minimum_effect: float = Field(
-        title="Minimum effect of interest", description="Relative effect, must be greater than zero."
+        title="Minimum effect of interest",
+        description="Relative effect, must be greater than zero.",
     )
 
     mean: float = Field(

diff --git a/src/epstats/server/res.py b/src/epstats/server/res.py
@@ -1,8 +1,8 @@
 import pandas as pd
-from typing import List
+from typing import List, Optional
 from pydantic import BaseModel, Field
 
-from ..toolkit import Evaluation
+from ..toolkit import Evaluation, DEFAULT_POWER
 from .req import Experiment, Metric, Check
 
 
@@ -49,6 +49,14 @@ class MetricStat(BaseModel):
         description="""Confidence level used
         to compute (obtain) `confidence_interval`.""",
     )
+    sample_size: Optional[float] = Field(
+        title="Sample size",
+        description="Current sample size.",
+    )
+    required_sample_size: Optional[float] = Field(
+        title="Required sample size",
+        description=f"Size of the sample required to reach {DEFAULT_POWER:.0%} power.",
+    )
 
     @staticmethod
     def from_df(df: pd.DataFrame):
@@ -62,6 +70,8 @@ def from_df(df: pd.DataFrame):
                 p_value=r["p_value"],
                 confidence_interval=r["confidence_interval"],
                 confidence_level=r["confidence_level"],
+                sample_size=r["sample_size"],
+                required_sample_size=r["required_sample_size"],
             )
             for i, r in df.iterrows()
         ]

diff --git a/src/epstats/toolkit/experiment.py b/src/epstats/toolkit/experiment.py
@@ -3,16 +3,18 @@
 from enum import Enum
 import pandas as pd
 import numpy as np
+from collections import Counter
+from typing import Optional
 from datetime import datetime
 from statsd import StatsClient
 from dataclasses import dataclass
 
-from .metric import Metric
+from .metric import Metric, SimpleMetric
 from .check import Check
 from .utils import get_utc_timestamp, goals_wide_to_long
 from .parser import EpGoal, UnitType, AggType, Goal
 
-from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL
+from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_POWER
 
 
 class Evaluation:
@@ -64,6 +66,9 @@ def metric_columns(cls) -> List[str]:
             "confidence_interval",
             "standard_error",
             "degrees_of_freedom",
+            "minimum_effect",
+            "sample_size",
+            "required_sample_size",
         ]
 
     @classmethod
@@ -128,20 +133,21 @@ def __init__(
         metrics: List[Metric],
         checks: List[Check],
         unit_type: str,
-        date_from: str = None,
-        date_to: str = None,
-        date_for: str = None,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        date_for: Optional[str] = None,
         confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
-        variants: List[str] = None,
+        variants: Optional[List[str]] = None,
         statsd: StatsClient = StatsClient(),
-        filters: List[Filter] = None,
-        outlier_detection_algorithm: str = None,
+        filters: Optional[List[Filter]] = None,
+        outlier_detection_algorithm: Optional[str] = None,
     ):
         self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.id = id
         self.control_variant = control_variant
         self.unit_type = unit_type
         self.metrics = metrics
+        self._check_metric_ids_unique()
         self.checks = checks
         self.date_from = datetime.strptime(date_from, "%Y-%m-%d").date() if date_from is not None else None
         self.date_to = datetime.strptime(date_to, "%Y-%m-%d").date() if date_to is not None else None
@@ -169,6 +175,15 @@ def __init__(
         self.filters = filters if filters is not None else []
         self.outlier_detection_algorithm = outlier_detection_algorithm
 
+    def _check_metric_ids_unique(self):
+        """
+        Raises an exception if `metrics` contain duplicated ids.
+        """
+        id_counts = Counter(metric.id for metric in self.metrics)
+        for id_, count in id_counts.items():
+            if count > 1:
+                raise ValueError(f"Metric ids must be unique. Id={id_} found more than once.")
+
     def _update_dimension_to_value(self):
         """
         To every `EpGoal` across all metrics, we need to add missing dimensions
@@ -652,6 +667,70 @@ def _fix_missing_by_unit(self, goals: pd.DataFrame) -> pd.DataFrame:
             + self.get_dimension_columns()
         ]
 
+    def _get_required_sample_size(
+        self,
+        metric_row: pd.Series,
+        controls: dict,
+        minimum_effects: dict,
+        metrics_with_value_denominator: set,
+        n_variants: int,
+    ) -> pd.Series:
+
+        metric_id = metric_row["metric_id"]
+        minimum_effect = minimum_effects[metric_id]
+        index = ["minimum_effect", "sample_size", "required_sample_size"]
+
+        # Right now, metric with value() denominator would return count that is not equal
+        # to the sample size. In such case we do not evaluate the required sample size.
+        # TODO: add suport for value() denominator metrics,
+        # parser will return an additional column equal to count or count_unique.
+        sample_size = metric_row["count"] if metric_id not in metrics_with_value_denominator else np.nan
+
+        if metric_row["exp_variant_id"] == self.control_variant or pd.isnull(minimum_effect):
+            return pd.Series([np.nan, sample_size, np.nan], index)
+
+        metric_id = metric_row["metric_id"]
+        return pd.Series(
+            [
+                minimum_effect,
+                sample_size,
+                Statistics.required_sample_size_per_variant(
+                    n_variants=n_variants,
+                    minimum_effect=minimum_effect,
+                    mean=controls[metric_id]["mean"],
+                    std=controls[metric_id]["std"],
+                    std_2=metric_row["std"],
+                    confidence_level=metric_row["confidence_level"],
+                    power=DEFAULT_POWER,
+                ),
+            ],
+            index,
+        )
+
+    def _get_required_sample_sizes(self, metrics: pd.DataFrame, n_variants: int) -> pd.DataFrame:
+
+        controls = {
+            r["metric_id"]: {"mean": r["mean"], "std": r["std"]}
+            for _, r in metrics.iterrows()
+            if r["exp_variant_id"] == self.control_variant
+        }
+
+        minimum_effects = {m.id: m.minimum_effect for m in self.metrics}
+        metrics_with_value_denominator = {
+            m.id for m in self.metrics if m.denominator.startswith("value(") and not isinstance(m, SimpleMetric)
+        }
+
+        return metrics.apply(
+            lambda metric_row: self._get_required_sample_size(
+                metric_row=metric_row,
+                controls=controls,
+                minimum_effects=minimum_effects,
+                metrics_with_value_denominator=metrics_with_value_denominator,
+                n_variants=n_variants,
+            ),
+            axis=1,
+        )
+
     def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
         if not self.metrics:
             return pd.DataFrame([], columns=Evaluation.metric_columns())
@@ -662,7 +741,7 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
             sts.append([count, sum_value, sum_sqr_value])
         stats = np.array(sts).transpose(0, 2, 1)
         metrics = stats.shape[0]
-        variants = stats.shape[1]
+        n_variants = stats.shape[1]
 
         count = stats[:, :, 0]
         sum_value = stats[:, :, 1]
@@ -689,9 +768,9 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
         stats = np.dstack((count, mean, std, sum_value, np.ones(count.shape) * confidence_level))
         stats = np.dstack(
             (
-                np.repeat([m.id for m in self.metrics], variants).reshape(metrics, variants, -1),
-                np.repeat([m.name for m in self.metrics], variants).reshape(metrics, variants, -1),
-                np.tile(goals["exp_variant_id"].unique(), metrics).reshape(metrics, variants, -1),
+                np.repeat([m.id for m in self.metrics], n_variants).reshape(metrics, n_variants, -1),
+                np.repeat([m.name for m in self.metrics], n_variants).reshape(metrics, n_variants, -1),
+                np.tile(goals["exp_variant_id"].unique(), metrics).reshape(metrics, n_variants, -1),
                 stats,
             )
         )
@@ -702,9 +781,10 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
         c = Statistics.ttest_evaluation(stats, self.control_variant)
 
         # multiple variants (comparisons) correction - applied when we have multiple treatment variants
-        if variants > 2:
-            c = Statistics.multiple_comparisons_correction(c, variants, metrics, confidence_level)
+        if n_variants > 2:
+            c = Statistics.multiple_comparisons_correction(c, n_variants, metrics, confidence_level)
 
         c["exp_id"] = self.id
         c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
+        c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
         return c[Evaluation.metric_columns()]
diff --git a/src/epstats/toolkit/metric.py b/src/epstats/toolkit/metric.py
@@ -1,4 +1,4 @@
-from typing import Set
+from typing import Set, Optional
 import pandas as pd
 import numpy as np
 
@@ -18,6 +18,7 @@ def __init__(
         denominator: str,
         metric_format: str = "{:.2%}",
         metric_value_multiplier: int = 1,
+        minimum_effect: Optional[float] = None,
     ):
         """
         Constructor of the general metric definition.
@@ -55,6 +56,7 @@ def __init__(
         self._goals = self._parser.get_goals()
         self.metric_format = metric_format
         self.metric_value_multiplier = metric_value_multiplier
+        self.minimum_effect = minimum_effect
 
     def get_goals(self) -> Set:
         """
@@ -111,6 +113,7 @@ def __init__(
         unit_type: str = "test_unit_type",
         metric_format: str = "{:.2%}",
         metric_value_multiplier: int = 1,
+        minimum_effect: Optional[float] = None,
     ):
         """
         Constructor of the simplified metric definition.
@@ -145,4 +148,4 @@ def __init__(
         num = "value" + "(" + unit_type + "." + agg_type + "." + numerator + ")"
         den = "value" + "(" + unit_type + "." + agg_type + "." + denominator + ")"
 
-        super().__init__(id, name, num, den, metric_format, metric_value_multiplier)
+        super().__init__(id, name, num, den, metric_format, metric_value_multiplier, minimum_effect)