Skip to content

Commit

Permalink
sample size calculation handles nans/infs (#42)
Browse files Browse the repository at this point in the history
* sample size calculation handles nans/infs

* nice assert
  • Loading branch information
jancervenka authored Oct 4, 2022
1 parent c523f98 commit b12c5b8
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 10 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ep-stats
version = 1.5.0
version = 1.5.1
description = Statistical package to evaluate ab tests in experimentation platform.
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down
2 changes: 1 addition & 1 deletion src/epstats/server/res.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ class SampleSizeCalculationResult(BaseModel):
Result of the sample size calculation.
"""

sample_size_per_variant: int = Field(
sample_size_per_variant: float = Field(
title="Sample size per variant",
)

Expand Down
13 changes: 7 additions & 6 deletions src/epstats/toolkit/statistics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import numpy as np
import scipy.stats as st
from typing import Optional
from typing import Optional, Union
from statsmodels.stats.multitest import multipletests
import warnings

Expand Down Expand Up @@ -247,7 +247,7 @@ def required_sample_size_per_variant(
std_2: Optional[float] = None,
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
power: float = DEFAULT_POWER,
) -> int:
) -> Union[int, float]:
"""
Computes the sample size required to reach the defined `confidence_level` and `power`.
Expand Down Expand Up @@ -301,15 +301,16 @@ def required_sample_size_per_variant(
raise ValueError("There must be at least two variants.")

two_vars = 2 * (std ** 2) if std_2 is None else (std ** 2 + std_2 ** 2)
delta = mean * minimum_effect
delta = np.float64(mean * minimum_effect)

alpha = 1 - confidence_level
m = n_variants - 1
alpha = alpha / m # Bonferroni correction
# 7.84 for 80% power and 95% confidence, alpha / 2 for two-sided hypothesis
confidence_and_power = (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(power)) ** 2
samples_size_per_variant = confidence_and_power * (two_vars / delta ** 2)
return round(samples_size_per_variant)
with np.errstate(divide="ignore", invalid="ignore"):
samples_size_per_variant = confidence_and_power * (two_vars / delta ** 2)
return np.round(samples_size_per_variant)

@classmethod
def required_sample_size_per_variant_bernoulli(
Expand All @@ -320,7 +321,7 @@ def required_sample_size_per_variant_bernoulli(
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
power: float = DEFAULT_POWER,
**unused_kwargs,
) -> int:
) -> Union[int, float]:
"""
Computes the sample size required to reach the defined `confidence_level`
and `power` when the data follow Bernoulli distribution
Expand Down
14 changes: 12 additions & 2 deletions tests/epstats/server/test_api_sample_size_calculation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from math import isnan
from fastapi.testclient import TestClient

from src.epstats.main import api
Expand All @@ -14,7 +15,13 @@

@pytest.mark.parametrize(
"n_variants, minimum_effect, mean, std, expected",
[(2, 0.10, 0.2, 1.2, 56512), (2, 0.05, 0.4, None, 9489), (3, 0.05, 0.4, None, 11492)],
[
(2, 0.10, 0.2, 1.2, 56512),
(2, 0.05, 0.4, None, 9489),
(3, 0.05, 0.4, None, 11492),
(2, 0.1, 0, 0, float("nan")),
(2, 0.1, 0, 1, float("inf")),
],
)
def test_sample_size_calculation(n_variants, minimum_effect, mean, std, expected):
json_blob = {
Expand All @@ -26,7 +33,10 @@ def test_sample_size_calculation(n_variants, minimum_effect, mean, std, expected

resp = client.post("/sample-size-calculation", json=json_blob)
assert resp.status_code == 200
assert resp.json()["sample_size_per_variant"] == expected

sample_size = resp.json()["sample_size_per_variant"]

assert sample_size == expected or (isnan(expected) and isnan(sample_size))


@pytest.mark.parametrize(
Expand Down
21 changes: 21 additions & 0 deletions tests/epstats/toolkit/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,24 @@ def test_required_sample_size_per_variant_raises_exception(n_variants, minimum_e

with pytest.raises(ValueError):
f(**args)


@pytest.mark.parametrize(
"minimum_effect, mean, std, expected",
[
(0.1, 0, 0, np.isnan),
(0.1, np.nan, np.nan, np.isnan),
(0.1, 0, np.nan, np.isnan),
(0.1, 0, 1, np.isinf),
(np.nan, np.nan, np.nan, np.isnan),
],
)
def test_required_sample_size_per_variant_not_valid(minimum_effect, mean, std, expected):
assert expected(
Statistics.required_sample_size_per_variant(
minimum_effect=minimum_effect,
mean=mean,
std=std,
n_variants=2,
)
)

0 comments on commit b12c5b8

Please sign in to comment.