From 763812b11a5f5cb91f3730a84c7c053ebcf5986a Mon Sep 17 00:00:00 2001 From: Evgeny Ivanov Date: Wed, 8 Jan 2025 22:00:38 +0300 Subject: [PATCH 1/4] Update docs --- docs/data-backends.md | 10 ++- docs/multiple-testing.md | 8 ++ docs/user-guide.md | 8 ++ mkdocs.yml | 2 + src/tea_tasting/aggr.py | 8 +- src/tea_tasting/config.py | 36 +++++---- src/tea_tasting/datasets.py | 18 ++--- src/tea_tasting/metrics/base.py | 8 -- src/tea_tasting/metrics/mean.py | 32 ++++---- src/tea_tasting/metrics/proportion.py | 14 ++-- src/tea_tasting/metrics/resampling.py | 55 +++++++------- src/tea_tasting/multiplicity.py | 30 ++++---- src/tea_tasting/utils.py | 101 ++++++++++++++------------ 13 files changed, 174 insertions(+), 156 deletions(-) diff --git a/docs/data-backends.md b/docs/data-backends.md index efcbf31..0798f02 100644 --- a/docs/data-backends.md +++ b/docs/data-backends.md @@ -17,11 +17,13 @@ This guide: ## Demo database -This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. To be able to reproduce the example code, install **tea-tasting**, Ibis with DuckDB extra, and Polars: +???+ note -```bash -pip install tea-tasting ibis-framework[duckdb] polars -``` + This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. To be able to reproduce the example code, install Ibis with DuckDB extra and Polars in addition to **tea-tasting**: + + ```bash + pip install ibis-framework[duckdb] polars + ``` First, let's prepare a demo database: diff --git a/docs/multiple-testing.md b/docs/multiple-testing.md index 91bc6b6..8de5a69 100644 --- a/docs/multiple-testing.md +++ b/docs/multiple-testing.md @@ -2,6 +2,14 @@ ## Multiple hypothesis testing problem +???+ note + + This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. To be able to reproduce the example code, install Polars in addition to **tea-tasting**: + + ```bash + pip install polars + ``` + The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test. **tea-tasting** provides the following methods for multiple testing correction: diff --git a/docs/user-guide.md b/docs/user-guide.md index 3505d87..368ff24 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -382,6 +382,14 @@ Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0 ### More than two variants +???+ note + + This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. To be able to reproduce the example code, install Polars in addition to **tea-tasting**: + + ```bash + pip install polars + ``` + In **tea-tasting**, it's possible to analyze experiments with more than two variants. However, the variants will be compared in pairs through two-sample statistical tests. Example usage: diff --git a/mkdocs.yml b/mkdocs.yml index f3eb05c..2e7adb8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,6 +79,8 @@ plugins: markdown_extensions: - _strip_doctest_artifacts + - admonition + - pymdownx.details - pymdownx.superfences - toc: permalink: "#" diff --git a/src/tea_tasting/aggr.py b/src/tea_tasting/aggr.py index 0092352..9ab0921 100644 --- a/src/tea_tasting/aggr.py +++ b/src/tea_tasting/aggr.py @@ -56,8 +56,9 @@ def with_zero_div(self) -> Aggregates: """Return aggregates that do not raise an error on division by zero. Division by zero returns: - - `inf` if numerator is greater than `0`, - - `nan` if numerator is equal to or less than `0`. + + - `inf` if numerator is greater than `0`, + - `nan` if numerator is equal to or less than `0`. """ return Aggregates( count_=None if self.count_ is None else tea_tasting.utils.Int(self.count_), @@ -69,9 +70,6 @@ def with_zero_div(self) -> Aggregates: def count(self) -> int: """Sample size (number of observations). - Raises: - RuntimeError: Count is `None` (if it was not defined during initialization). - Returns: Sample size (number of observations). """ diff --git a/src/tea_tasting/config.py b/src/tea_tasting/config.py index cf33fd7..062740e 100644 --- a/src/tea_tasting/config.py +++ b/src/tea_tasting/config.py @@ -69,7 +69,16 @@ def set_config( Args: alpha: Significance level. Default is 0.05. - alternative: Alternative hypothesis. Default is `"two-sided"`. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + + Default is `"two-sided"`. + confidence_level: Confidence level for the confidence interval. Default is `0.95`. equal_var: Defines whether equal variance is assumed. If `True`, @@ -86,13 +95,6 @@ def set_config( the Normal distribution (`False`) by default. Default is `True`. **kwargs: User-defined global parameters. - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Examples: ```pycon >>> import tea_tasting as tt @@ -138,7 +140,16 @@ def config_context( Args: alpha: Significance level. Default is 0.05. - alternative: Alternative hypothesis. Default is `"two-sided"`. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + + Default is `"two-sided"`. + confidence_level: Confidence level for the confidence interval. Default is `0.95`. equal_var: Defines whether equal variance is assumed. If `True`, @@ -155,13 +166,6 @@ def config_context( the Normal distribution (`False`) by default. Default is `True`. **kwargs: User-defined global parameters. - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Examples: ```pycon >>> import tea_tasting as tt diff --git a/src/tea_tasting/datasets.py b/src/tea_tasting/datasets.py index f6d8874..c6ccfc0 100644 --- a/src/tea_tasting/datasets.py +++ b/src/tea_tasting/datasets.py @@ -125,12 +125,11 @@ def make_users_data( avg_orders_per_session: Average number of orders per session. Should be less than `1`. avg_revenue_per_order: Average revenue per order. - return_type: Result type. + return_type: Return type: - Result types: - - `"arrow"`: PyArrow Table. - - `"pandas"`: Pandas DataFrame. - - `"polars"`: Polars DataFrame. + - `"arrow"`: PyArrow Table. + - `"pandas"`: Pandas DataFrame. + - `"polars"`: Polars DataFrame. Returns: Simulated data for A/B testing scenarios. @@ -345,12 +344,11 @@ def make_sessions_data( avg_orders_per_session: Average number of orders per session. Should be less than `1`. avg_revenue_per_order: Average revenue per order. - return_type: Result type. + return_type: Return type: - Result types: - - `"arrow"`: PyArrow Table. - - `"pandas"`: Pandas DataFrame. - - `"polars"`: Polars DataFrame. + - `"arrow"`: PyArrow Table. + - `"pandas"`: Pandas DataFrame. + - `"polars"`: Polars DataFrame. Returns: Simulated data for A/B testing scenarios. diff --git a/src/tea_tasting/metrics/base.py b/src/tea_tasting/metrics/base.py index effd997..8b20d56 100644 --- a/src/tea_tasting/metrics/base.py +++ b/src/tea_tasting/metrics/base.py @@ -269,11 +269,6 @@ def aggregate_by_variants( aggr_cols: Columns to be aggregated. variant: Variant column name. - Raises: - ValueError: The variant parameter is required but was not provided. - TypeError: data is not an instance of DataFrame, Table, - or a dictionary of Aggregates. - Returns: Experimental data as a dictionary of Aggregates. """ @@ -382,9 +377,6 @@ def read_granular( cols: Columns to read. variant: Variant column name. - Raises: - ValueError: The variant parameter is required but was not provided. - Returns: Experimental data as a dictionary of PyArrow Tables. """ diff --git a/src/tea_tasting/metrics/mean.py b/src/tea_tasting/metrics/mean.py index 4368e3b..2659932 100644 --- a/src/tea_tasting/metrics/mean.py +++ b/src/tea_tasting/metrics/mean.py @@ -111,7 +111,14 @@ def __init__( # noqa: PLR0913 denom: Denominator column name. numer_covariate: Covariate numerator column name. denom_covariate: Covariate denominator column name. - alternative: Alternative hypothesis. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + confidence_level: Confidence level for the confidence interval. equal_var: Defines whether equal variance is assumed. If `True`, pooled variance is used for the calculation of the standard error @@ -129,13 +136,6 @@ def __init__( # noqa: PLR0913 n_obs: Number of observations in the control and in the treatment together. Only for the analysis of power. - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Parameter defaults: Defaults for parameters `alpha`, `alternative`, `confidence_level`, `equal_var`, `n_obs`, `power`, `ratio`, and `use_t` can be changed @@ -755,7 +755,14 @@ def __init__( # noqa: PLR0913 Args: value: Metric value column name. covariate: Metric covariate column name. - alternative: Alternative hypothesis. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + confidence_level: Confidence level for the confidence interval. equal_var: Defines whether equal variance is assumed. If `True`, pooled variance is used for the calculation of the standard error @@ -773,13 +780,6 @@ def __init__( # noqa: PLR0913 n_obs: Number of observations in the control and in the treatment together. Only for the analysis of power. - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Parameter defaults: Defaults for parameters `alpha`, `alternative`, `confidence_level`, `equal_var`, `n_obs`, `power`, `ratio`, and `use_t` can be changed diff --git a/src/tea_tasting/metrics/proportion.py b/src/tea_tasting/metrics/proportion.py index 5524aab..5ebf437 100644 --- a/src/tea_tasting/metrics/proportion.py +++ b/src/tea_tasting/metrics/proportion.py @@ -50,16 +50,16 @@ def __init__( Args: ratio: Expected ratio of the number of observations in the treatment relative to the control. - method: Statistical test used for calculation of p-value. + method: Statistical test used for calculation of p-value: + + - `"auto"`: Apply exact binomial test if the total number + of observations is < 1000; or normal approximation otherwise. + - `"binom"`: Apply exact binomial test. + - `"norm"`: Apply normal approximation of the binomial distribution. + correction: If `True`, add continuity correction. Only for normal approximation. - Method options: - - `"auto"`: Apply exact binomial test if the total number of observations - is < 1000; or normal approximation otherwise. - - `"binom"`: Apply exact binomial test. - - `"norm"`: Apply normal approximation of the binomial distribution. - Examples: ```pycon >>> import tea_tasting as tt diff --git a/src/tea_tasting/metrics/resampling.py b/src/tea_tasting/metrics/resampling.py index e0f0d7c..461b415 100644 --- a/src/tea_tasting/metrics/resampling.py +++ b/src/tea_tasting/metrics/resampling.py @@ -64,6 +64,10 @@ def __init__( ) -> None: """Metric for analysis of a statistic using bootstrap resampling. + If `columns` is a sequence of strings, then the sample passed + to the statistic callable contains an extra dimension in the first axis. + See examples below. + Args: columns: Names of the columns to be used in the analysis. statistic: Statistic. It must be a vectorized callable @@ -71,7 +75,14 @@ def __init__( the resulting statistic. It must also accept a keyword argument `axis` and be vectorized to compute the statistic along the provided axis. - alternative: Alternative hypothesis. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + confidence_level: Confidence level for the confidence interval. n_resamples: The number of resamples performed to form the bootstrap distribution of the statistic. @@ -86,18 +97,6 @@ def __init__( random_state: Pseudorandom number generator state used to generate resamples. - Multiple columns: - If `columns` is a sequence of strings, then the sample passed - to the statistic callable contains an extra dimension in the first axis. - See examples below. - - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Parameter defaults: Defaults for parameters `alternative`, `confidence_level`, and `n_resamples` can be changed using the @@ -284,7 +283,14 @@ def __init__( Args: column: Name of the column for the quantiles to compute. q: Probability for the quantiles to compute. - alternative: Alternative hypothesis. + alternative: Alternative hypothesis: + + - `"two-sided"`: the means are unequal, + - `"greater"`: the mean in the treatment variant is greater than the mean + in the control variant, + - `"less"`: the mean in the treatment variant is less than the mean + in the control variant. + confidence_level: Confidence level for the confidence interval. n_resamples: The number of resamples performed to form the bootstrap distribution of the statistic. @@ -292,6 +298,13 @@ def __init__( interval (`"percentile"`), the "basic" (AKA "reverse") bootstrap confidence interval (`"basic"`), or the bias-corrected and accelerated bootstrap confidence interval (`"bca"`). + + Default method is "basic" which is different from default + method "bca" in `Bootstrap`. The "bca" confidence intervals cannot + be calculated when the bootstrap distribution is degenerate + (e.g. all elements are identical). This is often the case for the + quantile metrics. + batch: The number of resamples to process in each vectorized call to statistic. Memory usage is O(`batch * n`), where `n` is the sample size. Default is `None`, in which case `batch = n_resamples` @@ -299,13 +312,6 @@ def __init__( random_state: Pseudorandom number generator state used to generate resamples. - Alternative hypothesis options: - - `"two-sided"`: the means are unequal, - - `"greater"`: the mean in the treatment variant is greater than the mean - in the control variant, - - `"less"`: the mean in the treatment variant is less than the mean - in the control variant. - Parameter defaults: Defaults for parameters `alternative`, `confidence_level`, and `n_resamples` can be changed using the @@ -313,13 +319,6 @@ def __init__( See the [Global configuration](https://tea-tasting.e10v.me/api/config/) reference for details. - Default method: - Default method is "basic" which is different from default - method "bca" in `Bootstrap`. The "bca" confidence intervals cannot - be calculated when the bootstrap distribution is degenerate - (e.g. all elements are identical). This is often the case for the - quantile metrics. - Examples: ```pycon >>> import tea_tasting as tt diff --git a/src/tea_tasting/multiplicity.py b/src/tea_tasting/multiplicity.py index 06322cd..1beeb4e 100644 --- a/src/tea_tasting/multiplicity.py +++ b/src/tea_tasting/multiplicity.py @@ -65,12 +65,13 @@ def adjust_fdr( hypotheses (`arbitrary_dependence=False`). The function adds the following attributes to the results: - - `pvalue_adj`: The adjusted p-value, which should be compared with - the unadjusted FDR (`alpha`). - - `alpha_adj`: The adjusted FDR, which should be compared with the unadjusted - p-value (`pvalue`). - - `null_rejected`: A binary indicator (`0` or `1`) that shows whether - the null hypothesis is rejected. + + - `pvalue_adj`: The adjusted p-value, which should be compared with + the unadjusted FDR (`alpha`). + - `alpha_adj`: The adjusted FDR, which should be compared with the unadjusted + p-value (`pvalue`). + - `null_rejected`: A binary indicator (`0` or `1`) that shows whether + the null hypothesis is rejected. Args: experiment_results: Experiment results. @@ -86,7 +87,7 @@ def adjust_fdr( The experiments results with adjusted p-values and alphas. Parameter defaults: - Default for parameters `alpha` can be changed using the `config_context` + Default for parameter `alpha` can be changed using the `config_context` and `set_context` functions. See the [Global configuration](https://tea-tasting.e10v.me/api/config/) reference for details. @@ -241,12 +242,13 @@ def adjust_fwer( hypotheses (`arbitrary_dependence=False`). The function adds the following attributes to the results: - - `pvalue_adj`: The adjusted p-value, which should be compared with - the unadjusted FDR (`alpha`). - - `alpha_adj`: The adjusted FWER, which should be compared with the unadjusted - p-value (`pvalue`). - - `null_rejected`: A binary indicator (`0` or `1`) that shows whether - the null hypothesis is rejected. + + - `pvalue_adj`: The adjusted p-value, which should be compared with + the unadjusted FDR (`alpha`). + - `alpha_adj`: The adjusted FWER, which should be compared with the unadjusted + p-value (`pvalue`). + - `null_rejected`: A binary indicator (`0` or `1`) that shows whether + the null hypothesis is rejected. Args: experiment_results: Experiment results. @@ -263,7 +265,7 @@ def adjust_fwer( The experiments results with adjusted p-values and alphas. Parameter defaults: - Default for parameters `alpha` can be changed using the `config_context` + Default for parameter `alpha` can be changed using the `config_context` and `set_context` functions. See the [Global configuration](https://tea-tasting.e10v.me/api/config/) reference for details. diff --git a/src/tea_tasting/utils.py b/src/tea_tasting/utils.py index 5e8c800..44c4d3c 100644 --- a/src/tea_tasting/utils.py +++ b/src/tea_tasting/utils.py @@ -185,22 +185,23 @@ def format_num( def get_and_format_num(data: dict[str, Any], key: str) -> str: """Get and format dictionary value. + Formatting rules: + + - If a name starts with `"rel_"` or equals to `"power"` consider it + a percentage value. Round percentage values to 2 significant digits, + multiply by `100` and add `"%"`. + - Round other values to 3 significant values. + - If value is less than `0.001`, format it in exponential presentation. + - If a name ends with `"_ci"`, consider it a confidence interval. + Look up for attributes `"{name}_lower"` and `"{name}_upper"`, + and format the interval as `"[{lower_bound}, {lower_bound}]"`. + Args: data: Dictionary. key: Key. Returns: Formatted value. - - Formatting rules: - - If a name starts with `"rel_"` or equals to `"power"` consider it - a percentage value. Round percentage values to 2 significant digits, - multiply by `100` and add `"%"`. - - Round other values to 3 significant values. - - If value is less than `0.001`, format it in exponential presentation. - - If a name ends with `"_ci"`, consider it a confidence interval. - Look up for attributes `"{name}_lower"` and `"{name}_upper"`, - and format the interval as `"[{lower_bound}, {lower_bound}]"`. """ if key.endswith("_ci"): ci_lower = get_and_format_num(data, key + "_lower") @@ -219,14 +220,15 @@ class DictsReprMixin(abc.ABC): """Representation and conversion of a sequence of dictionaries. Default formatting rules: - - If a name starts with `"rel_"` or equals to `"power"` consider it - a percentage value. Round percentage values to 2 significant digits, - multiply by `100` and add `"%"`. - - Round other values to 3 significant values. - - If value is less than `0.001`, format it in exponential presentation. - - If a name ends with `"_ci"`, consider it a confidence interval. - Look up for attributes `"{name}_lower"` and `"{name}_upper"`, - and format the interval as `"[{lower_bound}, {lower_bound}]"`. + + - If a name starts with `"rel_"` or equals to `"power"` consider it + a percentage value. Round percentage values to 2 significant digits, + multiply by `100` and add `"%"`. + - Round other values to 3 significant values. + - If value is less than `0.001`, format it in exponential presentation. + - If a name ends with `"_ci"`, consider it a confidence interval. + Look up for attributes `"{name}_lower"` and `"{name}_upper"`, + and format the interval as `"[{lower_bound}, {lower_bound}]"`. """ default_keys: Sequence[str] @@ -255,6 +257,17 @@ def to_pretty_dicts( ) -> list[dict[str, str]]: """Convert the object to a list of dictionaries with formatted values. + Default formatting rules: + + - If a name starts with `"rel_"` or equals to `"power"` consider it + a percentage value. Round percentage values to 2 significant digits, + multiply by `100` and add `"%"`. + - Round other values to 3 significant values. + - If value is less than `0.001`, format it in exponential presentation. + - If a name ends with `"_ci"`, consider it a confidence interval. + Look up for attributes `"{name}_lower"` and `"{name}_upper"`, + and format the interval as `"[{lower_bound}, {lower_bound}]"`. + Args: keys: Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`. @@ -264,16 +277,6 @@ def to_pretty_dicts( Returns: List of dictionaries with formatted values. - - Default formatting rules: - - If a name starts with `"rel_"` or equals to `"power"` consider it - a percentage value. Round percentage values to 2 significant digits, - multiply by `100` and add `"%"`. - - Round other values to 3 significant values. - - If value is less than `0.001`, format it in exponential presentation. - - If a name ends with `"_ci"`, consider it a confidence interval. - Look up for attributes `"{name}_lower"` and `"{name}_upper"`, - and format the interval as `"[{lower_bound}, {lower_bound}]"`. """ if keys is None: keys = self.default_keys @@ -286,6 +289,17 @@ def to_string( ) -> str: """Convert the object to a string. + Default formatting rules: + + - If a name starts with `"rel_"` or equals to `"power"` consider it + a percentage value. Round percentage values to 2 significant digits, + multiply by `100` and add `"%"`. + - Round other values to 3 significant values. + - If value is less than `0.001`, format it in exponential presentation. + - If a name ends with `"_ci"`, consider it a confidence interval. + Look up for attributes `"{name}_lower"` and `"{name}_upper"`, + and format the interval as `"[{lower_bound}, {lower_bound}]"`. + Args: keys: Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`. @@ -295,16 +309,6 @@ def to_string( Returns: A table with results rendered as string. - - Default formatting rules: - - If a name starts with `"rel_"` or equals to `"power"` consider it - a percentage value. Round percentage values to 2 significant digits, - multiply by `100` and add `"%"`. - - Round other values to 3 significant values. - - If value is less than `0.001`, format it in exponential presentation. - - If a name ends with `"_ci"`, consider it a confidence interval. - Look up for attributes `"{name}_lower"` and `"{name}_upper"`, - and format the interval as `"[{lower_bound}, {lower_bound}]"`. """ if keys is None: keys = self.default_keys @@ -336,6 +340,17 @@ def to_html( ) -> str: """Convert the object to HTML. + Default formatting rules: + + - If a name starts with `"rel_"` or equals to `"power"` consider it + a percentage value. Round percentage values to 2 significant digits, + multiply by `100` and add `"%"`. + - Round other values to 3 significant values. + - If value is less than `0.001`, format it in exponential presentation. + - If a name ends with `"_ci"`, consider it a confidence interval. + Look up for attributes `"{name}_lower"` and `"{name}_upper"`, + and format the interval as `"[{lower_bound}, {lower_bound}]"`. + Args: keys: Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`. @@ -347,16 +362,6 @@ def to_html( Returns: A table with results rendered as HTML. - - Default formatting rules: - - If a name starts with `"rel_"` or equals to `"power"` consider it - a percentage value. Round percentage values to 2 significant digits, - multiply by `100` and add `"%"`. - - Round other values to 3 significant values. - - If value is less than `0.001`, format it in exponential presentation. - - If a name ends with `"_ci"`, consider it a confidence interval. - Look up for attributes `"{name}_lower"` and `"{name}_upper"`, - and format the interval as `"[{lower_bound}, {lower_bound}]"`. """ if keys is None: keys = self.default_keys From b8fb47402dbaeefa62bedebefebfde88ad95d19d Mon Sep 17 00:00:00 2001 From: Evgeny Ivanov Date: Wed, 8 Jan 2025 22:27:11 +0300 Subject: [PATCH 2/4] Update docs --- .markdownlint.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.markdownlint.yaml b/.markdownlint.yaml index f0ceca8..3433dfe 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -2,3 +2,4 @@ MD007: indent: 4 MD013: false +MD046: false From aec157a475b54148d3baa9af5268587e03c9c9e9 Mon Sep 17 00:00:00 2001 From: Evgeny Ivanov Date: Wed, 8 Jan 2025 22:28:51 +0300 Subject: [PATCH 3/4] Remove unnecessary whitespace in examples --- docs/custom-metrics.md | 6 ------ docs/data-backends.md | 1 - docs/index.md | 3 --- docs/multiple-testing.md | 4 ---- docs/power-analysis.md | 3 --- docs/user-guide.md | 18 +----------------- src/tea_tasting/config.py | 9 +-------- src/tea_tasting/datasets.py | 2 -- src/tea_tasting/experiment.py | 8 -------- src/tea_tasting/metrics/mean.py | 14 ++++---------- src/tea_tasting/metrics/proportion.py | 6 ------ src/tea_tasting/metrics/resampling.py | 5 ----- src/tea_tasting/multiplicity.py | 2 -- 13 files changed, 6 insertions(+), 75 deletions(-) diff --git a/docs/custom-metrics.md b/docs/custom-metrics.md index 2328ed1..94b6b4d 100644 --- a/docs/custom-metrics.md +++ b/docs/custom-metrics.md @@ -15,7 +15,6 @@ First, let's import all the required modules and prepare the data: ```pycon >>> from typing import Literal, NamedTuple - >>> import numpy as np >>> import pyarrow as pa >>> import pyarrow.compute as pc @@ -26,7 +25,6 @@ First, let's import all the required modules and prepare the data: >>> import tea_tasting.metrics >>> import tea_tasting.utils - >>> data = tt.make_users_data(seed=42) >>> data = data.append_column( ... "has_order", @@ -92,14 +90,12 @@ Let's define the metric and discuss each method in details: ... self.correction = tea_tasting.utils.auto_check(correction, "correction") ... self.method = tea_tasting.utils.check_scalar( ... method, "method", typ=str, in_={"g-test", "pearson"}) -... ... @property ... def aggr_cols(self) -> tea_tasting.metrics.AggrCols: ... return tea_tasting.metrics.AggrCols( ... has_count=True, ... mean_cols=(self.column,), ... ) -... ... def analyze_aggregates( ... self, ... control: tea_tasting.aggr.Aggregates, @@ -185,11 +181,9 @@ Metric should have the following methods and properties defined: ... if alternative is not None ... else tea_tasting.config.get_config("alternative") ... ) -... ... @property ... def cols(self) -> tuple[str]: ... return (self.column,) -... ... def analyze_granular( ... self, ... control: pa.Table, diff --git a/docs/data-backends.md b/docs/data-backends.md index 0798f02..b0375b3 100644 --- a/docs/data-backends.md +++ b/docs/data-backends.md @@ -32,7 +32,6 @@ First, let's prepare a demo database: >>> import polars as pl >>> import tea_tasting as tt - >>> users_data = tt.make_users_data(seed=42) >>> con = ibis.duckdb.connect() >>> con.create_table("users_data", users_data) diff --git a/docs/index.md b/docs/index.md index 54c0c45..3a625eb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,16 +34,13 @@ pip install tea-tasting ```pycon >>> import tea_tasting as tt - >>> data = tt.make_users_data(seed=42) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> result = experiment.analyze(data) >>> print(result) metric control treatment rel_effect_size rel_effect_size_ci pvalue diff --git a/docs/multiple-testing.md b/docs/multiple-testing.md index 8de5a69..b3e9456 100644 --- a/docs/multiple-testing.md +++ b/docs/multiple-testing.md @@ -27,7 +27,6 @@ As an example, consider an experiment with three variants, a control and two tre >>> import polars as pl >>> import tea_tasting as tt - >>> data = pl.concat(( ... tt.make_users_data( ... seed=42, @@ -75,7 +74,6 @@ Let's calculate the experiment results: ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> results = experiment.analyze(data, control=0, all_variants=True) >>> print(results) variants metric control treatment rel_effect_size rel_effect_size_ci pvalue @@ -186,10 +184,8 @@ In the examples above, the methods `adjust_fdr` and `adjust_fwer` received resul ```pycon >>> data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15) >>> data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20) - >>> result1 = experiment.analyze(data1) >>> result2 = experiment.analyze(data2) - >>> print(tt.adjust_fdr( ... {"Experiment 1": result1, "Experiment 2": result2}, ... metrics, diff --git a/docs/power-analysis.md b/docs/power-analysis.md index 8e2b3be..e85b644 100644 --- a/docs/power-analysis.md +++ b/docs/power-analysis.md @@ -11,7 +11,6 @@ In this example, **tea-tasting** calculates statistical power given the relative ```pycon >>> import tea_tasting as tt - >>> data = tt.make_users_data( ... seed=42, ... sessions_uplift=0, @@ -19,7 +18,6 @@ In this example, **tea-tasting** calculates statistical power given the relative ... revenue_uplift=0, ... covariates=True, ... ) - >>> orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1) >>> print(orders_per_session.solve_power(data, "power")) power effect_size rel_effect_size n_obs @@ -63,7 +61,6 @@ You can analyze power for all metrics in the experiment. Example: ... orders_per_user=tt.Mean("orders", "orders_covariate"), ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), ... ) - >>> power_result = experiment.solve_power(data) >>> print(power_result) metric power effect_size rel_effect_size n_obs diff --git a/docs/user-guide.md b/docs/user-guide.md index 368ff24..26b1f41 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -15,16 +15,13 @@ Begin with this simple example to understand the basic functionality: ```pycon >>> import tea_tasting as tt - >>> data = tt.make_users_data(seed=42) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> result = experiment.analyze(data) >>> print(result) metric control treatment rel_effect_size rel_effect_size_ci pvalue @@ -230,9 +227,7 @@ Example usage: ```pycon >>> import tea_tasting as tt - >>> data = tt.make_users_data(seed=42, covariates=True) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"), ... orders_per_session=tt.RatioOfMeans( @@ -244,7 +239,6 @@ Example usage: ... orders_per_user=tt.Mean("orders", "orders_covariate"), ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), ... ) - >>> result = experiment.analyze(data) >>> print(result) metric control treatment rel_effect_size rel_effect_size_ci pvalue @@ -275,13 +269,11 @@ Example usage: ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... sample_ratio=tt.SampleRatio(), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -326,7 +318,6 @@ Use [`get_config`](api/config.md#tea_tasting.config.get_config) with the option ```pycon >>> import tea_tasting as tt - >>> print(tt.get_config("equal_var")) False @@ -343,14 +334,13 @@ Use [`set_config`](api/config.md#tea_tasting.config.set_config) to set a global ```pycon >>> tt.set_config(equal_var=True, use_t=False) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - +>>> tt.set_config(equal_var=False, use_t=True) >>> print(experiment.metrics["orders_per_user"]) Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None) @@ -359,8 +349,6 @@ Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0 Use [`config_context`](api/config.md#tea_tasting.config.config_context) to temporarily set a global option value within a context: ```pycon ->>> tt.set_config(equal_var=False, use_t=True) - >>> with tt.config_context(equal_var=True, use_t=False): ... experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), @@ -368,7 +356,6 @@ Use [`config_context`](api/config.md#tea_tasting.config.config_context) to tempo ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> print(tt.get_config("equal_var")) False @@ -398,21 +385,18 @@ Example usage: >>> import polars as pl >>> import tea_tasting as tt - >>> data = pl.concat(( ... tt.make_users_data(seed=42, return_type="polars"), ... tt.make_users_data(seed=21, return_type="polars") ... .filter(pl.col("variant").eq(1)) ... .with_columns(variant=pl.lit(2, pl.Int64)), ... )) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> results = experiment.analyze(data, control=0, all_variants=True) >>> print(results) variants metric control treatment rel_effect_size rel_effect_size_ci pvalue diff --git a/src/tea_tasting/config.py b/src/tea_tasting/config.py index 062740e..de5bd08 100644 --- a/src/tea_tasting/config.py +++ b/src/tea_tasting/config.py @@ -41,7 +41,6 @@ def get_config(option: str | None = None) -> Any: ```pycon >>> import tea_tasting as tt - >>> print(tt.get_config("equal_var")) False @@ -99,21 +98,17 @@ def set_config( ```pycon >>> import tea_tasting as tt - >>> tt.set_config(equal_var=True, use_t=False) - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - + >>> tt.set_config(equal_var=False, use_t=True) >>> print(experiment.metrics["orders_per_user"]) Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None) - >>> tt.set_config(equal_var=False, use_t=True) - ``` """ # noqa: E501 params = {k: v for k, v in locals().items() if k != "kwargs"} | kwargs @@ -170,7 +165,6 @@ def config_context( ```pycon >>> import tea_tasting as tt - >>> with tt.config_context(equal_var=True, use_t=False): ... experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), @@ -178,7 +172,6 @@ def config_context( ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> print(experiment.metrics["orders_per_user"]) Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None) diff --git a/src/tea_tasting/datasets.py b/src/tea_tasting/datasets.py index c6ccfc0..f632bce 100644 --- a/src/tea_tasting/datasets.py +++ b/src/tea_tasting/datasets.py @@ -138,7 +138,6 @@ def make_users_data( ```pycon >>> import tea_tasting as tt - >>> data = tt.make_users_data(seed=42) >>> print(data) pyarrow.Table @@ -357,7 +356,6 @@ def make_sessions_data( ```pycon >>> import tea_tasting as tt - >>> data = tt.make_sessions_data(seed=42) >>> data pyarrow.Table diff --git a/src/tea_tasting/experiment.py b/src/tea_tasting/experiment.py index 3640d8b..bf5b087 100644 --- a/src/tea_tasting/experiment.py +++ b/src/tea_tasting/experiment.py @@ -40,15 +40,12 @@ def to_dicts(self) -> tuple[dict[str, Any], ...]: Examples: ```pycon >>> import pprint - >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> pprint.pprint(result.to_dicts()) @@ -140,14 +137,12 @@ def __init__( ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions"), ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -168,7 +163,6 @@ def __init__( ... "orders per user": tt.Mean("orders"), ... "revenue per user": tt.Mean("revenue"), ... }) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -190,7 +184,6 @@ def __init__( ... revenue_uplift=0, ... covariates=True, ... ) - >>> with tt.config_context(n_obs=(10_000, 20_000)): ... experiment = tt.Experiment( ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"), @@ -203,7 +196,6 @@ def __init__( ... orders_per_user=tt.Mean("orders", "orders_covariate"), ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), ... ) - >>> power_result = experiment.solve_power(data) >>> print(power_result) metric power effect_size rel_effect_size n_obs diff --git a/src/tea_tasting/metrics/mean.py b/src/tea_tasting/metrics/mean.py index 2659932..473c73e 100644 --- a/src/tea_tasting/metrics/mean.py +++ b/src/tea_tasting/metrics/mean.py @@ -151,11 +151,9 @@ def __init__( # noqa: PLR0913 ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -175,7 +173,6 @@ def __init__( # noqa: PLR0913 ... "sessions_covariate", ... ), ... ) - >>> data = tt.make_users_data(seed=42, covariates=True) >>> result = experiment.analyze(data) >>> print(result) @@ -194,7 +191,6 @@ def __init__( # noqa: PLR0913 ... revenue_uplift=0, ... covariates=True, ... ) - >>> orders_per_session = tt.RatioOfMeans( ... "orders", ... "sessions", @@ -202,7 +198,8 @@ def __init__( # noqa: PLR0913 ... "sessions_covariate", ... n_obs=(10_000, 20_000), ... ) - >>> print(orders_per_session.solve_power(data)) # Solve for effect size. + >>> # Solve for effect size. + >>> print(orders_per_session.solve_power(data)) power effect_size rel_effect_size n_obs 80% 0.0177 6.8% 10000 80% 0.0125 4.8% 20000 @@ -795,12 +792,10 @@ def __init__( # noqa: PLR0913 ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... orders_per_user=tt.Mean("orders"), ... revenue_per_user=tt.Mean("revenue"), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -817,7 +812,6 @@ def __init__( # noqa: PLR0913 ... orders_per_user=tt.Mean("orders", "orders_covariate"), ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), ... ) - >>> data = tt.make_users_data(seed=42, covariates=True) >>> result = experiment.analyze(data) >>> print(result) @@ -837,13 +831,13 @@ def __init__( # noqa: PLR0913 ... revenue_uplift=0, ... covariates=True, ... ) - >>> orders_per_user = tt.Mean( ... "orders", ... "orders_covariate", ... n_obs=(10_000, 20_000), ... ) - >>> print(orders_per_user.solve_power(data)) # Solve for effect size. + >>> # Solve for effect size. + >>> print(orders_per_user.solve_power(data)) power effect_size rel_effect_size n_obs 80% 0.0374 7.2% 10000 80% 0.0264 5.1% 20000 diff --git a/src/tea_tasting/metrics/proportion.py b/src/tea_tasting/metrics/proportion.py index 5ebf437..b572a5a 100644 --- a/src/tea_tasting/metrics/proportion.py +++ b/src/tea_tasting/metrics/proportion.py @@ -64,11 +64,9 @@ def __init__( ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... sample_ratio=tt.SampleRatio(), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result.to_string(("metric", "control", "treatment", "pvalue"))) @@ -80,13 +78,9 @@ def __init__( Different expected ratio: ```pycon - >>> import tea_tasting as tt - - >>> experiment = tt.Experiment( ... sample_ratio=tt.SampleRatio(0.5), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result.to_string(("metric", "control", "treatment", "pvalue"))) diff --git a/src/tea_tasting/metrics/resampling.py b/src/tea_tasting/metrics/resampling.py index 461b415..70be1e7 100644 --- a/src/tea_tasting/metrics/resampling.py +++ b/src/tea_tasting/metrics/resampling.py @@ -113,11 +113,9 @@ def __init__( >>> import numpy as np >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... orders_per_user=tt.Bootstrap("orders", np.mean, random_state=42), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -140,7 +138,6 @@ def __init__( ... random_state=42, ... ), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) @@ -323,11 +320,9 @@ def __init__( ```pycon >>> import tea_tasting as tt - >>> experiment = tt.Experiment( ... revenue_per_user_p80=tt.Quantile("revenue", 0.8, random_state=42), ... ) - >>> data = tt.make_users_data(seed=42) >>> result = experiment.analyze(data) >>> print(result) diff --git a/src/tea_tasting/multiplicity.py b/src/tea_tasting/multiplicity.py index 1beeb4e..2401678 100644 --- a/src/tea_tasting/multiplicity.py +++ b/src/tea_tasting/multiplicity.py @@ -101,7 +101,6 @@ def adjust_fdr( >>> import polars as pl >>> import tea_tasting as tt - >>> data = pl.concat(( ... tt.make_users_data( ... seed=42, @@ -280,7 +279,6 @@ def adjust_fwer( >>> import polars as pl >>> import tea_tasting as tt - >>> data = pl.concat(( ... tt.make_users_data( ... seed=42, From d925c39a7a84de15830ecf0d5f1ba9ce8223f28c Mon Sep 17 00:00:00 2001 From: Evgeny Ivanov Date: Wed, 8 Jan 2025 22:32:51 +0300 Subject: [PATCH 4/4] Update readme --- README.md | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index a3f6f07..3a625eb 100644 --- a/README.md +++ b/README.md @@ -31,26 +31,24 @@ pip install tea-tasting ## Basic example -```python -import tea_tasting as tt - - -data = tt.make_users_data(seed=42) - -experiment = tt.Experiment( - sessions_per_user=tt.Mean("sessions"), - orders_per_session=tt.RatioOfMeans("orders", "sessions"), - orders_per_user=tt.Mean("orders"), - revenue_per_user=tt.Mean("revenue"), -) - -result = experiment.analyze(data) -print(result) -#> metric control treatment rel_effect_size rel_effect_size_ci pvalue -#> sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 -#> orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 -#> orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 -#> revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 +```pycon +>>> import tea_tasting as tt + +>>> data = tt.make_users_data(seed=42) +>>> experiment = tt.Experiment( +... sessions_per_user=tt.Mean("sessions"), +... orders_per_session=tt.RatioOfMeans("orders", "sessions"), +... orders_per_user=tt.Mean("orders"), +... revenue_per_user=tt.Mean("revenue"), +... ) +>>> result = experiment.analyze(data) +>>> print(result) + metric control treatment rel_effect_size rel_effect_size_ci pvalue + sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 +orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 + orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 + revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 + ``` Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on [data backends](https://tea-tasting.e10v.me/data-backends/), [power analysis](https://tea-tasting.e10v.me/power-analysis/), [multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/), and [custom metrics](https://tea-tasting.e10v.me/custom-metrics/).