From fd6bbff391ec3cb273802e13d98dc9fa9af4248d Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Fri, 30 Dec 2022 21:06:09 -0600
Subject: [PATCH 01/43] first draft

---
 requirements.txt   |   1 +
 yasa/__init__.py   |   1 +
 yasa/evaluation.py | 510 +++++++++++++++++++++++++++++++++++++++++++++
 yasa/hypno.py      |  41 ++++
 4 files changed, 553 insertions(+)
 create mode 100644 yasa/evaluation.py

diff --git a/requirements.txt b/requirements.txt
index 476be0b..e2337b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ sleepecg>=0.5.0
 joblib
 antropy
 lightgbm
+pingouin>=0.5.3
diff --git a/yasa/__init__.py b/yasa/__init__.py
index 092a8a3..6ae69c1 100644
--- a/yasa/__init__.py
+++ b/yasa/__init__.py
@@ -1,5 +1,6 @@
 import logging
 from .detection import *
+from .evaluation import *
 from .features import *
 from .heart import *
 from .hypno import *
diff --git a/yasa/evaluation.py b/yasa/evaluation.py
new file mode 100644
index 0000000..6b25b2d
--- /dev/null
+++ b/yasa/evaluation.py
@@ -0,0 +1,510 @@
+"""
+YASA code for evaluating the agreement between two sleep-measurement systems.
+
+There are two levels of evaluating staging performance:
+- Comparing two hypnograms (e.g., human vs automated scorer)
+- Comparing summary sleep statistics between two scorers (e.g., PSG vs actigraphy)
+
+Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
+See the following resources:
+- https://doi.org/10.1093/sleep/zsaa170
+- https://sri-human-sleep.github.io/sleep-trackers-performance
+- https://github.com/SRI-human-sleep/sleep-trackers-performance
+"""
+import logging
+
+import numpy as np
+import pandas as pd
+import pingouin as pg
+from sklearn import metrics
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from yasa.plotting import plot_hypnogram
+
+
+logger = logging.getLogger("yasa")
+
+__all__ = [
+    "EpochByEpochEvaluation",
+    "SleepStatsEvaluation",
+]
+
+
+class EpochByEpochEvaluation:
+    """
+    See :py:meth:`yasa.Hypnogram.evaluate`
+
+    Parameters
+    ----------
+    hypno_ref : :py:class:`yasa.Hypnogram`
+        Reference or ground-truth hypnogram.
+    hypno_test : :py:class:`yasa.Hypnogram`
+        The test or to-be-evaluated hypnogram.
+
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
+    References
+    ----------
+    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
+                      (2021). A standardized framework for testing the performance of sleep-tracking
+                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+
+    Examples
+    --------
+    >>> import yasa
+    >>> hypno_a = yasa.simulate_hypno(tib=90, seed=8)
+    >>> hypno_b = yasa.simulate_hypno(tib=90, seed=9)
+    >>> hypno_a = yasa.Hypnogram(hypno_a, scorer="RaterA")
+    >>> hypno_b = yasa.Hypnogram(hypno_b, scorer="RaterB")
+    >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
+    >>> ebe.get_confusion_matrix()
+    RaterB  WAKE  N1   N2  N3  REM  ART  UNS  Total
+    RaterA
+    WAKE       1  20   68  12    0    0    0    101
+    N1         1   0    9   0    0    0    0     10
+    N2        15   7   19   0    0    0    0     41
+    N3         0   4   15   0    9    0    0     28
+    REM        0   0    0   0    0    0    0      0
+    ART        0   0    0   0    0    0    0      0
+    UNS        0   0    0   0    0    0    0      0
+    Total     17  31  111  12    9    0    0    180
+
+    >>> ebe.get_agreement().round(3)
+    metric
+    accuracy              0.111
+    kappa                -0.130
+    weighted_jaccard      0.037
+    weighted_precision    0.072
+    weighted_recall       0.111
+    weighted_f1           0.066
+    Name: agreement, dtype: float64
+
+    >>> ebe.get_agreement_by_stage().round(3)
+    stage         WAKE    N1      N2    N3  REM  ART  UNS
+    metric
+    precision    0.059   0.0   0.171   0.0  0.0  0.0  0.0
+    recall       0.010   0.0   0.463   0.0  0.0  0.0  0.0
+    fscore       0.017   0.0   0.250   0.0  0.0  0.0  0.0
+    support    101.000  10.0  41.000  28.0  0.0  0.0  0.0
+    """
+    def __init__(self, hypno_ref, hypno_test):
+        from yasa.hypno import Hypnogram  # Loading here to avoid circular import
+        assert isinstance(hypno_ref, Hypnogram), "`hypno_ref` must be a YASA Hypnogram"
+        assert isinstance(hypno_test, Hypnogram), "`hypno_test` must be a YASA Hypnogram"
+        assert hypno_ref.n_stages == hypno_test.n_stages, (
+            "`hypno_ref` and `hypno_test` must have the same `n_stages`")
+        if (n_ref := hypno_ref.n_epochs) != (n_test := hypno_test.n_epochs):
+            ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
+            if n_ref > n_test:
+                hypno_ref = Hypnogram(hypno_ref.hypno[:n_test], n_stages=hypno_ref.n_stages)
+                n_trimmed = n_ref - n_test
+                warn_msg = f"`hypno_ref` longer than `hypno_test`, trimmed to {n_test} epochs"
+            else:
+                hypno_test = Hypnogram(hypno_test.hypno[:n_ref], n_stages=hypno_test.n_stages)
+                n_trimmed = n_test - n_ref
+                warn_msg = f"`hypno_test` longer than `hypno_ref`, {n_trimmed} epochs trimmed"
+            ## Q: Should be downplayed as INFO?
+            logger.warning(warn_msg)
+        self.hypno_ref = hypno_ref
+        self.hypno_test = hypno_test
+
+    def get_confusion_matrix(self):
+        """
+        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
+
+        Returns
+        -------
+        matrix : :py:class:`pandas.DataFrame`
+            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
+            ``hypno_test`` as columns.
+        """
+        # Generate confusion matrix.
+        matrix = pd.crosstab(
+            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
+        )
+        # Reorder indices in sensible order and to include all stages
+        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
+        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
+        matrix = matrix.fillna(0).astype(int)
+        return matrix
+
+    def get_agreement(self):
+        """
+        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
+        across all stages as measured by common classifier agreement methods.
+
+        ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
+        ##    Maybe should be binary vs multiclass?
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
+
+        Returns
+        -------
+        agreement : :py:class:`pandas.Series`
+            A :py:class:`pandas.Series` with agreement metrics as indices.
+        """
+        true = self.hypno_ref.hypno.to_numpy()
+        pred = self.hypno_test.hypno.to_numpy()
+        accuracy = metrics.accuracy_score(true, pred)
+        kappa = metrics.cohen_kappa_score(true, pred)
+        jaccard = metrics.jaccard_score(true, pred, average="weighted")
+        precision = metrics.precision_score(true, pred, average="weighted", zero_division=0)
+        recall = metrics.recall_score(true, pred, average="weighted", zero_division=0)
+        f1 = metrics.f1_score(true, pred, average="weighted", zero_division=0)
+        scores = {
+            "accuracy": accuracy,
+            "kappa": kappa,
+            "weighted_jaccard": jaccard,
+            "weighted_precision": precision,
+            "weighted_recall": recall,
+            "weighted_f1": f1,
+        }
+        agreement = pd.Series(scores, name="agreement").rename_axis("metric")
+        return agreement
+
+    def get_agreement_by_stage(self):
+        """
+        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
+        for each stage as measured by common classifier agreement methods.
+
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
+
+        Returns
+        -------
+        agreement : :py:class:`pandas.DataFrame`
+            A DataFrame with agreement metrics as indices and stages as columns.
+        """
+        true = self.hypno_ref.hypno.to_numpy()
+        pred = self.hypno_test.hypno.to_numpy()
+        labels = self.hypno_ref.labels  # equivalent to hypno_test.labels
+        scores = metrics.precision_recall_fscore_support(
+            true, pred, labels=labels, average=None, zero_division=0
+        )
+        agreement = pd.DataFrame(scores)
+        agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
+        agreement.columns = pd.Index(labels, name="stage")
+        return agreement
+
+
+class SleepStatsEvaluation:
+    """
+    Evaluate agreement between two measurement devices by comparing summary sleep statistics across
+    multiple participants or sessions.
+
+    For example, the reference device might be PSG and the test device might be a wearable device.
+
+    Parameters
+    ----------
+    data : :py:class:`pandas.DataFrame`
+        A pandas dataframe with sleep statistics from two different
+        devices for multiple subjects
+    reference : str
+        Name of column containing the reference device sleep statistics.
+    test : str
+        Name of column containing the test device sleep statistics.
+    subject : str
+        Name of column containing the subject ID.
+    statistic : str
+        Name of column containing the name of the sleep statistics.
+
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
+    References
+    ----------
+    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
+                      (2021). A standardized framework for testing the performance of sleep-tracking
+                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import yasa
+    >>> results = []
+    >>> for i in range(1, 21):
+    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i)
+    >>>     hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=i + 99)
+    >>>     sstats_a = hypno_a.sleep_statistics()
+    >>>     sstats_b = hypno_b.sleep_statistics()
+    >>>     sstats_a["subject"] = f"sub-{i:03d}"
+    >>>     sstats_b["subject"] = f"sub-{i:03d}"
+    >>>     sstats_a["scorer"] = "RaterA"
+    >>>     sstats_b["scorer"] = "RaterB"
+    >>>     results.extend([sstats_a, sstats_b])
+    >>> 
+    >>> df = (pd.DataFrame(results)
+    >>>     .pivot(index="subject", columns="scorer")
+    >>>     .stack(0).rename_axis(["subject", "sstat"]).reset_index().rename_axis(None, axis=1)
+    >>>     .query("sstat.isin(['%N1', '%N2', '%N3', '%REM', 'SOL', 'SE', 'TST'])")
+    >>>
+    >>> sse = yasa.SleepStatsEvaluation(
+    >>>     data=df, reference="RaterA", test="RaterB", subject="subject", statistic="sstat"
+    >>> )
+    >>>
+    >>> sse.summary(descriptives=False)
+           normal  unbiased  homoscedastic
+    sstat
+    %N1      True      True           True
+    %N2      True      True           True
+    %N3      True      True           True
+    %REM    False      True           True
+    SE       True      True           True
+    SOL     False     False           True
+    TST      True      True           True
+
+    .. plot::
+
+        >>> sse.plot_discrepancies_heatmap()
+
+    .. plot::
+
+        >>> sse.plot_blandaltman()
+    """
+    def __init__(self, data, reference, test, subject, statistic):
+        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
+        for col in [reference, test, subject, statistic]:
+            assert isinstance(col, str) and col in data, f"`{col}` must be a string and a column in `data`"
+        assert data[subject].nunique() > 1, "`data` must include more than one subject"
+        data = data.copy()
+
+        # Get measurement difference between reference and test devices
+        data["difference"] = data[test].sub(data[reference])
+
+        # Check for sleep statistics that have no differences between measurement devices.
+        # This is most likely to occur with TIB but is possible with any, and will break some functions.
+        stats_nodiff = data.groupby(statistic)["difference"].any().loc[lambda x: ~x].index
+        for s in stats_nodiff:
+            data = data.query(f"{statistic} != '{s}'")
+            logger.warning(f"All {s} differences are zero, removing from evaluation.")
+            ## Q: Should this be logged as just info?
+
+        # Get list of all statistics to be evaluated
+        self.all_sleepstats = data[statistic].unique()
+
+        # Save attributes
+        self.data = data
+        self.reference = reference
+        self.test = test
+        self.subject = subject
+        self.statistic = statistic
+
+        # Run tests
+        self.test_normality()
+        self.test_proportional_bias()
+        self.test_homoscedasticity()
+
+    def test_normality(self):
+        """Test reference data for normality at each sleep statistic."""
+        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality)
+        self.normality = normality.droplevel(-1)
+
+    def test_proportional_bias(self):
+        """Test each sleep statistic for proportional bias.
+        
+        For each statistic, regress the device difference score on the reference device score to get
+        proportional bias and residuals that will be used for the later homoscedasticity
+        calculation. Subject-level residuals for each statistic are added to ``data``.
+        """
+        prop_bias_results = []
+        residuals_results = []
+        for ss, ss_df in self.data.groupby(self.statistic):
+            # Regress the difference score on the reference device
+            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"])
+            model.insert(0, self.statistic, ss)
+            # Extract the subject-level residuals
+            resid = pd.DataFrame(
+                {
+                    self.subject: ss_df[self.subject],
+                    self.statistic: ss,
+                    "pbias_residual": model.residuals_
+                }
+            )
+            prop_bias_results.append(model)
+            residuals_results.append(resid)
+        # Add residuals to raw dataframe, used later when testing homoscedasticity
+        residuals = pd.concat(residuals_results)
+        self.data = self.data.merge(residuals, on=[self.subject, self.statistic])
+        # Handle proportional bias results
+        prop_bias = pd.concat(prop_bias_results)
+        # Save all the proportional bias models before removing intercept, for optional user access
+        self.proportional_bias_models_ = prop_bias.reset_index(drop=True)
+        # Remove intercept rows
+        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
+        # Add True/False passing column for easy access
+        prop_bias["unbiased"] = prop_bias["pval"].ge(0.05)
+        self.proportional_bias = prop_bias.set_index(self.statistic)
+
+    def test_homoscedasticity(self, method="levene"):
+        """Test each statistic for homoscedasticity.
+
+        The ``method`` argument is passed to :py:func:`pingouin.homoscedasticity`.
+
+        ..note:: ``self.test_proportional_bias()`` must be run first.
+        """
+        group = self.data.groupby(self.statistic)
+        columns = [self.reference, "difference", "pbias_residual"]
+        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], method=method))
+        self.homoscedasticity = homoscedasticity.droplevel(-1)
+
+    def summary(self, descriptives=True):
+        """Return a summary dataframe highlighting what statistics pass checks."""
+        assert isinstance(descriptives, bool), "descriptives must be True or False"
+        series_list = [
+            self.normality["normal"],
+            self.proportional_bias["unbiased"],
+            self.homoscedasticity["equal_var"].rename("homoscedastic"),
+        ]
+        summary = pd.concat(series_list, axis=1)
+        if descriptives:
+            group = self.data.drop(columns=self.subject).groupby(self.statistic)
+            desc = group.agg(["mean", "std"])
+            desc.columns = desc.columns.map("_".join)
+            summary = summary.join(desc)
+        return summary
+
+    def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
+        """Visualize subject-level discrepancies, generally for outlier inspection.
+
+        Parameters
+        ----------
+        sstats_order : list
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`seaborn.heatmap`.
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Merge default heatmap arguments with optional input
+        heatmap_kwargs = dict(cmap="binary", annot=True, fmt=".1f", square=False)
+        heatmap_kwargs.update(kwargs)
+        # Pivot for subject-rows and statistic-columns
+        table = self.data.pivot(
+            index=self.subject, columns=self.statistic, values="difference",
+        )
+        # Normalize statistics (i.e., columns) between zero and one
+        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp))
+        # If annotating, replace with raw values for writing.
+        if heatmap_kwargs["annot"]:
+            heatmap_kwargs["annot"] = table[sstats_order].to_numpy()
+        # Draw heatmap
+        ax = sns.heatmap(table_norm[sstats_order], **heatmap_kwargs)
+        return ax
+
+    def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwargs):
+        """Visualize subject-level discrepancies, generally for outlier inspection.
+
+        Parameters
+        ----------
+        sstats_order : list
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        palette : string, list, dict, or :py:class:`matplotlib.colors.Colormap`
+            Color palette passed to :py:class:`seaborn.PairGrid`
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`seaborn.stripplot`.
+
+        Returns
+        -------
+        g : :py:class:`seaborn.PairGrid`
+            Seaborn PairGrid
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Merge default stripplot arguments with optional input
+        stripplot_kwargs = dict(size=10, linewidth=1, edgecolor="white")
+        stripplot_kwargs.update(kwargs)
+
+        # Pivot data to get subject-rows and statistic-columns
+        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+
+        # Initialize the PairGrid
+        height = 0.3 * len(table)
+        aspect = 0.6
+        g = sns.PairGrid(
+            table.reset_index(),
+            x_vars=sstats_order,
+            y_vars=[self.subject],
+            hue=self.subject,
+            palette=palette,
+            height=height,
+            aspect=aspect,
+        )
+        # Draw the dots
+        g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
+
+        # Adjust aesthetics
+        g.set(xlabel="", ylabel="")
+        for ax, title in zip(g.axes.flat, sstats_order):
+            ax.set(title=title)
+            ax.margins(x=0.3)
+            ax.yaxis.grid(True)
+            ax.tick_params(left=False)
+        sns.despine(left=True, bottom=True)
+
+        return g
+
+    def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
+        """
+        Parameters
+        ----------
+        sstats_order : list or None
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        facet_kwargs : dict
+            Other keyword arguments are passed through to :py:class:`seaborn.FacetGrid`.
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`pingouin.plot_blandaltman`.
+
+        Returns
+        -------
+        g : :py:class:`seaborn.FacetGrid`
+            Seaborn FacetGrid
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Select scatterplot arguments (passed to blandaltman) and update with optional input
+        blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
+        blandaltman_kwargs.update(kwargs)
+        # Select FacetGrid arguments and update with optional input
+        col_wrap = 4 if len(sstats_order) > 4 else None
+        facetgrid_kwargs = dict(col_wrap=col_wrap, height=2, aspect=1, sharex=False, sharey=False)
+        facetgrid_kwargs.update(facet_kwargs)
+
+        # Initialize a grid of plots with an Axes for each sleep statistic
+        g = sns.FacetGrid(self.data, col=self.statistic, col_order=sstats_order, **facetgrid_kwargs)
+        # Draw Bland-Altman on each axis
+        g.map(pg.plot_blandaltman, self.test, self.reference, **blandaltman_kwargs)
+
+        # Tidy-up axis limits with symmetric y-axis and minimal ticks
+        for ax in g.axes.flat:
+            bound = max(map(abs, ax.get_ylim()))
+            ax.set_ylim(-bound, bound)
+            ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
+            ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
+        # More aesthetics
+        ylabel = " - ".join((self.test, self.reference))
+        g.set_ylabels(ylabel)
+        g.set_titles(col_template="{col_name}")
+        g.tight_layout(w_pad=1, h_pad=2)
+
+        return g
diff --git a/yasa/hypno.py b/yasa/hypno.py
index 81b0c80..908aa9b 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -9,6 +9,7 @@
 from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 from yasa.sleepstats import transition_matrix
+from yasa.evaluation import EpochByEpochEvaluation
 from pandas.api.types import CategoricalDtype
 
 __all__ = [
@@ -538,6 +539,46 @@ def copy(self):
             scorer=self.scorer,
         )
 
+    def evaluate(self, hypno_test):
+        """Evaluate agreement between two hypnograms.
+
+        Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
+        test hypnogram (i.e., ``hypno_test``) is a hypnogram from an actigraphy/wearable device or
+        automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
+
+        Comparing more than two hypnograms is not currently supported.
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.Hypnogram`
+            Reference or ground-truth hypnogram.
+        hypno_test : :py:class:`yasa.Hypnogram`
+            The test or to-be-evaluated hypnogram.
+            Must have the same ``n_stages`` as the reference hypnogram.
+
+        Returns
+        -------
+        ebe : :py:class:`yasa.EpochByEpochEvaluation`
+            See :py:class:`yasa.EpochByEpochEvaluation` documentation for more detail.
+
+        Examples
+        --------
+        .. plot::
+
+            >>> import yasa
+            >>> hypno_ref = yasa.simulate_hypno(tib=600, seed=11)
+            >>> hypno_ref = yasa.Hypnogram(hypno_ref, scorer="Rater1")
+            >>> _, true_probas = hypno_ref.transition_matrix()
+            >>> hypno_test = yasa.simulate_hypno(tib=600, seed=12, trans_probas=true_probas)
+            >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
+            >>> ebe = hypno_ref.evaluate(hypno_test)
+            >>> conf = ebe.get_confusion_matrix()
+            >>> perf = ebe.get_agreement()
+            >>> # Plot the overlapping hypnograms
+            >>> ebe.plot_hypnograms()
+        """
+        return EpochByEpochEvaluation(self, hypno_test)
+
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.
 

From af22cc0d70bb48d77c91515fdde4eefe7d82ced3 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Fri, 30 Dec 2022 22:30:07 -0600
Subject: [PATCH 02/43] plot_hypnogramS method

---
 yasa/evaluation.py | 93 ++++++++++++++++++++++++++++++++--------------
 yasa/plotting.py   | 56 +++++++++++++++++-----------
 2 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 6b25b2d..f0a1173 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -58,10 +58,8 @@ class EpochByEpochEvaluation:
     Examples
     --------
     >>> import yasa
-    >>> hypno_a = yasa.simulate_hypno(tib=90, seed=8)
-    >>> hypno_b = yasa.simulate_hypno(tib=90, seed=9)
-    >>> hypno_a = yasa.Hypnogram(hypno_a, scorer="RaterA")
-    >>> hypno_b = yasa.Hypnogram(hypno_b, scorer="RaterB")
+    >>> hypno_a = yasa.simulate_hypnogram(tib=90, seed=8, scorer="RaterA")
+    >>> hypno_b = yasa.simulate_hypnogram(tib=90, seed=9, scorer="RaterB")
     >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
     >>> ebe.get_confusion_matrix()
     RaterB  WAKE  N1   N2  N3  REM  ART  UNS  Total
@@ -114,26 +112,6 @@ def __init__(self, hypno_ref, hypno_test):
         self.hypno_ref = hypno_ref
         self.hypno_test = hypno_test
 
-    def get_confusion_matrix(self):
-        """
-        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
-
-        Returns
-        -------
-        matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
-            ``hypno_test`` as columns.
-        """
-        # Generate confusion matrix.
-        matrix = pd.crosstab(
-            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
-        )
-        # Reorder indices in sensible order and to include all stages
-        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
-        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
-        matrix = matrix.fillna(0).astype(int)
-        return matrix
-
     def get_agreement(self):
         """
         Return a dataframe of ``hypno_ref``/``hypno_test`` performance
@@ -190,6 +168,65 @@ def get_agreement_by_stage(self):
         agreement.columns = pd.Index(labels, name="stage")
         return agreement
 
+    def get_confusion_matrix(self):
+        """
+        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
+
+        Returns
+        -------
+        matrix : :py:class:`pandas.DataFrame`
+            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
+            ``hypno_test`` as columns.
+        """
+        # Generate confusion matrix.
+        matrix = pd.crosstab(
+            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
+        )
+        # Reorder indices in sensible order and to include all stages
+        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
+        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
+        matrix = matrix.fillna(0).astype(int)
+        return matrix
+
+    def plot_hypnograms(
+            self, jitter_test=0.1, legend=True, kwargs_ref={"ls": "dotted"}, kwargs_test={}, ax=None
+        ):
+        """Plot the two hypnograms, ``hypno_test`` overlaid on ``hypno_ref``.
+
+        Parameters
+        ----------
+        kwargs_ref : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``hypno_ref``.
+        kwargs_test : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``hypno_test``.
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+        """
+        assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
+        assert isinstance(jitter_test, (float, int)), "`jitter_test` must be a number"
+        assert isinstance(kwargs_ref, dict), "`kwargs_ref` must be a dictionary"
+        assert isinstance(kwargs_test, dict), "`kwargs_test` must be a dictionary"
+        assert not "ax" in kwargs_ref | kwargs_test, (
+            "ax can't be supplied to `kwargs_ref` or `kwargs_test`, use the `ax` keyword instead"
+        )
+        if "label" not in kwargs_ref:
+            kwargs_ref["label"] = self.hypno_ref.scorer
+        if "label" not in kwargs_test:
+            kwargs_test["label"] = self.hypno_test.scorer
+        if ax is None:
+            ax = plt.gca()
+        self.hypno_ref.plot_hypnogram(ax=ax, **kwargs_ref)
+        self.hypno_test.plot_hypnogram(ax=ax, **kwargs_test)
+        if legend:
+            if isinstance(legend, dict):
+                ax.legend(**legend)
+            else:
+                ax.legend()
+        return ax
+
 
 class SleepStatsEvaluation:
     """
@@ -230,14 +267,14 @@ class SleepStatsEvaluation:
     >>> import yasa
     >>> results = []
     >>> for i in range(1, 21):
-    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i)
-    >>>     hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=i + 99)
+    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i)
+    >>>     hypno_b = hypno_a.simulate_similar(scorer="YASA", seed=i + 99)
     >>>     sstats_a = hypno_a.sleep_statistics()
     >>>     sstats_b = hypno_b.sleep_statistics()
     >>>     sstats_a["subject"] = f"sub-{i:03d}"
     >>>     sstats_b["subject"] = f"sub-{i:03d}"
-    >>>     sstats_a["scorer"] = "RaterA"
-    >>>     sstats_b["scorer"] = "RaterB"
+    >>>     sstats_a["scorer"] = hypno_a.scorer
+    >>>     sstats_b["scorer"] = hypno_b.scorer
     >>>     results.extend([sstats_a, sstats_b])
     >>> 
     >>> df = (pd.DataFrame(results)
diff --git a/yasa/plotting.py b/yasa/plotting.py
index 87ba2f0..8361122 100644
--- a/yasa/plotting.py
+++ b/yasa/plotting.py
@@ -13,7 +13,7 @@
 __all__ = ["plot_hypnogram", "plot_spectrogram", "topoplot"]
 
 
-def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
+def plot_hypnogram(hyp, highlight="REM", fill_color=None, ax=None, **kwargs):
     """
     Plot a hypnogram.
 
@@ -23,14 +23,21 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     ----------
     hyp : :py:class:`yasa.Hypnogram`
         A YASA hypnogram instance.
-    lw : float
-        Linewidth.
     highlight : str or None
         Optional stage to highlight with alternate color.
+    lw : float
+        Linewidth of the hypnogram line.
+    ls : str
+        Linestyle of the hypnogram line.
+    alpha : float or int
+        Alpha transparency of the hypnogram line.
     fill_color : str or None
         Optional color to fill space above hypnogram line.
     ax : :py:class:`matplotlib.axes.Axes`
         Axis on which to draw the plot, optional.
+    **kwargs : dict
+        Keyword arguments controlling hypnogram line display (e.g., ``linewidth``, ``linestyle``).
+        Passed to :py:func:`matplotlib.pyplot.stairs` and py:func:`matplotlib.pyplot.hlines`.
 
     Returns
     -------
@@ -74,20 +81,25 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     old_fontsize = plt.rcParams["font.size"]
     plt.rcParams.update({"font.size": 18})
 
+    # Open the figure
+    if ax is None:
+        ax = plt.gca()
+
     ## Remap stages to be in desired y-axis order ##
     # Start with default of all allowed labels
     stage_order = hyp.labels.copy()
-    stages_present = hyp.hypno.unique()
-    # Remove Art/Uns from stage order, and place back individually at front to be higher on plot
-    art_str = stage_order.pop(stage_order.index("ART"))
-    uns_str = stage_order.pop(stage_order.index("UNS"))
-    if "ART" in stages_present:
-        stage_order.insert(0, art_str)
-    if "UNS" in stages_present:
-        stage_order.insert(0, uns_str)
+    stages_present = hyp.hypno.unique().tolist()
+    # Reverse order so WAKE is highest, and exclude ART/UNS which are always last
+    stage_order = stage_order[:-2][::-1]
+    # Add ART/UNS back above WAKE if they're present in the current hypnogram or existing axis
+    gca_ylabels = [x.get_text() for x in ax.get_yticklabels()]
+    if "ART" in stages_present or "ART" in gca_ylabels:
+        stage_order += ["ART"]
+    if "UNS" in stages_present or "UNS" in gca_ylabels:
+        stage_order += ["UNS"]
     # Put REM after WAKE if all 5 standard stages are allowed
     if hyp.n_stages == 5:
-        stage_order.insert(stage_order.index("WAKE") + 1, stage_order.pop(stage_order.index("REM")))
+        stage_order.insert(stage_order.index("WAKE") - 1, stage_order.pop(stage_order.index("REM")))
     # Reset the Hypnogram mapping so any future returns have this order
     hyp.mapping = {stage: i for i, stage in enumerate(stage_order)}
 
@@ -111,18 +123,19 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     # Make mask to draw the highlighted stage
     yvals_highlight = np.ma.masked_not_equal(yvalues, hyp.mapping.get(highlight))
 
-    # Open the figure
-    if ax is None:
-        ax = plt.gca()
-
     # Draw background filling
     if fill_color is not None:
-        bline = hyp.mapping["WAKE"]  # len(stage_order) - 1 to fill from bottom
-        ax.stairs(yvalues.clip(bline), bins, baseline=bline, color=fill_color, fill=True, lw=0)
-    # Draw main hypnogram line, highlighted stage line, and Artefact/Unscored line
-    ax.stairs(yvalues, bins, baseline=None, color="black", lw=lw)
+        bline = hyp.mapping["WAKE"]
+        ax.stairs(yvalues.clip(max=bline), bins, baseline=bline, color=fill_color, fill=True, lw=0)
+    # Draw main hypnogram line and highlighted stage line
+    line_kwargs = {"color": "black", "lw": 1.5, "label": hyp.scorer}
+    if "linewidth" in kwargs:
+        line_kwargs["linewidth"] = line_kwargs.pop("lw")
+    line_kwargs.update(kwargs)
+    ax.stairs(yvalues, bins, baseline=None, **line_kwargs)
     if not yvals_highlight.mask.all():
-        ax.hlines(yvals_highlight, xmin=bins[:-1], xmax=bins[1:], color="red", lw=lw)
+        line_kwargs.update({"color": "red", "label": None})
+        ax.hlines(yvals_highlight, xmin=bins[:-1], xmax=bins[1:], **line_kwargs)
 
     # Aesthetics
     ax.use_sticky_edges = False
@@ -131,7 +144,6 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     ax.set_yticklabels(stage_order)
     ax.set_ylabel("Stage")
     ax.set_xlabel(xlabel)
-    ax.invert_yaxis()
     ax.spines[["right", "top"]].set_visible(False)
     if hyp.start is not None:
         ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))

From cfa1f6bc4672bade72b69bdeedf040d642224b52 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sat, 31 Dec 2022 01:46:27 -0600
Subject: [PATCH 03/43] docstrings examples

---
 yasa/evaluation.py | 105 +++++++++++++++++++++++++++++++--------------
 yasa/plotting.py   |   4 +-
 2 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index f0a1173..cb1e06b 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -62,34 +62,62 @@ class EpochByEpochEvaluation:
     >>> hypno_b = yasa.simulate_hypnogram(tib=90, seed=9, scorer="RaterB")
     >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
     >>> ebe.get_confusion_matrix()
-    RaterB  WAKE  N1   N2  N3  REM  ART  UNS  Total
+    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
     RaterA
-    WAKE       1  20   68  12    0    0    0    101
-    N1         1   0    9   0    0    0    0     10
-    N2        15   7   19   0    0    0    0     41
-    N3         0   4   15   0    9    0    0     28
-    REM        0   0    0   0    0    0    0      0
-    ART        0   0    0   0    0    0    0      0
-    UNS        0   0    0   0    0    0    0      0
-    Total     17  31  111  12    9    0    0    180
+    WAKE      52   38  126  23   51    0    0    290
+    N1        59    2   27   8   14    0    0    110
+    N2       117   50  105  15   44    0    0    331
+    N3        34   26   62  42   15    0    0    179
+    REM       15   12   13  10    0    0    0     50
+    ART        0    0    0   0    0    0    0      0
+    UNS        0    0    0   0    0    0    0      0
+    Total    277  128  333  98  124    0    0    960
 
     >>> ebe.get_agreement().round(3)
     metric
-    accuracy              0.111
-    kappa                -0.130
-    weighted_jaccard      0.037
-    weighted_precision    0.072
-    weighted_recall       0.111
-    weighted_f1           0.066
+    accuracy              0.209
+    kappa                -0.051
+    weighted_jaccard      0.130
+    weighted_precision    0.247
+    weighted_recall       0.209
+    weighted_f1           0.223
     Name: agreement, dtype: float64
 
     >>> ebe.get_agreement_by_stage().round(3)
-    stage         WAKE    N1      N2    N3  REM  ART  UNS
+    stage         WAKE       N1       N2       N3   REM  ART  UNS
     metric
-    precision    0.059   0.0   0.171   0.0  0.0  0.0  0.0
-    recall       0.010   0.0   0.463   0.0  0.0  0.0  0.0
-    fscore       0.017   0.0   0.250   0.0  0.0  0.0  0.0
-    support    101.000  10.0  41.000  28.0  0.0  0.0  0.0
+    precision    0.188    0.016    0.315    0.429   0.0  0.0  0.0
+    recall       0.179    0.018    0.317    0.235   0.0  0.0  0.0
+    fscore       0.183    0.017    0.316    0.303   0.0  0.0  0.0
+    support    290.000  110.000  331.000  179.000  50.0  0.0  0.0
+
+    .. plot::
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
+        >>> ebe.plot_hypnograms()
+
+    .. plot::
+
+        >>> fig, ax = plt.subplots(figsize=(6, 3))
+        >>> ebe.plot_hypnograms(ax=ax, kwargs_test={"color": "black", "lw": 2, "ls": "dotted"})
+        >>> plt.tight_layout()
+
+    .. plot::
+
+        >>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
+        >>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
+        >>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
+        >>> legend_style = dict(
+        >>>     title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
+        >>> )
+        >>> ax = ebe.plot_hypnograms(
+        >>>     kwargs_ref=style_a, kwargs_test=style_b, legend=legend_style, ax=ax
+        >>> )
+        >>>
+        >>> acc = ebe.get_agreement().multiply(100).at["accuracy"]
+        >>> ax.text(0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes)
+
     """
     def __init__(self, hypno_ref, hypno_test):
         from yasa.hypno import Hypnogram  # Loading here to avoid circular import
@@ -188,13 +216,19 @@ def get_confusion_matrix(self):
         matrix = matrix.fillna(0).astype(int)
         return matrix
 
-    def plot_hypnograms(
-            self, jitter_test=0.1, legend=True, kwargs_ref={"ls": "dotted"}, kwargs_test={}, ax=None
-        ):
+    def plot_hypnograms(self, legend=True, ax=None, kwargs_ref={}, kwargs_test={}):
         """Plot the two hypnograms, ``hypno_test`` overlaid on ``hypno_ref``.
 
+        .. seealso:: :py:func:`yasa.plot_hypnogram`
+
         Parameters
         ----------
+        legend : bool or None
+            If True, a legend with default :py:func:`matplotlib.pyplot.legend` arguments is added.
+            If False, no legend is added. If a dictionary, a legend is added and the dictionary is
+            passed as keyword arguments to :py:func:`matplotlib.pyplot.legend`.
+        ax : :py:class:`matplotlib.axes.Axes`
+            Axis on which to draw the plot, optional.
         kwargs_ref : dict
             Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``hypno_ref``.
         kwargs_test : dict
@@ -204,23 +238,30 @@ def plot_hypnograms(
         -------
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
+
+        Examples
+        --------
+        .. plot::
+
+            >>> from yasa import simulate_hypnogram
+            >>> hyp = simulate_hypnogram(seed=7)
+            >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
-        assert isinstance(jitter_test, (float, int)), "`jitter_test` must be a number"
         assert isinstance(kwargs_ref, dict), "`kwargs_ref` must be a dictionary"
         assert isinstance(kwargs_test, dict), "`kwargs_test` must be a dictionary"
         assert not "ax" in kwargs_ref | kwargs_test, (
             "ax can't be supplied to `kwargs_ref` or `kwargs_test`, use the `ax` keyword instead"
         )
-        if "label" not in kwargs_ref:
-            kwargs_ref["label"] = self.hypno_ref.scorer
-        if "label" not in kwargs_test:
-            kwargs_test["label"] = self.hypno_test.scorer
+        plot_kwargs_ref = {"highlight": None, "alpha": 0.8}
+        plot_kwargs_test = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
+        plot_kwargs_ref.update(kwargs_ref)
+        plot_kwargs_test.update(kwargs_test)
         if ax is None:
             ax = plt.gca()
-        self.hypno_ref.plot_hypnogram(ax=ax, **kwargs_ref)
-        self.hypno_test.plot_hypnogram(ax=ax, **kwargs_test)
-        if legend:
+        self.hypno_ref.plot_hypnogram(ax=ax, **plot_kwargs_ref)
+        self.hypno_test.plot_hypnogram(ax=ax, **plot_kwargs_test)
+        if legend and "label" in plot_kwargs_ref | plot_kwargs_test:
             if isinstance(legend, dict):
                 ax.legend(**legend)
             else:
@@ -393,7 +434,7 @@ def test_homoscedasticity(self, method="levene"):
 
     def summary(self, descriptives=True):
         """Return a summary dataframe highlighting what statistics pass checks."""
-        assert isinstance(descriptives, bool), "descriptives must be True or False"
+        assert isinstance(descriptives, bool), "`descriptives` must be True or False"
         series_list = [
             self.normality["normal"],
             self.proportional_bias["unbiased"],
diff --git a/yasa/plotting.py b/yasa/plotting.py
index 8361122..96f735e 100644
--- a/yasa/plotting.py
+++ b/yasa/plotting.py
@@ -128,9 +128,7 @@ def plot_hypnogram(hyp, highlight="REM", fill_color=None, ax=None, **kwargs):
         bline = hyp.mapping["WAKE"]
         ax.stairs(yvalues.clip(max=bline), bins, baseline=bline, color=fill_color, fill=True, lw=0)
     # Draw main hypnogram line and highlighted stage line
-    line_kwargs = {"color": "black", "lw": 1.5, "label": hyp.scorer}
-    if "linewidth" in kwargs:
-        line_kwargs["linewidth"] = line_kwargs.pop("lw")
+    line_kwargs = {"color": "black", "linewidth": 1.5, "label": hyp.scorer}
     line_kwargs.update(kwargs)
     ax.stairs(yvalues, bins, baseline=None, **line_kwargs)
     if not yvals_highlight.mask.all():

From f036da3f7caaed8b2c89918eeab642abdb23f88a Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sat, 31 Dec 2022 01:48:55 -0600
Subject: [PATCH 04/43] plot_hypnogram lw --> linekwargs

---
 yasa/plotting.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/yasa/plotting.py b/yasa/plotting.py
index 96f735e..f1861fd 100644
--- a/yasa/plotting.py
+++ b/yasa/plotting.py
@@ -25,12 +25,6 @@ def plot_hypnogram(hyp, highlight="REM", fill_color=None, ax=None, **kwargs):
         A YASA hypnogram instance.
     highlight : str or None
         Optional stage to highlight with alternate color.
-    lw : float
-        Linewidth of the hypnogram line.
-    ls : str
-        Linestyle of the hypnogram line.
-    alpha : float or int
-        Alpha transparency of the hypnogram line.
     fill_color : str or None
         Optional color to fill space above hypnogram line.
     ax : :py:class:`matplotlib.axes.Axes`

From 16fc5bd3036d25cc77a1bfaf24669ecbfb9c427f Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sat, 31 Dec 2022 14:52:16 -0600
Subject: [PATCH 05/43] heatmap colorbar label

---
 yasa/evaluation.py | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index cb1e06b..1360f4d 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -268,6 +268,24 @@ def plot_hypnograms(self, legend=True, ax=None, kwargs_ref={}, kwargs_test={}):
                 ax.legend()
         return ax
 
+    def plot_roc(self, palette=None, ax=None, **kwargs):
+        """Plot ROC curves for each stage.
+
+        Parameters
+        ----------
+        palette : dict or None
+            If a dictionary, keys are stages and values are corresponding colors.
+        ax : :py:class:`matplotlib.axes.Axes`
+            Axis on which to draw the plot, optional.
+        kwargs : dict
+            Keyword arguments passed to :py:func:`matplotlib.pyplot.plot`
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+        """
+        raise NotImplementedError("Requires probability/confidence values.")
 
 class SleepStatsEvaluation:
     """
@@ -340,7 +358,10 @@ class SleepStatsEvaluation:
 
     .. plot::
 
-        >>> sse.plot_discrepancies_heatmap()
+        >>> import matplotlib.pyplot as plt
+        >>> ax = sse.plot_discrepancies_heatmap()
+        >>> ax.set_title("Sleep statistic discrepancies")
+        >>> plt.tight_layout()
 
     .. plot::
 
@@ -470,13 +491,14 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
 
         # Merge default heatmap arguments with optional input
         heatmap_kwargs = dict(cmap="binary", annot=True, fmt=".1f", square=False)
+        heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
+        if "cbar_kws" in kwargs:
+            heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
         # Pivot for subject-rows and statistic-columns
-        table = self.data.pivot(
-            index=self.subject, columns=self.statistic, values="difference",
-        )
-        # Normalize statistics (i.e., columns) between zero and one
-        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp))
+        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+        # Normalize statistics (i.e., columns) between zero and one then convert to percentage
+        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         # If annotating, replace with raw values for writing.
         if heatmap_kwargs["annot"]:
             heatmap_kwargs["annot"] = table[sstats_order].to_numpy()

From c042553680c68e863f7396afdccdd27b6a8f6715 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sat, 31 Dec 2022 15:21:38 -0600
Subject: [PATCH 06/43] pass kwargs through to all pingouin calls

---
 yasa/evaluation.py | 84 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 21 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 1360f4d..a684c63 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -277,8 +277,8 @@ def plot_roc(self, palette=None, ax=None, **kwargs):
             If a dictionary, keys are stages and values are corresponding colors.
         ax : :py:class:`matplotlib.axes.Axes`
             Axis on which to draw the plot, optional.
-        kwargs : dict
-            Keyword arguments passed to :py:func:`matplotlib.pyplot.plot`
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`matplotlib.pyplot.plot` call.
 
         Returns
         -------
@@ -356,6 +356,32 @@ class SleepStatsEvaluation:
     SOL     False     False           True
     TST      True      True           True
 
+    Access more detailed statistical output of each test.
+
+    >>> sse.normality
+                  W      pval  normal
+    sstat
+    %N1    0.973407  0.824551    True
+    %N2    0.960684  0.557595    True
+    %N3    0.958591  0.516092    True
+    %REM   0.901733  0.044447   False
+    SE     0.926732  0.133580    True
+    SOL    0.774786  0.000372   False
+    TST    0.926733  0.133584    True
+    WASO   0.924288  0.119843    True
+
+    >>> sse.homoscedasticity.head(2)
+                  W      pval  equal_var
+    sstat
+    %N1    0.684833  0.508274       True
+    %N2    0.080359  0.922890       True
+
+    >>> sse.proportional_bias.round(3).head(2)
+            coef     se      T   pval     r2  adj_r2  CI[2.5%]  CI[97.5%]  unbiased
+    sstat
+    %N1   -0.487  0.314 -1.551  0.138  0.118   0.069    -1.146      0.172      True
+    %N2   -0.107  0.262 -0.409  0.688  0.009  -0.046    -0.658      0.444      True
+
     .. plot::
 
         >>> import matplotlib.pyplot as plt
@@ -396,27 +422,40 @@ def __init__(self, data, reference, test, subject, statistic):
         self.statistic = statistic
 
         # Run tests
-        self.test_normality()
-        self.test_proportional_bias()
-        self.test_homoscedasticity()
+        self.test_normality(method="shapiro", alpha=0.05)
+        self.test_proportional_bias(alpha=0.05)
+        self.test_homoscedasticity(method="levene", alpha=0.05)
 
-    def test_normality(self):
-        """Test reference data for normality at each sleep statistic."""
-        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality)
+    def test_normality(self, **kwargs):
+        """Test reference data for normality at each sleep statistic.
+
+        Parameters
+        ----------
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`pingouin.normality` call.
+        """
+        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality, **kwargs)
         self.normality = normality.droplevel(-1)
 
-    def test_proportional_bias(self):
+    def test_proportional_bias(self, **kwargs):
         """Test each sleep statistic for proportional bias.
         
         For each statistic, regress the device difference score on the reference device score to get
         proportional bias and residuals that will be used for the later homoscedasticity
         calculation. Subject-level residuals for each statistic are added to ``data``.
+
+        Parameters
+        ----------
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:func:`pingouin.linear_regression`.
         """
+        if "alpha" not in kwargs:
+            kwargs["alpha"] = 0.05
         prop_bias_results = []
         residuals_results = []
         for ss, ss_df in self.data.groupby(self.statistic):
             # Regress the difference score on the reference device
-            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"])
+            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"], **kwargs)
             model.insert(0, self.statistic, ss)
             # Extract the subject-level residuals
             resid = pd.DataFrame(
@@ -438,19 +477,22 @@ def test_proportional_bias(self):
         # Remove intercept rows
         prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
         # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(0.05)
+        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs["alpha"])
         self.proportional_bias = prop_bias.set_index(self.statistic)
 
-    def test_homoscedasticity(self, method="levene"):
+    def test_homoscedasticity(self, **kwargs):
         """Test each statistic for homoscedasticity.
 
-        The ``method`` argument is passed to :py:func:`pingouin.homoscedasticity`.
+        Parameters
+        ----------
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:func:`pingouin.homoscedasticity`.
 
         ..note:: ``self.test_proportional_bias()`` must be run first.
         """
         group = self.data.groupby(self.statistic)
         columns = [self.reference, "difference", "pbias_residual"]
-        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], method=method))
+        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], **kwargs))
         self.homoscedasticity = homoscedasticity.droplevel(-1)
 
     def summary(self, descriptives=True):
@@ -476,8 +518,8 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
         ----------
         sstats_order : list
             List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`seaborn.heatmap`.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`seaborn.heatmap` call.
 
         Returns
         -------
@@ -515,8 +557,8 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
             List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
         palette : string, list, dict, or :py:class:`matplotlib.colors.Colormap`
             Color palette passed to :py:class:`seaborn.PairGrid`
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`seaborn.stripplot`.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
 
         Returns
         -------
@@ -568,9 +610,9 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         sstats_order : list or None
             List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
         facet_kwargs : dict
-            Other keyword arguments are passed through to :py:class:`seaborn.FacetGrid`.
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`pingouin.plot_blandaltman`.
+            Keyword arguments passed to :py:class:`seaborn.FacetGrid`.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
 
         Returns
         -------

From 70f3627549aaa1f33db1011c3ccc7bbb0cddd5d9 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sat, 31 Dec 2022 17:49:45 -0600
Subject: [PATCH 07/43] docstrings examples update

---
 yasa/evaluation.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index a684c63..942342d 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -117,7 +117,6 @@ class EpochByEpochEvaluation:
         >>>
         >>> acc = ebe.get_agreement().multiply(100).at["accuracy"]
         >>> ax.text(0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes)
-
     """
     def __init__(self, hypno_ref, hypno_test):
         from yasa.hypno import Hypnogram  # Loading here to avoid circular import
@@ -324,22 +323,20 @@ class SleepStatsEvaluation:
     --------
     >>> import pandas as pd
     >>> import yasa
-    >>> results = []
+    >>>
+    >>> # For this example, generate a fake dataset of sleep statistics from two different raters
+    >>> data = []
     >>> for i in range(1, 21):
-    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i)
-    >>>     hypno_b = hypno_a.simulate_similar(scorer="YASA", seed=i + 99)
-    >>>     sstats_a = hypno_a.sleep_statistics()
-    >>>     sstats_b = hypno_b.sleep_statistics()
-    >>>     sstats_a["subject"] = f"sub-{i:03d}"
-    >>>     sstats_b["subject"] = f"sub-{i:03d}"
-    >>>     sstats_a["scorer"] = hypno_a.scorer
-    >>>     sstats_b["scorer"] = hypno_b.scorer
-    >>>     results.extend([sstats_a, sstats_b])
-    >>> 
-    >>> df = (pd.DataFrame(results)
-    >>>     .pivot(index="subject", columns="scorer")
-    >>>     .stack(0).rename_axis(["subject", "sstat"]).reset_index().rename_axis(None, axis=1)
-    >>>     .query("sstat.isin(['%N1', '%N2', '%N3', '%REM', 'SOL', 'SE', 'TST'])")
+    >>>     hypA = yasa.simulate_hypnogram(tib=600, seed=i)
+    >>>     hypB = hypA.simulate_similar(seed=i)
+    >>>     data.append({"subject": f"sub-{i:03d}", "rater": "RaterA"} | hypA.sleep_statistics())
+    >>>     data.append({"subject": f"sub-{i:03d}", "rater": "RaterB"} | hypB.sleep_statistics())
+    >>> df = (pd.json_normalize(data)
+    >>>     .melt(id_vars=["subject", "rater"], var_name="sstat", value_name="score")
+    >>>     .pivot(index=["subject", "sstat"], columns="rater", values="score")
+    >>>     .reset_index().rename_axis(None, axis=1)
+    >>>     .query("sstat.isin(['SE', 'TST', 'SOL', 'WASO', '%N1', '%N2', '%N3', '%REM'])")
+    >>> )
     >>>
     >>> sse = yasa.SleepStatsEvaluation(
     >>>     data=df, reference="RaterA", test="RaterB", subject="subject", statistic="sstat"

From cfaf8b6559c7df86cf9afe5b794a6250f059ea04 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 1 Jan 2023 22:32:29 -0600
Subject: [PATCH 08/43] setting attrs, docstrings, var name changes

---
 yasa/evaluation.py | 267 ++++++++++++++++++++++++++++++---------------
 1 file changed, 181 insertions(+), 86 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 942342d..8222e7b 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -32,15 +32,20 @@
 ]
 
 
+#############################################################################
+# EPOCH BY EPOCH
+#############################################################################
+
+
 class EpochByEpochEvaluation:
     """
     See :py:meth:`yasa.Hypnogram.evaluate`
 
     Parameters
     ----------
-    hypno_ref : :py:class:`yasa.Hypnogram`
-        Reference or ground-truth hypnogram.
-    hypno_test : :py:class:`yasa.Hypnogram`
+    refr_hyp : :py:class:`yasa.Hypnogram`
+        The reference or ground-truth hypnogram.
+    test_hyp : :py:class:`yasa.Hypnogram`
         The test or to-be-evaluated hypnogram.
 
     Notes
@@ -115,46 +120,88 @@ class EpochByEpochEvaluation:
         >>>     kwargs_ref=style_a, kwargs_test=style_b, legend=legend_style, ax=ax
         >>> )
         >>>
-        >>> acc = ebe.get_agreement().multiply(100).at["accuracy"]
-        >>> ax.text(0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes)
+        >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
+        >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
     """
-    def __init__(self, hypno_ref, hypno_test):
+    def __init__(self, refr_hyp, test_hyp):
         from yasa.hypno import Hypnogram  # Loading here to avoid circular import
-        assert isinstance(hypno_ref, Hypnogram), "`hypno_ref` must be a YASA Hypnogram"
-        assert isinstance(hypno_test, Hypnogram), "`hypno_test` must be a YASA Hypnogram"
-        assert hypno_ref.n_stages == hypno_test.n_stages, (
-            "`hypno_ref` and `hypno_test` must have the same `n_stages`")
-        if (n_ref := hypno_ref.n_epochs) != (n_test := hypno_test.n_epochs):
+        assert isinstance(refr_hyp, Hypnogram), "`refr_hyp` must be a YASA Hypnogram"
+        assert isinstance(test_hyp, Hypnogram), "`test_hyp` must be a YASA Hypnogram"
+        assert refr_hyp.scorer is not None, "`refr_hyp` must have a scorer label"
+        assert test_hyp.scorer is not None, "`test_hyp` must have a scorer label"
+        assert refr_hyp.scorer != test_hyp.scorer, (
+            "scorer must be unique for `refr_hyp` and `test_hyp`"
+        )
+        assert refr_hyp.n_stages == test_hyp.n_stages, (
+            "`refr_hyp` and `test_hyp` must have the same `n_stages`"
+        )
+        assert refr_hyp.labels == test_hyp.labels
+        assert refr_hyp.mapping == test_hyp.mapping
+        if (n_ref := refr_hyp.n_epochs) != (n_test := test_hyp.n_epochs):
             ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
             if n_ref > n_test:
-                hypno_ref = Hypnogram(hypno_ref.hypno[:n_test], n_stages=hypno_ref.n_stages)
+                refr_hyp = Hypnogram(refr_hyp.hypno[:n_test], n_stages=refr_hyp.n_stages)
                 n_trimmed = n_ref - n_test
-                warn_msg = f"`hypno_ref` longer than `hypno_test`, trimmed to {n_test} epochs"
+                warn_msg = f"`refr_hyp` longer than `test_hyp`, trimmed to {n_test} epochs"
             else:
-                hypno_test = Hypnogram(hypno_test.hypno[:n_ref], n_stages=hypno_test.n_stages)
+                test_hyp = Hypnogram(test_hyp.hypno[:n_ref], n_stages=test_hyp.n_stages)
                 n_trimmed = n_test - n_ref
-                warn_msg = f"`hypno_test` longer than `hypno_ref`, {n_trimmed} epochs trimmed"
+                warn_msg = f"`test_hyp` longer than `refr_hyp`, {n_trimmed} epochs trimmed"
             ## Q: Should be downplayed as INFO?
             logger.warning(warn_msg)
-        self.hypno_ref = hypno_ref
-        self.hypno_test = hypno_test
+        
+        # Set attributes
+        self._refr_hyp = refr_hyp.copy()
+        self._test_hyp = test_hyp.copy()
+
+    def __repr__(self):
+        # TODO v0.8: Keep only the text between < and >
+        return (
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_hyp.scorer} evaluated "
+            f"against reference Hypnogram scored by {self.test_hyp.scorer}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
+            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            "See the online documentation for more details."
+        )
+
+    def __str__(self):
+        return (
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_hyp.scorer} evaluated "
+            f"against reference Hypnogram scored by {self.test_hyp.scorer}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
+            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            "See the online documentation for more details."
+        )
+
+    @property
+    def refr_hyp(self):
+        """The reference Hypnogram."""
+        ## Q: Starting to think there should be a clear convention on what we mean
+        ##    when we say "hypnogram". Should hypnogram mean the Series and Hypnogram
+        ##    mean the YASA object? Similarly for hypno/hyp.
+        return self._refr_hyp
+
+    @property
+    def test_hyp(self):
+        """The test Hypnogram."""
+        return self._test_hyp
 
     def get_agreement(self):
         """
-        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
-        across all stages as measured by common classifier agreement methods.
+        Return a dataframe of ``refr_hyp``/``test_hyp`` performance across all stages as measured by
+        common classifier agreement methods.
 
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
         ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
         ##    Maybe should be binary vs multiclass?
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
 
         Returns
         -------
         agreement : :py:class:`pandas.Series`
             A :py:class:`pandas.Series` with agreement metrics as indices.
         """
-        true = self.hypno_ref.hypno.to_numpy()
-        pred = self.hypno_test.hypno.to_numpy()
+        true = self.refr_hyp.hypno.to_numpy()
+        pred = self.test_hyp.hypno.to_numpy()
         accuracy = metrics.accuracy_score(true, pred)
         kappa = metrics.cohen_kappa_score(true, pred)
         jaccard = metrics.jaccard_score(true, pred, average="weighted")
@@ -174,8 +221,8 @@ def get_agreement(self):
 
     def get_agreement_by_stage(self):
         """
-        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
-        for each stage as measured by common classifier agreement methods.
+        Return a dataframe of ``refr_hyp``/``test_hyp`` performance for each stage as measured by
+        common classifier agreement methods.
 
         .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
 
@@ -184,9 +231,9 @@ def get_agreement_by_stage(self):
         agreement : :py:class:`pandas.DataFrame`
             A DataFrame with agreement metrics as indices and stages as columns.
         """
-        true = self.hypno_ref.hypno.to_numpy()
-        pred = self.hypno_test.hypno.to_numpy()
-        labels = self.hypno_ref.labels  # equivalent to hypno_test.labels
+        true = self.refr_hyp.hypno.to_numpy()
+        pred = self.test_hyp.hypno.to_numpy()
+        labels = self.test_hyp.labels  # Same as refr_hyp.labels
         scores = metrics.precision_recall_fscore_support(
             true, pred, labels=labels, average=None, zero_division=0
         )
@@ -196,42 +243,38 @@ def get_agreement_by_stage(self):
         return agreement
 
     def get_confusion_matrix(self):
-        """
-        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
+        """Return a ``refr_hyp``/``test_hyp``confusion matrix.
 
         Returns
         -------
         matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
-            ``hypno_test`` as columns.
+            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
         """
         # Generate confusion matrix.
         matrix = pd.crosstab(
-            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
+            self.refr_hyp.hypno, self.test_hyp.hypno, margins=True, margins_name="Total"
         )
         # Reorder indices in sensible order and to include all stages
-        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
-        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
-        matrix = matrix.fillna(0).astype(int)
-        return matrix
+        matrix = matrix.reindex(labels=self.refr_hyp.labels + ["Total"], fill_value=0)
+        matrix = matrix.reindex(columns=self.test_hyp.labels + ["Total"], fill_value=0)
+        return matrix.astype(int)
 
-    def plot_hypnograms(self, legend=True, ax=None, kwargs_ref={}, kwargs_test={}):
-        """Plot the two hypnograms, ``hypno_test`` overlaid on ``hypno_ref``.
+    def plot_hypnograms(self, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
+        """Plot the two hypnograms, where ``refr_hyp`` is overlaid on ``refr_hyp``.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
 
         Parameters
         ----------
-        legend : bool or None
-            If True, a legend with default :py:func:`matplotlib.pyplot.legend` arguments is added.
-            If False, no legend is added. If a dictionary, a legend is added and the dictionary is
-            passed as keyword arguments to :py:func:`matplotlib.pyplot.legend`.
-        ax : :py:class:`matplotlib.axes.Axes`
+        legend : bool or dict
+            If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
+            pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
+        ax : :py:class:`matplotlib.axes.Axes` or None
             Axis on which to draw the plot, optional.
-        kwargs_ref : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``hypno_ref``.
-        kwargs_test : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``hypno_test``.
+        refr_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``refr_hyp``.
+        test_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``test_hyp``.
 
         Returns
         -------
@@ -247,20 +290,20 @@ def plot_hypnograms(self, legend=True, ax=None, kwargs_ref={}, kwargs_test={}):
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
-        assert isinstance(kwargs_ref, dict), "`kwargs_ref` must be a dictionary"
-        assert isinstance(kwargs_test, dict), "`kwargs_test` must be a dictionary"
-        assert not "ax" in kwargs_ref | kwargs_test, (
-            "ax can't be supplied to `kwargs_ref` or `kwargs_test`, use the `ax` keyword instead"
+        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
+        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
+        assert not "ax" in refr_kwargs | test_kwargs, (
+            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
         )
-        plot_kwargs_ref = {"highlight": None, "alpha": 0.8}
-        plot_kwargs_test = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
-        plot_kwargs_ref.update(kwargs_ref)
-        plot_kwargs_test.update(kwargs_test)
+        plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
+        plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
+        plot_refr_kwargs.update(refr_kwargs)
+        plot_test_kwargs.update(test_kwargs)
         if ax is None:
             ax = plt.gca()
-        self.hypno_ref.plot_hypnogram(ax=ax, **plot_kwargs_ref)
-        self.hypno_test.plot_hypnogram(ax=ax, **plot_kwargs_test)
-        if legend and "label" in plot_kwargs_ref | plot_kwargs_test:
+        self.refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
+        self.test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
+        if legend and "label" in plot_refr_kwargs | plot_test_kwargs:
             if isinstance(legend, dict):
                 ax.legend(**legend)
             else:
@@ -284,20 +327,27 @@ def plot_roc(self, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
+        # assert self.test_hyp.probas is not None
         raise NotImplementedError("Requires probability/confidence values.")
 
+
+#############################################################################
+# SLEEP STATISTICS
+#############################################################################
+
+
 class SleepStatsEvaluation:
     """
-    Evaluate agreement between two measurement devices by comparing summary sleep statistics across
-    multiple participants or sessions.
-
-    For example, the reference device might be PSG and the test device might be a wearable device.
+    Evaluate agreement between two measurement systems (e.g., two different manual scorers or one
+    one manual scorer againt YASA's automatic staging) by comparing their summary sleep statistics
+    derived from multiple subjects or sessions.
 
     Parameters
     ----------
     data : :py:class:`pandas.DataFrame`
-        A pandas dataframe with sleep statistics from two different
-        devices for multiple subjects
+        A :py:class:`pandas.DataFrame` with sleep statistics from two different measurement systems.
+        Each row contains the two different measurements of a single subject and sleep statistic.
+        Of shape (n_subjects x n_sleep_statistics, 4).
     reference : str
         Name of column containing the reference device sleep statistics.
     test : str
@@ -305,7 +355,7 @@ class SleepStatsEvaluation:
     subject : str
         Name of column containing the subject ID.
     statistic : str
-        Name of column containing the name of the sleep statistics.
+        Name of column containing the name of the sleep statistic.
 
     Notes
     -----
@@ -390,18 +440,24 @@ class SleepStatsEvaluation:
 
         >>> sse.plot_blandaltman()
     """
-    def __init__(self, data, reference, test, subject, statistic):
+    def __init__(self, data, *, reference, test, subject, statistic):
         assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
         for col in [reference, test, subject, statistic]:
-            assert isinstance(col, str) and col in data, f"`{col}` must be a string and a column in `data`"
+            assert isinstance(col, str) and col in data, (
+                f"`{col}` must be a string and a column in `data`"
+            )
         assert data[subject].nunique() > 1, "`data` must include more than one subject"
+        assert not data.groupby("subject")["sstat"].count().diff().any(), "same number of sstats for all subjects"
+        assert not data.groupby("subject")["sstat"].nunique().is_unique, "no repeated sstats per subject"
+        
+        # Don't update this, rename to something else like table.
         data = data.copy()
 
         # Get measurement difference between reference and test devices
         data["difference"] = data[test].sub(data[reference])
 
-        # Check for sleep statistics that have no differences between measurement devices.
-        # This is most likely to occur with TIB but is possible with any, and will break some functions.
+        # Remove sleep statistics that have no differences between measurement systems.
+        ## TODO: simplify once not manipulating _data
         stats_nodiff = data.groupby(statistic)["difference"].any().loc[lambda x: ~x].index
         for s in stats_nodiff:
             data = data.query(f"{statistic} != '{s}'")
@@ -411,18 +467,62 @@ def __init__(self, data, reference, test, subject, statistic):
         # Get list of all statistics to be evaluated
         self.all_sleepstats = data[statistic].unique()
 
-        # Save attributes
-        self.data = data
-        self.reference = reference
-        self.test = test
-        self.subject = subject
-        self.statistic = statistic
+        # Set attributes
+        self._data = data
+        self._reference = reference
+        self._test = test
+        self._subject = subject
+        self._statistic = statistic
 
         # Run tests
         self.test_normality(method="shapiro", alpha=0.05)
         self.test_proportional_bias(alpha=0.05)
         self.test_homoscedasticity(method="levene", alpha=0.05)
 
+    @property
+    def data(self):
+        """The summary dataframe of sleep statistics."""
+        return self._data
+
+    @property
+    def reference(self):
+        """The name of the column containing the reference measurement sleep statistics."""
+        return self._reference
+
+    @property
+    def test(self):
+        """The name of the column containing the test measurement sleep statistics."""
+        return self._test
+
+    @property
+    def subject(self):
+        """The name of the column containing the subject identifiers."""
+        return self._subject
+
+    @property
+    def statistic(self):
+        """The name of the column containing the sleep statistic name."""
+        return self._statistic
+
+    def __repr__(self):
+        # TODO v0.8: Keep only the text between < and >
+        return (
+            f"<SleepStatsEvaluation | Test measurement '{self.test}' evaluated against reference "
+            f"measurement '{self.reference}'>\n"
+            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
+            "See the online documentation for more details."
+        )
+
+    def __str__(self):
+        return (
+            f"<SleepStatsEvaluation | Test measurement '{self.test}' evaluated against reference "
+            f"measurement '{self.reference}'>\n"
+            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
+            "See the online documentation for more details."
+        )
+
     def test_normality(self, **kwargs):
         """Test reference data for normality at each sleep statistic.
 
@@ -485,7 +585,7 @@ def test_homoscedasticity(self, **kwargs):
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`pingouin.homoscedasticity`.
 
-        ..note:: ``self.test_proportional_bias()`` must be run first.
+        ..note:: :py:meth:`yasa.SleepStatsEvaluation.test_proportional_bias` must be called first.
         """
         group = self.data.groupby(self.statistic)
         columns = [self.reference, "difference", "pbias_residual"]
@@ -523,13 +623,12 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
+        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
         if sstats_order is None:
             sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
 
         # Merge default heatmap arguments with optional input
-        heatmap_kwargs = dict(cmap="binary", annot=True, fmt=".1f", square=False)
+        heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
         heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
@@ -562,13 +661,12 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
         g : :py:class:`seaborn.PairGrid`
             Seaborn PairGrid
         """
+        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
         if sstats_order is None:
             sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
 
         # Merge default stripplot arguments with optional input
-        stripplot_kwargs = dict(size=10, linewidth=1, edgecolor="white")
+        stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         stripplot_kwargs.update(kwargs)
 
         # Pivot data to get subject-rows and statistic-columns
@@ -597,7 +695,6 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
             ax.yaxis.grid(True)
             ax.tick_params(left=False)
         sns.despine(left=True, bottom=True)
-
         return g
 
     def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
@@ -616,10 +713,9 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         g : :py:class:`seaborn.FacetGrid`
             Seaborn FacetGrid
         """
+        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
         if sstats_order is None:
             sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
 
         # Select scatterplot arguments (passed to blandaltman) and update with optional input
         blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
@@ -645,5 +741,4 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)
-
         return g

From e490cc990319cb3884f28db5afe51e34f7987a09 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 1 Jan 2023 23:38:10 -0600
Subject: [PATCH 09/43] SleepStatsEval takes 2 dataframes as input, reshaping
 is done internally

---
 yasa/evaluation.py | 181 +++++++++++++++++++++++----------------------
 1 file changed, 92 insertions(+), 89 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 8222e7b..451f130 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -344,18 +344,12 @@ class SleepStatsEvaluation:
 
     Parameters
     ----------
-    data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from two different measurement systems.
-        Each row contains the two different measurements of a single subject and sleep statistic.
-        Of shape (n_subjects x n_sleep_statistics, 4).
-    reference : str
-        Name of column containing the reference device sleep statistics.
-    test : str
-        Name of column containing the test device sleep statistics.
-    subject : str
-        Name of column containing the subject ID.
-    statistic : str
-        Name of column containing the name of the sleep statistic.
+    refr_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the reference measurement system.
+        Rows are individual subjects and columns are individual sleep statistics.
+    test_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
+        Shape, indices, and columns must be identical to ``refr_data``.
 
     Notes
     -----
@@ -374,23 +368,14 @@ class SleepStatsEvaluation:
     >>> import pandas as pd
     >>> import yasa
     >>>
-    >>> # For this example, generate a fake dataset of sleep statistics from two different raters
-    >>> data = []
-    >>> for i in range(1, 21):
-    >>>     hypA = yasa.simulate_hypnogram(tib=600, seed=i)
-    >>>     hypB = hypA.simulate_similar(seed=i)
-    >>>     data.append({"subject": f"sub-{i:03d}", "rater": "RaterA"} | hypA.sleep_statistics())
-    >>>     data.append({"subject": f"sub-{i:03d}", "rater": "RaterB"} | hypB.sleep_statistics())
-    >>> df = (pd.json_normalize(data)
-    >>>     .melt(id_vars=["subject", "rater"], var_name="sstat", value_name="score")
-    >>>     .pivot(index=["subject", "sstat"], columns="rater", values="score")
-    >>>     .reset_index().rename_axis(None, axis=1)
-    >>>     .query("sstat.isin(['SE', 'TST', 'SOL', 'WASO', '%N1', '%N2', '%N3', '%REM'])")
-    >>> )
+    >>> # For this example, generate two fake datasets of sleep statistics
+    >>> hypsA = [yasa.simulate_hypnogram(tib=600, seed=i) for i in range(20)]
+    >>> hypsB = [h.simulate_similar(tib=600, seed=i) for i, h in enumerate(hypsA)]
+    >>> sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
     >>>
-    >>> sse = yasa.SleepStatsEvaluation(
-    >>>     data=df, reference="RaterA", test="RaterB", subject="subject", statistic="sstat"
-    >>> )
+    >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
     >>>
     >>> sse.summary(descriptives=False)
            normal  unbiased  homoscedastic
@@ -440,75 +425,93 @@ class SleepStatsEvaluation:
 
         >>> sse.plot_blandaltman()
     """
-    def __init__(self, data, *, reference, test, subject, statistic):
-        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
-        for col in [reference, test, subject, statistic]:
-            assert isinstance(col, str) and col in data, (
-                f"`{col}` must be a string and a column in `data`"
-            )
-        assert data[subject].nunique() > 1, "`data` must include more than one subject"
-        assert not data.groupby("subject")["sstat"].count().diff().any(), "same number of sstats for all subjects"
-        assert not data.groupby("subject")["sstat"].nunique().is_unique, "no repeated sstats per subject"
-        
-        # Don't update this, rename to something else like table.
-        data = data.copy()
+    def __init__(self, refr_data, test_data, *, refr_name="Reference", test_name="Test"):
+
+        assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
+        assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
+        assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
+        assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
+        assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
+
+        # Set attributes
+        self._refr_data = refr_data
+        self._test_data = test_data
+        self._refr_name = refr_name
+        self._test_name = test_name
+        self._subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
+
+        # Merge dataframes and reshape wide-to-long format
+        # Add levels to index
+        refr_data.index.name = self._subj_name
+        test_data.index.name = self._subj_name
+        df1 = pd.concat({refr_name: refr_data}, names=["measurement"])
+        df2 = pd.concat({test_name: test_data}, names=["measurement"])
+        df = pd.concat([df1, df2])
+        df = df.melt(var_name="sstat", ignore_index=False).reset_index(
+            ).pivot(columns="measurement", index=[self._subj_name, "sstat"], values="value"
+            ).reset_index().rename_axis(columns=None)
 
         # Get measurement difference between reference and test devices
-        data["difference"] = data[test].sub(data[reference])
+        df["difference"] = df[test_name].sub(df[refr_name])
 
         # Remove sleep statistics that have no differences between measurement systems.
         ## TODO: simplify once not manipulating _data
-        stats_nodiff = data.groupby(statistic)["difference"].any().loc[lambda x: ~x].index
+        stats_nodiff = df.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
+        df = df.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
-            data = data.query(f"{statistic} != '{s}'")
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
             ## Q: Should this be logged as just info?
 
+        # Set more attributes
+        self._data = df
         # Get list of all statistics to be evaluated
-        self.all_sleepstats = data[statistic].unique()
-
-        # Set attributes
-        self._data = data
-        self._reference = reference
-        self._test = test
-        self._subject = subject
-        self._statistic = statistic
+        self._all_sleepstats = df["sstat"].unique()
 
         # Run tests
         self.test_normality(method="shapiro", alpha=0.05)
         self.test_proportional_bias(alpha=0.05)
         self.test_homoscedasticity(method="levene", alpha=0.05)
 
+    # @property
+    # def data(self):
+    #     """The summary dataframe of sleep statistics."""
+    #     return self._data
+
+    @property
+    def refr_data(self):
+        """The dataframe of reference measurement sleep statistics."""
+        return self._refr_data
+
     @property
-    def data(self):
-        """The summary dataframe of sleep statistics."""
-        return self._data
+    def test_data(self):
+        """The dataframe of test measurement sleep statistics."""
+        return self._test_data
 
     @property
-    def reference(self):
-        """The name of the column containing the reference measurement sleep statistics."""
-        return self._reference
+    def refr_name(self):
+        """The name of the reference measurement."""
+        return self._refr_name
 
     @property
-    def test(self):
-        """The name of the column containing the test measurement sleep statistics."""
-        return self._test
+    def test_name(self):
+        """The name of the test measurement."""
+        return self._test_name
 
     @property
-    def subject(self):
-        """The name of the column containing the subject identifiers."""
-        return self._subject
+    def subj_name(self):
+        """The name of the subject identifier."""
+        return self._subj_name
 
     @property
-    def statistic(self):
-        """The name of the column containing the sleep statistic name."""
-        return self._statistic
+    def all_sleepstats(self):
+        """A list of all sleep statistics included in analysis."""
+        return self._all_sleepstats
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test}' evaluated against reference "
-            f"measurement '{self.reference}'>\n"
+            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
+            f"reference measurement '{self.refr_name}'>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -516,8 +519,8 @@ def __repr__(self):
 
     def __str__(self):
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test}' evaluated against reference "
-            f"measurement '{self.reference}'>\n"
+            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
+            f"reference measurement '{self.refr_name}'>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -531,7 +534,7 @@ def test_normality(self, **kwargs):
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`pingouin.normality` call.
         """
-        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality, **kwargs)
+        normality = self._data.groupby("sstat")[self.refr_name].apply(pg.normality, **kwargs)
         self.normality = normality.droplevel(-1)
 
     def test_proportional_bias(self, **kwargs):
@@ -550,15 +553,15 @@ def test_proportional_bias(self, **kwargs):
             kwargs["alpha"] = 0.05
         prop_bias_results = []
         residuals_results = []
-        for ss, ss_df in self.data.groupby(self.statistic):
-            # Regress the difference score on the reference device
-            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"], **kwargs)
-            model.insert(0, self.statistic, ss)
+        for ss, ss_df in self._data.groupby("sstat"):
+            # Regress the difference score on the reference measurements
+            model = pg.linear_regression(ss_df[self.refr_name], ss_df["difference"], **kwargs)
+            model.insert(0, "sstat", ss)
             # Extract the subject-level residuals
             resid = pd.DataFrame(
                 {
-                    self.subject: ss_df[self.subject],
-                    self.statistic: ss,
+                    self.subj_name: ss_df[self.subj_name],
+                    "sstat": ss,  # Or ss_df["sstat"]?
                     "pbias_residual": model.residuals_
                 }
             )
@@ -566,7 +569,7 @@ def test_proportional_bias(self, **kwargs):
             residuals_results.append(resid)
         # Add residuals to raw dataframe, used later when testing homoscedasticity
         residuals = pd.concat(residuals_results)
-        self.data = self.data.merge(residuals, on=[self.subject, self.statistic])
+        self._data = self._data.merge(residuals, on=[self.subj_name, "sstat"])
         # Handle proportional bias results
         prop_bias = pd.concat(prop_bias_results)
         # Save all the proportional bias models before removing intercept, for optional user access
@@ -575,7 +578,7 @@ def test_proportional_bias(self, **kwargs):
         prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
         # Add True/False passing column for easy access
         prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs["alpha"])
-        self.proportional_bias = prop_bias.set_index(self.statistic)
+        self.proportional_bias = prop_bias.set_index("sstat")
 
     def test_homoscedasticity(self, **kwargs):
         """Test each statistic for homoscedasticity.
@@ -587,8 +590,8 @@ def test_homoscedasticity(self, **kwargs):
 
         ..note:: :py:meth:`yasa.SleepStatsEvaluation.test_proportional_bias` must be called first.
         """
-        group = self.data.groupby(self.statistic)
-        columns = [self.reference, "difference", "pbias_residual"]
+        group = self._data.groupby("sstat")
+        columns = [self.refr_name, "difference", "pbias_residual"]
         homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], **kwargs))
         self.homoscedasticity = homoscedasticity.droplevel(-1)
 
@@ -602,7 +605,7 @@ def summary(self, descriptives=True):
         ]
         summary = pd.concat(series_list, axis=1)
         if descriptives:
-            group = self.data.drop(columns=self.subject).groupby(self.statistic)
+            group = self._data.drop(columns=self.subj_name).groupby("sstat")
             desc = group.agg(["mean", "std"])
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
@@ -634,7 +637,7 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
         # Pivot for subject-rows and statistic-columns
-        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+        table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         # If annotating, replace with raw values for writing.
@@ -670,7 +673,7 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
         stripplot_kwargs.update(kwargs)
 
         # Pivot data to get subject-rows and statistic-columns
-        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+        table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
 
         # Initialize the PairGrid
         height = 0.3 * len(table)
@@ -678,8 +681,8 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
         g = sns.PairGrid(
             table.reset_index(),
             x_vars=sstats_order,
-            y_vars=[self.subject],
-            hue=self.subject,
+            y_vars=[self.subj_name],
+            hue=self.subj_name,
             palette=palette,
             height=height,
             aspect=aspect,
@@ -726,9 +729,9 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         facetgrid_kwargs.update(facet_kwargs)
 
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col=self.statistic, col_order=sstats_order, **facetgrid_kwargs)
+        g = sns.FacetGrid(self._data, col="sstat", col_order=sstats_order, **facetgrid_kwargs)
         # Draw Bland-Altman on each axis
-        g.map(pg.plot_blandaltman, self.test, self.reference, **blandaltman_kwargs)
+        g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
 
         # Tidy-up axis limits with symmetric y-axis and minimal ticks
         for ax in g.axes.flat:
@@ -737,7 +740,7 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
         # More aesthetics
-        ylabel = " - ".join((self.test, self.reference))
+        ylabel = " - ".join((self.test_name, self.refr_name))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)

From a8a3b1c47134fbaf961f18f8964693dfa16d255b Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 2 Jan 2023 03:44:16 -0600
Subject: [PATCH 10/43] EpochByEpoch accepts sequences of Hypnograms for group
 evaluation

---
 yasa/evaluation.py | 208 +++++++++++++++++++++++++++++++++------------
 1 file changed, 154 insertions(+), 54 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 451f130..39406aa 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -123,70 +123,146 @@ class EpochByEpochEvaluation:
         >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
         >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
     """
-    def __init__(self, refr_hyp, test_hyp):
+    def __init__(self, refr_hyps, test_hyps):
         from yasa.hypno import Hypnogram  # Loading here to avoid circular import
-        assert isinstance(refr_hyp, Hypnogram), "`refr_hyp` must be a YASA Hypnogram"
-        assert isinstance(test_hyp, Hypnogram), "`test_hyp` must be a YASA Hypnogram"
-        assert refr_hyp.scorer is not None, "`refr_hyp` must have a scorer label"
-        assert test_hyp.scorer is not None, "`test_hyp` must have a scorer label"
-        assert refr_hyp.scorer != test_hyp.scorer, (
-            "scorer must be unique for `refr_hyp` and `test_hyp`"
+
+        assert isinstance(refr_hyps, Hypnogram) or hasattr(refr_hyps, "__iter__"), (
+            "`refr_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
         )
-        assert refr_hyp.n_stages == test_hyp.n_stages, (
-            "`refr_hyp` and `test_hyp` must have the same `n_stages`"
+        assert isinstance(test_hyps, Hypnogram) or hasattr(test_hyps, "__iter__"), (
+            "`test_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
         )
-        assert refr_hyp.labels == test_hyp.labels
-        assert refr_hyp.mapping == test_hyp.mapping
-        if (n_ref := refr_hyp.n_epochs) != (n_test := test_hyp.n_epochs):
-            ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
-            if n_ref > n_test:
-                refr_hyp = Hypnogram(refr_hyp.hypno[:n_test], n_stages=refr_hyp.n_stages)
-                n_trimmed = n_ref - n_test
-                warn_msg = f"`refr_hyp` longer than `test_hyp`, trimmed to {n_test} epochs"
-            else:
-                test_hyp = Hypnogram(test_hyp.hypno[:n_ref], n_stages=test_hyp.n_stages)
-                n_trimmed = n_test - n_ref
-                warn_msg = f"`test_hyp` longer than `refr_hyp`, {n_trimmed} epochs trimmed"
-            ## Q: Should be downplayed as INFO?
-            logger.warning(warn_msg)
+        assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
+
+        # Convert solo hypnograms to len==1 tuples
+        if isinstance(refr_hyps, Hypnogram):  # As below, picking refr_hyps for checks arbitrarily
+            refr_hyps = [refr_hyps]
+            test_hyps = [test_hyps]
+        else:
+            assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps)
+            assert all(isinstance(hyp, Hypnogram) for hyp in test_hyps)
+
+        assert len(refr_hyps) == len(test_hyps), "must have same number of subjects"
+
+        if isinstance(refr_hyps, dict):
+            assert refr_hyps.keys() == test_hyps.keys(), "must have same subject identifiers and in same order"
+            subjects, refr_hyps = zip(*refr_hyps.items())
+            # assert all(isinstance(s, str) for s in subjects)
+            test_hyps = tuple(test_hyps.values())
+        else:
+            subjects = 1 + np.arange(len(refr_hyps))
+
+        all_hyps = refr_hyps + test_hyps
+        assert all(h.scorer is not None for h in all_hyps), "all hypnograms must have a scorer"
+        for h1, h2 in zip(all_hyps[:-1], all_hyps[1:]):
+            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+            assert h1.labels == h2.labels, "all hypnograms must have the same labels"
+            assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), "all `refr_hyps` must have the same scorer"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), "all `test_hyps` must have the same scorer"
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "`refr_hyps` and `test_hyps` must have unique scorers"
+        ## Could use set() for those above
+        ## Or set scorer as the first available and check all equal
+
+        ## TODO: trim each hypno
+        # if (n_ref := refr_hyp.n_epochs) != (n_test := test_hyp.n_epochs):
+        #     ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
+        #     if n_ref > n_test:
+        #         refr_hyp = Hypnogram(refr_hyp.hypno[:n_test], n_stages=refr_hyp.n_stages)
+        #         n_trimmed = n_ref - n_test
+        #         warn_msg = f"`refr_hyp` longer than `test_hyp`, trimmed to {n_test} epochs"
+        #     else:
+        #         test_hyp = Hypnogram(test_hyp.hypno[:n_ref], n_stages=test_hyp.n_stages)
+        #         n_trimmed = n_test - n_ref
+        #         warn_msg = f"`test_hyp` longer than `refr_hyp`, {n_trimmed} epochs trimmed"
+        #     ## Q: Should be downplayed as INFO?
+        #     logger.warning(warn_msg)
+
+        # Convert to dictionaries with subjects and hypnograms
+        refr_hyps = { s: h for s, h in zip(subjects, refr_hyps) }
+        test_hyps = { s: h for s, h in zip(subjects, test_hyps) }
+
+        # Merge all hypnograms into a single multiindexed dataframe
+        refr = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in refr_hyps.items())
+        test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
+        data = pd.concat([refr, test], axis=1)
         
         # Set attributes
-        self._refr_hyp = refr_hyp.copy()
-        self._test_hyp = test_hyp.copy()
+        self._data = data
+        self._subjects = subjects
+        self._n_subjects = len(subjects)
+        self._refr_hyps = refr_hyps
+        self._test_hyps = test_hyps
+        self._refr_name = refr_hyps[subjects[0]].scorer
+        self._test_name = test_hyps[subjects[0]].scorer
+        self._n_stages = refr_hyps[subjects[0]].n_stages
+        self._labels = refr_hyps[subjects[0]].labels
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
+        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_hyp.scorer} evaluated "
-            f"against reference Hypnogram scored by {self.test_hyp.scorer}>\n"
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
+            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
         )
 
     def __str__(self):
+        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_hyp.scorer} evaluated "
-            f"against reference Hypnogram scored by {self.test_hyp.scorer}>\n"
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
+            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
         )
 
     @property
-    def refr_hyp(self):
-        """The reference Hypnogram."""
+    def data(self):
+        return self._data
+
+    @property
+    def refr_hyps(self):
+        """The reference Hypnograms."""
         ## Q: Starting to think there should be a clear convention on what we mean
         ##    when we say "hypnogram". Should hypnogram mean the Series and Hypnogram
         ##    mean the YASA object? Similarly for hypno/hyp.
-        return self._refr_hyp
+        return self._refr_hyps
+
+    @property
+    def test_hyps(self):
+        """The test Hypnograms."""
+        return self._test_hyps
+
+    @property
+    def subjects(self):
+        return self._subjects
 
     @property
-    def test_hyp(self):
-        """The test Hypnogram."""
-        return self._test_hyp
+    def n_subjects(self):
+        return self._n_subjects
 
-    def get_agreement(self):
+    @property
+    def refr_name(self):
+        """The name of the reference measurement."""
+        return self._refr_name
+
+    @property
+    def test_name(self):
+        """The name of the test measurement."""
+        return self._test_name
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def n_stages(self):
+        return self._n_stages
+
+    def get_agreement(self, subject=None):
         """
         Return a dataframe of ``refr_hyp``/``test_hyp`` performance across all stages as measured by
         common classifier agreement methods.
@@ -195,13 +271,23 @@ def get_agreement(self):
         ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
         ##    Maybe should be binary vs multiclass?
 
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEpochEvaluation`
+            A :py:class:`yasa.EpochByEpochEvaluation` instance.
+        subject : None or a unique subject identifier.
+            Subject identifiers are based on user input, and integers starting from 1 if not provided.
+
         Returns
         -------
         agreement : :py:class:`pandas.Series`
             A :py:class:`pandas.Series` with agreement metrics as indices.
         """
-        true = self.refr_hyp.hypno.to_numpy()
-        pred = self.test_hyp.hypno.to_numpy()
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = pred.loc[subject]
+            pred = pred.loc[subject]
         accuracy = metrics.accuracy_score(true, pred)
         kappa = metrics.cohen_kappa_score(true, pred)
         jaccard = metrics.jaccard_score(true, pred, average="weighted")
@@ -219,7 +305,7 @@ def get_agreement(self):
         agreement = pd.Series(scores, name="agreement").rename_axis("metric")
         return agreement
 
-    def get_agreement_by_stage(self):
+    def get_agreement_by_stage(self, subject=None):
         """
         Return a dataframe of ``refr_hyp``/``test_hyp`` performance for each stage as measured by
         common classifier agreement methods.
@@ -231,18 +317,20 @@ def get_agreement_by_stage(self):
         agreement : :py:class:`pandas.DataFrame`
             A DataFrame with agreement metrics as indices and stages as columns.
         """
-        true = self.refr_hyp.hypno.to_numpy()
-        pred = self.test_hyp.hypno.to_numpy()
-        labels = self.test_hyp.labels  # Same as refr_hyp.labels
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = true.loc[subject]
+            pred = pred.loc[subject]
         scores = metrics.precision_recall_fscore_support(
-            true, pred, labels=labels, average=None, zero_division=0
+            true, pred, labels=self.labels, average=None, zero_division=0
         )
         agreement = pd.DataFrame(scores)
         agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
-        agreement.columns = pd.Index(labels, name="stage")
+        agreement.columns = pd.Index(self.labels, name="stage")
         return agreement
 
-    def get_confusion_matrix(self):
+    def get_confusion_matrix(self, subject=None):
         """Return a ``refr_hyp``/``test_hyp``confusion matrix.
 
         Returns
@@ -250,16 +338,19 @@ def get_confusion_matrix(self):
         matrix : :py:class:`pandas.DataFrame`
             A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
         """
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = true.loc[subject]
+            pred = pred.loc[subject]
         # Generate confusion matrix.
-        matrix = pd.crosstab(
-            self.refr_hyp.hypno, self.test_hyp.hypno, margins=True, margins_name="Total"
-        )
+        matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
         # Reorder indices in sensible order and to include all stages
-        matrix = matrix.reindex(labels=self.refr_hyp.labels + ["Total"], fill_value=0)
-        matrix = matrix.reindex(columns=self.test_hyp.labels + ["Total"], fill_value=0)
+        index_col_labels = self.labels + ["Total"]
+        matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
         return matrix.astype(int)
 
-    def plot_hypnograms(self, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
+    def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         """Plot the two hypnograms, where ``refr_hyp`` is overlaid on ``refr_hyp``.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
@@ -289,6 +380,15 @@ def plot_hypnograms(self, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
+        if subject is None:
+            if self.n_subjects == 1:
+                refr_hyp = self.refr_hyps[self.subjects[0]]
+                test_hyp = self.test_hyps[self.subjects[0]]
+            else:
+                raise NotImplementedError("Plotting is currently allowed for only one subject")
+        else:
+            refr_hyp = self.refr_hyps[subject]
+            test_hyp = self.test_hyps[subject]
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
@@ -301,8 +401,8 @@ def plot_hypnograms(self, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         plot_test_kwargs.update(test_kwargs)
         if ax is None:
             ax = plt.gca()
-        self.refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
-        self.test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
+        refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
+        test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
         if legend and "label" in plot_refr_kwargs | plot_test_kwargs:
             if isinstance(legend, dict):
                 ax.legend(**legend)
@@ -310,7 +410,7 @@ def plot_hypnograms(self, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
                 ax.legend()
         return ax
 
-    def plot_roc(self, palette=None, ax=None, **kwargs):
+    def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
         """Plot ROC curves for each stage.
 
         Parameters

From a9b784af7964ac9ff5b06f4bbff170dd762da783 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 2 Jan 2023 15:53:20 -0600
Subject: [PATCH 11/43] EpochByEpoch gets sleep stats

---
 yasa/evaluation.py | 94 ++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 39406aa..2caf99d 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -138,10 +138,6 @@ def __init__(self, refr_hyps, test_hyps):
         if isinstance(refr_hyps, Hypnogram):  # As below, picking refr_hyps for checks arbitrarily
             refr_hyps = [refr_hyps]
             test_hyps = [test_hyps]
-        else:
-            assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps)
-            assert all(isinstance(hyp, Hypnogram) for hyp in test_hyps)
-
         assert len(refr_hyps) == len(test_hyps), "must have same number of subjects"
 
         if isinstance(refr_hyps, dict):
@@ -153,6 +149,7 @@ def __init__(self, refr_hyps, test_hyps):
             subjects = 1 + np.arange(len(refr_hyps))
 
         all_hyps = refr_hyps + test_hyps
+        assert all(isinstance(hyp, Hypnogram) for hyp in all_hyps), "`refr_hyps` and `test_hyps` must only include hypnograms"
         assert all(h.scorer is not None for h in all_hyps), "all hypnograms must have a scorer"
         for h1, h2 in zip(all_hyps[:-1], all_hyps[1:]):
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
@@ -160,24 +157,11 @@ def __init__(self, refr_hyps, test_hyps):
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
         assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), "all `refr_hyps` must have the same scorer"
         assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), "all `test_hyps` must have the same scorer"
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "`refr_hyps` and `test_hyps` must have unique scorers"
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
         ## Could use set() for those above
         ## Or set scorer as the first available and check all equal
 
-        ## TODO: trim each hypno
-        # if (n_ref := refr_hyp.n_epochs) != (n_test := test_hyp.n_epochs):
-        #     ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
-        #     if n_ref > n_test:
-        #         refr_hyp = Hypnogram(refr_hyp.hypno[:n_test], n_stages=refr_hyp.n_stages)
-        #         n_trimmed = n_ref - n_test
-        #         warn_msg = f"`refr_hyp` longer than `test_hyp`, trimmed to {n_test} epochs"
-        #     else:
-        #         test_hyp = Hypnogram(test_hyp.hypno[:n_ref], n_stages=test_hyp.n_stages)
-        #         n_trimmed = n_test - n_ref
-        #         warn_msg = f"`test_hyp` longer than `refr_hyp`, {n_trimmed} epochs trimmed"
-        #     ## Q: Should be downplayed as INFO?
-        #     logger.warning(warn_msg)
-
         # Convert to dictionaries with subjects and hypnograms
         refr_hyps = { s: h for s, h in zip(subjects, refr_hyps) }
         test_hyps = { s: h for s, h in zip(subjects, test_hyps) }
@@ -186,6 +170,13 @@ def __init__(self, refr_hyps, test_hyps):
         refr = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in refr_hyps.items())
         test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
         data = pd.concat([refr, test], axis=1)
+
+        # Get summary sleep statistics for each measurement.
+        refr_sstats = pd.Series(refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        test_sstats = pd.Series(test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        refr_sstats = refr_sstats.set_index(pd.Index(subjects, name="subject"))
+        test_sstats = test_sstats.set_index(pd.Index(subjects, name="subject"))
+        # sse = yasa.SleepStatsEvaluation(refr_sstats, test_sstats)
         
         # Set attributes
         self._data = data
@@ -193,6 +184,8 @@ def __init__(self, refr_hyps, test_hyps):
         self._n_subjects = len(subjects)
         self._refr_hyps = refr_hyps
         self._test_hyps = test_hyps
+        self._refr_sstats = refr_sstats
+        self._test_sstats = test_sstats
         self._refr_name = refr_hyps[subjects[0]].scorer
         self._test_name = test_hyps[subjects[0]].scorer
         self._n_stages = refr_hyps[subjects[0]].n_stages
@@ -223,6 +216,14 @@ def __str__(self):
     def data(self):
         return self._data
 
+    @property
+    def refr_sstats(self):
+        return self._refr_sstats
+
+    @property
+    def test_sstats(self):
+        return self._test_sstats
+
     @property
     def refr_hyps(self):
         """The reference Hypnograms."""
@@ -546,36 +547,37 @@ def __init__(self, refr_data, test_data, *, refr_name="Reference", test_name="Te
         test_data.index.name = self._subj_name
         df1 = pd.concat({refr_name: refr_data}, names=["measurement"])
         df2 = pd.concat({test_name: test_data}, names=["measurement"])
-        df = pd.concat([df1, df2])
-        df = df.melt(var_name="sstat", ignore_index=False).reset_index(
-            ).pivot(columns="measurement", index=[self._subj_name, "sstat"], values="value"
-            ).reset_index().rename_axis(columns=None)
-
-        # Get measurement difference between reference and test devices
-        df["difference"] = df[test_name].sub(df[refr_name])
+        df3 = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
+        data = (pd.concat([df1, df2, df3])
+            .melt(var_name="sstat", ignore_index=False).reset_index()
+            .pivot(columns="measurement", index=[self._subj_name, "sstat"], values="value")
+            .reset_index().rename_axis(columns=None)
+        )
+        # # Get measurement difference between reference and test devices
+        # df["difference"] = df[test_name].sub(df[refr_name])
 
         # Remove sleep statistics that have no differences between measurement systems.
         ## TODO: simplify once not manipulating _data
-        stats_nodiff = df.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
-        df = df.query(f"~sstat.isin({stats_nodiff})")
+        stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
+        data = data.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
             ## Q: Should this be logged as just info?
 
         # Set more attributes
-        self._data = df
+        self._data = data
         # Get list of all statistics to be evaluated
-        self._all_sleepstats = df["sstat"].unique()
+        self._all_sleepstats = data["sstat"].unique()
 
         # Run tests
         self.test_normality(method="shapiro", alpha=0.05)
         self.test_proportional_bias(alpha=0.05)
         self.test_homoscedasticity(method="levene", alpha=0.05)
 
-    # @property
-    # def data(self):
-    #     """The summary dataframe of sleep statistics."""
-    #     return self._data
+    @property
+    def data(self):
+        """The summary dataframe of sleep statistics."""
+        return self._data
 
     @property
     def refr_data(self):
@@ -634,7 +636,7 @@ def test_normality(self, **kwargs):
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`pingouin.normality` call.
         """
-        normality = self._data.groupby("sstat")[self.refr_name].apply(pg.normality, **kwargs)
+        normality = self.data.groupby("sstat")[self.refr_name].apply(pg.normality, **kwargs)
         self.normality = normality.droplevel(-1)
 
     def test_proportional_bias(self, **kwargs):
@@ -653,7 +655,7 @@ def test_proportional_bias(self, **kwargs):
             kwargs["alpha"] = 0.05
         prop_bias_results = []
         residuals_results = []
-        for ss, ss_df in self._data.groupby("sstat"):
+        for ss, ss_df in self.data.groupby("sstat"):
             # Regress the difference score on the reference measurements
             model = pg.linear_regression(ss_df[self.refr_name], ss_df["difference"], **kwargs)
             model.insert(0, "sstat", ss)
@@ -669,7 +671,7 @@ def test_proportional_bias(self, **kwargs):
             residuals_results.append(resid)
         # Add residuals to raw dataframe, used later when testing homoscedasticity
         residuals = pd.concat(residuals_results)
-        self._data = self._data.merge(residuals, on=[self.subj_name, "sstat"])
+        self.residuals_ = self.data.merge(residuals, on=[self.subj_name, "sstat"])
         # Handle proportional bias results
         prop_bias = pd.concat(prop_bias_results)
         # Save all the proportional bias models before removing intercept, for optional user access
@@ -690,7 +692,7 @@ def test_homoscedasticity(self, **kwargs):
 
         ..note:: :py:meth:`yasa.SleepStatsEvaluation.test_proportional_bias` must be called first.
         """
-        group = self._data.groupby("sstat")
+        group = self.residuals_.groupby("sstat")
         columns = [self.refr_name, "difference", "pbias_residual"]
         homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], **kwargs))
         self.homoscedasticity = homoscedasticity.droplevel(-1)
@@ -705,7 +707,7 @@ def summary(self, descriptives=True):
         ]
         summary = pd.concat(series_list, axis=1)
         if descriptives:
-            group = self._data.drop(columns=self.subj_name).groupby("sstat")
+            group = self.data.drop(columns=self.subj_name).groupby("sstat")
             desc = group.agg(["mean", "std"])
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
@@ -736,15 +738,16 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        # Pivot for subject-rows and statistic-columns
-        table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        # # Pivot for subject-rows and statistic-columns
+        # table = self.data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        table = self.test_data.sub(self.refr_data)[sstats_order]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         # If annotating, replace with raw values for writing.
         if heatmap_kwargs["annot"]:
-            heatmap_kwargs["annot"] = table[sstats_order].to_numpy()
+            heatmap_kwargs["annot"] = table.to_numpy()
         # Draw heatmap
-        ax = sns.heatmap(table_norm[sstats_order], **heatmap_kwargs)
+        ax = sns.heatmap(table_norm, **heatmap_kwargs)
         return ax
 
     def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwargs):
@@ -773,7 +776,8 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
         stripplot_kwargs.update(kwargs)
 
         # Pivot data to get subject-rows and statistic-columns
-        table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        # table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        table = self.test_data.sub(self.refr_data)#[sstats_order]
 
         # Initialize the PairGrid
         height = 0.3 * len(table)
@@ -829,7 +833,7 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         facetgrid_kwargs.update(facet_kwargs)
 
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self._data, col="sstat", col_order=sstats_order, **facetgrid_kwargs)
+        g = sns.FacetGrid(self.data, col="sstat", col_order=sstats_order, **facetgrid_kwargs)
         # Draw Bland-Altman on each axis
         g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
 

From b4df021b61da2375c18ff72e074be56182ed7bc8 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 2 Jan 2023 17:59:58 -0600
Subject: [PATCH 12/43] SleepStats move statistical tests to __init__()

---
 yasa/evaluation.py | 319 +++++++++++++++++++++++----------------------
 1 file changed, 164 insertions(+), 155 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 2caf99d..1c96242 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -451,6 +451,18 @@ class SleepStatsEvaluation:
     test_data : :py:class:`pandas.DataFrame`
         A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
         Shape, indices, and columns must be identical to ``refr_data``.
+    refr_name : str
+        Name of the reference measurement device, used for labeling.
+    test_name : str
+        Name of the test measurement device, used for labeling.
+    alpha : float
+        Alpha cutoff used for all three tests.
+    kwargs_normality : dict
+        Keywords arguments passed to the :py:func:`pingouin.normality` call.
+    kwargs_regression : dict
+        Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
+    kwargs_homoscedasticity : dict
+        Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
 
     Notes
     -----
@@ -526,68 +538,113 @@ class SleepStatsEvaluation:
 
         >>> sse.plot_blandaltman()
     """
-    def __init__(self, refr_data, test_data, *, refr_name="Reference", test_name="Test"):
-
+    def __init__(
+        self,
+        refr_data,
+        test_data,
+        *,
+        refr_name="Reference",
+        test_name="Test",
+        kwargs_normality={"alpha": 0.05},
+        kwargs_regression={"alpha": 0.05},
+        kwargs_homoscedasticity={"alpha": 0.05},
+    ):
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
         assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
         assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
         assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
-
-        # Set attributes
-        self._refr_data = refr_data
-        self._test_data = test_data
-        self._refr_name = refr_name
-        self._test_name = test_name
-        self._subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
-
-        # Merge dataframes and reshape wide-to-long format
-        # Add levels to index
-        refr_data.index.name = self._subj_name
-        test_data.index.name = self._subj_name
-        df1 = pd.concat({refr_name: refr_data}, names=["measurement"])
-        df2 = pd.concat({test_name: test_data}, names=["measurement"])
-        df3 = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
-        data = (pd.concat([df1, df2, df3])
+        assert isinstance(refr_name, str)
+        assert isinstance(test_name, str)
+        assert refr_name != test_name
+        assert isinstance(kwargs_normality, dict)
+        assert isinstance(kwargs_regression, dict)
+        assert isinstance(kwargs_homoscedasticity, dict)
+        assert "alpha" in kwargs_normality
+        assert "alpha" in kwargs_regression
+        assert "alpha" in kwargs_homoscedasticity
+
+        # Merge dataframes, get differences, and reshape wide-to-long format
+        subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
+        refr_data.index.name = subj_name
+        test_data.index.name = subj_name
+        diff_data = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
+        refr_data = pd.concat({refr_name: refr_data}, names=["measurement"])
+        test_data = pd.concat({test_name: test_data}, names=["measurement"])
+        data = (pd.concat([refr_data, test_data, diff_data])
             .melt(var_name="sstat", ignore_index=False).reset_index()
-            .pivot(columns="measurement", index=[self._subj_name, "sstat"], values="value")
+            .pivot(columns="measurement", index=[subj_name, "sstat"], values="value")
             .reset_index().rename_axis(columns=None)
         )
-        # # Get measurement difference between reference and test devices
-        # df["difference"] = df[test_name].sub(df[refr_name])
 
-        # Remove sleep statistics that have no differences between measurement systems.
-        ## TODO: simplify once not manipulating _data
+        # Remove sleep statistics that have no differences between measurement systems
+        ## TODO: restructure?
         stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
         data = data.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
             ## Q: Should this be logged as just info?
 
-        # Set more attributes
-        self._data = data
-        # Get list of all statistics to be evaluated
-        self._all_sleepstats = data["sstat"].unique()
+        ## NORMALITY ## Test reference data for normality at each sleep statistic
+        normality = data.groupby("sstat")[refr_name].apply(pg.normality, **kwargs_normality).droplevel(-1)
 
-        # Run tests
-        self.test_normality(method="shapiro", alpha=0.05)
-        self.test_proportional_bias(alpha=0.05)
-        self.test_homoscedasticity(method="levene", alpha=0.05)
+        ## PROPORTIONAL BIAS ## Test each sleep statistic for proportional bias
+        # Subject-level residuals for each statistic are added to data.
+        prop_bias_results = []
+        residuals_results = []
+        # proportional bias and residuals that will be used for the later  tests.
+        for ss_name, ss_df in data.groupby("sstat"):
+            # Regress the difference scores on the reference scores
+            model = pg.linear_regression(ss_df[refr_name], ss_df["difference"], **kwargs_regression)
+            model.insert(0, "sstat", ss_name)
+            # Extract subject-level residuals for later homoscedasticity tests
+            resid_dict = {subj_name: ss_df[subj_name], "sstat": ss_name, "pbias_residual": model.residuals_}
+            resid = pd.DataFrame(resid_dict)
+            prop_bias_results.append(model)
+            residuals_results.append(resid)
+        # Add residuals to raw dataframe, used later when testing homoscedasticity
+        data = data.merge(pd.concat(residuals_results), on=[subj_name, "sstat"])
+        # Handle proportional bias results
+        prop_bias = pd.concat(prop_bias_results)
+        # Save all the proportional bias models before removing intercept, for optional user access
+        prop_bias_full = prop_bias.reset_index(drop=True)
+        # Now remove intercept rows
+        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
+        # Add True/False passing column for easy access
+        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
+
+        ## Test each statistic for homoscedasticity ##
+        columns = [refr_name, "difference", "pbias_residual"]
+        homoscedasticity_func = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_func).droplevel(-1)
+
+        # Set attributes
+        self._data = data
+        self._normality = normality
+        self._proportional_bias = prop_bias
+        self._proportional_bias_full = prop_bias_full  # Q: Is this worth saving??
+        self._homoscedasticity = homoscedasticity
+        # These will not be set as properties, as they are only needed internally
+        self._refr_name = refr_name
+        self._test_name = test_name
+        self._subj_name = subj_name
+        # Pivot new to not include removed sstats
+        self._diff_data = data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        self._sleepstats = data["sstat"].unique() ## Q: Rename to self._labels??
 
     @property
     def data(self):
-        """The summary dataframe of sleep statistics."""
+        """
+        ``refr_data`` and ``test_data`` combined in a long-format :py:class:`pandas.DataFrame`.
+        Also includes difference scores (``test_data`` minus ``refr_data``).
+        """
         return self._data
 
     @property
-    def refr_data(self):
-        """The dataframe of reference measurement sleep statistics."""
-        return self._refr_data
-
-    @property
-    def test_data(self):
-        """The dataframe of test measurement sleep statistics."""
-        return self._test_data
+    def diff_data(self):
+        """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
+        # # Pivot for subject-rows and statistic-columns
+        return self._diff_data
 
     @property
     def refr_name(self):
@@ -605,9 +662,29 @@ def subj_name(self):
         return self._subj_name
 
     @property
-    def all_sleepstats(self):
+    def sleepstats(self):
         """A list of all sleep statistics included in analysis."""
-        return self._all_sleepstats
+        return self._sleepstats
+
+    @property
+    def normality(self):
+        """A :py:class:`pandas.DataFrame` of normality test results for all sleep statistics."""
+        return self._normality
+
+    @property
+    def homoscedasticity(self):
+        """A :py:class:`pandas.DataFrame` of homoscedasticity test results for all sleep statistics."""
+        return self._homoscedasticity
+
+    @property
+    def proportional_bias(self):
+        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        return self._proportional_bias
+
+    @property
+    def proportional_bias_full(self):
+        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        return self._proportional_bias_full
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
@@ -628,78 +705,25 @@ def __str__(self):
             "See the online documentation for more details."
         )
 
-    def test_normality(self, **kwargs):
-        """Test reference data for normality at each sleep statistic.
-
-        Parameters
-        ----------
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`pingouin.normality` call.
-        """
-        normality = self.data.groupby("sstat")[self.refr_name].apply(pg.normality, **kwargs)
-        self.normality = normality.droplevel(-1)
-
-    def test_proportional_bias(self, **kwargs):
-        """Test each sleep statistic for proportional bias.
-        
-        For each statistic, regress the device difference score on the reference device score to get
-        proportional bias and residuals that will be used for the later homoscedasticity
-        calculation. Subject-level residuals for each statistic are added to ``data``.
-
-        Parameters
-        ----------
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:func:`pingouin.linear_regression`.
-        """
-        if "alpha" not in kwargs:
-            kwargs["alpha"] = 0.05
-        prop_bias_results = []
-        residuals_results = []
-        for ss, ss_df in self.data.groupby("sstat"):
-            # Regress the difference score on the reference measurements
-            model = pg.linear_regression(ss_df[self.refr_name], ss_df["difference"], **kwargs)
-            model.insert(0, "sstat", ss)
-            # Extract the subject-level residuals
-            resid = pd.DataFrame(
-                {
-                    self.subj_name: ss_df[self.subj_name],
-                    "sstat": ss,  # Or ss_df["sstat"]?
-                    "pbias_residual": model.residuals_
-                }
-            )
-            prop_bias_results.append(model)
-            residuals_results.append(resid)
-        # Add residuals to raw dataframe, used later when testing homoscedasticity
-        residuals = pd.concat(residuals_results)
-        self.residuals_ = self.data.merge(residuals, on=[self.subj_name, "sstat"])
-        # Handle proportional bias results
-        prop_bias = pd.concat(prop_bias_results)
-        # Save all the proportional bias models before removing intercept, for optional user access
-        self.proportional_bias_models_ = prop_bias.reset_index(drop=True)
-        # Remove intercept rows
-        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
-        # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs["alpha"])
-        self.proportional_bias = prop_bias.set_index("sstat")
-
-    def test_homoscedasticity(self, **kwargs):
-        """Test each statistic for homoscedasticity.
+    def summary(self, descriptives=True):
+        """Return a summary dataframe highlighting what statistics pass checks.
 
         Parameters
         ----------
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:func:`pingouin.homoscedasticity`.
+        self : :py:class:`SleepStatsEvaluation`
+            A :py:class:`SleepStatsEvaluation` instance.
+        descriptives : bool or dict
+            If True (default) or a dictionary, also include descriptive statistics for reference and
+            test measurements. If a dictionary, all key/value pairs are passed as keyword arguments
+            to the :py:meth:`pandas.DataFrame.agg` call.
 
-        ..note:: :py:meth:`yasa.SleepStatsEvaluation.test_proportional_bias` must be called first.
+        Returns
+        -------
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
+            normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
         """
-        group = self.residuals_.groupby("sstat")
-        columns = [self.refr_name, "difference", "pbias_residual"]
-        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], **kwargs))
-        self.homoscedasticity = homoscedasticity.droplevel(-1)
-
-    def summary(self, descriptives=True):
-        """Return a summary dataframe highlighting what statistics pass checks."""
-        assert isinstance(descriptives, bool), "`descriptives` must be True or False"
+        assert isinstance(descriptives, (bool, dict)), "`descriptives` must be True, False, or dict"
         series_list = [
             self.normality["normal"],
             self.proportional_bias["unbiased"],
@@ -707,8 +731,10 @@ def summary(self, descriptives=True):
         ]
         summary = pd.concat(series_list, axis=1)
         if descriptives:
-            group = self.data.drop(columns=self.subj_name).groupby("sstat")
-            desc = group.agg(["mean", "std"])
+            agg_kwargs = {"func": ["mean", "std"]}
+            if isinstance(descriptives, dict):
+                agg_kwargs.update(descriptives)
+            desc = self.data.drop(columns=self.subj_name).groupby("sstat").agg(**agg_kwargs)
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
         return summary
@@ -730,35 +756,27 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
         """
         assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
         if sstats_order is None:
-            sstats_order = self.all_sleepstats
-
-        # Merge default heatmap arguments with optional input
+            sstats_order = self.sleepstats
         heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
         heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        # # Pivot for subject-rows and statistic-columns
-        # table = self.data.pivot(index=self.subj_name, columns="sstat", values="difference")
-        table = self.test_data.sub(self.refr_data)[sstats_order]
+        table = self.diff_data[sstats_order]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
-        # If annotating, replace with raw values for writing.
         if heatmap_kwargs["annot"]:
+            # Use raw values for writing
             heatmap_kwargs["annot"] = table.to_numpy()
-        # Draw heatmap
-        ax = sns.heatmap(table_norm, **heatmap_kwargs)
-        return ax
+        return sns.heatmap(table_norm, **heatmap_kwargs)
 
-    def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwargs):
+    def plot_discrepancies_dotplot(self, kwargs_pairplot={"palette": "winter"}, **kwargs):
         """Visualize subject-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        sstats_order : list
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        palette : string, list, dict, or :py:class:`matplotlib.colors.Colormap`
-            Color palette passed to :py:class:`seaborn.PairGrid`
+        kwargs_pairplot : dict
+            Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
 
@@ -766,34 +784,30 @@ def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwar
         -------
         g : :py:class:`seaborn.PairGrid`
             Seaborn PairGrid
+
+        Examples
+        --------
+        To plot a limited subset of sleep statistics, use the ``x_vars`` keyword argument of
+        :py:class:`seaborn.PairGrid`.
+
+        .. plot::
+            ## TODO: Example using x_vars
         """
-        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
+        assert isinstance(kwargs_pairplot, dict), "`kwargs_pairplot` must be a dict"
         if sstats_order is None:
-            sstats_order = self.all_sleepstats
-
-        # Merge default stripplot arguments with optional input
+            sstats_order = self.sleepstats
         stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         stripplot_kwargs.update(kwargs)
-
-        # Pivot data to get subject-rows and statistic-columns
-        # table = self._data.pivot(index=self.subj_name, columns="sstat", values="difference")
-        table = self.test_data.sub(self.refr_data)#[sstats_order]
-
         # Initialize the PairGrid
-        height = 0.3 * len(table)
+        height = 0.3 * len(self.diff_data)
         aspect = 0.6
-        g = sns.PairGrid(
-            table.reset_index(),
-            x_vars=sstats_order,
-            y_vars=[self.subj_name],
-            hue=self.subj_name,
-            palette=palette,
-            height=height,
-            aspect=aspect,
+        pairgrid_kwargs = dict(
+            x_vars=sstats_order, hue=self.subj_name, height=height, aspect=aspect
         )
+        pairgrid_kwargs.update(kwargs_pairgrid)
+        g = sns.PairGrid(self.diff_data.reset_index(), y_vars=[self.subj_name], **pairgrid_kwargs)
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
-
         # Adjust aesthetics
         g.set(xlabel="", ylabel="")
         for ax, title in zip(g.axes.flat, sstats_order):
@@ -822,28 +836,23 @@ def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
         """
         assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
         if sstats_order is None:
-            sstats_order = self.all_sleepstats
-
-        # Select scatterplot arguments (passed to blandaltman) and update with optional input
+            sstats_order = self.sleepstats
         blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
         blandaltman_kwargs.update(kwargs)
-        # Select FacetGrid arguments and update with optional input
-        col_wrap = 4 if len(sstats_order) > 4 else None
+        col_wrap = None if len(sstats_order) <= 4 else 4
         facetgrid_kwargs = dict(col_wrap=col_wrap, height=2, aspect=1, sharex=False, sharey=False)
         facetgrid_kwargs.update(facet_kwargs)
-
         # Initialize a grid of plots with an Axes for each sleep statistic
         g = sns.FacetGrid(self.data, col="sstat", col_order=sstats_order, **facetgrid_kwargs)
-        # Draw Bland-Altman on each axis
+        # Draw Bland-Altman plot on each axis
         g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
-
-        # Tidy-up axis limits with symmetric y-axis and minimal ticks
+        # Adjust aesthetics
         for ax in g.axes.flat:
+            # Tidy-up axis limits with symmetric y-axis and minimal ticks
             bound = max(map(abs, ax.get_ylim()))
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        # More aesthetics
         ylabel = " - ".join((self.test_name, self.refr_name))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")

From ba06ab1c2c05cc075005a95a96f73a60e9a80bb0 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 2 Jan 2023 18:33:00 -0600
Subject: [PATCH 13/43] better plotting flexibility and baked-in
 sleepstats_order

---
 yasa/evaluation.py | 67 +++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 1c96242..e8a0cff 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -628,9 +628,9 @@ def __init__(
         self._refr_name = refr_name
         self._test_name = test_name
         self._subj_name = subj_name
+        self._n_subjects = data[subj_name].nunique()
         # Pivot new to not include removed sstats
         self._diff_data = data.pivot(index=self.subj_name, columns="sstat", values="difference")
-        self._sleepstats = data["sstat"].unique() ## Q: Rename to self._labels??
 
     @property
     def data(self):
@@ -662,9 +662,9 @@ def subj_name(self):
         return self._subj_name
 
     @property
-    def sleepstats(self):
-        """A list of all sleep statistics included in analysis."""
-        return self._sleepstats
+    def n_subjects(self):
+        """The number of subjects."""
+        return self._n_subjects
 
     @property
     def normality(self):
@@ -690,7 +690,7 @@ def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
             f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}'>\n"
+            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -699,7 +699,7 @@ def __repr__(self):
     def __str__(self):
         return (
             f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}'>\n"
+            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -739,12 +739,12 @@ def summary(self, descriptives=True):
             summary = summary.join(desc)
         return summary
 
-    def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
+    def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """Visualize subject-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        sstats_order : list
+        sleep_stats : list or None
             List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`seaborn.heatmap` call.
@@ -754,15 +754,15 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
-        if sstats_order is None:
-            sstats_order = self.sleepstats
+        assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
+        if sleep_stats is None:
+            sleep_stats = self.data["sstat"].unique()  # All available sleep statistics
         heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
         heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        table = self.diff_data[sstats_order]
+        table = self.diff_data[sleep_stats]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
@@ -770,12 +770,12 @@ def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
             heatmap_kwargs["annot"] = table.to_numpy()
         return sns.heatmap(table_norm, **heatmap_kwargs)
 
-    def plot_discrepancies_dotplot(self, kwargs_pairplot={"palette": "winter"}, **kwargs):
+    def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
         """Visualize subject-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        kwargs_pairplot : dict
+        kwargs_pairgrid : dict
             Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
@@ -783,7 +783,7 @@ def plot_discrepancies_dotplot(self, kwargs_pairplot={"palette": "winter"}, **kw
         Returns
         -------
         g : :py:class:`seaborn.PairGrid`
-            Seaborn PairGrid
+            A :py:class:`seaborn.FacetGrid` with sleep statistics dotplots on each axis.
 
         Examples
         --------
@@ -793,57 +793,50 @@ def plot_discrepancies_dotplot(self, kwargs_pairplot={"palette": "winter"}, **kw
         .. plot::
             ## TODO: Example using x_vars
         """
-        assert isinstance(kwargs_pairplot, dict), "`kwargs_pairplot` must be a dict"
-        if sstats_order is None:
-            sstats_order = self.sleepstats
+        assert isinstance(kwargs_pairgrid, dict), "`kwargs_pairgrid` must be a dict"
         stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         stripplot_kwargs.update(kwargs)
         # Initialize the PairGrid
         height = 0.3 * len(self.diff_data)
         aspect = 0.6
-        pairgrid_kwargs = dict(
-            x_vars=sstats_order, hue=self.subj_name, height=height, aspect=aspect
-        )
+        pairgrid_kwargs = dict(hue=self.subj_name, height=height, aspect=aspect)
         pairgrid_kwargs.update(kwargs_pairgrid)
         g = sns.PairGrid(self.diff_data.reset_index(), y_vars=[self.subj_name], **pairgrid_kwargs)
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
         # Adjust aesthetics
-        g.set(xlabel="", ylabel="")
-        for ax, title in zip(g.axes.flat, sstats_order):
-            ax.set(title=title)
+        for ax in g.axes.flat:
+            ax.set(title=ax.get_xlabel())
             ax.margins(x=0.3)
             ax.yaxis.grid(True)
             ax.tick_params(left=False)
+        g.set(xlabel="", ylabel="")
         sns.despine(left=True, bottom=True)
         return g
 
-    def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
+    def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         """
+
+        **Use col_order=sstats_order for plotting a subset.
+
         Parameters
         ----------
-        sstats_order : list or None
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        facet_kwargs : dict
-            Keyword arguments passed to :py:class:`seaborn.FacetGrid`.
+        kwargs_facetgrid : dict
+            Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
 
         Returns
         -------
         g : :py:class:`seaborn.FacetGrid`
-            Seaborn FacetGrid
+            A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
         """
-        assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list or None"
-        if sstats_order is None:
-            sstats_order = self.sleepstats
+        facetgrid_kwargs = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
+        facetgrid_kwargs.update(kwargs_facetgrid)
         blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
         blandaltman_kwargs.update(kwargs)
-        col_wrap = None if len(sstats_order) <= 4 else 4
-        facetgrid_kwargs = dict(col_wrap=col_wrap, height=2, aspect=1, sharex=False, sharey=False)
-        facetgrid_kwargs.update(facet_kwargs)
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col="sstat", col_order=sstats_order, **facetgrid_kwargs)
+        g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
         # Draw Bland-Altman plot on each axis
         g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
         # Adjust aesthetics

From 1eb95ce2d33f0b31e95c0052e2cae350e8b774c5 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Tue, 3 Jan 2023 20:46:12 -0600
Subject: [PATCH 14/43] major restructure of attributes/methods and scores
 calculations

---
 yasa/evaluation.py | 405 +++++++++++++++++++++++++++++++--------------
 1 file changed, 279 insertions(+), 126 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index e8a0cff..aa5d291 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pandas as pd
 import pingouin as pg
-from sklearn import metrics
+import sklearn.metrics as skm
 
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -44,9 +44,9 @@ class EpochByEpochEvaluation:
     Parameters
     ----------
     refr_hyp : :py:class:`yasa.Hypnogram`
-        The reference or ground-truth hypnogram.
+        The reference or ground-truth hypnogram, or sequence of hypnograms.
     test_hyp : :py:class:`yasa.Hypnogram`
-        The test or to-be-evaluated hypnogram.
+        The test or to-be-evaluated hypnogram, or sequence of hypnograms.
 
     Notes
     -----
@@ -171,32 +171,82 @@ def __init__(self, refr_hyps, test_hyps):
         test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
         data = pd.concat([refr, test], axis=1)
 
-        # Get summary sleep statistics for each measurement.
-        refr_sstats = pd.Series(refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        test_sstats = pd.Series(test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        refr_sstats = refr_sstats.set_index(pd.Index(subjects, name="subject"))
-        test_sstats = test_sstats.set_index(pd.Index(subjects, name="subject"))
-        # sse = yasa.SleepStatsEvaluation(refr_sstats, test_sstats)
-        
+        ########################################################################
+        # INDIVIDUAL-LEVEL AGREEMENT
+        ########################################################################
+
+        # Get individual-level averaged/weighted agreement scores
+        # indiv_agree_avg = pd.DataFrame({s: multi_scorer_avg(refr_hyps[s].as_int(), test_hyps[s].as_int()) for s in subjects })
+        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer_avg).apply(pd.Series)
+
+        # Get individual-level one-vs-rest/un-weighted agreement scores
+        # Only include stages that appear in the data
+        # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
+        labels = [l for l in refr_hyps[subjects[0]].hypno.cat.categories if l in data.values]
+        ############ OPTION 1 (uses staticmethod, slower by 500ms)
+        indiv_agree_ovr = (data
+            # Get multiple metrics for each individual
+            .groupby(level=0).apply(self.multi_scorer_ovr, labels=labels)
+            # Unpack metrics results and reshape
+            .apply(pd.Series).stack().apply(pd.Series)
+            # Convert stages to string labels
+            .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+            # Reshape so metrics are columns
+            .stack().unstack(level=1)
+            # Swap MultiIndex levels and sort so stages drive the view
+            .swaplevel().sort_index(level="stage", key=lambda x: x.map(lambda y: labels.index(y)))
+        )
+        # ############ OPTION 2 (does NOT use staticmethod, faster by 500ms)
+        # prfs_func = lambda df: skm.precision_recall_fscore_support(
+        #     *df.values.T, labels=labels, average=None, zero_division=0
+        # )
+        # indiv_agree_ovr = (data
+        #     .groupby(level=0).apply(prfs_func)
+        #     .explode().apply(pd.Series)
+        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps)).set_index("metric", append=True)
+        #     .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+        #     .stack().unstack("metric").rename_axis(columns=None)
+        # )
+        ## Q: Currently both options will leave some all-zero rows, for when a stage is present
+        ##    in some subjects but not others. Prefer to remove?
+        # agr = agr.loc[agr.any(axis=1)]  # or .pipe
+        # And then could drop the label restriction, just passing all labels to preserve order
+
+        # ac_f = lambda s: skm.accuracy_score(*s.values.T)
+        # ka_f = lambda s: skm.cohen_kappa_score(*s.values.T)
+        # ja_f = lambda s: skm.jaccard_score(*s.values.T, average=average)
+        # pr_f = lambda s: skm.precision_score(*s.values.T, average=average, zero_division=zd)
+        # re_f = lambda s: skm.recall_score(*s.values.T, average=average, zero_division=zd)
+        # f1_f = lambda s: skm.f1_score(*s.values.T, average=average, zero_division=zd)
+        # ac = data.groupby(level=0).apply(ac_f)
+        # ka = data.groupby(level=0).apply(ka_f)
+        # ja = data.groupby(level=0).apply(ja_f)
+        # pr = data.groupby(level=0).apply(pr_f)
+        # re = data.groupby(level=0).apply(re_f)
+        # f1 = data.groupby(level=0).apply(f1_f)
+        # agreement_individuals = pd.DataFrame(
+        #     {"accuracy": ac, "kappa": ka, "jaccard": ja, "precision": pr, "recall": re, "f1": f1}
+        # )
+
         # Set attributes
         self._data = data
         self._subjects = subjects
         self._n_subjects = len(subjects)
         self._refr_hyps = refr_hyps
         self._test_hyps = test_hyps
-        self._refr_sstats = refr_sstats
-        self._test_sstats = test_sstats
-        self._refr_name = refr_hyps[subjects[0]].scorer
-        self._test_name = test_hyps[subjects[0]].scorer
-        self._n_stages = refr_hyps[subjects[0]].n_stages
+        self._refr_scorer = refr_hyps[subjects[0]].scorer
+        self._test_scorer = test_hyps[subjects[0]].scorer
         self._labels = refr_hyps[subjects[0]].labels
 
+        self._indiv_agree_avg = indiv_agree_avg
+        self._indiv_agree_ovr = indiv_agree_ovr
+
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
-            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_scorer} evaluated "
+            f"against reference Hypnogram scored by {self.test_scorer}{text_subjects}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
@@ -205,8 +255,8 @@ def __repr__(self):
     def __str__(self):
         text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
-            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_scorer} evaluated "
+            f"against reference Hypnogram scored by {self.test_scorer}{text_subjects}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
@@ -214,27 +264,17 @@ def __str__(self):
 
     @property
     def data(self):
+        """A :py:class:`pandas.DataFrame` including all hypnograms."""
         return self._data
 
-    @property
-    def refr_sstats(self):
-        return self._refr_sstats
-
-    @property
-    def test_sstats(self):
-        return self._test_sstats
-
     @property
     def refr_hyps(self):
-        """The reference Hypnograms."""
-        ## Q: Starting to think there should be a clear convention on what we mean
-        ##    when we say "hypnogram". Should hypnogram mean the Series and Hypnogram
-        ##    mean the YASA object? Similarly for hypno/hyp.
+        """The reference YASA hypnograms."""
         return self._refr_hyps
 
     @property
     def test_hyps(self):
-        """The test Hypnograms."""
+        """The test YASA hypnograms."""
         return self._test_hyps
 
     @property
@@ -246,101 +286,212 @@ def n_subjects(self):
         return self._n_subjects
 
     @property
-    def refr_name(self):
-        """The name of the reference measurement."""
-        return self._refr_name
+    def refr_scorer(self):
+        """The name of the reference scorer."""
+        return self._refr_scorer
 
     @property
-    def test_name(self):
-        """The name of the test measurement."""
-        return self._test_name
+    def test_scorer(self):
+        """The name of the test scorer."""
+        return self._test_scorer
 
     @property
     def labels(self):
+        """All available sleep stage labels."""
         return self._labels
 
     @property
-    def n_stages(self):
-        return self._n_stages
+    def indiv_agree_avg(self):
+        """
+        A :py:class:`pandas.DataFrame` of individual-level ``refr_hyp``/``test_hyp`` agreement
+        metrics that average over individual sleep stages.
 
-    def get_agreement(self, subject=None):
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.individual_agreement_ovr`
+        
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.group_agreement_avg`
+        """
+        return self._indiv_agree_avg
+
+    @property
+    def indiv_agree_ovr(self):
         """
-        Return a dataframe of ``refr_hyp``/``test_hyp`` performance across all stages as measured by
-        common classifier agreement methods.
+        A :py:class:`pandas.DataFrame` of individual-level ``refr_hyp``/``test_hyp`` "one-vs-rest"
+        agreement metrics. Metrics for each sleep stage are provided.
+
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.individual_agreement_ovr`
+        
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.group_agreement_avg`
+        """
+        return self._indiv_agree_ovr
+
+    @staticmethod
+    def multi_scorer_avg(df):
+        """Compute multiple agreement scores from a 2-column dataframe.
 
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
-        ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
-        ##    Maybe should be binary vs multiclass?
+        This function offers convenience when calculating multiple agreement scores using
+        :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
+        return multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
+        accept multiple functions.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEpochEvaluation`
-            A :py:class:`yasa.EpochByEpochEvaluation` instance.
-        subject : None or a unique subject identifier.
-            Subject identifiers are based on user input, and integers starting from 1 if not provided.
+        df : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
+            The first column contains true values and second column contains predicted values.
 
         Returns
         -------
-        agreement : :py:class:`pandas.Series`
-            A :py:class:`pandas.Series` with agreement metrics as indices.
+        scores : dict
+            A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
-        if subject is not None:
-            true = pred.loc[subject]
-            pred = pred.loc[subject]
-        accuracy = metrics.accuracy_score(true, pred)
-        kappa = metrics.cohen_kappa_score(true, pred)
-        jaccard = metrics.jaccard_score(true, pred, average="weighted")
-        precision = metrics.precision_score(true, pred, average="weighted", zero_division=0)
-        recall = metrics.recall_score(true, pred, average="weighted", zero_division=0)
-        f1 = metrics.f1_score(true, pred, average="weighted", zero_division=0)
-        scores = {
-            "accuracy": accuracy,
-            "kappa": kappa,
-            "weighted_jaccard": jaccard,
-            "weighted_precision": precision,
-            "weighted_recall": recall,
-            "weighted_f1": f1,
+        true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
+        ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
+        ##     For example:
+        ##     >>> scorers = ["accuracy", "recall"]
+        ##     >>> funcs = { s: skm.__getattribute__(f"{s}_scorer") for s in scorers }
+        ##     >>> scores = { s: f(true, pred) for s, f in funcs.items() }
+        ##     Keywords could be applied as needed by checking f.__kwdefaults__
+        ##     This would offer an easy way for users to add their own scorers with an arg as well.
+        return {
+            "accuracy": skm.accuracy_score(true, pred),
+            "kappa": skm.cohen_kappa_score(true, pred),
+            "micro_jaccard": skm.jaccard_score(true, pred, average="micro"),
+            "macro_jaccard": skm.jaccard_score(true, pred, average="macro"),
+            "weighted_jaccard": skm.jaccard_score(true, pred, average="weighted"),
+            "micro_precision": skm.precision_score(true, pred, average="micro", zero_division=0),
+            "macro_precision": skm.precision_score(true, pred, average="macro", zero_division=0),
+            "weighted_precision": skm.precision_score(true, pred, average="weighted", zero_division=0),
+            "micro_recall": skm.recall_score(true, pred, average="micro", zero_division=0),
+            "macro_recall": skm.recall_score(true, pred, average="macro", zero_division=0),
+            "weighted_recall": skm.recall_score(true, pred, average="weighted", zero_division=0),
+            "micro_f1": skm.f1_score(true, pred, average="micro", zero_division=0),
+            "macro_f1": skm.f1_score(true, pred, average="macro", zero_division=0),
+            "weighted_f1": skm.f1_score(true, pred, average="weighted", zero_division=0),
         }
-        agreement = pd.Series(scores, name="agreement").rename_axis("metric")
-        return agreement
 
-    def get_agreement_by_stage(self, subject=None):
+    @staticmethod
+    def multi_scorer_ovr(df, labels):
+        """Compute multiple one-vs-rest agreement scores from a 2-column dataframe.
+
+        Parameters
+        ----------
+        df : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
+            The first column contains true values and second column contains predicted values.
+        labels : array-like
+            The labels to include in scoring and control the order of returned scores.
+
+        Returns
+        -------
+        scores : dict
+            A dictionary with scorer names (``str``) as keys and scores (``np.ndarray``) as values.
         """
-        Return a dataframe of ``refr_hyp``/``test_hyp`` performance for each stage as measured by
-        common classifier agreement methods.
+        true, pred = zip(*df.values)
+        return {
+            "precision": skm.precision_score(true, pred, labels=labels, average=None, zero_division=0),
+            "recall": skm.recall_score(true, pred, labels=labels, average=None, zero_division=0),
+            "f1": skm.f1_score(true, pred, labels=labels, average=None, zero_division=0),
+            "support": pd.Series(true).value_counts().reindex(labels, fill_value=0).to_numpy(),
+        }
+
+    def summary(self, by_stage=False, **kwargs):
+        """Return group-level agreement scores.
 
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+        by_stage : bool
+            If True, returned ``summary`` :py:class:`pandas.DataFrame` will include agreement scores
+            for each sleep stage, derived from one-vs-rest metrics. If False (default), ``summary``
+            will include agreement scores derived from average-based metrics.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
 
         Returns
         -------
-        agreement : :py:class:`pandas.DataFrame`
-            A DataFrame with agreement metrics as indices and stages as columns.
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
+            with descriptive statistics.
+
+            >>> ebe = yasa.EpochByEpochEvaluation(...)
+            >>> ebe.summary()
+
+            This will give a :py:class:`pandas.DataFrame` where each row is an agreement metric and
+            each column is a descriptive statistic (e.g., mean, standard deviation).
+            To control the descriptive statistics included as columns:
+
+            >>> ebe.summary(func=["count", "mean", "sem"])
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
-        if subject is not None:
-            true = true.loc[subject]
-            pred = pred.loc[subject]
-        scores = metrics.precision_recall_fscore_support(
-            true, pred, labels=self.labels, average=None, zero_division=0
-        )
-        agreement = pd.DataFrame(scores)
-        agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
-        agreement.columns = pd.Index(self.labels, name="stage")
-        return agreement
+        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
+        agg_kwargs = {"func": ["mean", "std", "min", "median", "max"]} | kwargs
+        if by_stage:
+            summary = (self.indiv_agree_ovr
+                .groupby("stage").agg(**agg_kwargs)
+                .stack(0).rename_axis(["stage", "metric"])
+            )
+        else:
+            summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
+            ## Q: Should we include a column that calculates agreement treating all hypnograms as
+            ##    coming from one individual? Others sometimes report it, though I find it mostly
+            ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
+            # summary.insert(0, "all", self.multi_scorer_avg(self.data))
+        ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
+        ##    one merged DataFrame where the results that are *not* by-stage are included
+        ##    with an "all" stage label:
+        # summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
+        # summary = pd.concat([summary, summary_ovr]).sort_index()
+        return summary
+
+    def get_sleep_stats(self):
+        """Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived
+        from both Reference and Test measurement systems.
+
+        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
+
+        .. seealso:: :py:class:`yasa.SleepStatsEvaluation`
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+
+        Returns
+        -------
+        sstats : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with sleep statistics as columns and two rows for each
+            individual (one from Reference measurement and another from Test measurement).
+        """
+        # refr_sstats = pd.Series(self.refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # test_sstats = pd.Series(self.test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        refr_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.refr_hyps.items()})
+        test_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.test_hyps.items()})
+        refr_sstats = pd.concat({self.refr_scorer: refr_sstats.T.rename_axis("subject")}, names=["measurement"])
+        test_sstats = pd.concat({self.test_scorer: test_sstats.T.rename_axis("subject")}, names=["measurement"])
+        sstats = pd.concat([refr_sstats, test_sstats])
+        return sstats
 
     def get_confusion_matrix(self, subject=None):
         """Return a ``refr_hyp``/``test_hyp``confusion matrix.
 
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+        subject : None or a valid individual identifier
+            If None (default), cross-tabulation is derived from the entire group dataset.
+            If a valid individual identifier, cross-tabulation is derived using only hypnograms
+            from that individual.
+
         Returns
         -------
         matrix : :py:class:`pandas.DataFrame`
             A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
+        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
+        true = self.data[self.refr_scorer]
+        pred = self.data[self.test_scorer]
         if subject is not None:
             true = true.loc[subject]
             pred = pred.loc[subject]
@@ -381,6 +532,13 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
+        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
+        assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
+        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
+        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
+        assert not "ax" in refr_kwargs | test_kwargs, (
+            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
+        )
         if subject is None:
             if self.n_subjects == 1:
                 refr_hyp = self.refr_hyps[self.subjects[0]]
@@ -390,12 +548,6 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
         else:
             refr_hyp = self.refr_hyps[subject]
             test_hyp = self.test_hyps[subject]
-        assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
-        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
-        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
-        assert not "ax" in refr_kwargs | test_kwargs, (
-            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
-        )
         plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
         plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
         plot_refr_kwargs.update(refr_kwargs)
@@ -428,6 +580,7 @@ def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
+        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
         # assert self.test_hyp.probas is not None
         raise NotImplementedError("Requires probability/confidence values.")
 
@@ -451,10 +604,10 @@ class SleepStatsEvaluation:
     test_data : :py:class:`pandas.DataFrame`
         A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
         Shape, indices, and columns must be identical to ``refr_data``.
-    refr_name : str
-        Name of the reference measurement device, used for labeling.
-    test_name : str
-        Name of the test measurement device, used for labeling.
+    refr_scorer : str
+        Name of the reference measurement scorer, used for labeling.
+    test_scorer : str
+        Name of the test measurement scorer, used for labeling.
     alpha : float
         Alpha cutoff used for all three tests.
     kwargs_normality : dict
@@ -543,8 +696,8 @@ def __init__(
         refr_data,
         test_data,
         *,
-        refr_name="Reference",
-        test_name="Test",
+        refr_scorer="Reference",
+        test_scorer="Test",
         kwargs_normality={"alpha": 0.05},
         kwargs_regression={"alpha": 0.05},
         kwargs_homoscedasticity={"alpha": 0.05},
@@ -554,9 +707,9 @@ def __init__(
         assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
         assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
         assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
-        assert isinstance(refr_name, str)
-        assert isinstance(test_name, str)
-        assert refr_name != test_name
+        assert isinstance(refr_scorer, str)
+        assert isinstance(test_scorer, str)
+        assert refr_scorer != test_scorer
         assert isinstance(kwargs_normality, dict)
         assert isinstance(kwargs_regression, dict)
         assert isinstance(kwargs_homoscedasticity, dict)
@@ -569,8 +722,8 @@ def __init__(
         refr_data.index.name = subj_name
         test_data.index.name = subj_name
         diff_data = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
-        refr_data = pd.concat({refr_name: refr_data}, names=["measurement"])
-        test_data = pd.concat({test_name: test_data}, names=["measurement"])
+        refr_data = pd.concat({refr_scorer: refr_data}, names=["measurement"])
+        test_data = pd.concat({test_scorer: test_data}, names=["measurement"])
         data = (pd.concat([refr_data, test_data, diff_data])
             .melt(var_name="sstat", ignore_index=False).reset_index()
             .pivot(columns="measurement", index=[subj_name, "sstat"], values="value")
@@ -586,7 +739,7 @@ def __init__(
             ## Q: Should this be logged as just info?
 
         ## NORMALITY ## Test reference data for normality at each sleep statistic
-        normality = data.groupby("sstat")[refr_name].apply(pg.normality, **kwargs_normality).droplevel(-1)
+        normality = data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
 
         ## PROPORTIONAL BIAS ## Test each sleep statistic for proportional bias
         # Subject-level residuals for each statistic are added to data.
@@ -595,7 +748,7 @@ def __init__(
         # proportional bias and residuals that will be used for the later  tests.
         for ss_name, ss_df in data.groupby("sstat"):
             # Regress the difference scores on the reference scores
-            model = pg.linear_regression(ss_df[refr_name], ss_df["difference"], **kwargs_regression)
+            model = pg.linear_regression(ss_df[refr_scorer], ss_df["difference"], **kwargs_regression)
             model.insert(0, "sstat", ss_name)
             # Extract subject-level residuals for later homoscedasticity tests
             resid_dict = {subj_name: ss_df[subj_name], "sstat": ss_name, "pbias_residual": model.residuals_}
@@ -614,7 +767,7 @@ def __init__(
         prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
 
         ## Test each statistic for homoscedasticity ##
-        columns = [refr_name, "difference", "pbias_residual"]
+        columns = [refr_scorer, "difference", "pbias_residual"]
         homoscedasticity_func = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
         homoscedasticity = data.groupby("sstat").apply(homoscedasticity_func).droplevel(-1)
 
@@ -625,8 +778,8 @@ def __init__(
         self._proportional_bias_full = prop_bias_full  # Q: Is this worth saving??
         self._homoscedasticity = homoscedasticity
         # These will not be set as properties, as they are only needed internally
-        self._refr_name = refr_name
-        self._test_name = test_name
+        self._refr_scorer = refr_scorer
+        self._test_scorer = test_scorer
         self._subj_name = subj_name
         self._n_subjects = data[subj_name].nunique()
         # Pivot new to not include removed sstats
@@ -647,14 +800,14 @@ def diff_data(self):
         return self._diff_data
 
     @property
-    def refr_name(self):
-        """The name of the reference measurement."""
-        return self._refr_name
+    def refr_scorer(self):
+        """The name of the reference measurement scorer."""
+        return self._refr_scorer
 
     @property
-    def test_name(self):
-        """The name of the test measurement."""
-        return self._test_name
+    def test_scorer(self):
+        """The name of the test measurement scorer."""
+        return self._test_scorer
 
     @property
     def subj_name(self):
@@ -689,8 +842,8 @@ def proportional_bias_full(self):
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
+            f"<SleepStatsEvaluation | Test measurement '{self.test_scorer}' evaluated against "
+            f"reference measurement '{self.refr_scorer}', {self.n_subjects} subjects>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -698,8 +851,8 @@ def __repr__(self):
 
     def __str__(self):
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
+            f"<SleepStatsEvaluation | Test measurement '{self.test_scorer}' evaluated against "
+            f"reference measurement '{self.refr_scorer}', {self.n_subjects} subjects>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -838,7 +991,7 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         # Initialize a grid of plots with an Axes for each sleep statistic
         g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
+        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **blandaltman_kwargs)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks
@@ -846,7 +999,7 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.test_name, self.refr_name))
+        ylabel = " - ".join((self.test_scorer, self.refr_scorer))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)

From 48f1df6dccffe725c568002259e90125945bcd50 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Tue, 3 Jan 2023 23:59:28 -0600
Subject: [PATCH 15/43] cleanup

---
 yasa/evaluation.py | 504 ++++++++++++++++++++++++---------------------
 yasa/hypno.py      |  13 +-
 2 files changed, 270 insertions(+), 247 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index aa5d291..35273a4 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -1,5 +1,5 @@
 """
-YASA code for evaluating the agreement between two sleep-measurement systems.
+YASA code for evaluating the agreement between two scorers.
 
 There are two levels of evaluating staging performance:
 - Comparing two hypnograms (e.g., human vs automated scorer)
@@ -39,14 +39,14 @@
 
 class EpochByEpochEvaluation:
     """
-    See :py:meth:`yasa.Hypnogram.evaluate`
+    For comparing only 2 hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
 
     Parameters
     ----------
-    refr_hyp : :py:class:`yasa.Hypnogram`
-        The reference or ground-truth hypnogram, or sequence of hypnograms.
-    test_hyp : :py:class:`yasa.Hypnogram`
-        The test or to-be-evaluated hypnogram, or sequence of hypnograms.
+    refr_hyps : :py:class:`yasa.Hypnogram`
+        A collection of reference or ground-truth hypnograms.
+    test_hyps : :py:class:`yasa.Hypnogram`
+        A collection of test or to-be-evaluated hypnograms.
 
     Notes
     -----
@@ -63,20 +63,9 @@ class EpochByEpochEvaluation:
     Examples
     --------
     >>> import yasa
-    >>> hypno_a = yasa.simulate_hypnogram(tib=90, seed=8, scorer="RaterA")
-    >>> hypno_b = yasa.simulate_hypnogram(tib=90, seed=9, scorer="RaterB")
-    >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
-    >>> ebe.get_confusion_matrix()
-    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
-    RaterA
-    WAKE      52   38  126  23   51    0    0    290
-    N1        59    2   27   8   14    0    0    110
-    N2       117   50  105  15   44    0    0    331
-    N3        34   26   62  42   15    0    0    179
-    REM       15   12   13  10    0    0    0     50
-    ART        0    0    0   0    0    0    0      0
-    UNS        0    0    0   0    0    0    0      0
-    Total    277  128  333  98  124    0    0    960
+    >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i) for i in range(20)]
+    >>> hyps_b = [h.simulate_similar(scorer="RaterB", seed=i) for i, h in enumerate(refr_hyps)]
+    >>> ebe = yasa.EpochByEpochEvaluation(hyps_a, hyps_b)
 
     >>> ebe.get_agreement().round(3)
     metric
@@ -122,53 +111,78 @@ class EpochByEpochEvaluation:
         >>>
         >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
         >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
+
+    When comparing only 2 hypnograms, use the :py:meth:`yasa.Hynogram.evaluate` method:
+
+    >>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
+    >>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
+    >>> ebe = hypno_a.evaluate(hypno_b)
+
+    >>> ebe.get_confusion_matrix()
+    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
+    RaterA
+    WAKE      52   38  126  23   51    0    0    290
+    N1        59    2   27   8   14    0    0    110
+    N2       117   50  105  15   44    0    0    331
+    N3        34   26   62  42   15    0    0    179
+    REM       15   12   13  10    0    0    0     50
+    ART        0    0    0   0    0    0    0      0
+    UNS        0    0    0   0    0    0    0      0
+    Total    277  128  333  98  124    0    0    960
     """
     def __init__(self, refr_hyps, test_hyps):
-        from yasa.hypno import Hypnogram  # Loading here to avoid circular import
+        from yasa.hypno import Hypnogram  # Avoiding circular import
 
-        assert isinstance(refr_hyps, Hypnogram) or hasattr(refr_hyps, "__iter__"), (
-            "`refr_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
-        )
-        assert isinstance(test_hyps, Hypnogram) or hasattr(test_hyps, "__iter__"), (
-            "`test_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
-        )
+        assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
+        assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
         assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-
-        # Convert solo hypnograms to len==1 tuples
-        if isinstance(refr_hyps, Hypnogram):  # As below, picking refr_hyps for checks arbitrarily
-            refr_hyps = [refr_hyps]
-            test_hyps = [test_hyps]
-        assert len(refr_hyps) == len(test_hyps), "must have same number of subjects"
+        assert len(refr_hyps) == len(test_hyps), (
+            "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
+        )
 
         if isinstance(refr_hyps, dict):
-            assert refr_hyps.keys() == test_hyps.keys(), "must have same subject identifiers and in same order"
-            subjects, refr_hyps = zip(*refr_hyps.items())
-            # assert all(isinstance(s, str) for s in subjects)
+            # If user provides dictionaries, split into sleep IDs and hypnograms
+            assert refr_hyps.keys() == test_hyps.keys(), (
+                "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
+            )
+            sleep_ids, refr_hyps = zip(*refr_hyps.items())
             test_hyps = tuple(test_hyps.values())
         else:
-            subjects = 1 + np.arange(len(refr_hyps))
+            # Create hypnogram_ids
+            sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
 
-        all_hyps = refr_hyps + test_hyps
-        assert all(isinstance(hyp, Hypnogram) for hyp in all_hyps), "`refr_hyps` and `test_hyps` must only include hypnograms"
-        assert all(h.scorer is not None for h in all_hyps), "all hypnograms must have a scorer"
-        for h1, h2 in zip(all_hyps[:-1], all_hyps[1:]):
-            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+        assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps), (
+            "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+        )
+        assert all(h.scorer is not None for h in refr_hyps + test_hyps), (
+            "all hypnograms must have a scorer name"
+        )
+        for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), "all `refr_hyps` must have the same scorer"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), "all `test_hyps` must have the same scorer"
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
-        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        ## Could use set() for those above
-        ## Or set scorer as the first available and check all equal
+            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), (
+            "all `refr_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), (
+            "all `test_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), (
+            "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        )
+        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), (
+            "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
+        )
+        ## Q: Could use set() for those above.
+        ##    Or set scorer as the first available and check all equal.
 
-        # Convert to dictionaries with subjects and hypnograms
-        refr_hyps = { s: h for s, h in zip(subjects, refr_hyps) }
-        test_hyps = { s: h for s, h in zip(subjects, test_hyps) }
+        # Convert to dictionaries with sleep_ids and hypnograms
+        refr_hyps = { s: h for s, h in zip(sleep_ids, refr_hyps) }
+        test_hyps = { s: h for s, h in zip(sleep_ids, test_hyps) }
 
         # Merge all hypnograms into a single multiindexed dataframe
-        refr = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in refr_hyps.items())
-        test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
+        refr = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in refr_hyps.items())
+        test = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in test_hyps.items())
         data = pd.concat([refr, test], axis=1)
 
         ########################################################################
@@ -176,16 +190,16 @@ def __init__(self, refr_hyps, test_hyps):
         ########################################################################
 
         # Get individual-level averaged/weighted agreement scores
-        # indiv_agree_avg = pd.DataFrame({s: multi_scorer_avg(refr_hyps[s].as_int(), test_hyps[s].as_int()) for s in subjects })
         indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer_avg).apply(pd.Series)
+        ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
 
         # Get individual-level one-vs-rest/un-weighted agreement scores
         # Only include stages that appear in the data
         # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
-        labels = [l for l in refr_hyps[subjects[0]].hypno.cat.categories if l in data.values]
+        labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
         ############ OPTION 1 (uses staticmethod, slower by 500ms)
         indiv_agree_ovr = (data
-            # Get multiple metrics for each individual
+            # Get multiple metrics for each individual sleep
             .groupby(level=0).apply(self.multi_scorer_ovr, labels=labels)
             # Unpack metrics results and reshape
             .apply(pd.Series).stack().apply(pd.Series)
@@ -203,7 +217,8 @@ def __init__(self, refr_hyps, test_hyps):
         # indiv_agree_ovr = (data
         #     .groupby(level=0).apply(prfs_func)
         #     .explode().apply(pd.Series)
-        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps)).set_index("metric", append=True)
+        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
+        #     .set_index("metric", append=True)
         #     .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
         #     .stack().unstack("metric").rename_axis(columns=None)
         # )
@@ -212,55 +227,33 @@ def __init__(self, refr_hyps, test_hyps):
         # agr = agr.loc[agr.any(axis=1)]  # or .pipe
         # And then could drop the label restriction, just passing all labels to preserve order
 
-        # ac_f = lambda s: skm.accuracy_score(*s.values.T)
-        # ka_f = lambda s: skm.cohen_kappa_score(*s.values.T)
-        # ja_f = lambda s: skm.jaccard_score(*s.values.T, average=average)
-        # pr_f = lambda s: skm.precision_score(*s.values.T, average=average, zero_division=zd)
-        # re_f = lambda s: skm.recall_score(*s.values.T, average=average, zero_division=zd)
-        # f1_f = lambda s: skm.f1_score(*s.values.T, average=average, zero_division=zd)
-        # ac = data.groupby(level=0).apply(ac_f)
-        # ka = data.groupby(level=0).apply(ka_f)
-        # ja = data.groupby(level=0).apply(ja_f)
-        # pr = data.groupby(level=0).apply(pr_f)
-        # re = data.groupby(level=0).apply(re_f)
-        # f1 = data.groupby(level=0).apply(f1_f)
-        # agreement_individuals = pd.DataFrame(
-        #     {"accuracy": ac, "kappa": ka, "jaccard": ja, "precision": pr, "recall": re, "f1": f1}
-        # )
-
         # Set attributes
         self._data = data
-        self._subjects = subjects
-        self._n_subjects = len(subjects)
+        self._sleep_ids = sleep_ids
+        self._n_sleeps = len(sleep_ids)
         self._refr_hyps = refr_hyps
         self._test_hyps = test_hyps
-        self._refr_scorer = refr_hyps[subjects[0]].scorer
-        self._test_scorer = test_hyps[subjects[0]].scorer
-        self._labels = refr_hyps[subjects[0]].labels
-
+        self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
+        self._test_scorer = test_hyps[sleep_ids[0]].scorer
+        self._labels = refr_hyps[sleep_ids[0]].labels
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
+        ## Q: Merge these to one individual agreement dataframe?
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
-        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
+        s = "s" if self._n_sleeps > 1 else ""
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_scorer} evaluated "
-            f"against reference Hypnogram scored by {self.test_scorer}{text_subjects}>\n"
+            f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
+            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep"
+            f"session{s}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
         )
 
     def __str__(self):
-        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
-        return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_scorer} evaluated "
-            f"against reference Hypnogram scored by {self.test_scorer}{text_subjects}>\n"
-            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
-            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
-            "See the online documentation for more details."
-        )
+        return self.__repr__()
 
     @property
     def data(self):
@@ -269,21 +262,23 @@ def data(self):
 
     @property
     def refr_hyps(self):
-        """The reference YASA hypnograms."""
+        """A dictionary of all reference YASA hypnograms with sleep IDs as keys."""
         return self._refr_hyps
 
     @property
     def test_hyps(self):
-        """The test YASA hypnograms."""
+        """A dictionary of all test YASA hypnograms with sleep IDs as keys."""
         return self._test_hyps
 
     @property
-    def subjects(self):
-        return self._subjects
+    def sleep_ids(self):
+        """A tuple of all sleep IDs."""
+        return self._sleep_ids
 
     @property
-    def n_subjects(self):
-        return self._n_subjects
+    def n_sleeps(self):
+        """The number of unique sleep sessions."""
+        return self._n_sleeps
 
     @property
     def refr_scorer(self):
@@ -303,24 +298,20 @@ def labels(self):
     @property
     def indiv_agree_avg(self):
         """
-        A :py:class:`pandas.DataFrame` of individual-level ``refr_hyp``/``test_hyp`` agreement
-        metrics that average over individual sleep stages.
+        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` average-based agreement scores
+        for each individual sleep session.
 
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.individual_agreement_ovr`
-        
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.group_agreement_avg`
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_ovr`
         """
         return self._indiv_agree_avg
 
     @property
     def indiv_agree_ovr(self):
         """
-        A :py:class:`pandas.DataFrame` of individual-level ``refr_hyp``/``test_hyp`` "one-vs-rest"
-        agreement metrics. Metrics for each sleep stage are provided.
+        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` one-vs-rest agreement scores
+        for each individual sleep session. Agreement scores are provided for each sleep stage.
 
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.individual_agreement_ovr`
-        
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.group_agreement_avg`
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_avg`
         """
         return self._indiv_agree_ovr
 
@@ -356,18 +347,20 @@ def multi_scorer_avg(df):
         return {
             "accuracy": skm.accuracy_score(true, pred),
             "kappa": skm.cohen_kappa_score(true, pred),
-            "micro_jaccard": skm.jaccard_score(true, pred, average="micro"),
-            "macro_jaccard": skm.jaccard_score(true, pred, average="macro"),
-            "weighted_jaccard": skm.jaccard_score(true, pred, average="weighted"),
-            "micro_precision": skm.precision_score(true, pred, average="micro", zero_division=0),
-            "macro_precision": skm.precision_score(true, pred, average="macro", zero_division=0),
-            "weighted_precision": skm.precision_score(true, pred, average="weighted", zero_division=0),
-            "micro_recall": skm.recall_score(true, pred, average="micro", zero_division=0),
-            "macro_recall": skm.recall_score(true, pred, average="macro", zero_division=0),
-            "weighted_recall": skm.recall_score(true, pred, average="weighted", zero_division=0),
-            "micro_f1": skm.f1_score(true, pred, average="micro", zero_division=0),
-            "macro_f1": skm.f1_score(true, pred, average="macro", zero_division=0),
-            "weighted_f1": skm.f1_score(true, pred, average="weighted", zero_division=0),
+            "jaccard_micro": skm.jaccard_score(true, pred, average="micro"),
+            "jaccard_macro": skm.jaccard_score(true, pred, average="macro"),
+            "jaccard_weighted": skm.jaccard_score(true, pred, average="weighted"),
+            "precision_micro": skm.precision_score(true, pred, average="micro", zero_division=0),
+            "precision_macro": skm.precision_score(true, pred, average="macro", zero_division=0),
+            "precision_weighted": skm.precision_score(
+                true, pred, average="weighted", zero_division=0
+            ),
+            "recall_micro": skm.recall_score(true, pred, average="micro", zero_division=0),
+            "recall_macro": skm.recall_score(true, pred, average="macro", zero_division=0),
+            "recall_weighted": skm.recall_score(true, pred, average="weighted", zero_division=0),
+            "f1_micro": skm.f1_score(true, pred, average="micro", zero_division=0),
+            "f1_macro": skm.f1_score(true, pred, average="macro", zero_division=0),
+            "f1_weighted": skm.f1_score(true, pred, average="weighted", zero_division=0),
         }
 
     @staticmethod
@@ -440,13 +433,14 @@ def summary(self, by_stage=False, **kwargs):
         ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
         ##    one merged DataFrame where the results that are *not* by-stage are included
         ##    with an "all" stage label:
-        # summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
-        # summary = pd.concat([summary, summary_ovr]).sort_index()
+        ## >>> summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
+        ## >>> summary = pd.concat([summary, summary_ovr]).sort_index()
         return summary
 
     def get_sleep_stats(self):
-        """Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived
-        from both Reference and Test measurement systems.
+        """
+        Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived from
+        both reference and test scorers.
 
         .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
@@ -461,63 +455,73 @@ def get_sleep_stats(self):
         -------
         sstats : :py:class:`pandas.DataFrame`
             A :py:class:`pandas.DataFrame` with sleep statistics as columns and two rows for each
-            individual (one from Reference measurement and another from Test measurement).
+            individual (one from reference scorer and another from test scorer).
         """
-        # refr_sstats = pd.Series(self.refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        # test_sstats = pd.Series(self.test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # Get all sleep statistics
         refr_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.refr_hyps.items()})
         test_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.test_hyps.items()})
-        refr_sstats = pd.concat({self.refr_scorer: refr_sstats.T.rename_axis("subject")}, names=["measurement"])
-        test_sstats = pd.concat({self.test_scorer: test_sstats.T.rename_axis("subject")}, names=["measurement"])
-        sstats = pd.concat([refr_sstats, test_sstats])
-        return sstats
-
-    def get_confusion_matrix(self, subject=None):
-        """Return a ``refr_hyp``/``test_hyp``confusion matrix.
+        # Reshape and name axis
+        refr_sstats = refr_sstats.T.rename_axis("sleep_id")
+        test_sstats = test_sstats.T.rename_axis("sleep_id")
+        # Convert to MultiIndex with new scorer level
+        refr_sstats = pd.concat({self.refr_scorer: refr_sstats}, names=["scorer"])
+        test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
+        return pd.concat([refr_sstats, test_sstats])
+
+    def get_confusion_matrix(self, sleep_id=None):
+        """
+        Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
+        sessions concatenated together.
 
         Parameters
         ----------
         self : :py:class:`yasa.EpochByEvaluation`
             A :py:class:`yasa.EpochByEvaluation` instance.
-        subject : None or a valid individual identifier
+        sleep_id : None or a valid sleep ID
             If None (default), cross-tabulation is derived from the entire group dataset.
-            If a valid individual identifier, cross-tabulation is derived using only hypnograms
-            from that individual.
+            If a valid sleep ID, cross-tabulation is derived using only the reference and test
+            scored hypnograms from that sleep session.
 
         Returns
         -------
         matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
+            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as
+            columns.
         """
-        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
         true = self.data[self.refr_scorer]
         pred = self.data[self.test_scorer]
-        if subject is not None:
-            true = true.loc[subject]
-            pred = pred.loc[subject]
-        # Generate confusion matrix.
+        if sleep_id is not None:
+            true = true.loc[sleep_id]
+            pred = pred.loc[sleep_id]
         matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
         # Reorder indices in sensible order and to include all stages
         index_col_labels = self.labels + ["Total"]
         matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
         return matrix.astype(int)
 
-    def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
-        """Plot the two hypnograms, where ``refr_hyp`` is overlaid on ``refr_hyp``.
+    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
+        """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
 
         Parameters
         ----------
+        sleep_id : None or a valid sleep ID
+            If a valid sleep ID, plot the reference and test hypnograms from on sleep session.
         legend : bool or dict
             If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
             pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
         ax : :py:class:`matplotlib.axes.Axes` or None
             Axis on which to draw the plot, optional.
         refr_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``refr_hyp``.
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
+            hypnogram.
         test_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``test_hyp``.
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the test
+            hypnogram.
 
         Returns
         -------
@@ -532,22 +536,24 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
-        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
         assert not "ax" in refr_kwargs | test_kwargs, (
             "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
         )
-        if subject is None:
-            if self.n_subjects == 1:
-                refr_hyp = self.refr_hyps[self.subjects[0]]
-                test_hyp = self.test_hyps[self.subjects[0]]
+        if sleep_id is None:
+            if self.n_sleeps == 1:
+                refr_hyp = self.refr_hyps[self.sleep_ids[0]]
+                test_hyp = self.test_hyps[self.sleep_ids[0]]
             else:
-                raise NotImplementedError("Plotting is currently allowed for only one subject")
+                raise NotImplementedError("Multi-session plotting is not currently supported")
         else:
-            refr_hyp = self.refr_hyps[subject]
-            test_hyp = self.test_hyps[subject]
+            refr_hyp = self.refr_hyps[sleep_id]
+            test_hyp = self.test_hyps[sleep_id]
         plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
         plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
         plot_refr_kwargs.update(refr_kwargs)
@@ -563,7 +569,7 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
                 ax.legend()
         return ax
 
-    def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
+    def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         """Plot ROC curves for each stage.
 
         Parameters
@@ -580,9 +586,10 @@ def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        assert subject is None or subject in self.subjects, "`subject` must be None or a valid subject ID"
-        # assert self.test_hyp.probas is not None
-        raise NotImplementedError("Requires probability/confidence values.")
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
+        raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
 
 
 #############################################################################
@@ -592,22 +599,22 @@ def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
 
 class SleepStatsEvaluation:
     """
-    Evaluate agreement between two measurement systems (e.g., two different manual scorers or one
-    one manual scorer againt YASA's automatic staging) by comparing their summary sleep statistics
-    derived from multiple subjects or sessions.
+    Evaluate agreement between two scorers (e.g., two different manual scorers or one manual scorer
+    and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
+    subjects or sessions.
 
     Parameters
     ----------
     refr_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the reference measurement system.
-        Rows are individual subjects and columns are individual sleep statistics.
+        A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
+        Rows are individual sleep sessions and columns are individual sleep statistics.
     test_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
+        A :py:class:`pandas.DataFrame` with sleep statistics from the test scorer.
         Shape, indices, and columns must be identical to ``refr_data``.
     refr_scorer : str
-        Name of the reference measurement scorer, used for labeling.
+        Name of the reference scorer, used for labeling.
     test_scorer : str
-        Name of the test measurement scorer, used for labeling.
+        Name of the test scorer, used for labeling.
     alpha : float
         Alpha cutoff used for all three tests.
     kwargs_normality : dict
@@ -704,59 +711,81 @@ def __init__(
     ):
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
-        assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
-        assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
-        assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
-        assert isinstance(refr_scorer, str)
-        assert isinstance(test_scorer, str)
-        assert refr_scorer != test_scorer
-        assert isinstance(kwargs_normality, dict)
-        assert isinstance(kwargs_regression, dict)
-        assert isinstance(kwargs_homoscedasticity, dict)
-        assert "alpha" in kwargs_normality
-        assert "alpha" in kwargs_regression
-        assert "alpha" in kwargs_homoscedasticity
-
-        # Merge dataframes, get differences, and reshape wide-to-long format
-        subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
-        refr_data.index.name = subj_name
-        test_data.index.name = subj_name
-        diff_data = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
-        refr_data = pd.concat({refr_scorer: refr_data}, names=["measurement"])
-        test_data = pd.concat({test_scorer: test_data}, names=["measurement"])
-        data = (pd.concat([refr_data, test_data, diff_data])
+        assert np.array_equal(refr_data.index, test_data.index), (
+            "`refr_data` and `test_data` index values must be identical"
+        )
+        assert refr_data.index.name == test_data.index.name, (
+            "`refr_data` and `test_data` index names must be identical"
+        )
+        assert np.array_equal(refr_data.columns, test_data.columns), (
+            "`refr_data` and `test_data` column values must be identical"
+        )
+        assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
+        assert isinstance(test_scorer, str), "`test_scorer` must be a string"
+        assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
+        assert isinstance(kwargs_normality, dict), "`kwargs_normality` must be a dictionary"
+        assert isinstance(kwargs_regression, dict), "`kwargs_regression` must be a dictionary"
+        assert isinstance(kwargs_homoscedasticity, dict), "`kwargs_homoscedasticity` must be a dict"
+        assert "alpha" in kwargs_normality, "`kwargs_normality` must include 'alpha'"
+        assert "alpha" in kwargs_regression, "`kwargs_regression` must include 'alpha'"
+        assert "alpha" in kwargs_homoscedasticity, "`kwargs_homoscedasticity` must include 'alpha'"
+
+        # If refr_data and test_data indices are unnamed, name them
+        sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
+        refr_data.index.name = sleep_id_str
+        test_data.index.name = sleep_id_str
+
+        # Get scorer differences
+        diff_data = test_data.sub(refr_data)
+
+        # Convert to MultiIndex with new scorer level
+        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
+        test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
+
+        # Merge dataframes and reshape to long format
+        data = pd.concat([refr_data, test_data, diff_data])
+        data = (data
             .melt(var_name="sstat", ignore_index=False).reset_index()
-            .pivot(columns="measurement", index=[subj_name, "sstat"], values="value")
+            .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
             .reset_index().rename_axis(columns=None)
         )
 
-        # Remove sleep statistics that have no differences between measurement systems
-        ## TODO: restructure?
+        # Remove sleep statistics that have no differences between scorers
         stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
         data = data.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
-            ## Q: Should this be logged as just info?
 
-        ## NORMALITY ## Test reference data for normality at each sleep statistic
-        normality = data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
+        ## NORMALITY ##
+        # Test reference data for normality at each sleep statistic
+        normality = (data
+            .groupby("sstat")[refr_scorer]
+            .apply(pg.normality, **kwargs_normality)
+            .droplevel(-1)
+        )
 
-        ## PROPORTIONAL BIAS ## Test each sleep statistic for proportional bias
-        # Subject-level residuals for each statistic are added to data.
+        ## PROPORTIONAL BIAS ##
+        # Test each sleep statistic for proportional bias
         prop_bias_results = []
         residuals_results = []
-        # proportional bias and residuals that will be used for the later  tests.
         for ss_name, ss_df in data.groupby("sstat"):
             # Regress the difference scores on the reference scores
-            model = pg.linear_regression(ss_df[refr_scorer], ss_df["difference"], **kwargs_regression)
+            model = pg.linear_regression(
+                ss_df[refr_scorer], ss_df["difference"], **kwargs_regression
+            )
             model.insert(0, "sstat", ss_name)
-            # Extract subject-level residuals for later homoscedasticity tests
-            resid_dict = {subj_name: ss_df[subj_name], "sstat": ss_name, "pbias_residual": model.residuals_}
+            # Extract sleep-level residuals for later homoscedasticity tests
+            resid_dict = {
+                sleep_id_str: ss_df[sleep_id_str],
+                "sstat": ss_name,
+                "pbias_residual": model.residuals_,
+            }
             resid = pd.DataFrame(resid_dict)
             prop_bias_results.append(model)
             residuals_results.append(resid)
         # Add residuals to raw dataframe, used later when testing homoscedasticity
-        data = data.merge(pd.concat(residuals_results), on=[subj_name, "sstat"])
+        data = data.merge(pd.concat(residuals_results), on=[sleep_id_str, "sstat"])
         # Handle proportional bias results
         prop_bias = pd.concat(prop_bias_results)
         # Save all the proportional bias models before removing intercept, for optional user access
@@ -768,98 +797,93 @@ def __init__(
 
         ## Test each statistic for homoscedasticity ##
         columns = [refr_scorer, "difference", "pbias_residual"]
-        homoscedasticity_func = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
-        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_func).droplevel(-1)
+        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
 
         # Set attributes
         self._data = data
         self._normality = normality
         self._proportional_bias = prop_bias
-        self._proportional_bias_full = prop_bias_full  # Q: Is this worth saving??
+        self._proportional_bias_full = prop_bias_full  ## Q: Is this worth saving??
         self._homoscedasticity = homoscedasticity
-        # These will not be set as properties, as they are only needed internally
         self._refr_scorer = refr_scorer
         self._test_scorer = test_scorer
-        self._subj_name = subj_name
-        self._n_subjects = data[subj_name].nunique()
-        # Pivot new to not include removed sstats
-        self._diff_data = data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        self._sleep_id_str = sleep_id_str
+        self._n_sleeps = data[sleep_id_str].nunique()
+        self._diff_data = diff_data.drop(columns=stats_nodiff)
+        # self._diff_data = data.pivot(index=sleep_id_str, columns="sstat", values="difference")
 
     @property
     def data(self):
-        """
-        ``refr_data`` and ``test_data`` combined in a long-format :py:class:`pandas.DataFrame`.
-        Also includes difference scores (``test_data`` minus ``refr_data``).
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``refr_data`` and
+        ``test_data`` as well as their difference scores (``test_data`` minus ``refr_data``).
         """
         return self._data
 
     @property
     def diff_data(self):
         """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
-        # # Pivot for subject-rows and statistic-columns
+        # # Pivot for session-rows and statistic-columns
         return self._diff_data
 
     @property
     def refr_scorer(self):
-        """The name of the reference measurement scorer."""
+        """The name of the reference scorer."""
         return self._refr_scorer
 
     @property
     def test_scorer(self):
-        """The name of the test measurement scorer."""
+        """The name of the test scorer."""
         return self._test_scorer
 
     @property
-    def subj_name(self):
-        """The name of the subject identifier."""
-        return self._subj_name
+    def sleep_id_str(self):
+        """The name of the unique sleep session identifier."""
+        return self._sleep_id_str
 
     @property
-    def n_subjects(self):
-        """The number of subjects."""
-        return self._n_subjects
+    def n_sleeps(self):
+        """The number of sleep sessions."""
+        return self._n_sleeps
 
     @property
     def normality(self):
-        """A :py:class:`pandas.DataFrame` of normality test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of normality results for all sleep statistics."""
         return self._normality
 
     @property
     def homoscedasticity(self):
-        """A :py:class:`pandas.DataFrame` of homoscedasticity test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of homoscedasticity results for all sleep statistics."""
         return self._homoscedasticity
 
     @property
     def proportional_bias(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        """
+        A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics, with
+        intercept terms removed.
+        """
         return self._proportional_bias
 
     @property
     def proportional_bias_full(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
         return self._proportional_bias_full
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_scorer}' evaluated against "
-            f"reference measurement '{self.refr_scorer}', {self.n_subjects} subjects>\n"
+            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference"
+            f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
         )
 
     def __str__(self):
-        return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_scorer}' evaluated against "
-            f"reference measurement '{self.refr_scorer}', {self.n_subjects} subjects>\n"
-            " - Use `.summary()` to get pass/fail values from various checks\n"
-            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
-            "See the online documentation for more details."
-        )
+        return __repr__()
 
     def summary(self, descriptives=True):
-        """Return a summary dataframe highlighting what statistics pass checks.
+        """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
@@ -867,7 +891,7 @@ def summary(self, descriptives=True):
             A :py:class:`SleepStatsEvaluation` instance.
         descriptives : bool or dict
             If True (default) or a dictionary, also include descriptive statistics for reference and
-            test measurements. If a dictionary, all key/value pairs are passed as keyword arguments
+            test scorers. If a dictionary, all key/value pairs are passed as keyword arguments
             to the :py:meth:`pandas.DataFrame.agg` call.
 
         Returns
@@ -887,13 +911,13 @@ def summary(self, descriptives=True):
             agg_kwargs = {"func": ["mean", "std"]}
             if isinstance(descriptives, dict):
                 agg_kwargs.update(descriptives)
-            desc = self.data.drop(columns=self.subj_name).groupby("sstat").agg(**agg_kwargs)
+            desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
         return summary
 
     def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
-        """Visualize subject-level discrepancies, generally for outlier inspection.
+        """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
@@ -924,7 +948,7 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         return sns.heatmap(table_norm, **heatmap_kwargs)
 
     def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
-        """Visualize subject-level discrepancies, generally for outlier inspection.
+        """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
@@ -952,9 +976,11 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         # Initialize the PairGrid
         height = 0.3 * len(self.diff_data)
         aspect = 0.6
-        pairgrid_kwargs = dict(hue=self.subj_name, height=height, aspect=aspect)
+        pairgrid_kwargs = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
         pairgrid_kwargs.update(kwargs_pairgrid)
-        g = sns.PairGrid(self.diff_data.reset_index(), y_vars=[self.subj_name], **pairgrid_kwargs)
+        g = sns.PairGrid(
+            self.diff_data.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
+        )
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
         # Adjust aesthetics
diff --git a/yasa/hypno.py b/yasa/hypno.py
index 908aa9b..b6c12ef 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -539,22 +539,19 @@ def copy(self):
             scorer=self.scorer,
         )
 
-    def evaluate(self, hypno_test):
+    def evaluate(self, test_hyp):
         """Evaluate agreement between two hypnograms.
 
         Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
-        test hypnogram (i.e., ``hypno_test``) is a hypnogram from an actigraphy/wearable device or
+        test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or
         automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
 
-        Comparing more than two hypnograms is not currently supported.
-
         Parameters
         ----------
         self : :py:class:`yasa.Hypnogram`
             Reference or ground-truth hypnogram.
-        hypno_test : :py:class:`yasa.Hypnogram`
+        test_hyp : :py:class:`yasa.Hypnogram`
             The test or to-be-evaluated hypnogram.
-            Must have the same ``n_stages`` as the reference hypnogram.
 
         Returns
         -------
@@ -573,11 +570,11 @@ def evaluate(self, hypno_test):
             >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
             >>> ebe = hypno_ref.evaluate(hypno_test)
             >>> conf = ebe.get_confusion_matrix()
-            >>> perf = ebe.get_agreement()
+            >>> perf = ebe.summary()
             >>> # Plot the overlapping hypnograms
             >>> ebe.plot_hypnograms()
         """
-        return EpochByEpochEvaluation(self, hypno_test)
+        return EpochByEpochEvaluation([self], [test_hyp])
 
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.

From 120b097a692a40a731e276dd2e0747f63b9bcbc5 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 00:21:49 -0600
Subject: [PATCH 16/43] alternate ovr agreement implementation

---
 yasa/evaluation.py | 105 +++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 70 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 35273a4..5dd2279 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -190,42 +190,34 @@ def __init__(self, refr_hyps, test_hyps):
         ########################################################################
 
         # Get individual-level averaged/weighted agreement scores
-        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer_avg).apply(pd.Series)
+        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer).apply(pd.Series)
         ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
 
         # Get individual-level one-vs-rest/un-weighted agreement scores
-        # Only include stages that appear in the data
-        # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
+        # Labels ensures the order of returned scores is known
+        # It also can be used to remove unused labels, but that will be taken care of later anyways
+        # # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
         labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
-        ############ OPTION 1 (uses staticmethod, slower by 500ms)
+        prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
+            *df.values.T, labels=labels, average=None, zero_division=0
+        )
         indiv_agree_ovr = (data
-            # Get multiple metrics for each individual sleep
-            .groupby(level=0).apply(self.multi_scorer_ovr, labels=labels)
-            # Unpack metrics results and reshape
-            .apply(pd.Series).stack().apply(pd.Series)
-            # Convert stages to string labels
+            # Get precision, recall, f1, and support for each individual sleep session
+            .groupby(level=0).apply(prfs_wrapper)
+            # Unpack arrays
+            .explode().apply(pd.Series)
+            # Add metric labels and prepend to index, creating MultiIndex
+            .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
+            .set_index("metric", append=True)
+            # Convert stage column names to string labels
             .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+            # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
+            .pipe(lambda df: df.loc[:, df.any()])
             # Reshape so metrics are columns
-            .stack().unstack(level=1)
+            .stack().unstack("metric").rename_axis(columns=None)
             # Swap MultiIndex levels and sort so stages drive the view
             .swaplevel().sort_index(level="stage", key=lambda x: x.map(lambda y: labels.index(y)))
         )
-        # ############ OPTION 2 (does NOT use staticmethod, faster by 500ms)
-        # prfs_func = lambda df: skm.precision_recall_fscore_support(
-        #     *df.values.T, labels=labels, average=None, zero_division=0
-        # )
-        # indiv_agree_ovr = (data
-        #     .groupby(level=0).apply(prfs_func)
-        #     .explode().apply(pd.Series)
-        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
-        #     .set_index("metric", append=True)
-        #     .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
-        #     .stack().unstack("metric").rename_axis(columns=None)
-        # )
-        ## Q: Currently both options will leave some all-zero rows, for when a stage is present
-        ##    in some subjects but not others. Prefer to remove?
-        # agr = agr.loc[agr.any(axis=1)]  # or .pipe
-        # And then could drop the label restriction, just passing all labels to preserve order
 
         # Set attributes
         self._data = data
@@ -316,7 +308,7 @@ def indiv_agree_ovr(self):
         return self._indiv_agree_ovr
 
     @staticmethod
-    def multi_scorer_avg(df):
+    def multi_scorer(df):
         """Compute multiple agreement scores from a 2-column dataframe.
 
         This function offers convenience when calculating multiple agreement scores using
@@ -335,7 +327,7 @@ def multi_scorer_avg(df):
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
         ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
         ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
         ##     For example:
@@ -345,47 +337,20 @@ def multi_scorer_avg(df):
         ##     Keywords could be applied as needed by checking f.__kwdefaults__
         ##     This would offer an easy way for users to add their own scorers with an arg as well.
         return {
-            "accuracy": skm.accuracy_score(true, pred),
-            "kappa": skm.cohen_kappa_score(true, pred),
-            "jaccard_micro": skm.jaccard_score(true, pred, average="micro"),
-            "jaccard_macro": skm.jaccard_score(true, pred, average="macro"),
-            "jaccard_weighted": skm.jaccard_score(true, pred, average="weighted"),
-            "precision_micro": skm.precision_score(true, pred, average="micro", zero_division=0),
-            "precision_macro": skm.precision_score(true, pred, average="macro", zero_division=0),
-            "precision_weighted": skm.precision_score(
-                true, pred, average="weighted", zero_division=0
-            ),
-            "recall_micro": skm.recall_score(true, pred, average="micro", zero_division=0),
-            "recall_macro": skm.recall_score(true, pred, average="macro", zero_division=0),
-            "recall_weighted": skm.recall_score(true, pred, average="weighted", zero_division=0),
-            "f1_micro": skm.f1_score(true, pred, average="micro", zero_division=0),
-            "f1_macro": skm.f1_score(true, pred, average="macro", zero_division=0),
-            "f1_weighted": skm.f1_score(true, pred, average="weighted", zero_division=0),
-        }
-
-    @staticmethod
-    def multi_scorer_ovr(df, labels):
-        """Compute multiple one-vs-rest agreement scores from a 2-column dataframe.
-
-        Parameters
-        ----------
-        df : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
-            The first column contains true values and second column contains predicted values.
-        labels : array-like
-            The labels to include in scoring and control the order of returned scores.
-
-        Returns
-        -------
-        scores : dict
-            A dictionary with scorer names (``str``) as keys and scores (``np.ndarray``) as values.
-        """
-        true, pred = zip(*df.values)
-        return {
-            "precision": skm.precision_score(true, pred, labels=labels, average=None, zero_division=0),
-            "recall": skm.recall_score(true, pred, labels=labels, average=None, zero_division=0),
-            "f1": skm.f1_score(true, pred, labels=labels, average=None, zero_division=0),
-            "support": pd.Series(true).value_counts().reindex(labels, fill_value=0).to_numpy(),
+            "accuracy": skm.accuracy_score(t, p),
+            "kappa": skm.cohen_kappa_score(t, p),
+            "jaccard_micro": skm.jaccard_score(t, p, average="micro"),
+            "jaccard_macro": skm.jaccard_score(t, p, average="macro"),
+            "jaccard_weighted": skm.jaccard_score(t, p, average="weighted"),
+            "precision_micro": skm.precision_score(t, p, average="micro", zero_division=0),
+            "precision_macro": skm.precision_score(t, p, average="macro", zero_division=0),
+            "precision_weighted": skm.precision_score(t, p, average="weighted", zero_division=0),
+            "recall_micro": skm.recall_score(t, p, average="micro", zero_division=0),
+            "recall_macro": skm.recall_score(t, p, average="macro", zero_division=0),
+            "recall_weighted": skm.recall_score(t, p, average="weighted", zero_division=0),
+            "f1_micro": skm.f1_score(t, p, average="micro", zero_division=0),
+            "f1_macro": skm.f1_score(t, p, average="macro", zero_division=0),
+            "f1_weighted": skm.f1_score(t, p, average="weighted", zero_division=0),
         }
 
     def summary(self, by_stage=False, **kwargs):
@@ -429,7 +394,7 @@ def summary(self, by_stage=False, **kwargs):
             ## Q: Should we include a column that calculates agreement treating all hypnograms as
             ##    coming from one individual? Others sometimes report it, though I find it mostly
             ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
-            # summary.insert(0, "all", self.multi_scorer_avg(self.data))
+            # summary.insert(0, "all", self.multi_scorer(self.data))
         ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
         ##    one merged DataFrame where the results that are *not* by-stage are included
         ##    with an "all" stage label:

From 1fcd1846ef7a88004191ac5b3ef0166aaea18158 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 01:06:12 -0600
Subject: [PATCH 17/43] use hyp integers instead of strings for big skm
 agreement speed improvements (~2s)

---
 yasa/evaluation.py | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 5dd2279..9e44bce 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -181,8 +181,8 @@ def __init__(self, refr_hyps, test_hyps):
         test_hyps = { s: h for s, h in zip(sleep_ids, test_hyps) }
 
         # Merge all hypnograms into a single multiindexed dataframe
-        refr = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in refr_hyps.items())
-        test = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in test_hyps.items())
+        refr = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items())
+        test = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items())
         data = pd.concat([refr, test], axis=1)
 
         ########################################################################
@@ -196,10 +196,16 @@ def __init__(self, refr_hyps, test_hyps):
         # Get individual-level one-vs-rest/un-weighted agreement scores
         # Labels ensures the order of returned scores is known
         # It also can be used to remove unused labels, but that will be taken care of later anyways
-        # # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
-        labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
+        # skm_labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
+        # skm will return an array of results, so mapping must be linear without skips
+        ## Q: Another option is to get Series.cat.codes for ints and use cat.categories for mapping
+        skm_labels = np.unique(data).tolist()
+        skm_mapping = {i: l for i, l in enumerate(skm_labels)}  # skm integers to YASA integers
+        mapping_int = refr_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integers to YASA strings
+        # labels = refr_hyps[sleep_ids[0]].labels.copy()  # To preserve YASA ordering
+        # labels = [v for k, v in mapping_int.items() if k in skm_labels]  # To preserve YASA ordering
         prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
-            *df.values.T, labels=labels, average=None, zero_division=0
+            *df.values.T, labels=skm_labels, average=None, zero_division=0
         )
         indiv_agree_ovr = (data
             # Get precision, recall, f1, and support for each individual sleep session
@@ -210,13 +216,15 @@ def __init__(self, refr_hyps, test_hyps):
             .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
             .set_index("metric", append=True)
             # Convert stage column names to string labels
-            .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+            .rename_axis(columns="stage").rename(columns=skm_mapping).rename(columns=mapping_int)
             # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
             .pipe(lambda df: df.loc[:, df.any()])
             # Reshape so metrics are columns
             .stack().unstack("metric").rename_axis(columns=None)
-            # Swap MultiIndex levels and sort so stages drive the view
-            .swaplevel().sort_index(level="stage", key=lambda x: x.map(lambda y: labels.index(y)))
+            # Swap MultiIndex levels and sort so stages in standard YASA order
+            .swaplevel().sort_index(
+                level="stage", key=lambda x: x.map(lambda y: list(mapping_int.values()).index(y))
+            )
         )
 
         # Set attributes
@@ -227,7 +235,7 @@ def __init__(self, refr_hyps, test_hyps):
         self._test_hyps = test_hyps
         self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
         self._test_scorer = test_hyps[sleep_ids[0]].scorer
-        self._labels = refr_hyps[sleep_ids[0]].labels
+        self._mapping_int = mapping_int
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
         ## Q: Merge these to one individual agreement dataframe?
@@ -282,11 +290,6 @@ def test_scorer(self):
         """The name of the test scorer."""
         return self._test_scorer
 
-    @property
-    def labels(self):
-        """All available sleep stage labels."""
-        return self._labels
-
     @property
     def indiv_agree_avg(self):
         """
@@ -461,11 +464,11 @@ def get_confusion_matrix(self, sleep_id=None):
         if sleep_id is not None:
             true = true.loc[sleep_id]
             pred = pred.loc[sleep_id]
-        matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
-        # Reorder indices in sensible order and to include all stages
-        index_col_labels = self.labels + ["Total"]
-        matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
-        return matrix.astype(int)
+        matrix = (pd.crosstab(true, pred, margins=True, margins_name="Total")
+            .rename(index=self._mapping_int, columns=self._mapping_int)
+            .astype(int)
+        )
+        return matrix
 
     def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.

From b3a64243c7b6ed4cd1cd74903fdbad7ecb00778e Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 01:11:33 -0600
Subject: [PATCH 18/43] fmt

---
 yasa/evaluation.py | 151 +++++++++++++++++++++++++--------------------
 1 file changed, 84 insertions(+), 67 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 9e44bce..67d3826 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -130,59 +130,64 @@ class EpochByEpochEvaluation:
     UNS        0    0    0   0    0    0    0      0
     Total    277  128  333  98  124    0    0    960
     """
+
     def __init__(self, refr_hyps, test_hyps):
         from yasa.hypno import Hypnogram  # Avoiding circular import
 
         assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
         assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
         assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-        assert len(refr_hyps) == len(test_hyps), (
-            "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
-        )
+        assert len(refr_hyps) == len(
+            test_hyps
+        ), "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
 
         if isinstance(refr_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
-            assert refr_hyps.keys() == test_hyps.keys(), (
-                "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
-            )
+            assert (
+                refr_hyps.keys() == test_hyps.keys()
+            ), "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
             sleep_ids, refr_hyps = zip(*refr_hyps.items())
             test_hyps = tuple(test_hyps.values())
         else:
             # Create hypnogram_ids
             sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
 
-        assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps), (
-            "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
-        )
-        assert all(h.scorer is not None for h in refr_hyps + test_hyps), (
-            "all hypnograms must have a scorer name"
-        )
+        assert all(
+            isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps
+        ), "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+        assert all(
+            h.scorer is not None for h in refr_hyps + test_hyps
+        ), "all hypnograms must have a scorer name"
         for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), (
-            "all `refr_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), (
-            "all `test_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), (
-            "each `refr_hyps` and `test_hyps` pair must have unique scorers"
-        )
-        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), (
-            "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        )
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])
+        ), "all `refr_hyps` must have the same scorer"
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])
+        ), "all `test_hyps` must have the same scorer"
+        assert all(
+            h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)
+        ), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        assert all(
+            h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)
+        ), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
         ## Q: Could use set() for those above.
         ##    Or set scorer as the first available and check all equal.
 
         # Convert to dictionaries with sleep_ids and hypnograms
-        refr_hyps = { s: h for s, h in zip(sleep_ids, refr_hyps) }
-        test_hyps = { s: h for s, h in zip(sleep_ids, test_hyps) }
+        refr_hyps = {s: h for s, h in zip(sleep_ids, refr_hyps)}
+        test_hyps = {s: h for s, h in zip(sleep_ids, test_hyps)}
 
-        # Merge all hypnograms into a single multiindexed dataframe
-        refr = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items())
-        test = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items())
+        # Merge all hypnograms into a single MultiIndexed dataframe
+        refr = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items()
+        )
+        test = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items()
+        )
         data = pd.concat([refr, test], axis=1)
 
         ########################################################################
@@ -207,22 +212,30 @@ def __init__(self, refr_hyps, test_hyps):
         prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
             *df.values.T, labels=skm_labels, average=None, zero_division=0
         )
-        indiv_agree_ovr = (data
+        indiv_agree_ovr = (
+            data
             # Get precision, recall, f1, and support for each individual sleep session
-            .groupby(level=0).apply(prfs_wrapper)
+            .groupby(level=0)
+            .apply(prfs_wrapper)
             # Unpack arrays
-            .explode().apply(pd.Series)
+            .explode()
+            .apply(pd.Series)
             # Add metric labels and prepend to index, creating MultiIndex
             .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
             .set_index("metric", append=True)
             # Convert stage column names to string labels
-            .rename_axis(columns="stage").rename(columns=skm_mapping).rename(columns=mapping_int)
+            .rename_axis(columns="stage")
+            .rename(columns=skm_mapping)
+            .rename(columns=mapping_int)
             # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
             .pipe(lambda df: df.loc[:, df.any()])
             # Reshape so metrics are columns
-            .stack().unstack("metric").rename_axis(columns=None)
+            .stack()
+            .unstack("metric")
+            .rename_axis(columns=None)
             # Swap MultiIndex levels and sort so stages in standard YASA order
-            .swaplevel().sort_index(
+            .swaplevel()
+            .sort_index(
                 level="stage", key=lambda x: x.map(lambda y: list(mapping_int.values()).index(y))
             )
         )
@@ -388,9 +401,11 @@ def summary(self, by_stage=False, **kwargs):
         assert isinstance(by_stage, bool), "`by_stage` must be True or False"
         agg_kwargs = {"func": ["mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
-            summary = (self.indiv_agree_ovr
-                .groupby("stage").agg(**agg_kwargs)
-                .stack(0).rename_axis(["stage", "metric"])
+            summary = (
+                self.indiv_agree_ovr.groupby("stage")
+                .agg(**agg_kwargs)
+                .stack(0)
+                .rename_axis(["stage", "metric"])
             )
         else:
             summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
@@ -456,15 +471,16 @@ def get_confusion_matrix(self, sleep_id=None):
             A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as
             columns.
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         true = self.data[self.refr_scorer]
         pred = self.data[self.test_scorer]
         if sleep_id is not None:
             true = true.loc[sleep_id]
             pred = pred.loc[sleep_id]
-        matrix = (pd.crosstab(true, pred, margins=True, margins_name="Total")
+        matrix = (
+            pd.crosstab(true, pred, margins=True, margins_name="Total")
             .rename(index=self._mapping_int, columns=self._mapping_int)
             .astype(int)
         )
@@ -504,15 +520,15 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
-        assert not "ax" in refr_kwargs | test_kwargs, (
-            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
-        )
+        assert (
+            not "ax" in refr_kwargs | test_kwargs
+        ), "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
         if sleep_id is None:
             if self.n_sleeps == 1:
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
@@ -554,9 +570,9 @@ def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
 
 
@@ -666,6 +682,7 @@ class SleepStatsEvaluation:
 
         >>> sse.plot_blandaltman()
     """
+
     def __init__(
         self,
         refr_data,
@@ -679,15 +696,15 @@ def __init__(
     ):
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
-        assert np.array_equal(refr_data.index, test_data.index), (
-            "`refr_data` and `test_data` index values must be identical"
-        )
-        assert refr_data.index.name == test_data.index.name, (
-            "`refr_data` and `test_data` index names must be identical"
-        )
-        assert np.array_equal(refr_data.columns, test_data.columns), (
-            "`refr_data` and `test_data` column values must be identical"
-        )
+        assert np.array_equal(
+            refr_data.index, test_data.index
+        ), "`refr_data` and `test_data` index values must be identical"
+        assert (
+            refr_data.index.name == test_data.index.name
+        ), "`refr_data` and `test_data` index names must be identical"
+        assert np.array_equal(
+            refr_data.columns, test_data.columns
+        ), "`refr_data` and `test_data` column values must be identical"
         assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
         assert isinstance(test_scorer, str), "`test_scorer` must be a string"
         assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
@@ -713,10 +730,12 @@ def __init__(
 
         # Merge dataframes and reshape to long format
         data = pd.concat([refr_data, test_data, diff_data])
-        data = (data
-            .melt(var_name="sstat", ignore_index=False).reset_index()
+        data = (
+            data.melt(var_name="sstat", ignore_index=False)
+            .reset_index()
             .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
-            .reset_index().rename_axis(columns=None)
+            .reset_index()
+            .rename_axis(columns=None)
         )
 
         # Remove sleep statistics that have no differences between scorers
@@ -727,10 +746,8 @@ def __init__(
 
         ## NORMALITY ##
         # Test reference data for normality at each sleep statistic
-        normality = (data
-            .groupby("sstat")[refr_scorer]
-            .apply(pg.normality, **kwargs_normality)
-            .droplevel(-1)
+        normality = (
+            data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
         )
 
         ## PROPORTIONAL BIAS ##

From 2cdd817b6517501b07107438e7641c9d90cdc900 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 05:00:35 -0600
Subject: [PATCH 19/43] quick comment addresses

---
 yasa/evaluation.py | 90 +++++++++++++++++++++++++++-------------------
 yasa/hypno.py      |  2 +-
 2 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 67d3826..3f3ce37 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -38,21 +38,41 @@
 
 
 class EpochByEpochEvaluation:
-    """
-    For comparing only 2 hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
+    """Evaluate agreement between two collections of hypnograms.
 
-    Parameters
-    ----------
-    refr_hyps : :py:class:`yasa.Hypnogram`
-        A collection of reference or ground-truth hypnograms.
-    test_hyps : :py:class:`yasa.Hypnogram`
-        A collection of test or to-be-evaluated hypnograms.
+    For example, evaluate the agreement between manually-scored hypnograms and automatically-scored
+    hypnograms, or hypnograms derived from actigraphy.
 
-    Notes
-    -----
     Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
+    Parameters
+    ----------
+    refr_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of reference (i.e., ground-truth) hypnograms.
+
+        Each :py:class:`yasa.Hypnogram` in ``refr_hyps`` must have the same
+        :py:attr:`~yasa.Hypnogram.scorer`.
+
+        If a ``dict``, key values are use to generate unique sleep session IDs. If any other
+        iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
+        generated.
+    test_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of test (i.e., to-be-evaluated) hypnograms.
+
+        Each :py:class:`yasa.Hypnogram` in ``test_hyps`` must have the same
+        :py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
+        hypnograms in ``refr_hyps``.
+
+        If a ``dict``, key values must match those of ``refr_hyps``.
+
+    .. important::
+        It is assumed that the order of hypnograms are the same in ``refr_hyps`` and ``test_hyps``.
+        For example, the third hypnogram in ``refr_hyps`` and ``test_hyps`` come from the same sleep
+        session, and only differ in that they have different scorers.
+
+    .. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
+
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
@@ -159,6 +179,7 @@ def __init__(self, refr_hyps, test_hyps):
             h.scorer is not None for h in refr_hyps + test_hyps
         ), "all hypnograms must have a scorer name"
         for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
+            assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
@@ -210,7 +231,7 @@ def __init__(self, refr_hyps, test_hyps):
         # labels = refr_hyps[sleep_ids[0]].labels.copy()  # To preserve YASA ordering
         # labels = [v for k, v in mapping_int.items() if k in skm_labels]  # To preserve YASA ordering
         prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
-            *df.values.T, labels=skm_labels, average=None, zero_division=0
+            *df.values.T, beta=1, labels=skm_labels, average=None, zero_division=0
         )
         indiv_agree_ovr = (
             data
@@ -221,7 +242,7 @@ def __init__(self, refr_hyps, test_hyps):
             .explode()
             .apply(pd.Series)
             # Add metric labels and prepend to index, creating MultiIndex
-            .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
+            .assign(metric=["precision", "recall", "fbeta", "support"] * len(refr_hyps))
             .set_index("metric", append=True)
             # Convert stage column names to string labels
             .rename_axis(columns="stage")
@@ -353,20 +374,13 @@ def multi_scorer(df):
         ##     Keywords could be applied as needed by checking f.__kwdefaults__
         ##     This would offer an easy way for users to add their own scorers with an arg as well.
         return {
-            "accuracy": skm.accuracy_score(t, p),
-            "kappa": skm.cohen_kappa_score(t, p),
-            "jaccard_micro": skm.jaccard_score(t, p, average="micro"),
-            "jaccard_macro": skm.jaccard_score(t, p, average="macro"),
-            "jaccard_weighted": skm.jaccard_score(t, p, average="weighted"),
-            "precision_micro": skm.precision_score(t, p, average="micro", zero_division=0),
-            "precision_macro": skm.precision_score(t, p, average="macro", zero_division=0),
-            "precision_weighted": skm.precision_score(t, p, average="weighted", zero_division=0),
-            "recall_micro": skm.recall_score(t, p, average="micro", zero_division=0),
-            "recall_macro": skm.recall_score(t, p, average="macro", zero_division=0),
-            "recall_weighted": skm.recall_score(t, p, average="weighted", zero_division=0),
-            "f1_micro": skm.f1_score(t, p, average="micro", zero_division=0),
-            "f1_macro": skm.f1_score(t, p, average="macro", zero_division=0),
-            "f1_weighted": skm.f1_score(t, p, average="weighted", zero_division=0),
+            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=None),
+            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=None),
+            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=None),
+            "mcc": skm.matthews_corrcoef(t, p, sample_weight=None),
+            "precision": skm.precision_score(t, p, average="weighted", zero_division=0),
+            "recall": skm.recall_score(t, p, average="weighted", zero_division=0),
+            "fbeta": skm.fbeta_score(t, p, beta=1, average="weighted", zero_division=0),
         }
 
     def summary(self, by_stage=False, **kwargs):
@@ -451,7 +465,7 @@ def get_sleep_stats(self):
         test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
         return pd.concat([refr_sstats, test_sstats])
 
-    def get_confusion_matrix(self, sleep_id=None):
+    def get_confusion_matrix(self, sleep_id=None, **kwargs):
         """
         Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
         sessions concatenated together.
@@ -464,12 +478,21 @@ def get_confusion_matrix(self, sleep_id=None):
             If None (default), cross-tabulation is derived from the entire group dataset.
             If a valid sleep ID, cross-tabulation is derived using only the reference and test
             scored hypnograms from that sleep session.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`pandas.crosstab` call.
 
         Returns
         -------
-        matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as
-            columns.
+        conf_matr : :py:class:`pandas.DataFrame`
+            A confusion matrix with stages from the reference scorer as indices and stages from the
+            test scorer as columns.
+
+        Examples
+        --------
+        Use ``**kwargs`` to add a "Total" column in the margins.
+
+        >>> ebe = yasa.EpochByEpochEvaluation(...)
+        >>> ebe.get_confusion_matrix(margins=True, margins_name="Total")
         """
         assert (
             sleep_id is None or sleep_id in self.sleep_ids
@@ -479,12 +502,7 @@ def get_confusion_matrix(self, sleep_id=None):
         if sleep_id is not None:
             true = true.loc[sleep_id]
             pred = pred.loc[sleep_id]
-        matrix = (
-            pd.crosstab(true, pred, margins=True, margins_name="Total")
-            .rename(index=self._mapping_int, columns=self._mapping_int)
-            .astype(int)
-        )
-        return matrix
+        return pd.crosstab(true, pred).rename(index=self._mapping_int, columns=self._mapping_int)
 
     def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
diff --git a/yasa/hypno.py b/yasa/hypno.py
index b6c12ef..23871b2 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -540,7 +540,7 @@ def copy(self):
         )
 
     def evaluate(self, test_hyp):
-        """Evaluate agreement between two hypnograms.
+        """Evaluate agreement between two hypnograms of the same sleep session.
 
         Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
         test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or

From 678c8c9dad91392af3b2dc4eaaa0cece9eda3b85 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 05:14:28 -0600
Subject: [PATCH 20/43] mad

---
 yasa/evaluation.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 3f3ce37..9eed04b 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -413,7 +413,9 @@ def summary(self, by_stage=False, **kwargs):
             >>> ebe.summary(func=["count", "mean", "sem"])
         """
         assert isinstance(by_stage, bool), "`by_stage` must be True or False"
-        agg_kwargs = {"func": ["mean", "std", "min", "median", "max"]} | kwargs
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
+        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
             summary = (
                 self.indiv_agree_ovr.groupby("stage")
@@ -885,17 +887,17 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
-    def summary(self, descriptives=True):
+    def summary(self, **kwargs):
         """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
         self : :py:class:`SleepStatsEvaluation`
             A :py:class:`SleepStatsEvaluation` instance.
-        descriptives : bool or dict
-            If True (default) or a dictionary, also include descriptive statistics for reference and
-            test scorers. If a dictionary, all key/value pairs are passed as keyword arguments
-            to the :py:meth:`pandas.DataFrame.agg` call.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+
+            >>> ebe.summary(func=["mean", "sem", "min", "max"])
 
         Returns
         -------
@@ -903,21 +905,18 @@ def summary(self, descriptives=True):
             A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
             normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
         """
-        assert isinstance(descriptives, (bool, dict)), "`descriptives` must be True, False, or dict"
         series_list = [
             self.normality["normal"],
             self.proportional_bias["unbiased"],
             self.homoscedasticity["equal_var"].rename("homoscedastic"),
         ]
         summary = pd.concat(series_list, axis=1)
-        if descriptives:
-            agg_kwargs = {"func": ["mean", "std"]}
-            if isinstance(descriptives, dict):
-                agg_kwargs.update(descriptives)
-            desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
-            desc.columns = desc.columns.map("_".join)
-            summary = summary.join(desc)
-        return summary
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
+        agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
+        desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
+        desc.columns = desc.columns.map("_".join)
+        return summary.join(desc)
 
     def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """Visualize session-level discrepancies, generally for outlier inspection.

From 5ce1776c095f59f568172a6d9147abdee5ae8815 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 05:16:49 -0600
Subject: [PATCH 21/43] typo

---
 yasa/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 9eed04b..2195059 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -279,7 +279,7 @@ def __repr__(self):
         s = "s" if self._n_sleeps > 1 else ""
         return (
             f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
-            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep"
+            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep "
             f"session{s}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
@@ -877,7 +877,7 @@ def proportional_bias_full(self):
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference"
+            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference "
             f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"

From 3c83edadb4671df6173c6cae444cbd6a1bce1fc6 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 04:27:59 -0600
Subject: [PATCH 22/43] pd.crosstab --> skm.confusion_matrix

---
 yasa/evaluation.py | 83 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 2195059..636b3e8 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -269,6 +269,8 @@ def __init__(self, refr_hyps, test_hyps):
         self._test_hyps = test_hyps
         self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
         self._test_scorer = test_hyps[sleep_ids[0]].scorer
+        self._skm_labels = skm_labels
+        self._skm_mapping = skm_mapping
         self._mapping_int = mapping_int
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
@@ -345,7 +347,7 @@ def indiv_agree_ovr(self):
         return self._indiv_agree_ovr
 
     @staticmethod
-    def multi_scorer(df):
+    def multi_scorer(df, weights=None):
         """Compute multiple agreement scores from a 2-column dataframe.
 
         This function offers convenience when calculating multiple agreement scores using
@@ -359,12 +361,23 @@ def multi_scorer(df):
             A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
             The first column contains true values and second column contains predicted values.
 
+        weights : None or :py:class:`pandas.Series`
+            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions when possible.
+            If a :py:class:`pandas.Series`, the index must match exactly that of
+            :py:attr:`~yasa.Hypnogram.data`.
+
         Returns
         -------
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
+        assert isinstance(weights, type(None)) or weights in df, "`weights` must be None or a column in `df`"
+        if weights is not None:
+            raise NotImplementedError("Custom `weights` not currently supported")
         t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        # t = df["col1"].to_numpy()
+        # p = df["col2"].to_numpy()
+        w = df["col3"].to_numpy() if weights is not None else weights
         ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
         ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
         ##     For example:
@@ -374,13 +387,17 @@ def multi_scorer(df):
         ##     Keywords could be applied as needed by checking f.__kwdefaults__
         ##     This would offer an easy way for users to add their own scorers with an arg as well.
         return {
-            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=None),
-            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=None),
-            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=None),
-            "mcc": skm.matthews_corrcoef(t, p, sample_weight=None),
-            "precision": skm.precision_score(t, p, average="weighted", zero_division=0),
-            "recall": skm.recall_score(t, p, average="weighted", zero_division=0),
-            "fbeta": skm.fbeta_score(t, p, beta=1, average="weighted", zero_division=0),
+            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=w),
+            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=w),
+            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=w),
+            "mcc": skm.matthews_corrcoef(t, p, sample_weight=w),
+            "precision": skm.precision_score(
+                t, p, average="weighted", sample_weight=w, zero_division=0
+            ),
+            "recall": skm.recall_score(t, p, average="weighted", sample_weight=w, zero_division=0),
+            "fbeta": skm.fbeta_score(
+                t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
+            ),
         }
 
     def summary(self, by_stage=False, **kwargs):
@@ -467,7 +484,7 @@ def get_sleep_stats(self):
         test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
         return pd.concat([refr_sstats, test_sstats])
 
-    def get_confusion_matrix(self, sleep_id=None, **kwargs):
+    def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         """
         Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
         sessions concatenated together.
@@ -480,8 +497,17 @@ def get_confusion_matrix(self, sleep_id=None, **kwargs):
             If None (default), cross-tabulation is derived from the entire group dataset.
             If a valid sleep ID, cross-tabulation is derived using only the reference and test
             scored hypnograms from that sleep session.
+        ## Q: This keyword (agg_func) is too complicated, but I wanted your opinion on the best
+        ##    approach. And I wanted you to see the returned value when agg_func=None because it
+        ##    might be best to generate during __init__ to set and access as an attribute.
+        agg_func : str, list, or None
+            If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
+            all individual sleep session results. If not None, group results returns a
+            :py:class:`~pandas.DataFrame` aggregated across individual sleep sessions where
+            ``agg_func`` is passed as ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`.
+            Ignored if ``sleep_id`` is not None.
         **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`pandas.crosstab` call.
+            Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.
 
         Returns
         -------
@@ -491,20 +517,39 @@ def get_confusion_matrix(self, sleep_id=None, **kwargs):
 
         Examples
         --------
-        Use ``**kwargs`` to add a "Total" column in the margins.
-
         >>> ebe = yasa.EpochByEpochEvaluation(...)
-        >>> ebe.get_confusion_matrix(margins=True, margins_name="Total")
+        >>> ebe.get_confusion_matrix()  # Return results from all individual subjects
+        >>> ebe.get_confusion_matrix(agg_func=["mean", "std"])  # Return summary results
+        >>> ebe.get_confusion_matrix(sleep_id="sub-002")  # Return results from one subject
         """
         assert (
             sleep_id is None or sleep_id in self.sleep_ids
         ), "`sleep_id` must be None or a valid sleep ID"
-        true = self.data[self.refr_scorer]
-        pred = self.data[self.test_scorer]
-        if sleep_id is not None:
-            true = true.loc[sleep_id]
-            pred = pred.loc[sleep_id]
-        return pd.crosstab(true, pred).rename(index=self._mapping_int, columns=self._mapping_int)
+        kwargs = {"labels": self._skm_labels} | kwargs
+        # Get confusion matrix for each individual sleep session
+        ## Q: Should this be done during __init__ and accessible via attribute?
+        conf_mats = (self.data
+            # Get confusion matrix for each individual sleep session
+            .groupby(level=0).apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
+            # Expand results matrix out from single cell
+            .explode().apply(pd.Series)
+            # Convert to MultiIndex with reference scorer as new level
+            .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
+            .set_index(self.refr_scorer, append=True).rename_axis(columns=self.test_scorer)
+            # Convert sleep stage columns and indices to strings
+            .rename(columns=self._skm_mapping).rename(columns=self._mapping_int)
+            .rename(index=self._skm_mapping, level=self.refr_scorer)
+            .rename(index=self._mapping_int, level=self.refr_scorer)
+        )
+        if sleep_id is None:
+            if agg_func is None:
+                mat = conf_mats
+            else:
+                mat = conf_mats.groupby(self.refr_scorer).agg(agg_func)
+                mat.columns = mat.columns.map("_".join).set_names(self.test_scorer)
+        else:
+            mat = conf_mats.loc[sleep_id]
+        return mat
 
     def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.

From 0923bf702c3acfe593c14bee23b14a10d3e39756 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 05:24:15 -0600
Subject: [PATCH 23/43] 3 group-hypnogram plotting options, need feedback

---
 yasa/evaluation.py | 68 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 636b3e8..70d31ba 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -17,6 +17,7 @@
 import pandas as pd
 import pingouin as pg
 import sklearn.metrics as skm
+from scipy.stats import zscore
 
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -599,6 +600,8 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
                 test_hyp = self.test_hyps[self.sleep_ids[0]]
             else:
+                return self.plot_hypnogram_group
+                return self.plot_group_hypno_hist()
                 raise NotImplementedError("Multi-session plotting is not currently supported")
         else:
             refr_hyp = self.refr_hyps[sleep_id]
@@ -618,6 +621,69 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
                 ax.legend()
         return ax
 
+    def plot_group_hypnogram_opt1(self, ax=None, **kwargs):
+        if ax is None:
+            ax = plt.gca()
+        palette = {"Inaccurate": "plum", "Accurate": "forestgreen"}
+        hue_order = list(palette)
+        hist_kwargs = dict(multiple="stack", stat="count", element="step", discrete=True, lw=0)
+        ser = self.data[self.refr_scorer].eq(self.data[self.test_scorer])
+        df = ser.rename("acc").replace({True: "Accurate", False: "Inaccurate"}).reset_index()
+        sns.histplot(
+            data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax
+        )
+        ax.set_ylabel("Number of unique sleep sessions")
+        ax.set_xlabel("Epochs")
+        ax.margins(x=0, y=0)
+        return ax
+
+    def plot_group_hypnogram_opt2(self, ax=None, **kwargs):
+        from pingouin import compute_bootci
+
+        plot_kwargs = dict(lw=1, color="plum", alpha=1, label="7-epoch rolling average")
+        plot_kwargs.update(kwargs)
+        betw_kwargs = dict(lw=0, alpha=0.3, color=plot_kwargs["color"], label="95% bootstrapped CI")
+        if ax is None:
+            ax = plt.gca()
+        df = self.data[self.refr_scorer].eq(self.data[self.test_scorer]).rename("acc").reset_index()
+        probas =  df.groupby("Epoch")["acc"].mean()
+        ci = df.groupby("Epoch")["acc"].apply(compute_bootci, None, "mean").apply(pd.Series)
+        ci = ci.rename(columns={0: "low", 1: "high"})
+        probas = probas.rolling(10, center=True).mean()
+        ci = ci.rolling(10, center=True).mean()
+        ax.fill_between(ci.index, ci["low"], ci["high"], **betw_kwargs)
+        ax.plot(probas.index, probas, **plot_kwargs)
+        ax.set_ylabel("Accuracy across sleep sessions")
+        ax.set_xlabel("Epochs")
+        ax.set_xlim(0, len(probas))
+        ax.set_ylim(0, 1)
+        ax.legend()
+        return ax
+
+    def plot_group_hypnogram_opt3(self, figsize=(7, 10), **kwargs):
+        imshow_kwargs = dict(cmap="Blues", interpolation="none")
+        imshow_kwargs.update(kwargs)
+        n_rows = self.n_sleeps
+        freq = self.refr_hyps[self.sleep_ids[0]].freq
+        freq_secs = pd.Timedelta(freq).total_seconds()
+        fig, axes = plt.subplots(nrows=n_rows, figsize=figsize, sharex=True, sharey=False)
+        for ax, (subj, data) in zip(axes, self.data.groupby(level=0)):
+            img = data.values.T
+            extent = (0, freq_secs * img.shape[1], img.shape[0]-0.5, -0.5)
+            ax.imshow(img, extent=extent, aspect="auto", origin="upper", **imshow_kwargs)
+            ax.set_yticks([0, 1])
+            ax.set_yticklabels([self.refr_scorer, self.test_scorer])
+            ax.set_ylabel(subj, rotation=0, va="center")
+            ax.spines[["top", "bottom", "left", "right"]].set_visible(False)
+            if not ax.get_subplotspec().is_first_row():
+                ax.tick_params(left=False, labelleft=False)
+            if not ax.get_subplotspec().is_last_row():
+                ax.tick_params(bottom=False)
+                ax.set_xlabel("Time [s]")
+                ax.spines["bottom"].set_visible(False)
+        fig.align_ylabels()
+        return fig
+
     def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         """Plot ROC curves for each stage.
 
@@ -785,7 +851,7 @@ def __init__(
         refr_data.index.name = sleep_id_str
         test_data.index.name = sleep_id_str
 
-        # Get scorer differences
+        # Get scorer differences (aka discrepancies)
         diff_data = test_data.sub(refr_data)
 
         # Convert to MultiIndex with new scorer level

From 8f17ec9ccd9caea8f9ff37f1ae609645ca7f4d18 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 06:17:35 -0600
Subject: [PATCH 24/43] minor

---
 yasa/evaluation.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 70d31ba..c87350f 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -22,6 +22,7 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 
+from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 
 
@@ -276,6 +277,7 @@ def __init__(self, refr_hyps, test_hyps):
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
         ## Q: Merge these to one individual agreement dataframe?
+        ##    Setting average="binary" to fill extra column in over dataframe
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
@@ -600,9 +602,9 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
                 test_hyp = self.test_hyps[self.sleep_ids[0]]
             else:
-                return self.plot_hypnogram_group
-                return self.plot_group_hypno_hist()
-                raise NotImplementedError("Multi-session plotting is not currently supported")
+                raise NotImplementedError(
+                    "Multi-session plotting is not currently supported. 3 options being tested!"
+                )
         else:
             refr_hyp = self.refr_hyps[sleep_id]
             test_hyp = self.test_hyps[sleep_id]
@@ -738,6 +740,10 @@ class SleepStatsEvaluation:
         Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
     kwargs_homoscedasticity : dict
         Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
+    verbose : bool or str
+        Verbose level. Default (False) will only print warning and error messages. The logging
+        levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
+        between 'info' (or ``verbose=True``) and warning (``verbose=False``).
 
     Notes
     -----
@@ -757,14 +763,29 @@ class SleepStatsEvaluation:
     >>> import yasa
     >>>
     >>> # For this example, generate two fake datasets of sleep statistics
-    >>> hypsA = [yasa.simulate_hypnogram(tib=600, seed=i) for i in range(20)]
-    >>> hypsB = [h.simulate_similar(tib=600, seed=i) for i, h in enumerate(hypsA)]
-    >>> sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
+    >>> hypsA = [yasa.simulate_hypnogram(tib=600, scorer="Ref", seed=i) for i in range(20)]
+    >>> hypsB = [h.simulate_similar(tib=600, scorer="Test", seed=i) for i, h in enumerate(hypsA)]
+    >>> # sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> # sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> # sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
+    >>> ebe = yasa.EpochByEpochEvaluation(hypsA, hypsB)
+    >>> sstats = ebe.get_sleepstats()
+    >>> sstatsA = sstats.loc["Ref"]
+    >>> sstatsB = sstats.loc["Test"]
     >>>
     >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
     >>>
+    >>> sse.summary()
+           normal  unbiased  homoscedastic
+    sstat
+    %N1      True      True           True
+    %N2      True      True           True
+    %N3      True      True           True
+    %REM    False      True           True
+    SE       True      True           True
+    SOL     False     False           True
+    TST      True      True           True
+
     >>> sse.summary(descriptives=False)
            normal  unbiased  homoscedastic
     sstat
@@ -824,7 +845,10 @@ def __init__(
         kwargs_normality={"alpha": 0.05},
         kwargs_regression={"alpha": 0.05},
         kwargs_homoscedasticity={"alpha": 0.05},
+        verbose=True,
     ):
+        set_log_level(verbose)
+
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
         assert np.array_equal(

From 54958beb0c6dedd5e0956dba4a6fdba2afbd3bf4 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 06:21:49 -0600
Subject: [PATCH 25/43] normality bug

---
 yasa/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index c87350f..a9c890f 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -900,9 +900,9 @@ def __init__(
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
 
         ## NORMALITY ##
-        # Test reference data for normality at each sleep statistic
+        # Test difference data (test - reference) for normality at each sleep statistic
         normality = (
-            data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
+            data.groupby("sstat")["difference"].apply(pg.normality, **kwargs_normality).droplevel(-1)
         )
 
         ## PROPORTIONAL BIAS ##

From b1c0d9a9d2a6b97dc0f4538540e0b6243b0e6105 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 06:24:56 -0600
Subject: [PATCH 26/43] black fmt makes my pandas chains lonnggggg

---
 yasa/evaluation.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index a9c890f..c664efb 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -374,7 +374,9 @@ def multi_scorer(df, weights=None):
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        assert isinstance(weights, type(None)) or weights in df, "`weights` must be None or a column in `df`"
+        assert (
+            isinstance(weights, type(None)) or weights in df
+        ), "`weights` must be None or a column in `df`"
         if weights is not None:
             raise NotImplementedError("Custom `weights` not currently supported")
         t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
@@ -531,16 +533,21 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         kwargs = {"labels": self._skm_labels} | kwargs
         # Get confusion matrix for each individual sleep session
         ## Q: Should this be done during __init__ and accessible via attribute?
-        conf_mats = (self.data
+        conf_mats = (
+            self.data
             # Get confusion matrix for each individual sleep session
-            .groupby(level=0).apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
+            .groupby(level=0)
+            .apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
             # Expand results matrix out from single cell
-            .explode().apply(pd.Series)
+            .explode()
+            .apply(pd.Series)
             # Convert to MultiIndex with reference scorer as new level
             .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
-            .set_index(self.refr_scorer, append=True).rename_axis(columns=self.test_scorer)
+            .set_index(self.refr_scorer, append=True)
+            .rename_axis(columns=self.test_scorer)
             # Convert sleep stage columns and indices to strings
-            .rename(columns=self._skm_mapping).rename(columns=self._mapping_int)
+            .rename(columns=self._skm_mapping)
+            .rename(columns=self._mapping_int)
             .rename(index=self._skm_mapping, level=self.refr_scorer)
             .rename(index=self._mapping_int, level=self.refr_scorer)
         )
@@ -631,9 +638,7 @@ def plot_group_hypnogram_opt1(self, ax=None, **kwargs):
         hist_kwargs = dict(multiple="stack", stat="count", element="step", discrete=True, lw=0)
         ser = self.data[self.refr_scorer].eq(self.data[self.test_scorer])
         df = ser.rename("acc").replace({True: "Accurate", False: "Inaccurate"}).reset_index()
-        sns.histplot(
-            data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax
-        )
+        sns.histplot(data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax)
         ax.set_ylabel("Number of unique sleep sessions")
         ax.set_xlabel("Epochs")
         ax.margins(x=0, y=0)
@@ -648,7 +653,7 @@ def plot_group_hypnogram_opt2(self, ax=None, **kwargs):
         if ax is None:
             ax = plt.gca()
         df = self.data[self.refr_scorer].eq(self.data[self.test_scorer]).rename("acc").reset_index()
-        probas =  df.groupby("Epoch")["acc"].mean()
+        probas = df.groupby("Epoch")["acc"].mean()
         ci = df.groupby("Epoch")["acc"].apply(compute_bootci, None, "mean").apply(pd.Series)
         ci = ci.rename(columns={0: "low", 1: "high"})
         probas = probas.rolling(10, center=True).mean()
@@ -671,7 +676,7 @@ def plot_group_hypnogram_opt3(self, figsize=(7, 10), **kwargs):
         fig, axes = plt.subplots(nrows=n_rows, figsize=figsize, sharex=True, sharey=False)
         for ax, (subj, data) in zip(axes, self.data.groupby(level=0)):
             img = data.values.T
-            extent = (0, freq_secs * img.shape[1], img.shape[0]-0.5, -0.5)
+            extent = (0, freq_secs * img.shape[1], img.shape[0] - 0.5, -0.5)
             ax.imshow(img, extent=extent, aspect="auto", origin="upper", **imshow_kwargs)
             ax.set_yticks([0, 1])
             ax.set_yticklabels([self.refr_scorer, self.test_scorer])
@@ -902,7 +907,9 @@ def __init__(
         ## NORMALITY ##
         # Test difference data (test - reference) for normality at each sleep statistic
         normality = (
-            data.groupby("sstat")["difference"].apply(pg.normality, **kwargs_normality).droplevel(-1)
+            data.groupby("sstat")["difference"]
+            .apply(pg.normality, **kwargs_normality)
+            .droplevel(-1)
         )
 
         ## PROPORTIONAL BIAS ##

From fa7eac6c9f7841baba5a67afbb690ee805a6baf8 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 17:38:08 -0600
Subject: [PATCH 27/43] diff_data --> discrepancies

---
 yasa/evaluation.py | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index c664efb..0047a81 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -533,6 +533,10 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         kwargs = {"labels": self._skm_labels} | kwargs
         # Get confusion matrix for each individual sleep session
         ## Q: Should this be done during __init__ and accessible via attribute?
+        ##    I'm a little unsure about what should happen in init and be accessed as a property
+        ##    vs what should require a function. Nothing takes so long that it feels like it
+        ##    couldn't just happen during __init__, leaving mostly just plotting functions as
+        ##    methods. But if that's the case, what's the benefit of being a class? Confused!!
         conf_mats = (
             self.data
             # Get confusion matrix for each individual sleep session
@@ -791,17 +795,6 @@ class SleepStatsEvaluation:
     SOL     False     False           True
     TST      True      True           True
 
-    >>> sse.summary(descriptives=False)
-           normal  unbiased  homoscedastic
-    sstat
-    %N1      True      True           True
-    %N2      True      True           True
-    %N3      True      True           True
-    %REM    False      True           True
-    SE       True      True           True
-    SOL     False     False           True
-    TST      True      True           True
-
     Access more detailed statistical output of each test.
 
     >>> sse.normality
@@ -880,16 +873,16 @@ def __init__(
         refr_data.index.name = sleep_id_str
         test_data.index.name = sleep_id_str
 
-        # Get scorer differences (aka discrepancies)
-        diff_data = test_data.sub(refr_data)
+        # Get scorer discrepancies (i.e., differences, test minus reference)
+        discrepancies = test_data.sub(refr_data)
 
         # Convert to MultiIndex with new scorer level
-        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        discrepancies = pd.concat({"difference": discrepancies}, names=["scorer"])
         refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
         test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
 
         # Merge dataframes and reshape to long format
-        data = pd.concat([refr_data, test_data, diff_data])
+        data = pd.concat([refr_data, test_data, discrepancies])
         data = (
             data.melt(var_name="sstat", ignore_index=False)
             .reset_index()
@@ -957,8 +950,7 @@ def __init__(
         self._test_scorer = test_scorer
         self._sleep_id_str = sleep_id_str
         self._n_sleeps = data[sleep_id_str].nunique()
-        self._diff_data = diff_data.drop(columns=stats_nodiff)
-        # self._diff_data = data.pivot(index=sleep_id_str, columns="sstat", values="difference")
+        self._discrepancies = discrepancies.drop(columns=stats_nodiff)
 
     @property
     def data(self):
@@ -968,10 +960,10 @@ def data(self):
         return self._data
 
     @property
-    def diff_data(self):
+    def discrepancies(self):
         """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
         # # Pivot for session-rows and statistic-columns
-        return self._diff_data
+        return self._discrepancies
 
     @property
     def refr_scorer(self):
@@ -1083,7 +1075,7 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        table = self.diff_data[sleep_stats]
+        table = self.discrepancies[sleep_stats]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
@@ -1118,12 +1110,12 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         stripplot_kwargs.update(kwargs)
         # Initialize the PairGrid
-        height = 0.3 * len(self.diff_data)
+        height = 0.3 * len(self.discrepancies)
         aspect = 0.6
         pairgrid_kwargs = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
         pairgrid_kwargs.update(kwargs_pairgrid)
         g = sns.PairGrid(
-            self.diff_data.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
+            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
         )
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)

From aeacf89d274485c62688efc5abd902ec5997b573 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 18:01:26 -0600
Subject: [PATCH 28/43] trailing not leading _kwargs

---
 yasa/evaluation.py | 83 +++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 0047a81..5a52105 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -48,6 +48,12 @@ class EpochByEpochEvaluation:
     Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
+    .. warning::
+        :py:class:`yasa.evaluation.EpochByEpochEvaluation` is a new YASA feature and the API is
+        subject to future change.
+
+    .. versionadded:: 0.7.0
+
     Parameters
     ----------
     refr_hyps : iterable of :py:class:`yasa.Hypnogram`
@@ -607,7 +613,7 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
         assert (
             not "ax" in refr_kwargs | test_kwargs
-        ), "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
+        ), "'ax' can't be supplied to `refr_kwargs` or `test_kwargs`, use the `ax` keyword instead"
         if sleep_id is None:
             if self.n_sleeps == 1:
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
@@ -729,6 +735,12 @@ class SleepStatsEvaluation:
     and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
     subjects or sessions.
 
+    .. warning::
+        :py:class:`yasa.evaluation.SleepStatsEvaluation` is a new YASA feature and the API is
+        subject to future change.
+
+    .. versionadded:: 0.7.0
+
     Parameters
     ----------
     refr_data : :py:class:`pandas.DataFrame`
@@ -743,11 +755,11 @@ class SleepStatsEvaluation:
         Name of the test scorer, used for labeling.
     alpha : float
         Alpha cutoff used for all three tests.
-    kwargs_normality : dict
+    normality_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.normality` call.
-    kwargs_regression : dict
+    regression_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
-    kwargs_homoscedasticity : dict
+    homoscedasticity_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
     verbose : bool or str
         Verbose level. Default (False) will only print warning and error messages. The logging
@@ -840,9 +852,9 @@ def __init__(
         *,
         refr_scorer="Reference",
         test_scorer="Test",
-        kwargs_normality={"alpha": 0.05},
-        kwargs_regression={"alpha": 0.05},
-        kwargs_homoscedasticity={"alpha": 0.05},
+        normality_kwargs={"alpha": 0.05},
+        regression_kwargs={"alpha": 0.05},
+        homoscedasticity_kwargs={"alpha": 0.05},
         verbose=True,
     ):
         set_log_level(verbose)
@@ -861,12 +873,12 @@ def __init__(
         assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
         assert isinstance(test_scorer, str), "`test_scorer` must be a string"
         assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
-        assert isinstance(kwargs_normality, dict), "`kwargs_normality` must be a dictionary"
-        assert isinstance(kwargs_regression, dict), "`kwargs_regression` must be a dictionary"
-        assert isinstance(kwargs_homoscedasticity, dict), "`kwargs_homoscedasticity` must be a dict"
-        assert "alpha" in kwargs_normality, "`kwargs_normality` must include 'alpha'"
-        assert "alpha" in kwargs_regression, "`kwargs_regression` must include 'alpha'"
-        assert "alpha" in kwargs_homoscedasticity, "`kwargs_homoscedasticity` must include 'alpha'"
+        assert isinstance(normality_kwargs, dict), "`normality_kwargs` must be a dictionary"
+        assert isinstance(regression_kwargs, dict), "`regression_kwargs` must be a dictionary"
+        assert isinstance(homoscedasticity_kwargs, dict), "`homoscedasticity_kwargs` must be a dict"
+        assert "alpha" in normality_kwargs, "`normality_kwargs` must include 'alpha'"
+        assert "alpha" in regression_kwargs, "`regression_kwargs` must include 'alpha'"
+        assert "alpha" in homoscedasticity_kwargs, "`homoscedasticity_kwargs` must include 'alpha'"
 
         # If refr_data and test_data indices are unnamed, name them
         sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
@@ -900,8 +912,9 @@ def __init__(
         ## NORMALITY ##
         # Test difference data (test - reference) for normality at each sleep statistic
         normality = (
-            data.groupby("sstat")["difference"]
-            .apply(pg.normality, **kwargs_normality)
+            data
+            .groupby("sstat")["difference"]
+            .apply(pg.normality, **normality_kwargs)
             .droplevel(-1)
         )
 
@@ -912,7 +925,7 @@ def __init__(
         for ss_name, ss_df in data.groupby("sstat"):
             # Regress the difference scores on the reference scores
             model = pg.linear_regression(
-                ss_df[refr_scorer], ss_df["difference"], **kwargs_regression
+                ss_df[refr_scorer], ss_df["difference"], **regression_kwargs
             )
             model.insert(0, "sstat", ss_name)
             # Extract sleep-level residuals for later homoscedasticity tests
@@ -933,11 +946,11 @@ def __init__(
         # Now remove intercept rows
         prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
         # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
+        prop_bias["unbiased"] = prop_bias["pval"].ge(regression_kwargs["alpha"])
 
         ## Test each statistic for homoscedasticity ##
         columns = [refr_scorer, "difference", "pbias_residual"]
-        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **homoscedasticity_kwargs)
         homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
 
         # Set attributes
@@ -1083,12 +1096,12 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
             heatmap_kwargs["annot"] = table.to_numpy()
         return sns.heatmap(table_norm, **heatmap_kwargs)
 
-    def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
+    def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kwargs):
         """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        kwargs_pairgrid : dict
+        pairgrid_kwargs : dict
             Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
@@ -1106,19 +1119,19 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         .. plot::
             ## TODO: Example using x_vars
         """
-        assert isinstance(kwargs_pairgrid, dict), "`kwargs_pairgrid` must be a dict"
-        stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
-        stripplot_kwargs.update(kwargs)
+        assert isinstance(pairgrid_kwargs, dict), "`pairgrid_kwargs` must be a dict"
+        kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
+        kwargs_stripplot.update(kwargs)
         # Initialize the PairGrid
         height = 0.3 * len(self.discrepancies)
         aspect = 0.6
-        pairgrid_kwargs = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
-        pairgrid_kwargs.update(kwargs_pairgrid)
+        kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
+        kwargs_pairgrid.update(pairgrid_kwargs)
         g = sns.PairGrid(
-            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
+            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
         )
         # Draw the dots
-        g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
+        g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
         # Adjust aesthetics
         for ax in g.axes.flat:
             ax.set(title=ax.get_xlabel())
@@ -1129,14 +1142,14 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         sns.despine(left=True, bottom=True)
         return g
 
-    def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
+    def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
         """
 
         **Use col_order=sstats_order for plotting a subset.
 
         Parameters
         ----------
-        kwargs_facetgrid : dict
+        facetgrid_kwargs : dict
             Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
@@ -1146,14 +1159,14 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         g : :py:class:`seaborn.FacetGrid`
             A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
         """
-        facetgrid_kwargs = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
-        facetgrid_kwargs.update(kwargs_facetgrid)
-        blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
-        blandaltman_kwargs.update(kwargs)
+        kwargs_facetgrid = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
+        kwargs_facetgrid.update(facetgrid_kwargs)
+        kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
+        kwargs_blandaltman.update(kwargs)
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
+        g = sns.FacetGrid(self.data, col="sstat", **kwargs_facetgrid)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **blandaltman_kwargs)
+        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **kwargs_blandaltman)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks

From dfd7005923beca90f6473acded69c7c43283ad41 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 18 Dec 2023 16:48:58 -0600
Subject: [PATCH 29/43] major EpochByEpoch restructure

---
 yasa/evaluation.py | 1513 +++++++++++++++++++++++++++-----------------
 yasa/hypno.py      |   44 +-
 2 files changed, 941 insertions(+), 616 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 5a52105..8749229 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -1,9 +1,6 @@
 """
-YASA code for evaluating the agreement between two scorers.
-
-There are two levels of evaluating staging performance:
-- Comparing two hypnograms (e.g., human vs automated scorer)
-- Comparing summary sleep statistics between two scorers (e.g., PSG vs actigraphy)
+YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
+epoch-by-epoch level or at the level of summary sleep statistics.
 
 Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
 See the following resources:
@@ -15,118 +12,134 @@
 
 import numpy as np
 import pandas as pd
-import pingouin as pg
 import sklearn.metrics as skm
-from scipy.stats import zscore
+from scipy import stats
 
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 
 
 logger = logging.getLogger("yasa")
 
 __all__ = [
-    "EpochByEpochEvaluation",
-    "SleepStatsEvaluation",
+    "EpochByEpochAgreement",
+    "SleepStatsAgreement",
 ]
 
 
-#############################################################################
+################################################################################
 # EPOCH BY EPOCH
-#############################################################################
-
+################################################################################
 
-class EpochByEpochEvaluation:
-    """Evaluate agreement between two collections of hypnograms.
 
-    For example, evaluate the agreement between manually-scored hypnograms and automatically-scored
-    hypnograms, or hypnograms derived from actigraphy.
+class EpochByEpochAgreement:
+    """Evaluate agreement between two hypnograms or two collections of hypnograms.
 
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
-    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
-
-    .. warning::
-        :py:class:`yasa.evaluation.EpochByEpochEvaluation` is a new YASA feature and the API is
-        subject to future change.
+    Evaluation includes averaged agreement scores, one-vs-rest agreement scores, agreement scores
+    summarized across all sleep and summarized by sleep stage, and various plotting options to
+    visualize the two hypnograms simultaneously. See examples for more detail.
 
     .. versionadded:: 0.7.0
 
     Parameters
     ----------
-    refr_hyps : iterable of :py:class:`yasa.Hypnogram`
-        A collection of reference (i.e., ground-truth) hypnograms.
+    ref_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of reference hypnograms (i.e., those considered ground-truth).
 
-        Each :py:class:`yasa.Hypnogram` in ``refr_hyps`` must have the same
+        Each :py:class:`yasa.Hypnogram` in ``ref_hyps`` must have the same
         :py:attr:`~yasa.Hypnogram.scorer`.
 
         If a ``dict``, key values are use to generate unique sleep session IDs. If any other
         iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
         generated.
-    test_hyps : iterable of :py:class:`yasa.Hypnogram`
-        A collection of test (i.e., to-be-evaluated) hypnograms.
+    obs_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of observed hypnograms (i.e., those to be evaluated).
 
-        Each :py:class:`yasa.Hypnogram` in ``test_hyps`` must have the same
+        Each :py:class:`yasa.Hypnogram` in ``obs_hyps`` must have the same
         :py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
-        hypnograms in ``refr_hyps``.
+        hypnograms in ``ref_hyps``.
 
-        If a ``dict``, key values must match those of ``refr_hyps``.
+        If a ``dict``, key values must match those of ``ref_hyps``.
 
     .. important::
-        It is assumed that the order of hypnograms are the same in ``refr_hyps`` and ``test_hyps``.
-        For example, the third hypnogram in ``refr_hyps`` and ``test_hyps`` come from the same sleep
-        session, and only differ in that they have different scorers.
+        It is assumed that the order of hypnograms are the same in ``ref_hyps`` and ``obs_hyps``.
+        For example, the third hypnogram in ``ref_hyps`` and ``obs_hyps`` must come from the same
+        sleep session, and they must only differ in that they have different scorers.
 
     .. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
 
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                        zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
     >>> import yasa
-    >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i) for i in range(20)]
-    >>> hyps_b = [h.simulate_similar(scorer="RaterB", seed=i) for i, h in enumerate(refr_hyps)]
-    >>> ebe = yasa.EpochByEpochEvaluation(hyps_a, hyps_b)
-
-    >>> ebe.get_agreement().round(3)
-    metric
-    accuracy              0.209
-    kappa                -0.051
-    weighted_jaccard      0.130
-    weighted_precision    0.247
-    weighted_recall       0.209
-    weighted_f1           0.223
-    Name: agreement, dtype: float64
-
-    >>> ebe.get_agreement_by_stage().round(3)
-    stage         WAKE       N1       N2       N3   REM  ART  UNS
-    metric
-    precision    0.188    0.016    0.315    0.429   0.0  0.0  0.0
-    recall       0.179    0.018    0.317    0.235   0.0  0.0  0.0
-    fscore       0.183    0.017    0.316    0.303   0.0  0.0  0.0
-    support    290.000  110.000  331.000  179.000  50.0  0.0  0.0
+    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i) for i in range(10)]
+    >>> obs_hyps = [h.simulate_similar(scorer="YASA", seed=i) for i, h in enumerate(ref_hyps)]
+    >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+    >>> agr = ebe.get_agreement()
+    >>> agr.head(5).round(2)
+              accuracy  balanced_acc  kappa   mcc  precision  recall  fbeta
+    sleep_id
+    1             0.31          0.26   0.07  0.07       0.31    0.31   0.31
+    2             0.33          0.33   0.14  0.14       0.35    0.33   0.34
+    3             0.35          0.24   0.06  0.06       0.35    0.35   0.35
+    4             0.22          0.21   0.01  0.01       0.21    0.22   0.21
+    5             0.21          0.17  -0.06 -0.06       0.20    0.21   0.21
+
+    >>> ebe.get_agreement_bystage().head(12).round(3)
+                    fbeta  precision  recall  support
+    stage sleep_id
+    WAKE  1         0.391      0.371   0.413    189.0
+          2         0.299      0.276   0.326    184.0
+          3         0.234      0.204   0.275    255.0
+          4         0.268      0.285   0.252    321.0
+          5         0.228      0.230   0.227    181.0
+          6         0.407      0.384   0.433    284.0
+          7         0.362      0.296   0.467    287.0
+          8         0.298      0.519   0.209    263.0
+          9         0.210      0.191   0.233    313.0
+          10        0.369      0.420   0.329    362.0
+    N1    1         0.185      0.185   0.185    124.0
+          2         0.121      0.131   0.112    160.0
+
+    >>> ebe.get_confusion_matrix(sleep_id=1)
+    YASA   WAKE  N1   N2  N3  REM
+    Human
+    WAKE     78  24   50   3   34
+    N1       23  23   43  15   20
+    N2       60  58  183  43  139
+    N3       30  10   50   5   32
+    REM      19   9  121  50   78
 
     .. plot::
 
         >>> import matplotlib.pyplot as plt
         >>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
-        >>> ebe.plot_hypnograms()
+        >>> ebe.plot_hypnograms(sleep_id=10)
 
     .. plot::
 
         >>> fig, ax = plt.subplots(figsize=(6, 3))
-        >>> ebe.plot_hypnograms(ax=ax, kwargs_test={"color": "black", "lw": 2, "ls": "dotted"})
+        >>> ebe.plot_hypnograms(
+        >>>     sleep_id=8, ax=ax, obs_kwargs={"color": "red", "lw": 2, "ls": "dotted"}
+        >>> )
         >>> plt.tight_layout()
 
     .. plot::
 
+        >>> session = 8
         >>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
         >>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
         >>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
@@ -134,166 +147,110 @@ class EpochByEpochEvaluation:
         >>>     title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
         >>> )
         >>> ax = ebe.plot_hypnograms(
-        >>>     kwargs_ref=style_a, kwargs_test=style_b, legend=legend_style, ax=ax
+        >>>     sleep_id=session, ref_kwargs=style_a, obs_kwargs=style_b, legend=legend_style, ax=ax
+        >>> )
+        >>> acc = ebe.get_agreement().multiply(100).at[session, "accuracy"]
+        >>> ax.text(
+        >>>     0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes
         >>> )
-        >>>
-        >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
-        >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
 
-    When comparing only 2 hypnograms, use the :py:meth:`yasa.Hynogram.evaluate` method:
+    When comparing only 2 hypnograms, use the :py:meth:`~yasa.Hynogram.evaluate` method:
 
     >>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
     >>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
     >>> ebe = hypno_a.evaluate(hypno_b)
-
     >>> ebe.get_confusion_matrix()
-    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
+    RaterB  WAKE  N1  N2  N3
     RaterA
-    WAKE      52   38  126  23   51    0    0    290
-    N1        59    2   27   8   14    0    0    110
-    N2       117   50  105  15   44    0    0    331
-    N3        34   26   62  42   15    0    0    179
-    REM       15   12   13  10    0    0    0     50
-    ART        0    0    0   0    0    0    0      0
-    UNS        0    0    0   0    0    0    0      0
-    Total    277  128  333  98  124    0    0    960
+    WAKE      71   2  20   8
+    N1         1   0   9   0
+    N2        12   4  25   0
+    N3        24   0   1   3
     """
 
-    def __init__(self, refr_hyps, test_hyps):
+    def __init__(self, ref_hyps, obs_hyps):
         from yasa.hypno import Hypnogram  # Avoiding circular import
 
-        assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
-        assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
-        assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-        assert len(refr_hyps) == len(
-            test_hyps
-        ), "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
+        assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
+        assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
+        assert type(ref_hyps) == type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
+        assert len(ref_hyps) == len(
+            obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
 
-        if isinstance(refr_hyps, dict):
+        if isinstance(ref_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
             assert (
-                refr_hyps.keys() == test_hyps.keys()
-            ), "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
-            sleep_ids, refr_hyps = zip(*refr_hyps.items())
-            test_hyps = tuple(test_hyps.values())
+                ref_hyps.keys() == obs_hyps.keys()
+            ), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
+            sleep_ids, ref_hyps = zip(*ref_hyps.items())
+            obs_hyps = tuple(obs_hyps.values())
         else:
             # Create hypnogram_ids
-            sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
+            sleep_ids = tuple(range(1, 1 + len(ref_hyps)))
 
         assert all(
-            isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps
-        ), "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+            isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
         assert all(
-            h.scorer is not None for h in refr_hyps + test_hyps
+            h.scorer is not None for h in ref_hyps + obs_hyps
         ), "all hypnograms must have a scorer name"
-        for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
+        for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
             assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
         assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])
-        ), "all `refr_hyps` must have the same scorer"
+            h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
+        ), "all `ref_hyps` must have the same scorer"
         assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])
-        ), "all `test_hyps` must have the same scorer"
+            h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
+        ), "all `obs_hyps` must have the same scorer"
         assert all(
-            h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)
-        ), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+            h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
         assert all(
-            h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)
-        ), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        ## Q: Could use set() for those above.
-        ##    Or set scorer as the first available and check all equal.
+            h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
 
-        # Convert to dictionaries with sleep_ids and hypnograms
-        refr_hyps = {s: h for s, h in zip(sleep_ids, refr_hyps)}
-        test_hyps = {s: h for s, h in zip(sleep_ids, test_hyps)}
+        # Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
+        ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
+        obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}
 
         # Merge all hypnograms into a single MultiIndexed dataframe
-        refr = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items()
+        ref = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items()
         )
-        test = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items()
+        obs = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items()
         )
-        data = pd.concat([refr, test], axis=1)
+        data = pd.concat([ref, obs], axis=1)
 
-        ########################################################################
-        # INDIVIDUAL-LEVEL AGREEMENT
-        ########################################################################
-
-        # Get individual-level averaged/weighted agreement scores
-        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer).apply(pd.Series)
-        ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
-
-        # Get individual-level one-vs-rest/un-weighted agreement scores
-        # Labels ensures the order of returned scores is known
-        # It also can be used to remove unused labels, but that will be taken care of later anyways
-        # skm_labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
-        # skm will return an array of results, so mapping must be linear without skips
-        ## Q: Another option is to get Series.cat.codes for ints and use cat.categories for mapping
-        skm_labels = np.unique(data).tolist()
-        skm_mapping = {i: l for i, l in enumerate(skm_labels)}  # skm integers to YASA integers
-        mapping_int = refr_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integers to YASA strings
-        # labels = refr_hyps[sleep_ids[0]].labels.copy()  # To preserve YASA ordering
-        # labels = [v for k, v in mapping_int.items() if k in skm_labels]  # To preserve YASA ordering
-        prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
-            *df.values.T, beta=1, labels=skm_labels, average=None, zero_division=0
-        )
-        indiv_agree_ovr = (
-            data
-            # Get precision, recall, f1, and support for each individual sleep session
-            .groupby(level=0)
-            .apply(prfs_wrapper)
-            # Unpack arrays
-            .explode()
-            .apply(pd.Series)
-            # Add metric labels and prepend to index, creating MultiIndex
-            .assign(metric=["precision", "recall", "fbeta", "support"] * len(refr_hyps))
-            .set_index("metric", append=True)
-            # Convert stage column names to string labels
-            .rename_axis(columns="stage")
-            .rename(columns=skm_mapping)
-            .rename(columns=mapping_int)
-            # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
-            .pipe(lambda df: df.loc[:, df.any()])
-            # Reshape so metrics are columns
-            .stack()
-            .unstack("metric")
-            .rename_axis(columns=None)
-            # Swap MultiIndex levels and sort so stages in standard YASA order
-            .swaplevel()
-            .sort_index(
-                level="stage", key=lambda x: x.map(lambda y: list(mapping_int.values()).index(y))
-            )
-        )
+        # Generate some mapping dictionaries to be used later in class methods
+        skm_labels = np.unique(data).tolist()  # all unique YASA integer codes in this hypno
+        skm2yasa_map = {i: l for i, l in enumerate(skm_labels)}  # skm order to YASA integers
+        yasa2yasa_map = ref_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integer to YASA string
 
         # Set attributes
         self._data = data
         self._sleep_ids = sleep_ids
-        self._n_sleeps = len(sleep_ids)
-        self._refr_hyps = refr_hyps
-        self._test_hyps = test_hyps
-        self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
-        self._test_scorer = test_hyps[sleep_ids[0]].scorer
+        self._ref_hyps = ref_hyps
+        self._obs_hyps = obs_hyps
+        self._ref_scorer = ref_hyps[sleep_ids[0]].scorer
+        self._obs_scorer = obs_hyps[sleep_ids[0]].scorer
         self._skm_labels = skm_labels
-        self._skm_mapping = skm_mapping
-        self._mapping_int = mapping_int
-        self._indiv_agree_avg = indiv_agree_avg
-        self._indiv_agree_ovr = indiv_agree_ovr
-        ## Q: Merge these to one individual agreement dataframe?
-        ##    Setting average="binary" to fill extra column in over dataframe
+        self._skm2yasa_map = skm2yasa_map
+        self._yasa2yasa_map = yasa2yasa_map
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
-        s = "s" if self._n_sleeps > 1 else ""
+        s = "s" if self.n_sleeps > 1 else ""
         return (
-            f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
-            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep "
-            f"session{s}>\n"
-            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
-            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            f"<EpochByEpochAgreement | Observed hypnogram{s} scored by {self.obs_scorer} "
+            f"evaluated against reference hypnogram{s} scored by {self.ref_scorer}, "
+            f"{self.n_sleeps} sleep session{s}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas DataFrame or Series\n"
+            " - Use `.plot_hypnograms()` to plot two overlaid hypnograms\n"
             "See the online documentation for more details."
         )
 
@@ -305,125 +262,203 @@ def data(self):
         """A :py:class:`pandas.DataFrame` including all hypnograms."""
         return self._data
 
-    @property
-    def refr_hyps(self):
-        """A dictionary of all reference YASA hypnograms with sleep IDs as keys."""
-        return self._refr_hyps
-
-    @property
-    def test_hyps(self):
-        """A dictionary of all test YASA hypnograms with sleep IDs as keys."""
-        return self._test_hyps
-
-    @property
-    def sleep_ids(self):
-        """A tuple of all sleep IDs."""
-        return self._sleep_ids
-
     @property
     def n_sleeps(self):
         """The number of unique sleep sessions."""
-        return self._n_sleeps
+        return len(self._sleep_ids)
 
     @property
-    def refr_scorer(self):
+    def ref_scorer(self):
         """The name of the reference scorer."""
-        return self._refr_scorer
-
-    @property
-    def test_scorer(self):
-        """The name of the test scorer."""
-        return self._test_scorer
+        return self._ref_scorer
 
     @property
-    def indiv_agree_avg(self):
-        """
-        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` average-based agreement scores
-        for each individual sleep session.
-
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_ovr`
-        """
-        return self._indiv_agree_avg
-
-    @property
-    def indiv_agree_ovr(self):
-        """
-        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` one-vs-rest agreement scores
-        for each individual sleep session. Agreement scores are provided for each sleep stage.
-
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_avg`
-        """
-        return self._indiv_agree_ovr
+    def obs_scorer(self):
+        """The name of the observed scorer."""
+        return self._obs_scorer
 
     @staticmethod
-    def multi_scorer(df, weights=None):
+    def multi_scorer(df, scorers):
         """Compute multiple agreement scores from a 2-column dataframe.
 
         This function offers convenience when calculating multiple agreement scores using
         :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
-        return multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
+        returns multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
         accept multiple functions.
 
         Parameters
         ----------
         df : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
-            The first column contains true values and second column contains predicted values.
+            A :py:class:`~pandas.DataFrame` with 2 columns and length of *n_samples*.
+            The first column contains reference values and second column contains observed values.
+            If a third column, it must contain sample weights to be passed to underlying
+            :py:mod:`sklearn.metrics` functions as ``sample_weight`` where applicable.
+        scorers : dictionary
+            The scorers to be used for evaluating agreement. A dictionary with scorer names (str) as
+            keys and functions as values.
 
-        weights : None or :py:class:`pandas.Series`
-            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions when possible.
+        Returns
+        -------
+        scores : dict
+            A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
+        """
+        assert isinstance(scorers, dict)
+        assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
+        if df.shape[1] == 3:
+            true, pred, weights = zip(*df.values)
+        else:
+            true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+            weights = None
+        scores = {s: f(true, pred, weights) for s, f in scorers.items()}
+        return scores
+
+    def get_agreement(self, sample_weight=None, scorers=None):
+        """
+        Return a :py:class:`pandas.DataFrame` of weighted (i.e., averaged) agreement scores.
+
+        Parameters
+        ----------
+        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
+            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
+        sample_weight : None or :py:class:`pandas.Series`
+            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions where possible.
             If a :py:class:`pandas.Series`, the index must match exactly that of
             :py:attr:`~yasa.Hypnogram.data`.
+        scorers : None, list, or dictionary
+            The scorers to be used for evaluating agreement. If None (default), default scorers are
+            used. If a list, the list must contain strings that represent metrics from the sklearn
+            metrics module (e.g., ``accuracy``, ``precision``). If more customization is desired, a
+            dictionary can be passed with scorer names (str) as keys and custom functions as values.
+            The custom functions should take 3 positional arguments (true values, predicted values,
+            and sample weights).
 
         Returns
         -------
-        scores : dict
-            A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
+        agreement : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
         """
         assert (
-            isinstance(weights, type(None)) or weights in df
-        ), "`weights` must be None or a column in `df`"
-        if weights is not None:
-            raise NotImplementedError("Custom `weights` not currently supported")
-        t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
-        # t = df["col1"].to_numpy()
-        # p = df["col2"].to_numpy()
-        w = df["col3"].to_numpy() if weights is not None else weights
-        ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
-        ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
-        ##     For example:
-        ##     >>> scorers = ["accuracy", "recall"]
-        ##     >>> funcs = { s: skm.__getattribute__(f"{s}_scorer") for s in scorers }
-        ##     >>> scores = { s: f(true, pred) for s, f in funcs.items() }
-        ##     Keywords could be applied as needed by checking f.__kwdefaults__
-        ##     This would offer an easy way for users to add their own scorers with an arg as well.
-        return {
-            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=w),
-            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=w),
-            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=w),
-            "mcc": skm.matthews_corrcoef(t, p, sample_weight=w),
-            "precision": skm.precision_score(
-                t, p, average="weighted", sample_weight=w, zero_division=0
-            ),
-            "recall": skm.recall_score(t, p, average="weighted", sample_weight=w, zero_division=0),
-            "fbeta": skm.fbeta_score(
-                t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
-            ),
-        }
+            isinstance(sample_weight, (type(None), pd.Series))
+        ), "`sample_weight` must be None or pandas Series"
+        assert isinstance(scorers, (type(None), list, dict))
+        if isinstance(scorers, list):
+            assert all(isinstance(x, str) for x in scorers)
+        elif isinstance(scorers, dict):
+            assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
+        if scorers is None:
+            # Create dictionary of default scorer functions
+            scorers = {
+                "accuracy": lambda t, p, w: skm.accuracy_score(
+                    t, p, normalize=True, sample_weight=w
+                ),
+                "balanced_acc": lambda t, p, w: skm.balanced_accuracy_score(
+                    t, p, adjusted=False, sample_weight=w
+                ),
+                "kappa": lambda t, p, w: skm.cohen_kappa_score(
+                    t, p, labels=None, weights=None, sample_weight=w
+                ),
+                "mcc": lambda t, p, w: skm.matthews_corrcoef(t, p, sample_weight=w),
+                "precision": lambda t, p, w: skm.precision_score(
+                    t, p, average="weighted", sample_weight=w, zero_division=0
+                ),
+                "recall": lambda t, p, w: skm.recall_score(
+                    t, p, average="weighted", sample_weight=w, zero_division=0
+                ),
+                "fbeta": lambda t, p, w: skm.fbeta_score(
+                    t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
+                ),
+            }
+        elif isinstance(scorers, list):
+            # Convert the list to a dictionary of sklearn scorers
+            scorers = {s: skm.__getattribute__(f"{s}_scorer") for s in scorers}
+        # Make a copy of data since weights series might be added to it
+        df = self.data.copy()
+        if sample_weight is not None:
+            assert sample_weight.index == self.data.index, (
+                "If not ``None``, ``sample_weight`` Series must be a pandas Series with same index as `self.data`"
+            )
+            # Add weights as a third column for multi_scorer to use
+            df["weights"] = sample_weight
+        # Get individual-level averaged/weighted agreement scores
+        agreement = df.groupby(level=0).apply(self.multi_scorer, scorers=scorers).apply(pd.Series)
+        # Set attribute for later access
+        self._agreement = agreement
+        # Convert to Series if just one session being evaluated
+        if self.n_sleeps == 1:
+            agreement = agreement.squeeze().rename("agreement")
+        return agreement
+
+    def get_agreement_bystage(self, beta=1.0):
+        """
+        Return a :py:class:`pandas.DataFrame` of unweighted (i.e., one-vs-rest) agreement scores.
+
+        Parameters
+        ----------
+        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
+            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
+        beta : float
+            See :py:func:`sklearn.metrics.precision_recall_fscore_support`.
+
+        Returns
+        -------
+        agreement : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
+            :py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
+        """
+        scorer = lambda df: skm.precision_recall_fscore_support(
+            *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
+        )
+        agreement = (
+            self.data
+            # Get precision, recall, f1, and support for each individual sleep session
+            .groupby(level=0)
+            .apply(scorer)
+            # Unpack arrays
+            .explode()
+            .apply(pd.Series)
+            # Add metric labels column and prepend it to index, creating MultiIndex
+            .assign(metric=["precision", "recall", "fbeta", "support"] * self.n_sleeps)
+            .set_index("metric", append=True)
+            # Convert stage column names to string labels
+            .rename_axis(columns="stage")
+            .rename(columns=self._skm2yasa_map)
+            .rename(columns=self._yasa2yasa_map)
+            # Remove all-zero columns (i.e., stages that were not present in the hypnogram)
+            .pipe(lambda df: df.loc[:, df.any()])
+            # Reshape so metrics are columns
+            .stack()
+            .unstack("metric")
+            .rename_axis(columns=None)
+            # Swap MultiIndex levels and sort so stages are in standard YASA order
+            .swaplevel()
+            .sort_index(
+                level="stage",
+                key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y))
+            )
+        )
+        # Set attribute for later access
+        self._agreement_bystage = agreement
+        # Remove the MultiIndex if just one session being evaluated
+        if self.n_sleeps == 1:
+            agreement = agreement.reset_index(level=1, drop=True)
+        return agreement
 
     def summary(self, by_stage=False, **kwargs):
         """Return group-level agreement scores.
 
+        Default aggregated measures are
+
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
+        self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
+            A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
         by_stage : bool
-            If True, returned ``summary`` :py:class:`pandas.DataFrame` will include agreement scores
-            for each sleep stage, derived from one-vs-rest metrics. If False (default), ``summary``
-            will include agreement scores derived from average-based metrics.
+            If ``False`` (default), ``summary`` will include agreement scores derived from
+            average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
+            will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+            This can be used to customize the descriptive statistics returned.
 
         Returns
         -------
@@ -431,92 +466,112 @@ def summary(self, by_stage=False, **kwargs):
             A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
             with descriptive statistics.
 
-            >>> ebe = yasa.EpochByEpochEvaluation(...)
+            >>> ebe = yasa.EpochByEpochAgreement(...)
+            >>> agreement = ebe.get_agreement()
             >>> ebe.summary()
 
-            This will give a :py:class:`pandas.DataFrame` where each row is an agreement metric and
+            This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
             each column is a descriptive statistic (e.g., mean, standard deviation).
             To control the descriptive statistics included as columns:
 
             >>> ebe.summary(func=["count", "mean", "sem"])
         """
+        assert self.n_sleeps > 1, (
+            "Summary scores can not be computed with only one hypnogram pair."
+        )
         assert isinstance(by_stage, bool), "`by_stage` must be True or False"
+        if by_stage:
+            assert hasattr(self, "_agreement_bystage"), (
+                "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
+            )
+        else:
+            assert hasattr(self, "_agreement"), (
+                "Must run `self.get_agreement` before obtaining summary results."
+            )
+        # Create a function for getting mean absolute deviation
         mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
+        mad.__name__ = "mad"  # Pandas uses this lambda attribute to name the aggregated column
+        # Merge default and user kwargs
         agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
             summary = (
-                self.indiv_agree_ovr.groupby("stage")
+                self
+                .agreement_bystage.groupby("stage")
                 .agg(**agg_kwargs)
-                .stack(0)
+                .stack(level=0)
                 .rename_axis(["stage", "metric"])
             )
         else:
-            summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
+            summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
             ## Q: Should we include a column that calculates agreement treating all hypnograms as
             ##    coming from one individual? Others sometimes report it, though I find it mostly
             ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
-            # summary.insert(0, "all", self.multi_scorer(self.data))
-        ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
-        ##    one merged DataFrame where the results that are *not* by-stage are included
-        ##    with an "all" stage label:
-        ## >>> summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
-        ## >>> summary = pd.concat([summary, summary_ovr]).sort_index()
+            ##    >> summary.insert(0, "all", self.multi_scorer(self.data))
+            ##    Alternatively, we could remove the `by_stage` parameter and stack these into
+            ##    one merged DataFrame where the results that are *not* by-stage are included
+            ##    with an "all" stage label:
+            ##    >>> summary = (
+            ##    >>>     summary.assign(stage="all").set_index("stage", append=True).swaplevel()
+            ##    >>> )
+            ##    >>> summary = pd.concat([summary, summary_ovr]).sort_index()
         return summary
 
     def get_sleep_stats(self):
         """
-        Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived from
-        both reference and test scorers.
+        Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
+        both reference and observed scorers.
 
         .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
-        .. seealso:: :py:class:`yasa.SleepStatsEvaluation`
+        .. seealso:: :py:class:`yasa.SleepStatsAgreement`
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
 
         Returns
         -------
         sstats : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with sleep statistics as columns and two rows for each
-            individual (one from reference scorer and another from test scorer).
+            A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
+            individual (one for reference scorer and another for test scorer).
         """
         # Get all sleep statistics
-        refr_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.refr_hyps.items()})
-        test_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.test_hyps.items()})
+        ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
+        obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
         # Reshape and name axis
-        refr_sstats = refr_sstats.T.rename_axis("sleep_id")
-        test_sstats = test_sstats.T.rename_axis("sleep_id")
+        ref_sstats = ref_sstats.T.rename_axis("sleep_id")
+        obs_sstats = obs_sstats.T.rename_axis("sleep_id")
         # Convert to MultiIndex with new scorer level
-        refr_sstats = pd.concat({self.refr_scorer: refr_sstats}, names=["scorer"])
-        test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
-        return pd.concat([refr_sstats, test_sstats])
+        ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
+        obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
+        # Concatenate into one DataFrame
+        sstats = pd.concat([ref_sstats, obs_sstats])
+        # Remove the MultiIndex if just one session being evaluated
+        if self.n_sleeps == 1:
+            sstats = sstats.reset_index(level=1, drop=True)
+        return sstats
 
     def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         """
-        Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
+        Return a ``ref_hyp``/``obs_hyp``confusion matrix from either a single session or all
         sessions concatenated together.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
         sleep_id : None or a valid sleep ID
             If None (default), cross-tabulation is derived from the entire group dataset.
-            If a valid sleep ID, cross-tabulation is derived using only the reference and test
+            If a valid sleep ID, cross-tabulation is derived using only the reference and observed
             scored hypnograms from that sleep session.
-        ## Q: This keyword (agg_func) is too complicated, but I wanted your opinion on the best
-        ##    approach. And I wanted you to see the returned value when agg_func=None because it
-        ##    might be best to generate during __init__ to set and access as an attribute.
-        agg_func : str, list, or None
+        agg_func : None or str
             If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
-            all individual sleep session results. If not None, group results returns a
-            :py:class:`~pandas.DataFrame` aggregated across individual sleep sessions where
-            ``agg_func`` is passed as ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`.
-            Ignored if ``sleep_id`` is not None.
+            all individual session results. If not None, group results returns a
+            :py:class:`~pandas.DataFrame` aggregated across sessions where ``agg_func`` is passed as
+            ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`. For example, set
+            ``agg_func="sum"`` to get a single confusion matrix across all epochs that does not take
+            session into account.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.
 
@@ -528,22 +583,59 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
 
         Examples
         --------
-        >>> ebe = yasa.EpochByEpochEvaluation(...)
-        >>> ebe.get_confusion_matrix()  # Return results from all individual subjects
-        >>> ebe.get_confusion_matrix(agg_func=["mean", "std"])  # Return summary results
-        >>> ebe.get_confusion_matrix(sleep_id="sub-002")  # Return results from one subject
+        >>> import yasa
+        >>> ref_hyps = [yasa.simulate_hypnogram(tib=90, scorer="Rater1", seed=i) for i in range(3)]
+        >>> obs_hyps = [h.simulate_similar(scorer="Rater2", seed=i) for i, h in enumerate(ref_hyps)]
+        >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+        >>> ebe.get_confusion_matrix(sleep_id=2)
+        Rater2  WAKE  N1  N2  N3  REM
+        Rater1
+        WAKE       1   2  23   0    0
+        N1         0   9  13   0    0
+        N2         0   6  71   0    0
+        N3         0  13  42   0    0
+        REM        0   0   0   0    0
+
+        >>> ebe.get_confusion_matrix()
+        Rater2           WAKE  N1  N2  N3  REM
+        sleep_id Rater1
+        1        WAKE      30   0   3   0   35
+                 N1         3   2   7   0    0
+                 N2        21  12   7   0    4
+                 N3         0   0   0   0    0
+                 REM        2   8  29   0   17
+        2        WAKE       1   2  23   0    0
+                 N1         0   9  13   0    0
+                 N2         0   6  71   0    0
+                 N3         0  13  42   0    0
+                 REM        0   0   0   0    0
+        3        WAKE      16   0   7  19   19
+                 N1         0   7   2   0    5
+                 N2         0  10  12   7    5
+                 N3         0   0  16  11    0
+                 REM        0  15  11  18    0
+
+        >>> ebe.get_confusion_matrix(agg_func="sum")
+        Rater2  WAKE  N1  N2  N3  REM
+        Rater1
+        WAKE      47   2  33  19   54
+        N1         3  18  22   0    5
+        N2        21  28  90   7    9
+        N3         0  13  58  11    0
+        REM        2  23  40  18   17
         """
         assert (
-            sleep_id is None or sleep_id in self.sleep_ids
+            sleep_id is None or sleep_id in self._sleep_ids
         ), "`sleep_id` must be None or a valid sleep ID"
+        assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
+        assert not ((self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None), (
+            "`agg_func` must be None if plotting a single session."
+        )
         kwargs = {"labels": self._skm_labels} | kwargs
-        # Get confusion matrix for each individual sleep session
-        ## Q: Should this be done during __init__ and accessible via attribute?
-        ##    I'm a little unsure about what should happen in init and be accessed as a property
-        ##    vs what should require a function. Nothing takes so long that it feels like it
-        ##    couldn't just happen during __init__, leaving mostly just plotting functions as
-        ##    methods. But if that's the case, what's the benefit of being a class? Confused!!
-        conf_mats = (
+        # Generate a DataFrame with a confusion matrix for each session
+        #   Seems easier to just generate this whole thing and then either
+        #   extract a single one or aggregate across them all, depending on user request
+        confusion_matrices = (
             self.data
             # Get confusion matrix for each individual sleep session
             .groupby(level=0)
@@ -552,44 +644,51 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
             .explode()
             .apply(pd.Series)
             # Convert to MultiIndex with reference scorer as new level
-            .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
-            .set_index(self.refr_scorer, append=True)
-            .rename_axis(columns=self.test_scorer)
+            .assign(**{self.ref_scorer: self._skm_labels * self.n_sleeps})
+            .set_index(self.ref_scorer, append=True)
+            .rename_axis(columns=self.obs_scorer)
             # Convert sleep stage columns and indices to strings
-            .rename(columns=self._skm_mapping)
-            .rename(columns=self._mapping_int)
-            .rename(index=self._skm_mapping, level=self.refr_scorer)
-            .rename(index=self._mapping_int, level=self.refr_scorer)
+            .rename(columns=self._skm2yasa_map)
+            .rename(columns=self._yasa2yasa_map)
+            .rename(index=self._skm2yasa_map, level=self.ref_scorer)
+            .rename(index=self._yasa2yasa_map, level=self.ref_scorer)
         )
+        if self.n_sleeps == 1:
+            # If just one session, use the only session ID as the key, for simplified returned df
+            sleep_id = self._sleep_ids[0]
         if sleep_id is None:
             if agg_func is None:
-                mat = conf_mats
+                mat = confusion_matrices
             else:
-                mat = conf_mats.groupby(self.refr_scorer).agg(agg_func)
-                mat.columns = mat.columns.map("_".join).set_names(self.test_scorer)
+                mat = confusion_matrices.groupby(self.ref_scorer, sort=False).agg(agg_func)
         else:
-            mat = conf_mats.loc[sleep_id]
+            mat = confusion_matrices.loc[sleep_id]
         return mat
 
-    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
-        """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
+    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, obs_kwargs={}):
+        """Plot the two hypnograms of one session overlapping on the same axis.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
 
         Parameters
         ----------
-        sleep_id : None or a valid sleep ID
-            If a valid sleep ID, plot the reference and test hypnograms from on sleep session.
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
+        sleep_id : a valid sleep ID or None
+            The sleep session to plot. If multiple sessions are included in the
+            :py:class:`~yasa.EpochByEpochAgreement` instance, a ``sleep_id`` must be provided. If
+            only one session is present, ``None`` (default) will plot the two hypnograms of the
+            only session.
         legend : bool or dict
             If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
             pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
         ax : :py:class:`matplotlib.axes.Axes` or None
             Axis on which to draw the plot, optional.
-        refr_kwargs : dict
+        ref_kwargs : dict
             Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
             hypnogram.
-        test_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the test
+        obs_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the observed
             hypnogram.
 
         Returns
@@ -602,165 +701,118 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
         .. plot::
 
             >>> from yasa import simulate_hypnogram
-            >>> hyp = simulate_hypnogram(seed=7)
-            >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
+            >>> hyp = simulate_hypnogram(scorer="Anthony", seed=19)
+            >>> ax = hyp.evaluate(hyp.simulate_similar(scorer="Alan", seed=68)).plot_hypnograms()
         """
         assert (
-            sleep_id is None or sleep_id in self.sleep_ids
+            sleep_id is None or sleep_id in self._sleep_ids
         ), "`sleep_id` must be None or a valid sleep ID"
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
-        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
-        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
-        assert (
-            not "ax" in refr_kwargs | test_kwargs
-        ), "'ax' can't be supplied to `refr_kwargs` or `test_kwargs`, use the `ax` keyword instead"
-        if sleep_id is None:
-            if self.n_sleeps == 1:
-                refr_hyp = self.refr_hyps[self.sleep_ids[0]]
-                test_hyp = self.test_hyps[self.sleep_ids[0]]
-            else:
-                raise NotImplementedError(
-                    "Multi-session plotting is not currently supported. 3 options being tested!"
-                )
+        assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
+        assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
+        assert not "ax" in ref_kwargs | obs_kwargs, (
+            "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
+        )
+        assert not (sleep_id is None and self.n_sleeps > 1), (
+            "Multi-session plotting is not currently supported. `sleep_id` must not be None when "
+            "multiple sessions are present"
+        )
+        # Select the session hypnograms to plot
+        if sleep_id is None and self.n_sleeps == 1:
+            ref_hyp = self._ref_hyps[self._sleep_ids[0]]
+            obs_hyp = self._obs_hyps[self._sleep_ids[0]]
         else:
-            refr_hyp = self.refr_hyps[sleep_id]
-            test_hyp = self.test_hyps[sleep_id]
-        plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
-        plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
-        plot_refr_kwargs.update(refr_kwargs)
-        plot_test_kwargs.update(test_kwargs)
-        if ax is None:
-            ax = plt.gca()
-        refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
-        test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
-        if legend and "label" in plot_refr_kwargs | plot_test_kwargs:
+            ref_hyp = self._ref_hyps[sleep_id]
+            obs_hyp = self._obs_hyps[sleep_id]
+        # Set default plotting kwargs and merge with user kwargs
+        plot_ref_kwargs = {
+            "label": self.ref_scorer,
+            "highlight": None,
+            "color": "black",
+            "alpha": 0.8,
+        }
+        plot_obs_kwargs = {
+            "label": self.obs_scorer,
+            "highlight": None,
+            "color": "green",
+            "alpha": 0.8,
+            "ls": "dashed",
+        }
+        plot_ref_kwargs.update(ref_kwargs)
+        plot_obs_kwargs.update(obs_kwargs)
+        # Draw the hypnograms
+        ax = ref_hyp.plot_hypnogram(ax=ax, **plot_ref_kwargs)
+        ax = obs_hyp.plot_hypnogram(ax=ax, **plot_obs_kwargs)
+        # Add legend if desired
+        if legend:
             if isinstance(legend, dict):
                 ax.legend(**legend)
             else:
                 ax.legend()
         return ax
 
-    def plot_group_hypnogram_opt1(self, ax=None, **kwargs):
-        if ax is None:
-            ax = plt.gca()
-        palette = {"Inaccurate": "plum", "Accurate": "forestgreen"}
-        hue_order = list(palette)
-        hist_kwargs = dict(multiple="stack", stat="count", element="step", discrete=True, lw=0)
-        ser = self.data[self.refr_scorer].eq(self.data[self.test_scorer])
-        df = ser.rename("acc").replace({True: "Accurate", False: "Inaccurate"}).reset_index()
-        sns.histplot(data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax)
-        ax.set_ylabel("Number of unique sleep sessions")
-        ax.set_xlabel("Epochs")
-        ax.margins(x=0, y=0)
-        return ax
 
-    def plot_group_hypnogram_opt2(self, ax=None, **kwargs):
-        from pingouin import compute_bootci
-
-        plot_kwargs = dict(lw=1, color="plum", alpha=1, label="7-epoch rolling average")
-        plot_kwargs.update(kwargs)
-        betw_kwargs = dict(lw=0, alpha=0.3, color=plot_kwargs["color"], label="95% bootstrapped CI")
-        if ax is None:
-            ax = plt.gca()
-        df = self.data[self.refr_scorer].eq(self.data[self.test_scorer]).rename("acc").reset_index()
-        probas = df.groupby("Epoch")["acc"].mean()
-        ci = df.groupby("Epoch")["acc"].apply(compute_bootci, None, "mean").apply(pd.Series)
-        ci = ci.rename(columns={0: "low", 1: "high"})
-        probas = probas.rolling(10, center=True).mean()
-        ci = ci.rolling(10, center=True).mean()
-        ax.fill_between(ci.index, ci["low"], ci["high"], **betw_kwargs)
-        ax.plot(probas.index, probas, **plot_kwargs)
-        ax.set_ylabel("Accuracy across sleep sessions")
-        ax.set_xlabel("Epochs")
-        ax.set_xlim(0, len(probas))
-        ax.set_ylim(0, 1)
-        ax.legend()
-        return ax
+################################################################################
+# SLEEP STATISTICS
+################################################################################
 
-    def plot_group_hypnogram_opt3(self, figsize=(7, 10), **kwargs):
-        imshow_kwargs = dict(cmap="Blues", interpolation="none")
-        imshow_kwargs.update(kwargs)
-        n_rows = self.n_sleeps
-        freq = self.refr_hyps[self.sleep_ids[0]].freq
-        freq_secs = pd.Timedelta(freq).total_seconds()
-        fig, axes = plt.subplots(nrows=n_rows, figsize=figsize, sharex=True, sharey=False)
-        for ax, (subj, data) in zip(axes, self.data.groupby(level=0)):
-            img = data.values.T
-            extent = (0, freq_secs * img.shape[1], img.shape[0] - 0.5, -0.5)
-            ax.imshow(img, extent=extent, aspect="auto", origin="upper", **imshow_kwargs)
-            ax.set_yticks([0, 1])
-            ax.set_yticklabels([self.refr_scorer, self.test_scorer])
-            ax.set_ylabel(subj, rotation=0, va="center")
-            ax.spines[["top", "bottom", "left", "right"]].set_visible(False)
-            if not ax.get_subplotspec().is_first_row():
-                ax.tick_params(left=False, labelleft=False)
-            if not ax.get_subplotspec().is_last_row():
-                ax.tick_params(bottom=False)
-                ax.set_xlabel("Time [s]")
-                ax.spines["bottom"].set_visible(False)
-        fig.align_ylabels()
-        return fig
-
-    def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
-        """Plot ROC curves for each stage.
 
-        Parameters
-        ----------
-        palette : dict or None
-            If a dictionary, keys are stages and values are corresponding colors.
-        ax : :py:class:`matplotlib.axes.Axes`
-            Axis on which to draw the plot, optional.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`matplotlib.pyplot.plot` call.
+class SleepStatsAgreement:
+    """
+    Evaluate agreement between sleep statistics reported by two different scorers or scoring
+    methods.
 
-        Returns
-        -------
-        ax : :py:class:`matplotlib.axes.Axes`
-            Matplotlib Axes
-        """
-        assert (
-            sleep_id is None or sleep_id in self.sleep_ids
-        ), "`sleep_id` must be None or a valid sleep ID"
-        raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
+    Bias and limits-of-agreement (and their confidence intervals) are calcualted for each sleep
+    statistic. How these are calculated depends on the sleep statistic's underlying error
+    distribution. See [Menghini2021]_ for details, but in brief:
 
+    * Bias: The difference between the two scorers (observed minus reference).
+        If sleep-statistic differences (observed minus reference) show proportional bias,
+        bias is represented as a regression equation that takes into account changes in bias as
+        a function of measurement value. Otherwise, bias is represented as the standard mean
+        difference.
+    * Limits-of-agreement: If sleep statistic differences show proportional bias, ...
+    * Confidence intervals: If sleep statistic differences follow a normal distribution,
+        confidence intervals are calculated using standard parametric methods. Otherwise,
+        bootstrapped confidence intervals are generated (see also ``bootstrap_cis``).
 
-#############################################################################
-# SLEEP STATISTICS
-#############################################################################
+    Observed sleep statistics can be corrected (i.e., ``calibrated``) to bring them into alignment
+    with the sleep statistics from the reference scorer.
 
+    Bias values are calculated as...
+    LOA ...
+    CI ...
 
-class SleepStatsEvaluation:
-    """
-    Evaluate agreement between two scorers (e.g., two different manual scorers or one manual scorer
-    and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
-    subjects or sessions.
 
-    .. warning::
-        :py:class:`yasa.evaluation.SleepStatsEvaluation` is a new YASA feature and the API is
-        subject to future change.
+    .. important::
+        Bias, limits-of-agreement, and confidence intervals are all calculated differently depending
+        on assumption violations. See Menghini et al., 2021 [Menghini2021]_ for details.
+
+    .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
     .. versionadded:: 0.7.0
 
     Parameters
     ----------
-    refr_data : :py:class:`pandas.DataFrame`
+    ref_data : :py:class:`pandas.DataFrame`
         A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
-        Rows are individual sleep sessions and columns are individual sleep statistics.
-    test_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the test scorer.
-        Shape, indices, and columns must be identical to ``refr_data``.
-    refr_scorer : str
-        Name of the reference scorer, used for labeling.
-    test_scorer : str
-        Name of the test scorer, used for labeling.
+        Rows are unique observations and columns are unique sleep statistics.
+    obs_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the observed scorer.
+        Rows are unique observations and columns are unique sleep statistics.
+        Shape, index, and columns must be identical to ``ref_data``.
+    ref_scorer : str
+        Name of the reference scorer.
+    obs_scorer : str
+        Name of the observed scorer.
     alpha : float
-        Alpha cutoff used for all three tests.
-    normality_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.normality` call.
-    regression_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
-    homoscedasticity_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
+        Alpha cutoff used for all assumption tests.
+
+        .. note:: set ``alpha=1`` to ignore all corrections.
+    bootstrap_all_cis : bool
+        If ``True``, generate all 95% confidence intervals using a bootstrap resampling procedure.
+        Otherwise (``False``, default) use the resampling procedure only when discrepancy values
+        break normality assumptions.
     verbose : bool or str
         Verbose level. Default (False) will only print warning and error messages. The logging
         levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
@@ -775,7 +827,7 @@ class SleepStatsEvaluation:
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                        zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
@@ -794,7 +846,7 @@ class SleepStatsEvaluation:
     >>> sstatsA = sstats.loc["Ref"]
     >>> sstatsB = sstats.loc["Test"]
     >>>
-    >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
+    >>> sse = yasa.SleepStatsAgreement(sstatsA, sstatsB)
     >>>
     >>> sse.summary()
            normal  unbiased  homoscedastic
@@ -847,156 +899,185 @@ class SleepStatsEvaluation:
 
     def __init__(
         self,
-        refr_data,
-        test_data,
+        ref_data,
+        obs_data,
         *,
-        refr_scorer="Reference",
-        test_scorer="Test",
-        normality_kwargs={"alpha": 0.05},
-        regression_kwargs={"alpha": 0.05},
-        homoscedasticity_kwargs={"alpha": 0.05},
+        ref_scorer="Reference",
+        obs_scorer="Observed",
+        alpha=0.05,
+        bootstrap_all_cis=False,
         verbose=True,
     ):
-        set_log_level(verbose)
 
-        assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
-        assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
+        assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
+        assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
         assert np.array_equal(
-            refr_data.index, test_data.index
-        ), "`refr_data` and `test_data` index values must be identical"
+            ref_data.index, obs_data.index
+        ), "`ref_data` and `obs_data` index values must be identical"
         assert (
-            refr_data.index.name == test_data.index.name
-        ), "`refr_data` and `test_data` index names must be identical"
+            ref_data.index.name == obs_data.index.name
+        ), "`ref_data` and `obs_data` index names must be identical"
         assert np.array_equal(
-            refr_data.columns, test_data.columns
-        ), "`refr_data` and `test_data` column values must be identical"
-        assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
-        assert isinstance(test_scorer, str), "`test_scorer` must be a string"
-        assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
-        assert isinstance(normality_kwargs, dict), "`normality_kwargs` must be a dictionary"
-        assert isinstance(regression_kwargs, dict), "`regression_kwargs` must be a dictionary"
-        assert isinstance(homoscedasticity_kwargs, dict), "`homoscedasticity_kwargs` must be a dict"
-        assert "alpha" in normality_kwargs, "`normality_kwargs` must include 'alpha'"
-        assert "alpha" in regression_kwargs, "`regression_kwargs` must include 'alpha'"
-        assert "alpha" in homoscedasticity_kwargs, "`homoscedasticity_kwargs` must include 'alpha'"
-
-        # If refr_data and test_data indices are unnamed, name them
-        sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
-        refr_data.index.name = sleep_id_str
-        test_data.index.name = sleep_id_str
-
-        # Get scorer discrepancies (i.e., differences, test minus reference)
-        discrepancies = test_data.sub(refr_data)
-
-        # Convert to MultiIndex with new scorer level
-        discrepancies = pd.concat({"difference": discrepancies}, names=["scorer"])
-        refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
-        test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
-
-        # Merge dataframes and reshape to long format
-        data = pd.concat([refr_data, test_data, discrepancies])
+            ref_data.columns, obs_data.columns
+        ), "`ref_data` and `obs_data` column values must be identical"
+        assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
+        assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
+        assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
+        assert isinstance(alpha, float) and 0 <= alpha <= 1, "`alpha` must be a number between 0 and 1, inclusive"
+        assert isinstance(bootstrap_all_cis, bool), "`bootstrap_all_cis` must be True or False"
+
+        # If `ref_data` and `obs_data` indices are unnamed, name them
+        session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
+        ref_data.index.name = session_key
+        obs_data.index.name = session_key
+
+        # Get scorer differences (i.e., observed minus reference)
+        diff_data = obs_data.sub(ref_data)
+
+        # Prepend a "scorer" level to index of each individual dataframe, making MultiIndex
+        obs_data = pd.concat({obs_scorer: obs_data}, names=["scorer"])
+        ref_data = pd.concat({ref_scorer: ref_data}, names=["scorer"])
+        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        # Merge observed data, reference data, and differences
+        data = pd.concat([obs_data, ref_data, diff_data])
+        # Reshape to long-format with 3 columns (observed, reference, difference)
         data = (
-            data.melt(var_name="sstat", ignore_index=False)
-            .reset_index()
-            .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
+            data.melt(var_name="sleep_stat", ignore_index=False)
             .reset_index()
+            .pivot(columns="scorer", index=["sleep_stat", session_key], values="value")
             .rename_axis(columns=None)
+            .sort_index()
         )
 
         # Remove sleep statistics that have no differences between scorers
-        stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
-        data = data.query(f"~sstat.isin({stats_nodiff})")
-        for s in stats_nodiff:
-            logger.warning(f"All {s} differences are zero, removing from evaluation.")
-
-        ## NORMALITY ##
-        # Test difference data (test - reference) for normality at each sleep statistic
-        normality = (
-            data
-            .groupby("sstat")["difference"]
-            .apply(pg.normality, **normality_kwargs)
-            .droplevel(-1)
+        stats_with_nodiff = diff_data.any().loc[lambda x: ~x].index.tolist()
+        data = data.query(f"~sleep_stat.isin({stats_with_nodiff})")
+        for s in stats_with_nodiff:
+            logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
+
+        ########################################################################
+        # TEST ASSUMPTION VIOLATIONS
+        ########################################################################
+
+        grouper = data.groupby("sleep_stat")  # For convenience
+
+        # Test SYSTEMATIC BIAS between the two scorers for each sleep statistic (do means differ?).
+        # This test is used to determine whether corrections are applied during calibration only.
+        systematic_bias = grouper["difference"].apply(pg.ttest, y=0).droplevel(-1)
+
+        # Test NORMALITY of difference values at each sleep statistic.
+        # This test is used to determine how confidence intervals for Bias and LoA are calculated.
+        normality = grouper["difference"].apply(pg.normality, alpha=alpha).droplevel(-1)
+
+        # Test PROPORTIONAL BIAS at each sleep statistic (do scorer diffs vary as with ref measure?)
+        # This test is used to determine how Bias and LoA are calculated.
+        regr_f = lambda df: pg.linear_regression(df[ref_scorer], df[obs_scorer], alpha=alpha)
+        resid_f = lambda df: pd.Series(regr_f(df).residuals_, index=df.index.get_level_values(1))
+        proportional_bias = grouper.apply(regr_f).droplevel(-1).set_index("names", append=True)
+        proportional_bias = proportional_bias.swaplevel().sort_index()
+        residuals = grouper.apply(resid_f).stack().rename("residual")
+
+        # Test HETEROSCEDASTICITY at each sleep statistic.
+        # This test is used to determine how LoAs are calculated.
+        data = data.join(residuals)
+        homosc_columns = [ref_scorer, "difference", "residual"]
+        homosc_f = lambda df: pg.homoscedasticity(df[homosc_columns], alpha=alpha)
+        heteroscedasticity = data.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
+        # Add same test for log-transformed values, also used for determining LoA calculation method
+        log_transform = lambda x: np.log(x + 1e-6)
+        backlog_transform = lambda x: np.exp(x) - 1e-6
+        logdata = data[[ref_scorer, obs_scorer]].applymap(log_transform)
+        logdata["difference"] = logdata[obs_scorer].sub(logdata[ref_scorer])
+        logdata["residual"] = logdata.groupby("sleep_stat").apply(resid_f).stack()#.rename("residual")
+        heteroscedasticity_log = logdata.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
+        # data_exp = logdata[[ref_scorer, obs_scorer, "difference"]].applymap(backlog_transform)
+        # data_exp = logdata["difference"].map(backlog_transformer)
+
+        # Aggregate test results into a dataframe of True/False for later convenience.
+        violations = (
+            systematic_bias["p-val"].lt(alpha).to_frame("is_systematically_biased")
+            .join(~normality["normal"].rename("is_nonnormal"))
+            .join(proportional_bias.loc[ref_scorer, "pval"].lt(alpha).rename("is_proportionally_biased"))
+            .join(~heteroscedasticity["equal_var"].rename("is_heteroscedastic"))
+            .join(~heteroscedasticity_log["equal_var"].rename("is_log_heteroscedastic"))
         )
 
-        ## PROPORTIONAL BIAS ##
-        # Test each sleep statistic for proportional bias
-        prop_bias_results = []
-        residuals_results = []
-        for ss_name, ss_df in data.groupby("sstat"):
-            # Regress the difference scores on the reference scores
-            model = pg.linear_regression(
-                ss_df[refr_scorer], ss_df["difference"], **regression_kwargs
-            )
-            model.insert(0, "sstat", ss_name)
-            # Extract sleep-level residuals for later homoscedasticity tests
-            resid_dict = {
-                sleep_id_str: ss_df[sleep_id_str],
-                "sstat": ss_name,
-                "pbias_residual": model.residuals_,
-            }
-            resid = pd.DataFrame(resid_dict)
-            prop_bias_results.append(model)
-            residuals_results.append(resid)
-        # Add residuals to raw dataframe, used later when testing homoscedasticity
-        data = data.merge(pd.concat(residuals_results), on=[sleep_id_str, "sstat"])
-        # Handle proportional bias results
-        prop_bias = pd.concat(prop_bias_results)
-        # Save all the proportional bias models before removing intercept, for optional user access
-        prop_bias_full = prop_bias.reset_index(drop=True)
-        # Now remove intercept rows
-        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
-        # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(regression_kwargs["alpha"])
-
-        ## Test each statistic for homoscedasticity ##
-        columns = [refr_scorer, "difference", "pbias_residual"]
-        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **homoscedasticity_kwargs)
-        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
+        # Get name of method for each calculation.
+        # CI - standard or bootstrap
+        # Bias - standard or modeled
+        # LoA - standard, log_standard, modeled, or residuals
+        get_ci_method = lambda row: "bootstrap" if row.is_nonnormal else "standard"
+        get_bias_method = lambda row: "modeled" if row.is_proportionally_biased else "standard"
+        get_loa_method = lambda row: (
+            "modeled" if row.is_log_heteroscedastic else "log_standard"
+        ) if row.is_heteroscedastic else (
+            "residuals" if row.is_proportionally_biased else "standard"
+        )
+        methods = {
+            "loa": violations.apply(get_loa_method, axis=1),
+            "bias": violations.apply(get_bias_method, axis=1),
+            "ci": violations.apply(get_ci_method, axis=1),
+        }
+        methods = pd.DataFrame(methods)
+        if bootstrap_all_cis:
+            methods["ci"] = ["standard"] * len(violations)
 
-        # Set attributes
+        ########################################################################
+        # ATTRIBUTES
+        ########################################################################
+
+        self._ref_scorer = ref_scorer
+        self._obs_scorer = obs_scorer
+        self._n_sessions = data.index.get_level_values(session_key).nunique()
         self._data = data
+        self._diff_data = diff_data.droplevel(0).drop(columns=stats_with_nodiff)
+        self._systematic_bias = systematic_bias
         self._normality = normality
-        self._proportional_bias = prop_bias
-        self._proportional_bias_full = prop_bias_full  ## Q: Is this worth saving??
-        self._homoscedasticity = homoscedasticity
-        self._refr_scorer = refr_scorer
-        self._test_scorer = test_scorer
-        self._sleep_id_str = sleep_id_str
-        self._n_sleeps = data[sleep_id_str].nunique()
-        self._discrepancies = discrepancies.drop(columns=stats_nodiff)
+        self._proportional_bias = proportional_bias
+        self._heteroscedasticity = heteroscedasticity
+        self._violations = violations
+        self._methods = methods
+        # self._bias = bias
+        # self._bias_vars = bias_vars
+        # self._loas = loas
+        # self._loas_vars = loas_vars
+
 
     @property
     def data(self):
-        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``refr_data`` and
-        ``test_data`` as well as their difference scores (``test_data`` minus ``refr_data``).
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
+        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
         """
         return self._data
 
+    @property
+    def methods(self):
+        return self._methods
+
+    @property
+    def biased(self):
+        return self._biased
+
     @property
     def discrepancies(self):
-        """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
+        """A :py:class:`pandas.DataFrame` of ``obs_data`` minus ``ref_data``."""
         # # Pivot for session-rows and statistic-columns
         return self._discrepancies
 
     @property
-    def refr_scorer(self):
+    def ref_scorer(self):
         """The name of the reference scorer."""
-        return self._refr_scorer
-
-    @property
-    def test_scorer(self):
-        """The name of the test scorer."""
-        return self._test_scorer
+        return self._ref_scorer
 
     @property
-    def sleep_id_str(self):
-        """The name of the unique sleep session identifier."""
-        return self._sleep_id_str
+    def obs_scorer(self):
+        """The name of the observed scorer."""
+        return self._obs_scorer
 
     @property
-    def n_sleeps(self):
+    def n_sessions(self):
         """The number of sleep sessions."""
-        return self._n_sleeps
+        return self._n_sessions
 
     @property
     def normality(self):
@@ -1010,22 +1091,14 @@ def homoscedasticity(self):
 
     @property
     def proportional_bias(self):
-        """
-        A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics, with
-        intercept terms removed.
-        """
-        return self._proportional_bias
-
-    @property
-    def proportional_bias_full(self):
         """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
-        return self._proportional_bias_full
+        return self._proportional_bias
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference "
-            f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
+            f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
+            f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -1034,17 +1107,264 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
+    @staticmethod
+    def _get_standard_bias(x):
+        """Wrapper around `np.mean`, for organizational purposes. For internal use."""
+        return x.mean()
+
+    @staticmethod
+    def _get_standard_loas(x, agreement=1.96, std=None):
+        """Return standard lower and upper limits of agreement. For internal use only.
+
+        Parameters
+        ----------
+        x : array_like
+        agreement : float, int
+        std : float, int
+
+        Returns
+        -------
+        loas : py:class:`numpy.ndarray`
+            A numpy array of shape (2,) where lower LoA is first and upper LoA is second.
+        """
+        if std is None:
+            std = x.std()
+        return x.mean() + np.array([-agreement, agreement]) * std
+
+    @staticmethod
+    def _get_regression_coefficient(x, y, index):
+        """Run linear regression and return a single coefficient.
+        
+        A wrapper to aid in computing CIs (with pg.compute_bootci). For internal use only.
+
+        Parameters
+        ----------
+        x : array_like
+            Predictor values
+        y : array_like
+            Outcome values
+        index: int
+            0 to get coefficient of intercept, N to get coefficient of Nth predictor
+
+        Returns
+        -------
+        coef: float
+            Regression coefficient of the effect of `b`.
+        """
+        ## Q: Jump straight to np.lstsq for speed?
+        return pg.linear_regression(x, y, add_intercept=True).at[index, "coef"]
+
+    @staticmethod
+    def _get_standard_bias_ci(x, confidence=0.95):
+        """Return standard confidence intervals for bias."""
+        n = x.size
+        dof = x.size - 1
+        avg = x.mean()
+        std = x.std()
+        sem = np.sqrt(std**2 / n)
+        low, high = stats.t.interval(confidence, dof, loc=avg, scale=sem)
+        return low, high
+
+    @staticmethod
+    def _get_standard_loas_cis(x, agreement=1.96, std=None, confidence=0.95):
+        """Return standard confidence intervals for both lower LoA and upper LoA.
+
+        Parameters
+        ----------
+        x : array_like
+        agreement : float, int
+        std : float, int
+        confidence : float
+
+        Returns
+        -------
+        cis : dict
+            A dictionary of length 2, with keys "lower" and "upper" LoA, and values of tuples
+            containing "lower" and "upper" confidence intervals for each.
+        """
+        n = x.size
+        dof = x.size - 1
+        if std is None:
+            std = x.std()
+        lower, upper = DiscrepancyEvaluation._get_standard_loas(x, agreement)
+        sem = np.sqrt(3 * std**2 / n)
+        lower_lo, lower_hi = stats.t.interval(confidence, dof, loc=lower, scale=sem)
+        upper_lo, upper_hi = stats.t.interval(confidence, dof, loc=upper, scale=sem)
+        return {"lower": (lower_lo, lower_hi), "upper": (upper_lo, upper_hi)}
+
+    def get_bias(self, alpha=0.05, **bootci_kwargs):
+        results = []
+        for sstat, row in self.methods.iterrows():
+            # Extract difference values once for convenience.
+            diffs = self.data.loc[sstat, "difference"].to_numpy()
+
+            # Identify the method that will be used.
+            if self._violations.at[sstat, "is_proportionally_biased"]:
+                bias_method = "modeled"
+            else:
+                bias_method = "standard"
+
+            if self._violations.at[sstat, "is_nonnormal"]:
+                ci_method = "bootstrap"
+            else:
+                ci_method = "standard"
+
+            # Initialize dictionary to hold row information.
+            metadata = {"sleep_stat": sstat, "method": bias_method}
+
+            # Calculate necessary variables to get bias (either bias or b0 and b1).
+            if bias_method == "modeled":
+                # Systematic bias and constant bias present, model based on constant bias regression.
+                # x, y = self.data.loc[sstat, [self.ref_scorer, "difference"]].T.to_numpy()
+                ref = self.data.loc[sstat, self.ref_scorer].to_numpy()
+                b0 = self._get_regression_coefficient(ref, diffs, index=0)
+                b1 = self._get_regression_coefficient(ref, diffs, index=1)
+                # Confidence intervals for b0 and b1
+                if ci_method == "bootstrap":
+                    b0_lo, b0_hi = pg.compute_bootci(
+                        ref,
+                        diffs,
+                        func=lambda x, y: self._get_regression_coefficient(x, y, index=0),
+                        **bootci_kwargs,
+                    )
+                    b1_lo, b1_hi = pg.compute_bootci(
+                        ref,
+                        diffs,
+                        func=lambda x, y: self._get_regression_coefficient(x, y, index=1),
+                        **bootci_kwargs,
+                    )
+                elif ci_method == "standard":
+                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
+                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
+                    b0_lo, b0_hi, b1_lo, b1_hi = pg.linear_regression(
+                        ref, diffs, alpha=alpha
+                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
+
+            elif bias_method == "standard":
+                b0 = self._get_standard_bias(diffs)
+                if ci_method == "bootstrap":
+                    b0_lo, b0_hi = pg.compute_bootci(
+                        diffs, func=self._get_standard_bias, **bootci_kwargs
+                    )
+                elif ci_method == "standard":
+                    b0_lo, b0_hi = self._get_standard_bias_ci(diffs)
+            else:
+                raise ValueError(f"Unexpected bias method {bias_method}.")
+
+            results.append(dict(variable="b0", mean=b0, ci_lower=b0_lo, ci_upper=b0_hi, **metadata))
+            if bias_method == "modeled":
+                results.append(dict(variable="b1", mean=b1, ci_lower=b1_lo, ci_upper=b1_hi, **metadata))
+
+        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
+        self._bias_values = df
+
+    def get_loa(self, alpha=0.05, **bootci_kwargs):
+        results = []
+        for sstat, row in self.methods.iterrows():
+            # Extract difference values once for convenience.
+            diffs = self.data.loc[sstat, "difference"].to_numpy()
+
+            # Identify the method that will be used.
+            if self._violations.at[sstat, "is_heteroscedastic"]:
+                if self._violations.at[sstat, "is_log_heteroscedastic"]:
+                    loa_method = "modeled"
+                else:
+                    loa_method = "log_standard"
+            else:
+                if self._violations.at[sstat, "is_proportionally_biased"]:
+                    loa_method = "residuals"
+                else:
+                    loa_method = "standard"
+
+            if self._violations.at[sstat, "is_nonnormal"]:
+                ci_method = "bootstrap"
+            else:
+                ci_method = "standard"
+
+            metadata = {"sleep_stat": sstat, "method": loa_method}
+            if loa_method in ["standard", "residuals"]:
+                # Get standard deviation of calibrated (i.e., bias-adjusted) observed values
+                # calibration_func = lambda x: x - (b0 + b1 * x)  # b0 and b1 were generated this iteration above
+                # Get standard deviation of residuals?
+                if loa_method == "residuals":
+                    std = self.data.loc[sstat, "residual"].std()
+                else:
+                    std = diffs.std()  # dof=1
+                lower, upper = self._get_standard_loas(diffs, std=std)
+                if ci_method == "bootstrap":
+                    lower_lo, lower_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[0], **bootci_kwargs)
+                    upper_lo, upper_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[1], **bootci_kwargs)
+                elif ci_method == "standard":
+                    cis = self._get_standard_loas_cis(diffs, std=std)
+                    lower_lo, lower_hi = cis["lower"]
+                    upper_lo, upper_hi = cis["upper"]
+
+                results.append(dict(variable="lower", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
+                results.append(dict(variable="upper", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
+            elif loa_method == "modeled":
+                x, y = self.data.loc[sstat, [obs_scorer, "residual"]].T.values
+                c0 = self._get_regression_coefficient(x, y, index=0)
+                c1 = self._get_regression_coefficient(x, y, index=1)
+                if ci_method == "bootstrap":
+                    c0_lo, c0_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=0), **ci_kwargs)
+                    c1_lo, c1_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=1), **ci_kwargs)
+                elif ci_method == "standard":
+                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
+                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
+                    c0_lo, c0_hi, c1_lo, c1_hi = pg.linear_regression(
+                        x, y, alpha=alpha
+                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
+                else:
+                    raise ValueError(f"Unknown CI method {ci_method}.")
+                results.append(dict(variable="c0", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
+                results.append(dict(variable="c1", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
+            else:
+                raise ValueError(f"Unexpected LoA method {loa_method}.")
+        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
+        self._loa_values = df
+
+    def get_text_summary(self, fmt_dict=None):
+        """
+        """
+        results = {}
+        # Bias
+        for (meth, sstat), df in self._bias_values.groupby(["method", "sleep_stat"]):
+            if meth == "standard":
+                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
+                bias = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc["b0"]
+            elif meth == "modeled":
+                fstr = "{b0_mean:.2f} [{b0_ci_lower:.2f}, {b0_ci_upper:.2f}] + {b1_mean:.2f} [{b1_ci_lower:.2f}, {b1_ci_upper:.2f}] x ref"
+                temp = df.unstack("variable").swaplevel(axis=1)
+                temp.columns = temp.columns.map("_".join)
+                bias = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
+            results[sstat] = dict(bias=bias)
+        # LoA
+        for (meth, sstat), df in self._loa_values.groupby(["method", "sleep_stat"]):
+            if meth in ["standard", "residuals"]:
+                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
+                lower, upper = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc[["lower", "upper"]]
+            else:
+                fstr = "{c0_mean:.2f} [{c0_ci_lower:.2f}, {c0_ci_upper:.2f}] + {c1_mean:.2f} [{c1_ci_lower:.2f}, {c1_ci_upper:.2f}] x ref"
+                temp = df.unstack("variable").swaplevel(axis=1)
+                temp.columns = temp.columns.map("_".join)
+                lower = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
+                upper = lower.copy()
+            results[sstat].update({"lower": lower, "upper": upper})
+
+        df = pd.DataFrame(results).T.rename_axis("sleep_stat")
+        return df
+
     def summary(self, **kwargs):
         """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
-        self : :py:class:`SleepStatsEvaluation`
-            A :py:class:`SleepStatsEvaluation` instance.
+        self : :py:class:`yasa.SleepStatsAgreement`
+            A :py:class:`yasa.SleepStatsAgreement` instance.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
 
-            >>> ebe.summary(func=["mean", "sem", "min", "max"])
+            >>> ssa.summary(func=["mean", "sem", "min", "max"])
 
         Returns
         -------
@@ -1053,15 +1373,16 @@ def summary(self, **kwargs):
             normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
         """
         series_list = [
+            self.bias["biased"],
             self.normality["normal"],
-            self.proportional_bias["unbiased"],
+            self.proportional_bias["bias_constant"],
             self.homoscedasticity["equal_var"].rename("homoscedastic"),
         ]
         summary = pd.concat(series_list, axis=1)
         mad = lambda df: (df - df.mean()).abs().mean()
         mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
         agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
-        desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
+        desc = self.data.groupby("sleep_stat").agg(**agg_kwargs)
         desc.columns = desc.columns.map("_".join)
         return summary.join(desc)
 
@@ -1082,13 +1403,13 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """
         assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
         if sleep_stats is None:
-            sleep_stats = self.data["sstat"].unique()  # All available sleep statistics
+            sleep_stats = self.data.index.get_level_values("sleep_stat").unique()
         heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
         heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        table = self.discrepancies[sleep_stats]
+        table = self._diff_data[sleep_stats]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
@@ -1123,12 +1444,12 @@ def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kw
         kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         kwargs_stripplot.update(kwargs)
         # Initialize the PairGrid
-        height = 0.3 * len(self.discrepancies)
+        height = 0.3 * len(self._diff_data)
         aspect = 0.6
         kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
         kwargs_pairgrid.update(pairgrid_kwargs)
         g = sns.PairGrid(
-            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
+            self._diff_data.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
         )
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
@@ -1164,9 +1485,9 @@ def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
         kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
         kwargs_blandaltman.update(kwargs)
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col="sstat", **kwargs_facetgrid)
+        g = sns.FacetGrid(self.data.reset_index(), col="sleep_stat", **kwargs_facetgrid)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **kwargs_blandaltman)
+        g.map(pg.plot_blandaltman, self.obs_scorer, self.ref_scorer, **kwargs_blandaltman)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks
@@ -1174,7 +1495,7 @@ def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.test_scorer, self.refr_scorer))
+        ylabel = " - ".join((self.obs_scorer, self.ref_scorer))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)
diff --git a/yasa/hypno.py b/yasa/hypno.py
index 23871b2..59b04a0 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -9,7 +9,7 @@
 from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 from yasa.sleepstats import transition_matrix
-from yasa.evaluation import EpochByEpochEvaluation
+from yasa.evaluation import EpochByEpochAgreement
 from pandas.api.types import CategoricalDtype
 
 __all__ = [
@@ -539,42 +539,46 @@ def copy(self):
             scorer=self.scorer,
         )
 
-    def evaluate(self, test_hyp):
+    def evaluate(self, obs_hyp):
         """Evaluate agreement between two hypnograms of the same sleep session.
 
-        Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
-        test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or
-        automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
+        For example, the reference hypnogram (i.e., ``self``) might be a manually-scored hypnogram
+        and the reference hypnogram (i.e., ``ref_hyp``) might be a hypnogram from actigraphy, a
+        wearable device, or an automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
 
         Parameters
         ----------
         self : :py:class:`yasa.Hypnogram`
             Reference or ground-truth hypnogram.
-        test_hyp : :py:class:`yasa.Hypnogram`
-            The test or to-be-evaluated hypnogram.
+        obs_hyp : :py:class:`yasa.Hypnogram`
+            The observed or to-be-evaluated hypnogram.
 
         Returns
         -------
-        ebe : :py:class:`yasa.EpochByEpochEvaluation`
-            See :py:class:`yasa.EpochByEpochEvaluation` documentation for more detail.
+        ebe : :py:class:`yasa.EpochByEpochAgreement`
+            See :py:class:`~yasa.EpochByEpochAgreement` documentation for more detail.
 
         Examples
         --------
+        >>> from yasa import simulate_hypnogram
+        >>> hyp_a = simulate_hypnogram(tib=90, scorer="AASM", seed=8)
+        >>> hyp_b = hyp_a.simulate_similar(scorer="YASA", seed=9)
+        >>> ebe = hyp_a.evaluate(hyp_b)
+        >>> ebe.get_agreement().round(3)
+        accuracy        0.550
+        balanced_acc    0.355
+        kappa           0.227
+        mcc             0.231
+        precision       0.515
+        recall          0.550
+        fbeta           0.524
+        Name: agreement, dtype: float64
+
         .. plot::
 
-            >>> import yasa
-            >>> hypno_ref = yasa.simulate_hypno(tib=600, seed=11)
-            >>> hypno_ref = yasa.Hypnogram(hypno_ref, scorer="Rater1")
-            >>> _, true_probas = hypno_ref.transition_matrix()
-            >>> hypno_test = yasa.simulate_hypno(tib=600, seed=12, trans_probas=true_probas)
-            >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
-            >>> ebe = hypno_ref.evaluate(hypno_test)
-            >>> conf = ebe.get_confusion_matrix()
-            >>> perf = ebe.summary()
-            >>> # Plot the overlapping hypnograms
             >>> ebe.plot_hypnograms()
         """
-        return EpochByEpochEvaluation([self], [test_hyp])
+        return EpochByEpochAgreement([self], [obs_hyp])
 
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.

From 349bfe7883785a89bc84c3967992f569010bcd61 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 18 Dec 2023 16:51:59 -0600
Subject: [PATCH 30/43] class methods alphabetical order

---
 yasa/evaluation.py | 207 +++++++++++++++++++++------------------------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 8749229..da75a4a 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -443,115 +443,6 @@ def get_agreement_bystage(self, beta=1.0):
             agreement = agreement.reset_index(level=1, drop=True)
         return agreement
 
-    def summary(self, by_stage=False, **kwargs):
-        """Return group-level agreement scores.
-
-        Default aggregated measures are
-
-        Parameters
-        ----------
-        self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
-            A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
-        by_stage : bool
-            If ``False`` (default), ``summary`` will include agreement scores derived from
-            average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
-            will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
-            This can be used to customize the descriptive statistics returned.
-
-        Returns
-        -------
-        summary : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
-            with descriptive statistics.
-
-            >>> ebe = yasa.EpochByEpochAgreement(...)
-            >>> agreement = ebe.get_agreement()
-            >>> ebe.summary()
-
-            This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
-            each column is a descriptive statistic (e.g., mean, standard deviation).
-            To control the descriptive statistics included as columns:
-
-            >>> ebe.summary(func=["count", "mean", "sem"])
-        """
-        assert self.n_sleeps > 1, (
-            "Summary scores can not be computed with only one hypnogram pair."
-        )
-        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
-        if by_stage:
-            assert hasattr(self, "_agreement_bystage"), (
-                "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
-            )
-        else:
-            assert hasattr(self, "_agreement"), (
-                "Must run `self.get_agreement` before obtaining summary results."
-            )
-        # Create a function for getting mean absolute deviation
-        mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this lambda attribute to name the aggregated column
-        # Merge default and user kwargs
-        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
-        if by_stage:
-            summary = (
-                self
-                .agreement_bystage.groupby("stage")
-                .agg(**agg_kwargs)
-                .stack(level=0)
-                .rename_axis(["stage", "metric"])
-            )
-        else:
-            summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
-            ## Q: Should we include a column that calculates agreement treating all hypnograms as
-            ##    coming from one individual? Others sometimes report it, though I find it mostly
-            ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
-            ##    >> summary.insert(0, "all", self.multi_scorer(self.data))
-            ##    Alternatively, we could remove the `by_stage` parameter and stack these into
-            ##    one merged DataFrame where the results that are *not* by-stage are included
-            ##    with an "all" stage label:
-            ##    >>> summary = (
-            ##    >>>     summary.assign(stage="all").set_index("stage", append=True).swaplevel()
-            ##    >>> )
-            ##    >>> summary = pd.concat([summary, summary_ovr]).sort_index()
-        return summary
-
-    def get_sleep_stats(self):
-        """
-        Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
-        both reference and observed scorers.
-
-        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
-
-        .. seealso:: :py:class:`yasa.SleepStatsAgreement`
-
-        Parameters
-        ----------
-        self : :py:class:`yasa.EpochByEpochAgreement`
-            A :py:class:`yasa.EpochByEpochAgreement` instance.
-
-        Returns
-        -------
-        sstats : :py:class:`pandas.DataFrame`
-            A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
-            individual (one for reference scorer and another for test scorer).
-        """
-        # Get all sleep statistics
-        ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
-        obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
-        # Reshape and name axis
-        ref_sstats = ref_sstats.T.rename_axis("sleep_id")
-        obs_sstats = obs_sstats.T.rename_axis("sleep_id")
-        # Convert to MultiIndex with new scorer level
-        ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
-        obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
-        # Concatenate into one DataFrame
-        sstats = pd.concat([ref_sstats, obs_sstats])
-        # Remove the MultiIndex if just one session being evaluated
-        if self.n_sleeps == 1:
-            sstats = sstats.reset_index(level=1, drop=True)
-        return sstats
-
     def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         """
         Return a ``ref_hyp``/``obs_hyp``confusion matrix from either a single session or all
@@ -665,6 +556,42 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
             mat = confusion_matrices.loc[sleep_id]
         return mat
 
+    def get_sleep_stats(self):
+        """
+        Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
+        both reference and observed scorers.
+
+        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
+
+        .. seealso:: :py:class:`yasa.SleepStatsAgreement`
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
+
+        Returns
+        -------
+        sstats : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
+            individual (one for reference scorer and another for test scorer).
+        """
+        # Get all sleep statistics
+        ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
+        obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
+        # Reshape and name axis
+        ref_sstats = ref_sstats.T.rename_axis("sleep_id")
+        obs_sstats = obs_sstats.T.rename_axis("sleep_id")
+        # Convert to MultiIndex with new scorer level
+        ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
+        obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
+        # Concatenate into one DataFrame
+        sstats = pd.concat([ref_sstats, obs_sstats])
+        # Remove the MultiIndex if just one session being evaluated
+        if self.n_sleeps == 1:
+            sstats = sstats.reset_index(level=1, drop=True)
+        return sstats
+
     def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, obs_kwargs={}):
         """Plot the two hypnograms of one session overlapping on the same axis.
 
@@ -751,6 +678,68 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, ob
                 ax.legend()
         return ax
 
+    def summary(self, by_stage=False, **kwargs):
+        """Return group-level agreement scores.
+
+        Default aggregated measures are
+
+        Parameters
+        ----------
+        self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
+            A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
+        by_stage : bool
+            If ``False`` (default), ``summary`` will include agreement scores derived from
+            average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
+            will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+            This can be used to customize the descriptive statistics returned.
+
+        Returns
+        -------
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
+            with descriptive statistics.
+
+            >>> ebe = yasa.EpochByEpochAgreement(...)
+            >>> agreement = ebe.get_agreement()
+            >>> ebe.summary()
+
+            This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
+            each column is a descriptive statistic (e.g., mean, standard deviation).
+            To control the descriptive statistics included as columns:
+
+            >>> ebe.summary(func=["count", "mean", "sem"])
+        """
+        assert self.n_sleeps > 1, (
+            "Summary scores can not be computed with only one hypnogram pair."
+        )
+        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
+        if by_stage:
+            assert hasattr(self, "_agreement_bystage"), (
+                "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
+            )
+        else:
+            assert hasattr(self, "_agreement"), (
+                "Must run `self.get_agreement` before obtaining summary results."
+            )
+        # Create a function for getting mean absolute deviation
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this lambda attribute to name the aggregated column
+        # Merge default and user kwargs
+        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
+        if by_stage:
+            summary = (
+                self
+                .agreement_bystage.groupby("stage")
+                .agg(**agg_kwargs)
+                .stack(level=0)
+                .rename_axis(["stage", "metric"])
+            )
+        else:
+            summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
+        return summary
+
 
 ################################################################################
 # SLEEP STATISTICS

From e0ff2fe855e5af0775f1cf01d903774c6cf8778e Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 11 Feb 2024 14:19:52 -0500
Subject: [PATCH 31/43] SleepStatsAgreement major restructure

---
 yasa/evaluation.py | 1093 +++++++++++++++++++++-----------------------
 1 file changed, 517 insertions(+), 576 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index da75a4a..3911c9c 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -2,7 +2,7 @@
 YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
 epoch-by-epoch level or at the level of summary sleep statistics.
 
-Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
+Analyses are influenced by the standardized framework proposed in Menghini et al., 2021, SLEEP.
 See the following resources:
 - https://doi.org/10.1093/sleep/zsaa170
 - https://sri-human-sleep.github.io/sleep-trackers-performance
@@ -12,6 +12,7 @@
 
 import numpy as np
 import pandas as pd
+import pingouin as pg
 import sklearn.metrics as skm
 from scipy import stats
 
@@ -72,15 +73,15 @@ class EpochByEpochAgreement:
 
     Notes
     -----
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
-                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
+                      zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
@@ -748,34 +749,21 @@ def summary(self, by_stage=False, **kwargs):
 
 class SleepStatsAgreement:
     """
-    Evaluate agreement between sleep statistics reported by two different scorers or scoring
-    methods.
-
-    Bias and limits-of-agreement (and their confidence intervals) are calcualted for each sleep
-    statistic. How these are calculated depends on the sleep statistic's underlying error
-    distribution. See [Menghini2021]_ for details, but in brief:
-
-    * Bias: The difference between the two scorers (observed minus reference).
-        If sleep-statistic differences (observed minus reference) show proportional bias,
-        bias is represented as a regression equation that takes into account changes in bias as
-        a function of measurement value. Otherwise, bias is represented as the standard mean
-        difference.
-    * Limits-of-agreement: If sleep statistic differences show proportional bias, ...
-    * Confidence intervals: If sleep statistic differences follow a normal distribution,
-        confidence intervals are calculated using standard parametric methods. Otherwise,
-        bootstrapped confidence intervals are generated (see also ``bootstrap_cis``).
-
-    Observed sleep statistics can be corrected (i.e., ``calibrated``) to bring them into alignment
-    with the sleep statistics from the reference scorer.
-
-    Bias values are calculated as...
-    LOA ...
-    CI ...
-
-
-    .. important::
-        Bias, limits-of-agreement, and confidence intervals are all calculated differently depending
-        on assumption violations. See Menghini et al., 2021 [Menghini2021]_ for details.
+    Evaluate agreement between sleep statistics reported by two different scorers.
+
+    Features include:
+    Evaluation includes bias and limits of agreement (as well as both their confidence intervals),
+    various plotting options, and calibration functions for correcting biased values from the
+    observed scorer.
+
+    * Get summary calculations of bias, limits of agreement, and their confidence intervals.
+    * Test statistical assumptions of bias, limits of agreement, and their confidence intervals,
+    and apply corrective procedures when the assumptions are not met.
+    * Get bias and limits of agreement in a string-formatted table.
+    * Calibrate new data to correct for biases in observed data.
+    * Return individual calibration functions.
+    * Visualize discrepancies for outlier inspection.
+    * Visualize Bland-Altman plots.
 
     .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
@@ -794,14 +782,17 @@ class SleepStatsAgreement:
         Name of the reference scorer.
     obs_scorer : str
         Name of the observed scorer.
+    agreement : float
+        Multiple of the standard deviation to plot agreement limits. The default is 1.96, which
+        corresponds to a 95% confidence interval if the differences are normally distributed.
+
+        .. note:: ``agreement`` gets adjusted for regression-modeled limits of agreement.
+    confidence : float
+        The percentage confidence interval for the confidence intervals that are applied to bias and
+        limits of agreement. The same confidence interval percentage is applied to both standard and
+        bootstrapped confidence intervals.
     alpha : float
         Alpha cutoff used for all assumption tests.
-
-        .. note:: set ``alpha=1`` to ignore all corrections.
-    bootstrap_all_cis : bool
-        If ``True``, generate all 95% confidence intervals using a bootstrap resampling procedure.
-        Otherwise (``False``, default) use the resampling procedure only when discrepancy values
-        break normality assumptions.
     verbose : bool or str
         Verbose level. Default (False) will only print warning and error messages. The logging
         levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
@@ -809,81 +800,103 @@ class SleepStatsAgreement:
 
     Notes
     -----
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    Sleep statistics that are identical between scorers are removed from analysis.
+
+    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
-                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
+                      zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
     >>> import pandas as pd
     >>> import yasa
     >>>
-    >>> # For this example, generate two fake datasets of sleep statistics
-    >>> hypsA = [yasa.simulate_hypnogram(tib=600, scorer="Ref", seed=i) for i in range(20)]
-    >>> hypsB = [h.simulate_similar(tib=600, scorer="Test", seed=i) for i, h in enumerate(hypsA)]
-    >>> # sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> # sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> # sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
-    >>> ebe = yasa.EpochByEpochEvaluation(hypsA, hypsB)
-    >>> sstats = ebe.get_sleepstats()
-    >>> sstatsA = sstats.loc["Ref"]
-    >>> sstatsB = sstats.loc["Test"]
-    >>>
-    >>> sse = yasa.SleepStatsAgreement(sstatsA, sstatsB)
-    >>>
-    >>> sse.summary()
-           normal  unbiased  homoscedastic
-    sstat
-    %N1      True      True           True
-    %N2      True      True           True
-    %N3      True      True           True
-    %REM    False      True           True
-    SE       True      True           True
-    SOL     False     False           True
-    TST      True      True           True
-
-    Access more detailed statistical output of each test.
-
-    >>> sse.normality
-                  W      pval  normal
-    sstat
-    %N1    0.973407  0.824551    True
-    %N2    0.960684  0.557595    True
-    %N3    0.958591  0.516092    True
-    %REM   0.901733  0.044447   False
-    SE     0.926732  0.133580    True
-    SOL    0.774786  0.000372   False
-    TST    0.926733  0.133584    True
-    WASO   0.924288  0.119843    True
-
-    >>> sse.homoscedasticity.head(2)
-                  W      pval  equal_var
-    sstat
-    %N1    0.684833  0.508274       True
-    %N2    0.080359  0.922890       True
-
-    >>> sse.proportional_bias.round(3).head(2)
-            coef     se      T   pval     r2  adj_r2  CI[2.5%]  CI[97.5%]  unbiased
-    sstat
-    %N1   -0.487  0.314 -1.551  0.138  0.118   0.069    -1.146      0.172      True
-    %N2   -0.107  0.262 -0.409  0.688  0.009  -0.046    -0.658      0.444      True
+    >>> # Generate fake reference and observed datasets with similar sleep statistics
+    >>> ref_scorer = "Henri"
+    >>> obs_scorer = "Piéron"
+    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer=ref_scorer, seed=i) for i in range(20)]
+    >>> obs_hyps = [h.simulate_similar(tib=600, scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
+    >>> # Generate sleep statistics from hypnograms using EpochByEpochAgreement
+    >>> eea = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+    >>> sstats = eea.get_sleep_stats()
+    >>> ref_sstats = sstats.loc[ref_scorer]
+    >>> obs_sstats = sstats.loc[obs_scorer]
+    >>> # Create SleepStatsAgreement instance
+    >>> ssa = yasa.SleepStatsAgreement(ref_sstats, obs_sstats)
+    >>> ssa.summary().round(1).head(3)
+    variable   bias_intercept             ...   uloa_parm
+    interval           center lower upper ...      center lower upper
+    sleep_stat                            ...
+    %N1                  -5.4 -13.9   3.2 ...         6.1   3.7   8.5
+    %N2                 -27.3 -49.1  -5.6 ...        12.4   7.2  17.6
+    %N3                  -9.1 -23.8   5.5 ...        20.4  12.6  28.3
+
+    >>> ssa.get_table().head(3)[["bias", "loa"]]
+                          bias                            loa
+    sleep_stat
+    %N1                   0.25  Bias ± 2.46 * (-0.00 + 1.00x)
+    %N2         -27.34 + 0.55x   Bias ± 2.46 * (0.00 + 1.00x)
+    %N3                   1.38   Bias ± 2.46 * (0.00 + 1.00x)
+
+    >>> ssa.assumptions.head(3)
+                unbiased  normal  constant_bias  homoscedastic
+    sleep_stat
+    %N1             True    True           True          False
+    %N2             True    True          False          False
+    %N3             True    True           True          False
+
+    >>> ssa.auto_methods.head(3)
+                bias   loa    ci
+    sleep_stat
+    %N1         parm  regr  parm
+    %N2         regr  regr  parm
+    %N3         parm  regr  parm
+
+    >>> ssa.get_table(bias_method="parm", loa_method="parm").head(3)[["bias", "loa"]]
+                 bias            loa
+    sleep_stat
+    %N1          0.25    -5.55, 6.06
+    %N2         -0.23  -12.87, 12.40
+    %N3          1.38  -17.67, 20.44
+
+    Generate a new observed dataset and calibrate the values based on bias present in original observed
+
+    >>> new_hyps = [h.simulate_similar(tib=600, scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
+    >>> new_sstats = pd.Series(new_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> new_sstats = new_sstats[["N1", "TST", "WASO"]]
+    >>> new_sstats.round(1).head(5)
+         N1    TST   WASO
+    0  42.5  439.5  147.5
+    1  84.0  550.0   38.5
+    2  53.5  489.0  103.0
+    3  57.0  469.5  120.0
+    4  71.0  531.0   69.0
+
+    >>> new_stats_calibrated = ssa.calibrate_stats(new_sstats, bias_method="auto")
+    >>> new_stats_calibrated.round(1).head(5)
+         N1    TST   WASO
+    0  42.9  433.8  150.0
+    1  84.4  544.2   41.0
+    2  53.9  483.2  105.5
+    3  57.4  463.8  122.5
+    4  71.4  525.2   71.5
 
     .. plot::
 
         >>> import matplotlib.pyplot as plt
-        >>> ax = sse.plot_discrepancies_heatmap()
+        >>> ax = ssa.plot_discrepancies_heatmap()
         >>> ax.set_title("Sleep statistic discrepancies")
         >>> plt.tight_layout()
 
     .. plot::
 
-        >>> sse.plot_blandaltman()
+        >>> ssa.plot_blandaltman()
     """
 
     def __init__(
@@ -893,9 +906,11 @@ def __init__(
         *,
         ref_scorer="Reference",
         obs_scorer="Observed",
+        agreement=1.96,
+        confidence=0.95,
         alpha=0.05,
-        bootstrap_all_cis=False,
         verbose=True,
+        bootstrap_kwargs={},
     ):
 
         assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
@@ -912,146 +927,138 @@ def __init__(
         assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
         assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
         assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
-        assert isinstance(alpha, float) and 0 <= alpha <= 1, "`alpha` must be a number between 0 and 1, inclusive"
-        assert isinstance(bootstrap_all_cis, bool), "`bootstrap_all_cis` must be True or False"
+        assert isinstance(agreement, (float, int)) and agreement > 0, "`agreement` must be a number greater than 0"
+        assert isinstance(confidence, (float, int)) and 0 < alpha < 1, "`confidence` must be a number between 0 and 1"
+        assert isinstance(alpha, (float, int)) and 0 < alpha < 1, "`alpha` must be a number between 0 and 1"
+        assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
+        restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]
+        assert all(k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs), f"None of {restricted_bootstrap_kwargs} can be set by the user"
 
         # If `ref_data` and `obs_data` indices are unnamed, name them
         session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
         ref_data.index.name = session_key
         obs_data.index.name = session_key
 
-        # Get scorer differences (i.e., observed minus reference)
-        diff_data = obs_data.sub(ref_data)
-
-        # Prepend a "scorer" level to index of each individual dataframe, making MultiIndex
-        obs_data = pd.concat({obs_scorer: obs_data}, names=["scorer"])
-        ref_data = pd.concat({ref_scorer: ref_data}, names=["scorer"])
-        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
-        # Merge observed data, reference data, and differences
-        data = pd.concat([obs_data, ref_data, diff_data])
-        # Reshape to long-format with 3 columns (observed, reference, difference)
+        # Reshape to long format DataFrame with 2 columns (observed, reference) and MultiIndex
         data = (
-            data.melt(var_name="sleep_stat", ignore_index=False)
-            .reset_index()
-            .pivot(columns="scorer", index=["sleep_stat", session_key], values="value")
+            pd.concat([obs_data, ref_data], keys=[obs_scorer, ref_scorer], names=["scorer"])
+            .melt(var_name="sleep_stat", ignore_index=False)
+            .pivot_table(index=["sleep_stat", session_key], columns="scorer", values="value")
             .rename_axis(columns=None)
             .sort_index()
         )
 
+        # Get scorer differences (i.e., observed minus reference)
+        data["difference"] = data[obs_scorer].sub(data[ref_scorer])
+
         # Remove sleep statistics that have no differences between scorers
-        stats_with_nodiff = diff_data.any().loc[lambda x: ~x].index.tolist()
-        data = data.query(f"~sleep_stat.isin({stats_with_nodiff})")
-        for s in stats_with_nodiff:
+        stats_rm = data.groupby("sleep_stat")["difference"].any().loc[lambda x: ~x].index.tolist()
+        data = data.drop(labels=stats_rm)
+        for s in stats_rm:
             logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
 
+        # Create grouper variable for convenience
+        grouper = data.groupby("sleep_stat")
+
         ########################################################################
-        # TEST ASSUMPTION VIOLATIONS
+        # Generate parametric Bias and LoA for all sleep stats
         ########################################################################
-
-        grouper = data.groupby("sleep_stat")  # For convenience
-
-        # Test SYSTEMATIC BIAS between the two scorers for each sleep statistic (do means differ?).
-        # This test is used to determine whether corrections are applied during calibration only.
-        systematic_bias = grouper["difference"].apply(pg.ttest, y=0).droplevel(-1)
-
-        # Test NORMALITY of difference values at each sleep statistic.
-        # This test is used to determine how confidence intervals for Bias and LoA are calculated.
-        normality = grouper["difference"].apply(pg.normality, alpha=alpha).droplevel(-1)
-
-        # Test PROPORTIONAL BIAS at each sleep statistic (do scorer diffs vary as with ref measure?)
-        # This test is used to determine how Bias and LoA are calculated.
-        regr_f = lambda df: pg.linear_regression(df[ref_scorer], df[obs_scorer], alpha=alpha)
-        resid_f = lambda df: pd.Series(regr_f(df).residuals_, index=df.index.get_level_values(1))
-        proportional_bias = grouper.apply(regr_f).droplevel(-1).set_index("names", append=True)
-        proportional_bias = proportional_bias.swaplevel().sort_index()
-        residuals = grouper.apply(resid_f).stack().rename("residual")
-
-        # Test HETEROSCEDASTICITY at each sleep statistic.
-        # This test is used to determine how LoAs are calculated.
-        data = data.join(residuals)
-        homosc_columns = [ref_scorer, "difference", "residual"]
-        homosc_f = lambda df: pg.homoscedasticity(df[homosc_columns], alpha=alpha)
-        heteroscedasticity = data.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
-        # Add same test for log-transformed values, also used for determining LoA calculation method
-        log_transform = lambda x: np.log(x + 1e-6)
-        backlog_transform = lambda x: np.exp(x) - 1e-6
-        logdata = data[[ref_scorer, obs_scorer]].applymap(log_transform)
-        logdata["difference"] = logdata[obs_scorer].sub(logdata[ref_scorer])
-        logdata["residual"] = logdata.groupby("sleep_stat").apply(resid_f).stack()#.rename("residual")
-        heteroscedasticity_log = logdata.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
-        # data_exp = logdata[[ref_scorer, obs_scorer, "difference"]].applymap(backlog_transform)
-        # data_exp = logdata["difference"].map(backlog_transformer)
-
-        # Aggregate test results into a dataframe of True/False for later convenience.
-        violations = (
-            systematic_bias["p-val"].lt(alpha).to_frame("is_systematically_biased")
-            .join(~normality["normal"].rename("is_nonnormal"))
-            .join(proportional_bias.loc[ref_scorer, "pval"].lt(alpha).rename("is_proportionally_biased"))
-            .join(~heteroscedasticity["equal_var"].rename("is_heteroscedastic"))
-            .join(~heteroscedasticity_log["equal_var"].rename("is_log_heteroscedastic"))
+        n_sessions = data.index.get_level_values(session_key).nunique()
+        # Parametric Bias
+        parm_vals = grouper["difference"].mean().to_frame("bias_parm")
+        # Parametric LoA
+        parm_vals["lloa_parm"], parm_vals["uloa_parm"] = zip(
+            *grouper["difference"].apply(self._arr_to_loa, agreement=agreement)
         )
 
-        # Get name of method for each calculation.
-        # CI - standard or bootstrap
-        # Bias - standard or modeled
-        # LoA - standard, log_standard, modeled, or residuals
-        get_ci_method = lambda row: "bootstrap" if row.is_nonnormal else "standard"
-        get_bias_method = lambda row: "modeled" if row.is_proportionally_biased else "standard"
-        get_loa_method = lambda row: (
-            "modeled" if row.is_log_heteroscedastic else "log_standard"
-        ) if row.is_heteroscedastic else (
-            "residuals" if row.is_proportionally_biased else "standard"
-        )
-        methods = {
-            "loa": violations.apply(get_loa_method, axis=1),
-            "bias": violations.apply(get_bias_method, axis=1),
-            "ci": violations.apply(get_ci_method, axis=1),
-        }
-        methods = pd.DataFrame(methods)
-        if bootstrap_all_cis:
-            methods["ci"] = ["standard"] * len(violations)
-
         ########################################################################
-        # ATTRIBUTES
+        # Generate standard CIs for standard Bias and LoA for all sleep stats
         ########################################################################
+        t_parm = stats.t.ppf((1 + confidence) / 2, n_sessions - 1)
+        sem = grouper["difference"].sem(ddof=1)
+        # Parametric CIs for parametric Bias and LoA
+        parm_ci = pd.DataFrame({
+            "bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
+            "bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
+            "lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
+            "lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
+            "uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
+            "uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
+        })
 
-        self._ref_scorer = ref_scorer
-        self._obs_scorer = obs_scorer
-        self._n_sessions = data.index.get_level_values(session_key).nunique()
-        self._data = data
-        self._diff_data = diff_data.droplevel(0).drop(columns=stats_with_nodiff)
-        self._systematic_bias = systematic_bias
-        self._normality = normality
-        self._proportional_bias = proportional_bias
-        self._heteroscedasticity = heteroscedasticity
-        self._violations = violations
-        self._methods = methods
-        # self._bias = bias
-        # self._bias_vars = bias_vars
-        # self._loas = loas
-        # self._loas_vars = loas_vars
+        ########################################################################
+        # Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
+        ########################################################################
+        # Run regression used to (a) model bias and (b) test for proportional/constant bias
+        bias_regr = grouper[[ref_scorer, "difference"]].apply(self._get_linregress_as_dict).apply(pd.Series)
+        # Get residuals from this regression, bc they are needed to run the next regression for homoscedasticity test
+        idx = data.index.get_level_values("sleep_stat")
+        slopes = bias_regr.loc[idx, "slope"].to_numpy()
+        intercepts = bias_regr.loc[idx, "intercept"].to_numpy()
+        predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
+        data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
+        # Run regression used to (b) model LoA and (b) test for heteroscedasticity/homoscedasticity
+        data["residuals_abs"] = data["residuals"].abs()
+        loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._get_linregress_as_dict).apply(pd.Series)
+        # Stack the two regression dataframes together
+        regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)
 
+        ########################################################################
+        # Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
+        ########################################################################
+        t_regr = stats.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
+        # Parametric CIs for modeled Bias and LoA
+        regr_ci = pd.DataFrame({
+            "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
+            "intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
+            "slope-lower": regr["slope"] - regr["stderr"] * t_regr,
+            "slope-upper": regr["slope"] + regr["stderr"] * t_regr,
+        })
 
-    @property
-    def data(self):
-        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
-        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
-        """
-        return self._data
+        ########################################################################
+        # Test all statistical assumptions
+        ########################################################################
+        assumptions = pd.DataFrame({
+            "unbiased": grouper["difference"].apply(lambda a: stats.ttest_1samp(a, 0).pvalue).ge(alpha),
+            "normal": grouper["difference"].apply(lambda a: stats.shapiro(a).pvalue).ge(alpha),
+            # "normal": grouper["difference"].apply(stats.shapiro).str[1].ge(alpha),
+            "constant_bias": bias_regr["pvalue"].ge(alpha),
+            "homoscedastic": loa_regr["pvalue"].ge(alpha),
+        })
 
-    @property
-    def methods(self):
-        return self._methods
+        ########################################################################
+        # Setting attributes
+        ########################################################################
 
-    @property
-    def biased(self):
-        return self._biased
+        # Merge the parametric and regression values for Bias and LoA
+        regr_vals = regr.unstack(0)[["slope", "intercept"]]
+        regr_vals.columns = regr_vals.columns.swaplevel().map("_".join)
+        vals = parm_vals.join(regr_vals).rename_axis("variable", axis=1)
+
+        # Merge the two CI dataframes for easier access
+        regr_ci = regr_ci.unstack(0)
+        regr_ci.columns = regr_ci.columns.swaplevel().map("_".join)
+        ci = parm_ci.join(regr_ci)
+        ci.columns = pd.MultiIndex.from_tuples(
+            tuples=ci.columns.str.split("-", expand=True), names=["variable", "interval"],
+        )
+        ci = pd.concat({"parm": ci, "boot": pd.DataFrame().reindex_like(ci)}, names=["ci_method"], axis=1)
+        ci = ci.sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
 
-    @property
-    def discrepancies(self):
-        """A :py:class:`pandas.DataFrame` of ``obs_data`` minus ``ref_data``."""
-        # # Pivot for session-rows and statistic-columns
-        return self._discrepancies
+        self._agreement = agreement
+        self._confidence = confidence
+        self._bootstrap_kwargs = bootstrap_kwargs
+        self._ref_scorer = ref_scorer
+        self._obs_scorer = obs_scorer
+        self._n_sessions = n_sessions
+        self._data = data
+        self._assumptions = assumptions
+        self._regr = regr
+        self._vals = vals
+        self._ci = ci
+        self._bias_method_opts = ["parm", "regr", "auto"]
+        self._loa_method_opts = ["parm", "regr", "auto"]
+        self._ci_method_opts = ["parm", "boot", "auto"]
 
     @property
     def ref_scorer(self):
@@ -1069,26 +1076,47 @@ def n_sessions(self):
         return self._n_sessions
 
     @property
-    def normality(self):
-        """A :py:class:`pandas.DataFrame` of normality results for all sleep statistics."""
-        return self._normality
+    def data(self):
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
+        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
+        Long format.
+        """
+        return self._data.drop(columns=["difference", "residuals", "residuals_abs"])
+
+    @property
+    def assumptions(self):
+        """A :py:class:`pandas.DataFrame` containing boolean values for all statistical tests used
+        to test assumptions.
+        """
+        return self._assumptions
 
     @property
-    def homoscedasticity(self):
-        """A :py:class:`pandas.DataFrame` of homoscedasticity results for all sleep statistics."""
-        return self._homoscedasticity
+    def sleep_statistics(self):
+        """Return a list of all sleep stats included in the agreement analyses."""
+        return self.data.index.get_level_values("sleep_stat").unique().to_list()
 
     @property
-    def proportional_bias(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
-        return self._proportional_bias
+    def auto_methods(self):
+        """
+        A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
+        """
+        return pd.concat(
+            [
+                self.assumptions["constant_bias"].map({True: "parm", False: "regr"}).rename("bias"),
+                self.assumptions["homoscedastic"].map({True: "parm", False: "regr"}).rename("loa"),
+                self.assumptions["normal"].map({True: "parm", False: "boot"}).rename("ci"),
+                self.assumptions["unbiased"].map({True: "calibrate", False: "uncalibrated"}).rename("calibration"),
+            ],
+            axis=1,
+        )
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
             f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
             f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
-            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.summary()` to get a dataframe of bias and limits of agreement for each sleep "
+            "statistic\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
         )
@@ -1096,396 +1124,309 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
-    @staticmethod
-    def _get_standard_bias(x):
-        """Wrapper around `np.mean`, for organizational purposes. For internal use."""
-        return x.mean()
+    ############################################################################
+    # Define some utility functions, mostly to aid with the use of df.apply and stats.bootstrap
+    ############################################################################
 
     @staticmethod
-    def _get_standard_loas(x, agreement=1.96, std=None):
-        """Return standard lower and upper limits of agreement. For internal use only.
-
-        Parameters
-        ----------
-        x : array_like
-        agreement : float, int
-        std : float, int
-
-        Returns
-        -------
-        loas : py:class:`numpy.ndarray`
-            A numpy array of shape (2,) where lower LoA is first and upper LoA is second.
-        """
-        if std is None:
-            std = x.std()
-        return x.mean() + np.array([-agreement, agreement]) * std
+    def _arr_to_loa(x, agreement):
+        mean = np.mean(x)
+        bound = agreement * np.std(x, ddof=1)
+        return mean-bound, mean+bound
 
     @staticmethod
-    def _get_regression_coefficient(x, y, index):
-        """Run linear regression and return a single coefficient.
-        
-        A wrapper to aid in computing CIs (with pg.compute_bootci). For internal use only.
-
-        Parameters
-        ----------
-        x : array_like
-            Predictor values
-        y : array_like
-            Outcome values
-        index: int
-            0 to get coefficient of intercept, N to get coefficient of Nth predictor
-
-        Returns
-        -------
-        coef: float
-            Regression coefficient of the effect of `b`.
+    def _get_linregress_as_dict(*args, **kwargs):
         """
-        ## Q: Jump straight to np.lstsq for speed?
-        return pg.linear_regression(x, y, add_intercept=True).at[index, "coef"]
+        A wrapper around :py:func:`scipy.stats.linregress` that returns a dictionary instead of a
+        named tuple. In the normally returned object, `intercept_stderr` is an extra field that is
+        not included when converting the named tuple, so this allows it to be included when using
+        something like groupby.
+        """
+        regr = stats.linregress(*args, **kwargs)
+        return {
+            "slope": regr.slope,
+            "intercept": regr.intercept,
+            "rvalue": regr.rvalue,
+            "pvalue": regr.pvalue,
+            "stderr": regr.stderr,
+            "intercept_stderr": regr.intercept_stderr,
+        }
 
-    @staticmethod
-    def _get_standard_bias_ci(x, confidence=0.95):
-        """Return standard confidence intervals for bias."""
-        n = x.size
-        dof = x.size - 1
-        avg = x.mean()
-        std = x.std()
-        sem = np.sqrt(std**2 / n)
-        low, high = stats.t.interval(confidence, dof, loc=avg, scale=sem)
-        return low, high
+    def _generate_bootstrap_ci(self, sleep_stats):
+        """
+        Generate bootstrapped confidence intervals for bias and limits of agreement. This operates
+        in-place by concatenating bootstrapped CIs to existing parametric CIs (the latter are
+        calculated by default during initialization).
+        """
+        assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
+        assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
+        assert all(isinstance(ss, str) for ss in sleep_stats), "elements of `sleep_stats` must be strings"
+        assert all(ss in self.sleep_statistics for ss in sleep_stats)
+        # sleep_stats_to_boot = pd.Index(sleep_stats).difference(sleep_stats_booted)
+        # grouper = self._data.loc[sleep_stats_to_boot].groupby("sleep_stat")
+        # Update bootstrap keywords arguments with defaults
+        bs_kwargs = {
+            "n_resamples": 1000,
+            "method": "BCa",
+            "confidence_level": self._confidence,  # should not change from parametric confidence level
+            "vectorized": False,  # should stay False
+            "paired": True,  # should be True, especially if method is BCa
+        } | self._bootstrap_kwargs
+
+        def boot_stats(ref_arr, diff_arr, rabs_arr):
+            # Wrap around all the stats to bootstrap, to avoid redundant scipy.stats.bootstrap calls
+            # Order of arrays is dependent on the column order used when calling grouper.apply
+            bias_parm = np.mean(diff_arr)
+            lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
+            bias_slope, bias_intercept = stats.linregress(ref_arr, diff_arr)[:2]
+            # Note this is not recalculating residuals each time for the next regression
+            loa_slope, loa_intercept = stats.linregress(ref_arr, rabs_arr)[:2]
+            return bias_parm, lloa_parm, uloa_parm, bias_intercept, bias_slope, loa_intercept, loa_slope
+
+        # !! Column order MUST match the order of arrays boot_stats expects as INPUT
+        # !! Variable order MUST match the order of floats boot_stats returns as OUTPUT
+        interval_order = ["lower", "upper"]
+        column_order = ["Reference", "difference", "residuals_abs"]
+        variable_order = [
+            "bias_parm",
+            "lloa_parm",
+            "uloa_parm",
+            "bias_intercept",
+            "bias_slope",
+            "loa_intercept",
+            "loa_slope",
+        ]
+        boot_ci = (self._data
+            .loc[sleep_stats, column_order]  # Extract the relevant sleep stats and columns
+            .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
+            # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
+            # of the passed dataframe to a tuple of 3 1D arrays
+            .apply(lambda df: stats.bootstrap(tuple(df.to_numpy().T), boot_stats, **bs_kwargs))
+            .map(lambda res: res.confidence_interval)  # Pull high/low CIs out of the results object
+            .explode()  # Break high and low CIs into separate rows
+            .to_frame("value")  # Convert to dataframe and name column
+            .assign(interval=interval_order * len(sleep_stats))  # Add a column indicating interval
+            .explode("value")  # Break low CI variables and high CI variables out of arrays
+            .assign(variable=variable_order * len(sleep_stats) * 2)  # Add a column indicating variable
+            .pivot(columns=["variable", "interval"], values="value")  # Go long to wide format
+            .sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
+        )
+        # Merge with existing CI dataframe
+        self._ci["boot"] = self._ci["boot"].fillna(boot_ci)
 
-    @staticmethod
-    def _get_standard_loas_cis(x, agreement=1.96, std=None, confidence=0.95):
-        """Return standard confidence intervals for both lower LoA and upper LoA.
+    def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fstrings={}):
+        """Return a pandas dataframe with bias, loa, bias_ci, loa_ci as string equations.
+        For all sleep stats, then index later what you want.
 
         Parameters
         ----------
-        x : array_like
-        agreement : float, int
-        std : float, int
-        confidence : float
+        bias_method : str
+            If ``'parm'`` (i.e., parametric), bias is always represented as the mean difference
+            (observed minus reference).
+            If ``'regr'`` (i.e., regression), bias is always represented as a regression equation.
+            If ``'auto'`` (default), bias is represented as a regression equation for sleep
+            statistics where the score differences are proportionally biased and as the mean
+            difference otherwise.
+        loa_method : str
+            If ``'parm'`` (i.e., parametric), limits of agreement are always represented as
+            bias +/- 1.96 standard deviations (where 1.96 can be adjusted through the ``agreement``
+            parameter).
+            If ``'regr'`` (i.e., regression), limits of agreement are always represented as a
+            regression equation.
+            If ``'auto'`` (default), limits of agreement are represented as a regression equation
+            for sleep statistics where the score differences are proportionally biased and as
+            bias +/- 1.96 standard deviation otherwise.
+        ci_method : str
+            If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
+            standard t-distribution.
+            If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
+            bootstrap resampling procedure.
+            If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
+            resampling procedure for sleep statistics where the distribution of score differences is
+            non-normal and using a standard t-distribution otherwise.
 
         Returns
         -------
-        cis : dict
-            A dictionary of length 2, with keys "lower" and "upper" LoA, and values of tuples
-            containing "lower" and "upper" confidence intervals for each.
-        """
-        n = x.size
-        dof = x.size - 1
-        if std is None:
-            std = x.std()
-        lower, upper = DiscrepancyEvaluation._get_standard_loas(x, agreement)
-        sem = np.sqrt(3 * std**2 / n)
-        lower_lo, lower_hi = stats.t.interval(confidence, dof, loc=lower, scale=sem)
-        upper_lo, upper_hi = stats.t.interval(confidence, dof, loc=upper, scale=sem)
-        return {"lower": (lower_lo, lower_hi), "upper": (upper_lo, upper_hi)}
-
-    def get_bias(self, alpha=0.05, **bootci_kwargs):
-        results = []
-        for sstat, row in self.methods.iterrows():
-            # Extract difference values once for convenience.
-            diffs = self.data.loc[sstat, "difference"].to_numpy()
-
-            # Identify the method that will be used.
-            if self._violations.at[sstat, "is_proportionally_biased"]:
-                bias_method = "modeled"
-            else:
-                bias_method = "standard"
-
-            if self._violations.at[sstat, "is_nonnormal"]:
-                ci_method = "bootstrap"
-            else:
-                ci_method = "standard"
-
-            # Initialize dictionary to hold row information.
-            metadata = {"sleep_stat": sstat, "method": bias_method}
-
-            # Calculate necessary variables to get bias (either bias or b0 and b1).
-            if bias_method == "modeled":
-                # Systematic bias and constant bias present, model based on constant bias regression.
-                # x, y = self.data.loc[sstat, [self.ref_scorer, "difference"]].T.to_numpy()
-                ref = self.data.loc[sstat, self.ref_scorer].to_numpy()
-                b0 = self._get_regression_coefficient(ref, diffs, index=0)
-                b1 = self._get_regression_coefficient(ref, diffs, index=1)
-                # Confidence intervals for b0 and b1
-                if ci_method == "bootstrap":
-                    b0_lo, b0_hi = pg.compute_bootci(
-                        ref,
-                        diffs,
-                        func=lambda x, y: self._get_regression_coefficient(x, y, index=0),
-                        **bootci_kwargs,
-                    )
-                    b1_lo, b1_hi = pg.compute_bootci(
-                        ref,
-                        diffs,
-                        func=lambda x, y: self._get_regression_coefficient(x, y, index=1),
-                        **bootci_kwargs,
-                    )
-                elif ci_method == "standard":
-                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
-                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
-                    b0_lo, b0_hi, b1_lo, b1_hi = pg.linear_regression(
-                        ref, diffs, alpha=alpha
-                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
-
-            elif bias_method == "standard":
-                b0 = self._get_standard_bias(diffs)
-                if ci_method == "bootstrap":
-                    b0_lo, b0_hi = pg.compute_bootci(
-                        diffs, func=self._get_standard_bias, **bootci_kwargs
-                    )
-                elif ci_method == "standard":
-                    b0_lo, b0_hi = self._get_standard_bias_ci(diffs)
-            else:
-                raise ValueError(f"Unexpected bias method {bias_method}.")
-
-            results.append(dict(variable="b0", mean=b0, ci_lower=b0_lo, ci_upper=b0_hi, **metadata))
-            if bias_method == "modeled":
-                results.append(dict(variable="b1", mean=b1, ci_lower=b1_lo, ci_upper=b1_hi, **metadata))
-
-        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
-        self._bias_values = df
-
-    def get_loa(self, alpha=0.05, **bootci_kwargs):
-        results = []
-        for sstat, row in self.methods.iterrows():
-            # Extract difference values once for convenience.
-            diffs = self.data.loc[sstat, "difference"].to_numpy()
-
-            # Identify the method that will be used.
-            if self._violations.at[sstat, "is_heteroscedastic"]:
-                if self._violations.at[sstat, "is_log_heteroscedastic"]:
-                    loa_method = "modeled"
-                else:
-                    loa_method = "log_standard"
-            else:
-                if self._violations.at[sstat, "is_proportionally_biased"]:
-                    loa_method = "residuals"
-                else:
-                    loa_method = "standard"
+        table : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` of string representations of bias, limits of agreement,
+            and their confidence intervals for all sleep statistics.
 
-            if self._violations.at[sstat, "is_nonnormal"]:
-                ci_method = "bootstrap"
-            else:
-                ci_method = "standard"
-
-            metadata = {"sleep_stat": sstat, "method": loa_method}
-            if loa_method in ["standard", "residuals"]:
-                # Get standard deviation of calibrated (i.e., bias-adjusted) observed values
-                # calibration_func = lambda x: x - (b0 + b1 * x)  # b0 and b1 were generated this iteration above
-                # Get standard deviation of residuals?
-                if loa_method == "residuals":
-                    std = self.data.loc[sstat, "residual"].std()
-                else:
-                    std = diffs.std()  # dof=1
-                lower, upper = self._get_standard_loas(diffs, std=std)
-                if ci_method == "bootstrap":
-                    lower_lo, lower_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[0], **bootci_kwargs)
-                    upper_lo, upper_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[1], **bootci_kwargs)
-                elif ci_method == "standard":
-                    cis = self._get_standard_loas_cis(diffs, std=std)
-                    lower_lo, lower_hi = cis["lower"]
-                    upper_lo, upper_hi = cis["upper"]
-
-                results.append(dict(variable="lower", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
-                results.append(dict(variable="upper", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
-            elif loa_method == "modeled":
-                x, y = self.data.loc[sstat, [obs_scorer, "residual"]].T.values
-                c0 = self._get_regression_coefficient(x, y, index=0)
-                c1 = self._get_regression_coefficient(x, y, index=1)
-                if ci_method == "bootstrap":
-                    c0_lo, c0_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=0), **ci_kwargs)
-                    c1_lo, c1_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=1), **ci_kwargs)
-                elif ci_method == "standard":
-                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
-                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
-                    c0_lo, c0_hi, c1_lo, c1_hi = pg.linear_regression(
-                        x, y, alpha=alpha
-                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
-                else:
-                    raise ValueError(f"Unknown CI method {ci_method}.")
-                results.append(dict(variable="c0", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
-                results.append(dict(variable="c1", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
-            else:
-                raise ValueError(f"Unexpected LoA method {loa_method}.")
-        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
-        self._loa_values = df
+        Examples
+        --------
 
-    def get_text_summary(self, fmt_dict=None):
         """
+        assert isinstance(bias_method, str), "`bias_method` must be a string"
+        assert bias_method in self._bias_method_opts, f"`bias_method` must be one of {self._bias_method_opts}"
+        assert isinstance(loa_method, str), "`loa_method` must be a string"
+        assert loa_method in self._loa_method_opts, f"`loa_method` must be one of {self._loa_method_opts}"
+        assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
+        loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)  # Agreement gets adjusted when LoA is modeled
+        if not fstrings:
+            fstrings = {
+                "bias_parm": "{bias_parm_center:.2f}",
+                "bias_regr": "{bias_intercept_center:.2f} + {bias_slope_center:.2f}x",
+                "loa_parm": "{lloa_parm_center:.2f}, {uloa_parm_center:.2f}",
+                "loa_regr": "Bias \u00B1 {loa_regr_agreement:.2f} * ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)",
+                "bias_parm_ci": (
+                    "[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"
+                ),
+                "bias_regr_ci": (
+                    "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], [{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
+                ),
+                "loa_parm_ci": (
+                    "[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], [{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
+                ),
+                "loa_regr_ci": (
+                    "[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], [{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
+                ),
+            }
+        # fstrings["loa_regr"] = fstrings["loa_regr"].replace("loa_regr_agreement", str(loa_regr_agreement))
+        values = self.summary(ci_method=ci_method)
+        values.columns = values.columns.map("_".join)  # Convert MultiIndex columns to Index
+        values["loa_regr_agreement"] = loa_regr_agreement  # Add a column of regr agreement so it can be used as variable
+        def return_all_the_strings(row, fstrings_dict):
+            return {var: fstr.format(**row) for var, fstr in fstrings_dict.items()}
+        all_strings = values.apply(return_all_the_strings, fstrings_dict=fstrings, axis=1).apply(pd.Series)
+        if bias_method == "auto":
+            bias_parm_idx = self.auto_methods.query("bias == 'parm'").index.tolist()
+        elif bias_method == "parm":
+            bias_parm_idx = self.sleep_statistics
+        elif bias_method == "regr":
+            bias_parm_idx = []
+        if loa_method == "auto":
+            loa_parm_idx = self.auto_methods.query("loa == 'parm'").index.tolist()
+        elif loa_method == "parm":
+            loa_parm_idx = self.sleep_statistics
+        elif loa_method == "regr":
+            loa_parm_idx = []
+        bias_regr_idx = [ss for ss in self.sleep_statistics if ss not in bias_parm_idx]
+        loa_regr_idx = [ss for ss in self.sleep_statistics if ss not in loa_parm_idx]
+        bias_parm = all_strings.loc[bias_parm_idx, ["bias_parm", "bias_parm_ci"]]
+        bias_regr = all_strings.loc[bias_regr_idx, ["bias_regr", "bias_regr_ci"]]
+        bias_parm.columns = bias_parm.columns.str.replace("_parm", "")
+        bias_regr.columns = bias_parm.columns.str.replace("_regr", "")
+        bias = pd.concat([bias_parm, bias_regr])
+        # bias = bias_parm.reindex(self.sleep_statistics).fillna(bias_regr)
+        loa_parm = all_strings.loc[loa_parm_idx, ["loa_parm", "loa_parm_ci"]]
+        loa_regr = all_strings.loc[loa_regr_idx, ["loa_regr", "loa_regr_ci"]]
+        loa_parm.columns = loa_parm.columns.str.replace("_parm", "")
+        loa_regr.columns = loa_regr.columns.str.replace("_regr", "")
+        loa = pd.concat([loa_parm, loa_regr])
+        return bias.join(loa, validate="1:1").sort_index(axis=0)
+
+    def summary(self, ci_method="auto"):
+        """
+        Return a dataframe that merges all the center values with their upper and lower confidence intervals.
+        There are always 2 options for CIs, so this is a convenient method to easily retrieve a set
+        of ALL values with their requested upper/lower bounds.
+        Returns a pandas DataFrame with 2-level multiindex as columns. with variable (bias) and 
+        interval (center, lower, upper)
         """
-        results = {}
-        # Bias
-        for (meth, sstat), df in self._bias_values.groupby(["method", "sleep_stat"]):
-            if meth == "standard":
-                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
-                bias = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc["b0"]
-            elif meth == "modeled":
-                fstr = "{b0_mean:.2f} [{b0_ci_lower:.2f}, {b0_ci_upper:.2f}] + {b1_mean:.2f} [{b1_ci_lower:.2f}, {b1_ci_upper:.2f}] x ref"
-                temp = df.unstack("variable").swaplevel(axis=1)
-                temp.columns = temp.columns.map("_".join)
-                bias = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
-            results[sstat] = dict(bias=bias)
-        # LoA
-        for (meth, sstat), df in self._loa_values.groupby(["method", "sleep_stat"]):
-            if meth in ["standard", "residuals"]:
-                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
-                lower, upper = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc[["lower", "upper"]]
-            else:
-                fstr = "{c0_mean:.2f} [{c0_ci_lower:.2f}, {c0_ci_upper:.2f}] + {c1_mean:.2f} [{c1_ci_lower:.2f}, {c1_ci_upper:.2f}] x ref"
-                temp = df.unstack("variable").swaplevel(axis=1)
-                temp.columns = temp.columns.map("_".join)
-                lower = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
-                upper = lower.copy()
-            results[sstat].update({"lower": lower, "upper": upper})
-
-        df = pd.DataFrame(results).T.rename_axis("sleep_stat")
+        assert isinstance(ci_method, str), "`ci_method` must be a string"
+        assert ci_method in self._ci_method_opts, f"`ci_method` must be one of {self._ci_method_opts}"
+        # Make sure relevant sleep statistics have bootstrapped CIs, generate them if not
+        if ci_method in ["boot", "auto"]:
+            if ci_method == "boot":
+                sleep_stats_to_boot = self.sleep_statistics
+            elif ci_method == "auto":
+                sleep_stats_to_boot = self.auto_methods.query("ci == 'boot'").index.tolist()
+            # Check if any of the sleep stats already have bootstrapped CIs (e.g., if user calls "auto" and then "boot")
+            sleep_stats_booted = self._ci["boot"].dropna().index
+            sleep_stats_to_boot = [s for s in sleep_stats_to_boot if s not in sleep_stats_booted]
+            if sleep_stats_to_boot:
+                self._generate_bootstrap_ci(sleep_stats=sleep_stats_to_boot)
+        if ci_method == "auto":
+            idx_boot, idx_parm = self.auto_methods.reset_index().groupby("ci", sort=True)["sleep_stat"].apply(list)
+            parm_vals = self._ci.loc[idx_parm, "parm"]
+            boot_vals = self._ci.loc[idx_boot, "boot"]
+            ci_vals = pd.concat([parm_vals, boot_vals])
+        else:
+            ci_vals = self._ci[ci_method]
+        # Add an extra level to values columns, indicating they are the center interval
+        center_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
+        df = center_vals.join(ci_vals, how="left", validate="1:1").astype(float).sort_index(axis=1)
         return df
 
-    def summary(self, **kwargs):
-        """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
+    def calibrate(self, sstats_c, bias_method="auto"):
+        """Return a Series of adjusted sleep stats.
+        # input should be a dataframe like sstats_a and sstats_b
+        Sleep stats input are adjusted according to observed biases in observed relative to reference
+        Return adjusted sleep stats.
 
         Parameters
         ----------
-        self : :py:class:`yasa.SleepStatsAgreement`
-            A :py:class:`yasa.SleepStatsAgreement` instance.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
-
-            >>> ssa.summary(func=["mean", "sem", "min", "max"])
+        obs_data : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with sleep statistics from an observed scorer.
+            Rows are unique observations and columns are unique sleep statistics.
+            Shape, index, and columns must be identical to ``ref_data`` and ``obs_data``.
+        bias_method : str
+            Name of the reference scorer.
 
         Returns
         -------
-        summary : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
-            normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
-        """
-        series_list = [
-            self.bias["biased"],
-            self.normality["normal"],
-            self.proportional_bias["bias_constant"],
-            self.homoscedasticity["equal_var"].rename("homoscedastic"),
-        ]
-        summary = pd.concat(series_list, axis=1)
-        mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
-        agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
-        desc = self.data.groupby("sleep_stat").agg(**agg_kwargs)
-        desc.columns = desc.columns.map("_".join)
-        return summary.join(desc)
-
-    def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
-        """Visualize session-level discrepancies, generally for outlier inspection.
+        obs_data_calibrated : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with calibrated sleep statistics from an observed scorer.
 
-        Parameters
-        ----------
-        sleep_stats : list or None
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`seaborn.heatmap` call.
+        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
 
-        Returns
+        Example
         -------
-        ax : :py:class:`matplotlib.axes.Axes`
-            Matplotlib Axes
+        >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="Henri", seed=i) for i in range(20)]
+        >>> hyps_b = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(20)]
+        >>> hyps_c = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(10)]
+        # sstats_a = pd.Series(hyps_a).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_b = pd.Series(hyps_b).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_c = pd.Series(hyps_c).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_a.index = sstats_b.index = sstats_a.index.map(lambda x: f"sub-{x+1:03d}")
+        >>> agr = yasa.SleepStatsAgreement(sstats_a, sstats_b)
+        >>> sstats_c_calibrated = agr.calibrate(sstats_c)
+        >>> print(sstats_c_calibrated.round(2).head(5))
+        """
+        assert isinstance(sstats_c, pd.DataFrame)
+        assert all(col in self.sleep_statistics for col in sstats_c)
+        assert isinstance(bias_method, str)
+        assert bias_method in self._bias_method_opts
+        parm_adjusted = sstats_c + self._vals["bias_parm"]
+        regr_adjusted = sstats_c * self._vals["bias_slope"] + self._vals["bias_intercept"]
+        if bias_method == "parm":
+            return parm_adjusted
+        elif bias_method == "regr":
+            return regr_adjusted
+        elif bias_method == "auto":
+            parm_idx = self.auto_methods.query("bias == 'parm'").index.to_list()
+            bias_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
+            return parm_adjusted[parm_idx].join(regr_adjusted[bias_idx]).dropna(axis=1)
+
+    def get_calibration_func(sleep_stat):
         """
-        assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
-        if sleep_stats is None:
-            sleep_stats = self.data.index.get_level_values("sleep_stat").unique()
-        heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
-        heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
-        if "cbar_kws" in kwargs:
-            heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
-        heatmap_kwargs.update(kwargs)
-        table = self._diff_data[sleep_stats]
-        # Normalize statistics (i.e., columns) between zero and one then convert to percentage
-        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
-        if heatmap_kwargs["annot"]:
-            # Use raw values for writing
-            heatmap_kwargs["annot"] = table.to_numpy()
-        return sns.heatmap(table_norm, **heatmap_kwargs)
-
-    def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kwargs):
-        """Visualize session-level discrepancies, generally for outlier inspection.
-
-        Parameters
-        ----------
-        pairgrid_kwargs : dict
-            Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
 
-        Returns
-        -------
-        g : :py:class:`seaborn.PairGrid`
-            A :py:class:`seaborn.FacetGrid` with sleep statistics dotplots on each axis.
+        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
 
         Examples
         --------
-        To plot a limited subset of sleep statistics, use the ``x_vars`` keyword argument of
-        :py:class:`seaborn.PairGrid`.
-
-        .. plot::
-            ## TODO: Example using x_vars
-        """
-        assert isinstance(pairgrid_kwargs, dict), "`pairgrid_kwargs` must be a dict"
-        kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
-        kwargs_stripplot.update(kwargs)
-        # Initialize the PairGrid
-        height = 0.3 * len(self._diff_data)
-        aspect = 0.6
-        kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
-        kwargs_pairgrid.update(pairgrid_kwargs)
-        g = sns.PairGrid(
-            self._diff_data.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
-        )
-        # Draw the dots
-        g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
-        # Adjust aesthetics
-        for ax in g.axes.flat:
-            ax.set(title=ax.get_xlabel())
-            ax.margins(x=0.3)
-            ax.yaxis.grid(True)
-            ax.tick_params(left=False)
-        g.set(xlabel="", ylabel="")
-        sns.despine(left=True, bottom=True)
-        return g
-
-    def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
-        """
-
-        **Use col_order=sstats_order for plotting a subset.
-
-        Parameters
-        ----------
-        facetgrid_kwargs : dict
-            Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
-
-        Returns
-        -------
-        g : :py:class:`seaborn.FacetGrid`
-            A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
+        >>> ssa = yasa.SleepStatsAgreement(...)
+        >>> calibrate_rem = ssa.get_calibration_func("REM")
+        >>> new_obs_rem_vals = np.array([50, 40, 30, 20])
+        >>> calibrate_rem(new_obs_rem_vals)
+        >>> calibrate_rem(new_obs_rem_vals)
+        array([50, 40, 30, 20])
+        >>> calibrate_rem(new_obs_rem_vals, bias_test=False)
+        array([42.825, 32.825, 22.825, 12.825])
+        >>> calibrate_rem(new_obs_rem_vals, bias_test=False, method="regr")
+        array([ -9.33878878,  -9.86815607, -10.39752335, -10.92689064])
         """
-        kwargs_facetgrid = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
-        kwargs_facetgrid.update(facetgrid_kwargs)
-        kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
-        kwargs_blandaltman.update(kwargs)
-        # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data.reset_index(), col="sleep_stat", **kwargs_facetgrid)
-        # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.obs_scorer, self.ref_scorer, **kwargs_blandaltman)
-        # Adjust aesthetics
-        for ax in g.axes.flat:
-            # Tidy-up axis limits with symmetric y-axis and minimal ticks
-            bound = max(map(abs, ax.get_ylim()))
-            ax.set_ylim(-bound, bound)
-            ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
-            ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.obs_scorer, self.ref_scorer))
-        g.set_ylabels(ylabel)
-        g.set_titles(col_template="{col_name}")
-        g.tight_layout(w_pad=1, h_pad=2)
-        return g
+        assert isinstance(sleep_stat, str)
+        assert sleep_stat in self.sleep_statistics
+        parm, slope, intercept = ssa._vals.loc[ss, ["bias_parm", "bias_slope", "bias_intercept"]].to_numpy()
+        auto_method = ssa.auto_methods.at[ss, "bias"]
+        not_biased = ssa.assumptions.at[ss, "unbiased"]
+        def calibration_func(x, method="auto", bias_test=True):
+            x = np.array(x)
+            method = auto_method if method == "auto" else method
+            if bias_test and not_biased:  # If sleep stat is not statistically biased, don't calibrate
+                return x
+            elif method == "parm":
+                return x + parm
+            elif method == "regr":
+                return x * slope + intercept
+        return calibration_func

From 44869c2397e96330b29da548db279728d932dcb5 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Fri, 30 Dec 2022 21:06:09 -0600
Subject: [PATCH 32/43] first draft

---
 requirements.txt   |   1 +
 yasa/__init__.py   |   1 +
 yasa/evaluation.py | 510 +++++++++++++++++++++++++++++++++++++++++++++
 yasa/hypno.py      |  41 ++++
 4 files changed, 553 insertions(+)
 create mode 100644 yasa/evaluation.py

diff --git a/requirements.txt b/requirements.txt
index 2fb7b1b..1fd261a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ sleepecg>=0.5.0
 joblib
 antropy
 lightgbm
+pingouin>=0.5.3
diff --git a/yasa/__init__.py b/yasa/__init__.py
index 7bdff0e..7177a2f 100644
--- a/yasa/__init__.py
+++ b/yasa/__init__.py
@@ -1,5 +1,6 @@
 import logging
 from .detection import *
+from .evaluation import *
 from .features import *
 from .heart import *
 from .hypno import *
diff --git a/yasa/evaluation.py b/yasa/evaluation.py
new file mode 100644
index 0000000..6b25b2d
--- /dev/null
+++ b/yasa/evaluation.py
@@ -0,0 +1,510 @@
+"""
+YASA code for evaluating the agreement between two sleep-measurement systems.
+
+There are two levels of evaluating staging performance:
+- Comparing two hypnograms (e.g., human vs automated scorer)
+- Comparing summary sleep statistics between two scorers (e.g., PSG vs actigraphy)
+
+Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
+See the following resources:
+- https://doi.org/10.1093/sleep/zsaa170
+- https://sri-human-sleep.github.io/sleep-trackers-performance
+- https://github.com/SRI-human-sleep/sleep-trackers-performance
+"""
+import logging
+
+import numpy as np
+import pandas as pd
+import pingouin as pg
+from sklearn import metrics
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from yasa.plotting import plot_hypnogram
+
+
+logger = logging.getLogger("yasa")
+
+__all__ = [
+    "EpochByEpochEvaluation",
+    "SleepStatsEvaluation",
+]
+
+
+class EpochByEpochEvaluation:
+    """
+    See :py:meth:`yasa.Hypnogram.evaluate`
+
+    Parameters
+    ----------
+    hypno_ref : :py:class:`yasa.Hypnogram`
+        Reference or ground-truth hypnogram.
+    hypno_test : :py:class:`yasa.Hypnogram`
+        The test or to-be-evaluated hypnogram.
+
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
+    References
+    ----------
+    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
+                      (2021). A standardized framework for testing the performance of sleep-tracking
+                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+
+    Examples
+    --------
+    >>> import yasa
+    >>> hypno_a = yasa.simulate_hypno(tib=90, seed=8)
+    >>> hypno_b = yasa.simulate_hypno(tib=90, seed=9)
+    >>> hypno_a = yasa.Hypnogram(hypno_a, scorer="RaterA")
+    >>> hypno_b = yasa.Hypnogram(hypno_b, scorer="RaterB")
+    >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
+    >>> ebe.get_confusion_matrix()
+    RaterB  WAKE  N1   N2  N3  REM  ART  UNS  Total
+    RaterA
+    WAKE       1  20   68  12    0    0    0    101
+    N1         1   0    9   0    0    0    0     10
+    N2        15   7   19   0    0    0    0     41
+    N3         0   4   15   0    9    0    0     28
+    REM        0   0    0   0    0    0    0      0
+    ART        0   0    0   0    0    0    0      0
+    UNS        0   0    0   0    0    0    0      0
+    Total     17  31  111  12    9    0    0    180
+
+    >>> ebe.get_agreement().round(3)
+    metric
+    accuracy              0.111
+    kappa                -0.130
+    weighted_jaccard      0.037
+    weighted_precision    0.072
+    weighted_recall       0.111
+    weighted_f1           0.066
+    Name: agreement, dtype: float64
+
+    >>> ebe.get_agreement_by_stage().round(3)
+    stage         WAKE    N1      N2    N3  REM  ART  UNS
+    metric
+    precision    0.059   0.0   0.171   0.0  0.0  0.0  0.0
+    recall       0.010   0.0   0.463   0.0  0.0  0.0  0.0
+    fscore       0.017   0.0   0.250   0.0  0.0  0.0  0.0
+    support    101.000  10.0  41.000  28.0  0.0  0.0  0.0
+    """
+    def __init__(self, hypno_ref, hypno_test):
+        from yasa.hypno import Hypnogram  # Loading here to avoid circular import
+        assert isinstance(hypno_ref, Hypnogram), "`hypno_ref` must be a YASA Hypnogram"
+        assert isinstance(hypno_test, Hypnogram), "`hypno_test` must be a YASA Hypnogram"
+        assert hypno_ref.n_stages == hypno_test.n_stages, (
+            "`hypno_ref` and `hypno_test` must have the same `n_stages`")
+        if (n_ref := hypno_ref.n_epochs) != (n_test := hypno_test.n_epochs):
+            ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
+            if n_ref > n_test:
+                hypno_ref = Hypnogram(hypno_ref.hypno[:n_test], n_stages=hypno_ref.n_stages)
+                n_trimmed = n_ref - n_test
+                warn_msg = f"`hypno_ref` longer than `hypno_test`, trimmed to {n_test} epochs"
+            else:
+                hypno_test = Hypnogram(hypno_test.hypno[:n_ref], n_stages=hypno_test.n_stages)
+                n_trimmed = n_test - n_ref
+                warn_msg = f"`hypno_test` longer than `hypno_ref`, {n_trimmed} epochs trimmed"
+            ## Q: Should be downplayed as INFO?
+            logger.warning(warn_msg)
+        self.hypno_ref = hypno_ref
+        self.hypno_test = hypno_test
+
+    def get_confusion_matrix(self):
+        """
+        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
+
+        Returns
+        -------
+        matrix : :py:class:`pandas.DataFrame`
+            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
+            ``hypno_test`` as columns.
+        """
+        # Generate confusion matrix.
+        matrix = pd.crosstab(
+            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
+        )
+        # Reorder indices in sensible order and to include all stages
+        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
+        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
+        matrix = matrix.fillna(0).astype(int)
+        return matrix
+
+    def get_agreement(self):
+        """
+        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
+        across all stages as measured by common classifier agreement methods.
+
+        ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
+        ##    Maybe should be binary vs multiclass?
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
+
+        Returns
+        -------
+        agreement : :py:class:`pandas.Series`
+            A :py:class:`pandas.Series` with agreement metrics as indices.
+        """
+        true = self.hypno_ref.hypno.to_numpy()
+        pred = self.hypno_test.hypno.to_numpy()
+        accuracy = metrics.accuracy_score(true, pred)
+        kappa = metrics.cohen_kappa_score(true, pred)
+        jaccard = metrics.jaccard_score(true, pred, average="weighted")
+        precision = metrics.precision_score(true, pred, average="weighted", zero_division=0)
+        recall = metrics.recall_score(true, pred, average="weighted", zero_division=0)
+        f1 = metrics.f1_score(true, pred, average="weighted", zero_division=0)
+        scores = {
+            "accuracy": accuracy,
+            "kappa": kappa,
+            "weighted_jaccard": jaccard,
+            "weighted_precision": precision,
+            "weighted_recall": recall,
+            "weighted_f1": f1,
+        }
+        agreement = pd.Series(scores, name="agreement").rename_axis("metric")
+        return agreement
+
+    def get_agreement_by_stage(self):
+        """
+        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
+        for each stage as measured by common classifier agreement methods.
+
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
+
+        Returns
+        -------
+        agreement : :py:class:`pandas.DataFrame`
+            A DataFrame with agreement metrics as indices and stages as columns.
+        """
+        true = self.hypno_ref.hypno.to_numpy()
+        pred = self.hypno_test.hypno.to_numpy()
+        labels = self.hypno_ref.labels  # equivalent to hypno_test.labels
+        scores = metrics.precision_recall_fscore_support(
+            true, pred, labels=labels, average=None, zero_division=0
+        )
+        agreement = pd.DataFrame(scores)
+        agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
+        agreement.columns = pd.Index(labels, name="stage")
+        return agreement
+
+
+class SleepStatsEvaluation:
+    """
+    Evaluate agreement between two measurement devices by comparing summary sleep statistics across
+    multiple participants or sessions.
+
+    For example, the reference device might be PSG and the test device might be a wearable device.
+
+    Parameters
+    ----------
+    data : :py:class:`pandas.DataFrame`
+        A pandas dataframe with sleep statistics from two different
+        devices for multiple subjects
+    reference : str
+        Name of column containing the reference device sleep statistics.
+    test : str
+        Name of column containing the test device sleep statistics.
+    subject : str
+        Name of column containing the subject ID.
+    statistic : str
+        Name of column containing the name of the sleep statistics.
+
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
+    References
+    ----------
+    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
+                      (2021). A standardized framework for testing the performance of sleep-tracking
+                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import yasa
+    >>> results = []
+    >>> for i in range(1, 21):
+    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i)
+    >>>     hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=i + 99)
+    >>>     sstats_a = hypno_a.sleep_statistics()
+    >>>     sstats_b = hypno_b.sleep_statistics()
+    >>>     sstats_a["subject"] = f"sub-{i:03d}"
+    >>>     sstats_b["subject"] = f"sub-{i:03d}"
+    >>>     sstats_a["scorer"] = "RaterA"
+    >>>     sstats_b["scorer"] = "RaterB"
+    >>>     results.extend([sstats_a, sstats_b])
+    >>> 
+    >>> df = (pd.DataFrame(results)
+    >>>     .pivot(index="subject", columns="scorer")
+    >>>     .stack(0).rename_axis(["subject", "sstat"]).reset_index().rename_axis(None, axis=1)
+    >>>     .query("sstat.isin(['%N1', '%N2', '%N3', '%REM', 'SOL', 'SE', 'TST'])")
+    >>>
+    >>> sse = yasa.SleepStatsEvaluation(
+    >>>     data=df, reference="RaterA", test="RaterB", subject="subject", statistic="sstat"
+    >>> )
+    >>>
+    >>> sse.summary(descriptives=False)
+           normal  unbiased  homoscedastic
+    sstat
+    %N1      True      True           True
+    %N2      True      True           True
+    %N3      True      True           True
+    %REM    False      True           True
+    SE       True      True           True
+    SOL     False     False           True
+    TST      True      True           True
+
+    .. plot::
+
+        >>> sse.plot_discrepancies_heatmap()
+
+    .. plot::
+
+        >>> sse.plot_blandaltman()
+    """
+    def __init__(self, data, reference, test, subject, statistic):
+        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
+        for col in [reference, test, subject, statistic]:
+            assert isinstance(col, str) and col in data, f"`{col}` must be a string and a column in `data`"
+        assert data[subject].nunique() > 1, "`data` must include more than one subject"
+        data = data.copy()
+
+        # Get measurement difference between reference and test devices
+        data["difference"] = data[test].sub(data[reference])
+
+        # Check for sleep statistics that have no differences between measurement devices.
+        # This is most likely to occur with TIB but is possible with any, and will break some functions.
+        stats_nodiff = data.groupby(statistic)["difference"].any().loc[lambda x: ~x].index
+        for s in stats_nodiff:
+            data = data.query(f"{statistic} != '{s}'")
+            logger.warning(f"All {s} differences are zero, removing from evaluation.")
+            ## Q: Should this be logged as just info?
+
+        # Get list of all statistics to be evaluated
+        self.all_sleepstats = data[statistic].unique()
+
+        # Save attributes
+        self.data = data
+        self.reference = reference
+        self.test = test
+        self.subject = subject
+        self.statistic = statistic
+
+        # Run tests
+        self.test_normality()
+        self.test_proportional_bias()
+        self.test_homoscedasticity()
+
+    def test_normality(self):
+        """Test reference data for normality at each sleep statistic."""
+        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality)
+        self.normality = normality.droplevel(-1)
+
+    def test_proportional_bias(self):
+        """Test each sleep statistic for proportional bias.
+        
+        For each statistic, regress the device difference score on the reference device score to get
+        proportional bias and residuals that will be used for the later homoscedasticity
+        calculation. Subject-level residuals for each statistic are added to ``data``.
+        """
+        prop_bias_results = []
+        residuals_results = []
+        for ss, ss_df in self.data.groupby(self.statistic):
+            # Regress the difference score on the reference device
+            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"])
+            model.insert(0, self.statistic, ss)
+            # Extract the subject-level residuals
+            resid = pd.DataFrame(
+                {
+                    self.subject: ss_df[self.subject],
+                    self.statistic: ss,
+                    "pbias_residual": model.residuals_
+                }
+            )
+            prop_bias_results.append(model)
+            residuals_results.append(resid)
+        # Add residuals to raw dataframe, used later when testing homoscedasticity
+        residuals = pd.concat(residuals_results)
+        self.data = self.data.merge(residuals, on=[self.subject, self.statistic])
+        # Handle proportional bias results
+        prop_bias = pd.concat(prop_bias_results)
+        # Save all the proportional bias models before removing intercept, for optional user access
+        self.proportional_bias_models_ = prop_bias.reset_index(drop=True)
+        # Remove intercept rows
+        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
+        # Add True/False passing column for easy access
+        prop_bias["unbiased"] = prop_bias["pval"].ge(0.05)
+        self.proportional_bias = prop_bias.set_index(self.statistic)
+
+    def test_homoscedasticity(self, method="levene"):
+        """Test each statistic for homoscedasticity.
+
+        The ``method`` argument is passed to :py:func:`pingouin.homoscedasticity`.
+
+        ..note:: ``self.test_proportional_bias()`` must be run first.
+        """
+        group = self.data.groupby(self.statistic)
+        columns = [self.reference, "difference", "pbias_residual"]
+        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], method=method))
+        self.homoscedasticity = homoscedasticity.droplevel(-1)
+
+    def summary(self, descriptives=True):
+        """Return a summary dataframe highlighting what statistics pass checks."""
+        assert isinstance(descriptives, bool), "descriptives must be True or False"
+        series_list = [
+            self.normality["normal"],
+            self.proportional_bias["unbiased"],
+            self.homoscedasticity["equal_var"].rename("homoscedastic"),
+        ]
+        summary = pd.concat(series_list, axis=1)
+        if descriptives:
+            group = self.data.drop(columns=self.subject).groupby(self.statistic)
+            desc = group.agg(["mean", "std"])
+            desc.columns = desc.columns.map("_".join)
+            summary = summary.join(desc)
+        return summary
+
+    def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
+        """Visualize subject-level discrepancies, generally for outlier inspection.
+
+        Parameters
+        ----------
+        sstats_order : list
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`seaborn.heatmap`.
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Merge default heatmap arguments with optional input
+        heatmap_kwargs = dict(cmap="binary", annot=True, fmt=".1f", square=False)
+        heatmap_kwargs.update(kwargs)
+        # Pivot for subject-rows and statistic-columns
+        table = self.data.pivot(
+            index=self.subject, columns=self.statistic, values="difference",
+        )
+        # Normalize statistics (i.e., columns) between zero and one
+        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp))
+        # If annotating, replace with raw values for writing.
+        if heatmap_kwargs["annot"]:
+            heatmap_kwargs["annot"] = table[sstats_order].to_numpy()
+        # Draw heatmap
+        ax = sns.heatmap(table_norm[sstats_order], **heatmap_kwargs)
+        return ax
+
+    def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwargs):
+        """Visualize subject-level discrepancies, generally for outlier inspection.
+
+        Parameters
+        ----------
+        sstats_order : list
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        palette : string, list, dict, or :py:class:`matplotlib.colors.Colormap`
+            Color palette passed to :py:class:`seaborn.PairGrid`
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`seaborn.stripplot`.
+
+        Returns
+        -------
+        g : :py:class:`seaborn.PairGrid`
+            Seaborn PairGrid
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Merge default stripplot arguments with optional input
+        stripplot_kwargs = dict(size=10, linewidth=1, edgecolor="white")
+        stripplot_kwargs.update(kwargs)
+
+        # Pivot data to get subject-rows and statistic-columns
+        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+
+        # Initialize the PairGrid
+        height = 0.3 * len(table)
+        aspect = 0.6
+        g = sns.PairGrid(
+            table.reset_index(),
+            x_vars=sstats_order,
+            y_vars=[self.subject],
+            hue=self.subject,
+            palette=palette,
+            height=height,
+            aspect=aspect,
+        )
+        # Draw the dots
+        g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
+
+        # Adjust aesthetics
+        g.set(xlabel="", ylabel="")
+        for ax, title in zip(g.axes.flat, sstats_order):
+            ax.set(title=title)
+            ax.margins(x=0.3)
+            ax.yaxis.grid(True)
+            ax.tick_params(left=False)
+        sns.despine(left=True, bottom=True)
+
+        return g
+
+    def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
+        """
+        Parameters
+        ----------
+        sstats_order : list or None
+            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
+        facet_kwargs : dict
+            Other keyword arguments are passed through to :py:class:`seaborn.FacetGrid`.
+        kwargs : dict
+            Other keyword arguments are passed through to :py:func:`pingouin.plot_blandaltman`.
+
+        Returns
+        -------
+        g : :py:class:`seaborn.FacetGrid`
+            Seaborn FacetGrid
+        """
+        if sstats_order is None:
+            sstats_order = self.all_sleepstats
+        else:
+            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+
+        # Select scatterplot arguments (passed to blandaltman) and update with optional input
+        blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
+        blandaltman_kwargs.update(kwargs)
+        # Select FacetGrid arguments and update with optional input
+        col_wrap = 4 if len(sstats_order) > 4 else None
+        facetgrid_kwargs = dict(col_wrap=col_wrap, height=2, aspect=1, sharex=False, sharey=False)
+        facetgrid_kwargs.update(facet_kwargs)
+
+        # Initialize a grid of plots with an Axes for each sleep statistic
+        g = sns.FacetGrid(self.data, col=self.statistic, col_order=sstats_order, **facetgrid_kwargs)
+        # Draw Bland-Altman on each axis
+        g.map(pg.plot_blandaltman, self.test, self.reference, **blandaltman_kwargs)
+
+        # Tidy-up axis limits with symmetric y-axis and minimal ticks
+        for ax in g.axes.flat:
+            bound = max(map(abs, ax.get_ylim()))
+            ax.set_ylim(-bound, bound)
+            ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
+            ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
+        # More aesthetics
+        ylabel = " - ".join((self.test, self.reference))
+        g.set_ylabels(ylabel)
+        g.set_titles(col_template="{col_name}")
+        g.tight_layout(w_pad=1, h_pad=2)
+
+        return g
diff --git a/yasa/hypno.py b/yasa/hypno.py
index d67bf82..a50eeaf 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -10,6 +10,7 @@
 from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 from yasa.sleepstats import transition_matrix
+from yasa.evaluation import EpochByEpochEvaluation
 from pandas.api.types import CategoricalDtype
 
 __all__ = [
@@ -570,6 +571,46 @@ def copy(self):
             scorer=self.scorer,
         )
 
+    def evaluate(self, hypno_test):
+        """Evaluate agreement between two hypnograms.
+
+        Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
+        test hypnogram (i.e., ``hypno_test``) is a hypnogram from an actigraphy/wearable device or
+        automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
+
+        Comparing more than two hypnograms is not currently supported.
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.Hypnogram`
+            Reference or ground-truth hypnogram.
+        hypno_test : :py:class:`yasa.Hypnogram`
+            The test or to-be-evaluated hypnogram.
+            Must have the same ``n_stages`` as the reference hypnogram.
+
+        Returns
+        -------
+        ebe : :py:class:`yasa.EpochByEpochEvaluation`
+            See :py:class:`yasa.EpochByEpochEvaluation` documentation for more detail.
+
+        Examples
+        --------
+        .. plot::
+
+            >>> import yasa
+            >>> hypno_ref = yasa.simulate_hypno(tib=600, seed=11)
+            >>> hypno_ref = yasa.Hypnogram(hypno_ref, scorer="Rater1")
+            >>> _, true_probas = hypno_ref.transition_matrix()
+            >>> hypno_test = yasa.simulate_hypno(tib=600, seed=12, trans_probas=true_probas)
+            >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
+            >>> ebe = hypno_ref.evaluate(hypno_test)
+            >>> conf = ebe.get_confusion_matrix()
+            >>> perf = ebe.get_agreement()
+            >>> # Plot the overlapping hypnograms
+            >>> ebe.plot_hypnograms()
+        """
+        return EpochByEpochEvaluation(self, hypno_test)
+
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.
 

From dea705016b3b09257235cfa38f3f2e4b1df8a186 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Fri, 30 Dec 2022 22:30:07 -0600
Subject: [PATCH 33/43] plot_hypnogramS method

docstrings examples

plot_hypnogram lw --> linekwargs

heatmap colorbar label

pass kwargs through to all pingouin calls

docstrings examples update

setting attrs, docstrings, var name changes

SleepStatsEval takes 2 dataframes as input, reshaping is done internally

EpochByEpoch accepts sequences of Hypnograms for group evaluation

EpochByEpoch gets sleep stats

SleepStats move statistical tests to __init__()

better plotting flexibility and baked-in sleepstats_order
---
 yasa/evaluation.py | 881 +++++++++++++++++++++++++++++++--------------
 yasa/plotting.py   |  48 +--
 2 files changed, 638 insertions(+), 291 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 6b25b2d..e8a0cff 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -32,15 +32,20 @@
 ]
 
 
+#############################################################################
+# EPOCH BY EPOCH
+#############################################################################
+
+
 class EpochByEpochEvaluation:
     """
     See :py:meth:`yasa.Hypnogram.evaluate`
 
     Parameters
     ----------
-    hypno_ref : :py:class:`yasa.Hypnogram`
-        Reference or ground-truth hypnogram.
-    hypno_test : :py:class:`yasa.Hypnogram`
+    refr_hyp : :py:class:`yasa.Hypnogram`
+        The reference or ground-truth hypnogram.
+    test_hyp : :py:class:`yasa.Hypnogram`
         The test or to-be-evaluated hypnogram.
 
     Notes
@@ -58,98 +63,232 @@ class EpochByEpochEvaluation:
     Examples
     --------
     >>> import yasa
-    >>> hypno_a = yasa.simulate_hypno(tib=90, seed=8)
-    >>> hypno_b = yasa.simulate_hypno(tib=90, seed=9)
-    >>> hypno_a = yasa.Hypnogram(hypno_a, scorer="RaterA")
-    >>> hypno_b = yasa.Hypnogram(hypno_b, scorer="RaterB")
+    >>> hypno_a = yasa.simulate_hypnogram(tib=90, seed=8, scorer="RaterA")
+    >>> hypno_b = yasa.simulate_hypnogram(tib=90, seed=9, scorer="RaterB")
     >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
     >>> ebe.get_confusion_matrix()
-    RaterB  WAKE  N1   N2  N3  REM  ART  UNS  Total
+    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
     RaterA
-    WAKE       1  20   68  12    0    0    0    101
-    N1         1   0    9   0    0    0    0     10
-    N2        15   7   19   0    0    0    0     41
-    N3         0   4   15   0    9    0    0     28
-    REM        0   0    0   0    0    0    0      0
-    ART        0   0    0   0    0    0    0      0
-    UNS        0   0    0   0    0    0    0      0
-    Total     17  31  111  12    9    0    0    180
+    WAKE      52   38  126  23   51    0    0    290
+    N1        59    2   27   8   14    0    0    110
+    N2       117   50  105  15   44    0    0    331
+    N3        34   26   62  42   15    0    0    179
+    REM       15   12   13  10    0    0    0     50
+    ART        0    0    0   0    0    0    0      0
+    UNS        0    0    0   0    0    0    0      0
+    Total    277  128  333  98  124    0    0    960
 
     >>> ebe.get_agreement().round(3)
     metric
-    accuracy              0.111
-    kappa                -0.130
-    weighted_jaccard      0.037
-    weighted_precision    0.072
-    weighted_recall       0.111
-    weighted_f1           0.066
+    accuracy              0.209
+    kappa                -0.051
+    weighted_jaccard      0.130
+    weighted_precision    0.247
+    weighted_recall       0.209
+    weighted_f1           0.223
     Name: agreement, dtype: float64
 
     >>> ebe.get_agreement_by_stage().round(3)
-    stage         WAKE    N1      N2    N3  REM  ART  UNS
+    stage         WAKE       N1       N2       N3   REM  ART  UNS
     metric
-    precision    0.059   0.0   0.171   0.0  0.0  0.0  0.0
-    recall       0.010   0.0   0.463   0.0  0.0  0.0  0.0
-    fscore       0.017   0.0   0.250   0.0  0.0  0.0  0.0
-    support    101.000  10.0  41.000  28.0  0.0  0.0  0.0
+    precision    0.188    0.016    0.315    0.429   0.0  0.0  0.0
+    recall       0.179    0.018    0.317    0.235   0.0  0.0  0.0
+    fscore       0.183    0.017    0.316    0.303   0.0  0.0  0.0
+    support    290.000  110.000  331.000  179.000  50.0  0.0  0.0
+
+    .. plot::
+
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
+        >>> ebe.plot_hypnograms()
+
+    .. plot::
+
+        >>> fig, ax = plt.subplots(figsize=(6, 3))
+        >>> ebe.plot_hypnograms(ax=ax, kwargs_test={"color": "black", "lw": 2, "ls": "dotted"})
+        >>> plt.tight_layout()
+
+    .. plot::
+
+        >>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
+        >>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
+        >>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
+        >>> legend_style = dict(
+        >>>     title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
+        >>> )
+        >>> ax = ebe.plot_hypnograms(
+        >>>     kwargs_ref=style_a, kwargs_test=style_b, legend=legend_style, ax=ax
+        >>> )
+        >>>
+        >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
+        >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
     """
-    def __init__(self, hypno_ref, hypno_test):
+    def __init__(self, refr_hyps, test_hyps):
         from yasa.hypno import Hypnogram  # Loading here to avoid circular import
-        assert isinstance(hypno_ref, Hypnogram), "`hypno_ref` must be a YASA Hypnogram"
-        assert isinstance(hypno_test, Hypnogram), "`hypno_test` must be a YASA Hypnogram"
-        assert hypno_ref.n_stages == hypno_test.n_stages, (
-            "`hypno_ref` and `hypno_test` must have the same `n_stages`")
-        if (n_ref := hypno_ref.n_epochs) != (n_test := hypno_test.n_epochs):
-            ## NOTE: would be nice to have a Hypnogram.trim() method for moments like this.
-            if n_ref > n_test:
-                hypno_ref = Hypnogram(hypno_ref.hypno[:n_test], n_stages=hypno_ref.n_stages)
-                n_trimmed = n_ref - n_test
-                warn_msg = f"`hypno_ref` longer than `hypno_test`, trimmed to {n_test} epochs"
-            else:
-                hypno_test = Hypnogram(hypno_test.hypno[:n_ref], n_stages=hypno_test.n_stages)
-                n_trimmed = n_test - n_ref
-                warn_msg = f"`hypno_test` longer than `hypno_ref`, {n_trimmed} epochs trimmed"
-            ## Q: Should be downplayed as INFO?
-            logger.warning(warn_msg)
-        self.hypno_ref = hypno_ref
-        self.hypno_test = hypno_test
-
-    def get_confusion_matrix(self):
-        """
-        Return ``hypno_ref``/``hypno_test``confusion matrix dataframe.
 
-        Returns
-        -------
-        matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with stages of ``hypno_ref`` as indices and stages of
-            ``hypno_test`` as columns.
-        """
-        # Generate confusion matrix.
-        matrix = pd.crosstab(
-            self.hypno_ref.hypno, self.hypno_test.hypno, margins=True, margins_name="Total"
+        assert isinstance(refr_hyps, Hypnogram) or hasattr(refr_hyps, "__iter__"), (
+            "`refr_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
+        )
+        assert isinstance(test_hyps, Hypnogram) or hasattr(test_hyps, "__iter__"), (
+            "`test_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
+        )
+        assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
+
+        # Convert solo hypnograms to len==1 tuples
+        if isinstance(refr_hyps, Hypnogram):  # As below, picking refr_hyps for checks arbitrarily
+            refr_hyps = [refr_hyps]
+            test_hyps = [test_hyps]
+        assert len(refr_hyps) == len(test_hyps), "must have same number of subjects"
+
+        if isinstance(refr_hyps, dict):
+            assert refr_hyps.keys() == test_hyps.keys(), "must have same subject identifiers and in same order"
+            subjects, refr_hyps = zip(*refr_hyps.items())
+            # assert all(isinstance(s, str) for s in subjects)
+            test_hyps = tuple(test_hyps.values())
+        else:
+            subjects = 1 + np.arange(len(refr_hyps))
+
+        all_hyps = refr_hyps + test_hyps
+        assert all(isinstance(hyp, Hypnogram) for hyp in all_hyps), "`refr_hyps` and `test_hyps` must only include hypnograms"
+        assert all(h.scorer is not None for h in all_hyps), "all hypnograms must have a scorer"
+        for h1, h2 in zip(all_hyps[:-1], all_hyps[1:]):
+            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+            assert h1.labels == h2.labels, "all hypnograms must have the same labels"
+            assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), "all `refr_hyps` must have the same scorer"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), "all `test_hyps` must have the same scorer"
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
+        ## Could use set() for those above
+        ## Or set scorer as the first available and check all equal
+
+        # Convert to dictionaries with subjects and hypnograms
+        refr_hyps = { s: h for s, h in zip(subjects, refr_hyps) }
+        test_hyps = { s: h for s, h in zip(subjects, test_hyps) }
+
+        # Merge all hypnograms into a single multiindexed dataframe
+        refr = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in refr_hyps.items())
+        test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
+        data = pd.concat([refr, test], axis=1)
+
+        # Get summary sleep statistics for each measurement.
+        refr_sstats = pd.Series(refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        test_sstats = pd.Series(test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        refr_sstats = refr_sstats.set_index(pd.Index(subjects, name="subject"))
+        test_sstats = test_sstats.set_index(pd.Index(subjects, name="subject"))
+        # sse = yasa.SleepStatsEvaluation(refr_sstats, test_sstats)
+        
+        # Set attributes
+        self._data = data
+        self._subjects = subjects
+        self._n_subjects = len(subjects)
+        self._refr_hyps = refr_hyps
+        self._test_hyps = test_hyps
+        self._refr_sstats = refr_sstats
+        self._test_sstats = test_sstats
+        self._refr_name = refr_hyps[subjects[0]].scorer
+        self._test_name = test_hyps[subjects[0]].scorer
+        self._n_stages = refr_hyps[subjects[0]].n_stages
+        self._labels = refr_hyps[subjects[0]].labels
+
+    def __repr__(self):
+        # TODO v0.8: Keep only the text between < and >
+        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
+        return (
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
+            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
+            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            "See the online documentation for more details."
         )
-        # Reorder indices in sensible order and to include all stages
-        matrix = matrix.reindex(self.hypno_ref.labels + ["Total"], axis=0)
-        matrix = matrix.reindex(self.hypno_test.labels + ["Total"], axis=1)
-        matrix = matrix.fillna(0).astype(int)
-        return matrix
 
-    def get_agreement(self):
+    def __str__(self):
+        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
+        return (
+            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
+            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
+            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            "See the online documentation for more details."
+        )
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def refr_sstats(self):
+        return self._refr_sstats
+
+    @property
+    def test_sstats(self):
+        return self._test_sstats
+
+    @property
+    def refr_hyps(self):
+        """The reference Hypnograms."""
+        ## Q: Starting to think there should be a clear convention on what we mean
+        ##    when we say "hypnogram". Should hypnogram mean the Series and Hypnogram
+        ##    mean the YASA object? Similarly for hypno/hyp.
+        return self._refr_hyps
+
+    @property
+    def test_hyps(self):
+        """The test Hypnograms."""
+        return self._test_hyps
+
+    @property
+    def subjects(self):
+        return self._subjects
+
+    @property
+    def n_subjects(self):
+        return self._n_subjects
+
+    @property
+    def refr_name(self):
+        """The name of the reference measurement."""
+        return self._refr_name
+
+    @property
+    def test_name(self):
+        """The name of the test measurement."""
+        return self._test_name
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def n_stages(self):
+        return self._n_stages
+
+    def get_agreement(self, subject=None):
         """
-        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
-        across all stages as measured by common classifier agreement methods.
+        Return a dataframe of ``refr_hyp``/``test_hyp`` performance across all stages as measured by
+        common classifier agreement methods.
 
+        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
         ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
         ##    Maybe should be binary vs multiclass?
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEpochEvaluation`
+            A :py:class:`yasa.EpochByEpochEvaluation` instance.
+        subject : None or a unique subject identifier.
+            Subject identifiers are based on user input, and integers starting from 1 if not provided.
 
         Returns
         -------
         agreement : :py:class:`pandas.Series`
             A :py:class:`pandas.Series` with agreement metrics as indices.
         """
-        true = self.hypno_ref.hypno.to_numpy()
-        pred = self.hypno_test.hypno.to_numpy()
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = pred.loc[subject]
+            pred = pred.loc[subject]
         accuracy = metrics.accuracy_score(true, pred)
         kappa = metrics.cohen_kappa_score(true, pred)
         jaccard = metrics.jaccard_score(true, pred, average="weighted")
@@ -167,10 +306,10 @@ def get_agreement(self):
         agreement = pd.Series(scores, name="agreement").rename_axis("metric")
         return agreement
 
-    def get_agreement_by_stage(self):
+    def get_agreement_by_stage(self, subject=None):
         """
-        Return a dataframe of ``hypno_ref``/``hypno_test`` performance
-        for each stage as measured by common classifier agreement methods.
+        Return a dataframe of ``refr_hyp``/``test_hyp`` performance for each stage as measured by
+        common classifier agreement methods.
 
         .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
 
@@ -179,38 +318,151 @@ def get_agreement_by_stage(self):
         agreement : :py:class:`pandas.DataFrame`
             A DataFrame with agreement metrics as indices and stages as columns.
         """
-        true = self.hypno_ref.hypno.to_numpy()
-        pred = self.hypno_test.hypno.to_numpy()
-        labels = self.hypno_ref.labels  # equivalent to hypno_test.labels
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = true.loc[subject]
+            pred = pred.loc[subject]
         scores = metrics.precision_recall_fscore_support(
-            true, pred, labels=labels, average=None, zero_division=0
+            true, pred, labels=self.labels, average=None, zero_division=0
         )
         agreement = pd.DataFrame(scores)
         agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
-        agreement.columns = pd.Index(labels, name="stage")
+        agreement.columns = pd.Index(self.labels, name="stage")
         return agreement
 
+    def get_confusion_matrix(self, subject=None):
+        """Return a ``refr_hyp``/``test_hyp``confusion matrix.
+
+        Returns
+        -------
+        matrix : :py:class:`pandas.DataFrame`
+            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
+        """
+        true = self.data[self.refr_name]
+        pred = self.data[self.test_name]
+        if subject is not None:
+            true = true.loc[subject]
+            pred = pred.loc[subject]
+        # Generate confusion matrix.
+        matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
+        # Reorder indices in sensible order and to include all stages
+        index_col_labels = self.labels + ["Total"]
+        matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
+        return matrix.astype(int)
+
+    def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
+        """Plot the two hypnograms, where ``refr_hyp`` is overlaid on ``refr_hyp``.
+
+        .. seealso:: :py:func:`yasa.plot_hypnogram`
+
+        Parameters
+        ----------
+        legend : bool or dict
+            If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
+            pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
+        ax : :py:class:`matplotlib.axes.Axes` or None
+            Axis on which to draw the plot, optional.
+        refr_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``refr_hyp``.
+        test_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``test_hyp``.
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+
+        Examples
+        --------
+        .. plot::
+
+            >>> from yasa import simulate_hypnogram
+            >>> hyp = simulate_hypnogram(seed=7)
+            >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
+        """
+        if subject is None:
+            if self.n_subjects == 1:
+                refr_hyp = self.refr_hyps[self.subjects[0]]
+                test_hyp = self.test_hyps[self.subjects[0]]
+            else:
+                raise NotImplementedError("Plotting is currently allowed for only one subject")
+        else:
+            refr_hyp = self.refr_hyps[subject]
+            test_hyp = self.test_hyps[subject]
+        assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
+        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
+        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
+        assert not "ax" in refr_kwargs | test_kwargs, (
+            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
+        )
+        plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
+        plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
+        plot_refr_kwargs.update(refr_kwargs)
+        plot_test_kwargs.update(test_kwargs)
+        if ax is None:
+            ax = plt.gca()
+        refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
+        test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
+        if legend and "label" in plot_refr_kwargs | plot_test_kwargs:
+            if isinstance(legend, dict):
+                ax.legend(**legend)
+            else:
+                ax.legend()
+        return ax
+
+    def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
+        """Plot ROC curves for each stage.
+
+        Parameters
+        ----------
+        palette : dict or None
+            If a dictionary, keys are stages and values are corresponding colors.
+        ax : :py:class:`matplotlib.axes.Axes`
+            Axis on which to draw the plot, optional.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`matplotlib.pyplot.plot` call.
+
+        Returns
+        -------
+        ax : :py:class:`matplotlib.axes.Axes`
+            Matplotlib Axes
+        """
+        # assert self.test_hyp.probas is not None
+        raise NotImplementedError("Requires probability/confidence values.")
+
+
+#############################################################################
+# SLEEP STATISTICS
+#############################################################################
+
 
 class SleepStatsEvaluation:
     """
-    Evaluate agreement between two measurement devices by comparing summary sleep statistics across
-    multiple participants or sessions.
-
-    For example, the reference device might be PSG and the test device might be a wearable device.
+    Evaluate agreement between two measurement systems (e.g., two different manual scorers or one
+    one manual scorer againt YASA's automatic staging) by comparing their summary sleep statistics
+    derived from multiple subjects or sessions.
 
     Parameters
     ----------
-    data : :py:class:`pandas.DataFrame`
-        A pandas dataframe with sleep statistics from two different
-        devices for multiple subjects
-    reference : str
-        Name of column containing the reference device sleep statistics.
-    test : str
-        Name of column containing the test device sleep statistics.
-    subject : str
-        Name of column containing the subject ID.
-    statistic : str
-        Name of column containing the name of the sleep statistics.
+    refr_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the reference measurement system.
+        Rows are individual subjects and columns are individual sleep statistics.
+    test_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
+        Shape, indices, and columns must be identical to ``refr_data``.
+    refr_name : str
+        Name of the reference measurement device, used for labeling.
+    test_name : str
+        Name of the test measurement device, used for labeling.
+    alpha : float
+        Alpha cutoff used for all three tests.
+    kwargs_normality : dict
+        Keywords arguments passed to the :py:func:`pingouin.normality` call.
+    kwargs_regression : dict
+        Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
+    kwargs_homoscedasticity : dict
+        Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
 
     Notes
     -----
@@ -228,26 +480,15 @@ class SleepStatsEvaluation:
     --------
     >>> import pandas as pd
     >>> import yasa
-    >>> results = []
-    >>> for i in range(1, 21):
-    >>>     hypno_a = yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i)
-    >>>     hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=i + 99)
-    >>>     sstats_a = hypno_a.sleep_statistics()
-    >>>     sstats_b = hypno_b.sleep_statistics()
-    >>>     sstats_a["subject"] = f"sub-{i:03d}"
-    >>>     sstats_b["subject"] = f"sub-{i:03d}"
-    >>>     sstats_a["scorer"] = "RaterA"
-    >>>     sstats_b["scorer"] = "RaterB"
-    >>>     results.extend([sstats_a, sstats_b])
-    >>> 
-    >>> df = (pd.DataFrame(results)
-    >>>     .pivot(index="subject", columns="scorer")
-    >>>     .stack(0).rename_axis(["subject", "sstat"]).reset_index().rename_axis(None, axis=1)
-    >>>     .query("sstat.isin(['%N1', '%N2', '%N3', '%REM', 'SOL', 'SE', 'TST'])")
     >>>
-    >>> sse = yasa.SleepStatsEvaluation(
-    >>>     data=df, reference="RaterA", test="RaterB", subject="subject", statistic="sstat"
-    >>> )
+    >>> # For this example, generate two fake datasets of sleep statistics
+    >>> hypsA = [yasa.simulate_hypnogram(tib=600, seed=i) for i in range(20)]
+    >>> hypsB = [h.simulate_similar(tib=600, seed=i) for i, h in enumerate(hypsA)]
+    >>> sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
+    >>>
+    >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
     >>>
     >>> sse.summary(descriptives=False)
            normal  unbiased  homoscedastic
@@ -260,103 +501,229 @@ class SleepStatsEvaluation:
     SOL     False     False           True
     TST      True      True           True
 
+    Access more detailed statistical output of each test.
+
+    >>> sse.normality
+                  W      pval  normal
+    sstat
+    %N1    0.973407  0.824551    True
+    %N2    0.960684  0.557595    True
+    %N3    0.958591  0.516092    True
+    %REM   0.901733  0.044447   False
+    SE     0.926732  0.133580    True
+    SOL    0.774786  0.000372   False
+    TST    0.926733  0.133584    True
+    WASO   0.924288  0.119843    True
+
+    >>> sse.homoscedasticity.head(2)
+                  W      pval  equal_var
+    sstat
+    %N1    0.684833  0.508274       True
+    %N2    0.080359  0.922890       True
+
+    >>> sse.proportional_bias.round(3).head(2)
+            coef     se      T   pval     r2  adj_r2  CI[2.5%]  CI[97.5%]  unbiased
+    sstat
+    %N1   -0.487  0.314 -1.551  0.138  0.118   0.069    -1.146      0.172      True
+    %N2   -0.107  0.262 -0.409  0.688  0.009  -0.046    -0.658      0.444      True
+
     .. plot::
 
-        >>> sse.plot_discrepancies_heatmap()
+        >>> import matplotlib.pyplot as plt
+        >>> ax = sse.plot_discrepancies_heatmap()
+        >>> ax.set_title("Sleep statistic discrepancies")
+        >>> plt.tight_layout()
 
     .. plot::
 
         >>> sse.plot_blandaltman()
     """
-    def __init__(self, data, reference, test, subject, statistic):
-        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
-        for col in [reference, test, subject, statistic]:
-            assert isinstance(col, str) and col in data, f"`{col}` must be a string and a column in `data`"
-        assert data[subject].nunique() > 1, "`data` must include more than one subject"
-        data = data.copy()
-
-        # Get measurement difference between reference and test devices
-        data["difference"] = data[test].sub(data[reference])
-
-        # Check for sleep statistics that have no differences between measurement devices.
-        # This is most likely to occur with TIB but is possible with any, and will break some functions.
-        stats_nodiff = data.groupby(statistic)["difference"].any().loc[lambda x: ~x].index
+    def __init__(
+        self,
+        refr_data,
+        test_data,
+        *,
+        refr_name="Reference",
+        test_name="Test",
+        kwargs_normality={"alpha": 0.05},
+        kwargs_regression={"alpha": 0.05},
+        kwargs_homoscedasticity={"alpha": 0.05},
+    ):
+        assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
+        assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
+        assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
+        assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
+        assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
+        assert isinstance(refr_name, str)
+        assert isinstance(test_name, str)
+        assert refr_name != test_name
+        assert isinstance(kwargs_normality, dict)
+        assert isinstance(kwargs_regression, dict)
+        assert isinstance(kwargs_homoscedasticity, dict)
+        assert "alpha" in kwargs_normality
+        assert "alpha" in kwargs_regression
+        assert "alpha" in kwargs_homoscedasticity
+
+        # Merge dataframes, get differences, and reshape wide-to-long format
+        subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
+        refr_data.index.name = subj_name
+        test_data.index.name = subj_name
+        diff_data = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
+        refr_data = pd.concat({refr_name: refr_data}, names=["measurement"])
+        test_data = pd.concat({test_name: test_data}, names=["measurement"])
+        data = (pd.concat([refr_data, test_data, diff_data])
+            .melt(var_name="sstat", ignore_index=False).reset_index()
+            .pivot(columns="measurement", index=[subj_name, "sstat"], values="value")
+            .reset_index().rename_axis(columns=None)
+        )
+
+        # Remove sleep statistics that have no differences between measurement systems
+        ## TODO: restructure?
+        stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
+        data = data.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
-            data = data.query(f"{statistic} != '{s}'")
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
             ## Q: Should this be logged as just info?
 
-        # Get list of all statistics to be evaluated
-        self.all_sleepstats = data[statistic].unique()
-
-        # Save attributes
-        self.data = data
-        self.reference = reference
-        self.test = test
-        self.subject = subject
-        self.statistic = statistic
-
-        # Run tests
-        self.test_normality()
-        self.test_proportional_bias()
-        self.test_homoscedasticity()
+        ## NORMALITY ## Test reference data for normality at each sleep statistic
+        normality = data.groupby("sstat")[refr_name].apply(pg.normality, **kwargs_normality).droplevel(-1)
 
-    def test_normality(self):
-        """Test reference data for normality at each sleep statistic."""
-        normality = self.data.groupby(self.statistic)[self.reference].apply(pg.normality)
-        self.normality = normality.droplevel(-1)
-
-    def test_proportional_bias(self):
-        """Test each sleep statistic for proportional bias.
-        
-        For each statistic, regress the device difference score on the reference device score to get
-        proportional bias and residuals that will be used for the later homoscedasticity
-        calculation. Subject-level residuals for each statistic are added to ``data``.
-        """
+        ## PROPORTIONAL BIAS ## Test each sleep statistic for proportional bias
+        # Subject-level residuals for each statistic are added to data.
         prop_bias_results = []
         residuals_results = []
-        for ss, ss_df in self.data.groupby(self.statistic):
-            # Regress the difference score on the reference device
-            model = pg.linear_regression(ss_df[self.reference], ss_df["difference"])
-            model.insert(0, self.statistic, ss)
-            # Extract the subject-level residuals
-            resid = pd.DataFrame(
-                {
-                    self.subject: ss_df[self.subject],
-                    self.statistic: ss,
-                    "pbias_residual": model.residuals_
-                }
-            )
+        # proportional bias and residuals that will be used for the later  tests.
+        for ss_name, ss_df in data.groupby("sstat"):
+            # Regress the difference scores on the reference scores
+            model = pg.linear_regression(ss_df[refr_name], ss_df["difference"], **kwargs_regression)
+            model.insert(0, "sstat", ss_name)
+            # Extract subject-level residuals for later homoscedasticity tests
+            resid_dict = {subj_name: ss_df[subj_name], "sstat": ss_name, "pbias_residual": model.residuals_}
+            resid = pd.DataFrame(resid_dict)
             prop_bias_results.append(model)
             residuals_results.append(resid)
         # Add residuals to raw dataframe, used later when testing homoscedasticity
-        residuals = pd.concat(residuals_results)
-        self.data = self.data.merge(residuals, on=[self.subject, self.statistic])
+        data = data.merge(pd.concat(residuals_results), on=[subj_name, "sstat"])
         # Handle proportional bias results
         prop_bias = pd.concat(prop_bias_results)
         # Save all the proportional bias models before removing intercept, for optional user access
-        self.proportional_bias_models_ = prop_bias.reset_index(drop=True)
-        # Remove intercept rows
-        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names")
+        prop_bias_full = prop_bias.reset_index(drop=True)
+        # Now remove intercept rows
+        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
         # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(0.05)
-        self.proportional_bias = prop_bias.set_index(self.statistic)
+        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
+
+        ## Test each statistic for homoscedasticity ##
+        columns = [refr_name, "difference", "pbias_residual"]
+        homoscedasticity_func = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_func).droplevel(-1)
+
+        # Set attributes
+        self._data = data
+        self._normality = normality
+        self._proportional_bias = prop_bias
+        self._proportional_bias_full = prop_bias_full  # Q: Is this worth saving??
+        self._homoscedasticity = homoscedasticity
+        # These will not be set as properties, as they are only needed internally
+        self._refr_name = refr_name
+        self._test_name = test_name
+        self._subj_name = subj_name
+        self._n_subjects = data[subj_name].nunique()
+        # Pivot new to not include removed sstats
+        self._diff_data = data.pivot(index=self.subj_name, columns="sstat", values="difference")
+
+    @property
+    def data(self):
+        """
+        ``refr_data`` and ``test_data`` combined in a long-format :py:class:`pandas.DataFrame`.
+        Also includes difference scores (``test_data`` minus ``refr_data``).
+        """
+        return self._data
+
+    @property
+    def diff_data(self):
+        """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
+        # # Pivot for subject-rows and statistic-columns
+        return self._diff_data
+
+    @property
+    def refr_name(self):
+        """The name of the reference measurement."""
+        return self._refr_name
+
+    @property
+    def test_name(self):
+        """The name of the test measurement."""
+        return self._test_name
+
+    @property
+    def subj_name(self):
+        """The name of the subject identifier."""
+        return self._subj_name
+
+    @property
+    def n_subjects(self):
+        """The number of subjects."""
+        return self._n_subjects
+
+    @property
+    def normality(self):
+        """A :py:class:`pandas.DataFrame` of normality test results for all sleep statistics."""
+        return self._normality
+
+    @property
+    def homoscedasticity(self):
+        """A :py:class:`pandas.DataFrame` of homoscedasticity test results for all sleep statistics."""
+        return self._homoscedasticity
+
+    @property
+    def proportional_bias(self):
+        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        return self._proportional_bias
+
+    @property
+    def proportional_bias_full(self):
+        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        return self._proportional_bias_full
+
+    def __repr__(self):
+        # TODO v0.8: Keep only the text between < and >
+        return (
+            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
+            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
+            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
+            "See the online documentation for more details."
+        )
 
-    def test_homoscedasticity(self, method="levene"):
-        """Test each statistic for homoscedasticity.
+    def __str__(self):
+        return (
+            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
+            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
+            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
+            "See the online documentation for more details."
+        )
 
-        The ``method`` argument is passed to :py:func:`pingouin.homoscedasticity`.
+    def summary(self, descriptives=True):
+        """Return a summary dataframe highlighting what statistics pass checks.
 
-        ..note:: ``self.test_proportional_bias()`` must be run first.
-        """
-        group = self.data.groupby(self.statistic)
-        columns = [self.reference, "difference", "pbias_residual"]
-        homoscedasticity = group.apply(lambda df: pg.homoscedasticity(df[columns], method=method))
-        self.homoscedasticity = homoscedasticity.droplevel(-1)
+        Parameters
+        ----------
+        self : :py:class:`SleepStatsEvaluation`
+            A :py:class:`SleepStatsEvaluation` instance.
+        descriptives : bool or dict
+            If True (default) or a dictionary, also include descriptive statistics for reference and
+            test measurements. If a dictionary, all key/value pairs are passed as keyword arguments
+            to the :py:meth:`pandas.DataFrame.agg` call.
 
-    def summary(self, descriptives=True):
-        """Return a summary dataframe highlighting what statistics pass checks."""
-        assert isinstance(descriptives, bool), "descriptives must be True or False"
+        Returns
+        -------
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
+            normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
+        """
+        assert isinstance(descriptives, (bool, dict)), "`descriptives` must be True, False, or dict"
         series_list = [
             self.normality["normal"],
             self.proportional_bias["unbiased"],
@@ -364,147 +731,123 @@ def summary(self, descriptives=True):
         ]
         summary = pd.concat(series_list, axis=1)
         if descriptives:
-            group = self.data.drop(columns=self.subject).groupby(self.statistic)
-            desc = group.agg(["mean", "std"])
+            agg_kwargs = {"func": ["mean", "std"]}
+            if isinstance(descriptives, dict):
+                agg_kwargs.update(descriptives)
+            desc = self.data.drop(columns=self.subj_name).groupby("sstat").agg(**agg_kwargs)
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
         return summary
 
-    def plot_discrepancies_heatmap(self, sstats_order=None, **kwargs):
+    def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """Visualize subject-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        sstats_order : list
+        sleep_stats : list or None
             List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`seaborn.heatmap`.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`seaborn.heatmap` call.
 
         Returns
         -------
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        if sstats_order is None:
-            sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
-
-        # Merge default heatmap arguments with optional input
-        heatmap_kwargs = dict(cmap="binary", annot=True, fmt=".1f", square=False)
+        assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
+        if sleep_stats is None:
+            sleep_stats = self.data["sstat"].unique()  # All available sleep statistics
+        heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
+        heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
+        if "cbar_kws" in kwargs:
+            heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        # Pivot for subject-rows and statistic-columns
-        table = self.data.pivot(
-            index=self.subject, columns=self.statistic, values="difference",
-        )
-        # Normalize statistics (i.e., columns) between zero and one
-        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp))
-        # If annotating, replace with raw values for writing.
+        table = self.diff_data[sleep_stats]
+        # Normalize statistics (i.e., columns) between zero and one then convert to percentage
+        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
-            heatmap_kwargs["annot"] = table[sstats_order].to_numpy()
-        # Draw heatmap
-        ax = sns.heatmap(table_norm[sstats_order], **heatmap_kwargs)
-        return ax
+            # Use raw values for writing
+            heatmap_kwargs["annot"] = table.to_numpy()
+        return sns.heatmap(table_norm, **heatmap_kwargs)
 
-    def plot_discrepancies_dotplot(self, sstats_order=None, palette="winter", **kwargs):
+    def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
         """Visualize subject-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        sstats_order : list
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        palette : string, list, dict, or :py:class:`matplotlib.colors.Colormap`
-            Color palette passed to :py:class:`seaborn.PairGrid`
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`seaborn.stripplot`.
+        kwargs_pairgrid : dict
+            Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
 
         Returns
         -------
         g : :py:class:`seaborn.PairGrid`
-            Seaborn PairGrid
-        """
-        if sstats_order is None:
-            sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
+            A :py:class:`seaborn.FacetGrid` with sleep statistics dotplots on each axis.
 
-        # Merge default stripplot arguments with optional input
-        stripplot_kwargs = dict(size=10, linewidth=1, edgecolor="white")
-        stripplot_kwargs.update(kwargs)
-
-        # Pivot data to get subject-rows and statistic-columns
-        table = self.data.pivot(index=self.subject, columns=self.statistic, values="difference")
+        Examples
+        --------
+        To plot a limited subset of sleep statistics, use the ``x_vars`` keyword argument of
+        :py:class:`seaborn.PairGrid`.
 
+        .. plot::
+            ## TODO: Example using x_vars
+        """
+        assert isinstance(kwargs_pairgrid, dict), "`kwargs_pairgrid` must be a dict"
+        stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
+        stripplot_kwargs.update(kwargs)
         # Initialize the PairGrid
-        height = 0.3 * len(table)
+        height = 0.3 * len(self.diff_data)
         aspect = 0.6
-        g = sns.PairGrid(
-            table.reset_index(),
-            x_vars=sstats_order,
-            y_vars=[self.subject],
-            hue=self.subject,
-            palette=palette,
-            height=height,
-            aspect=aspect,
-        )
+        pairgrid_kwargs = dict(hue=self.subj_name, height=height, aspect=aspect)
+        pairgrid_kwargs.update(kwargs_pairgrid)
+        g = sns.PairGrid(self.diff_data.reset_index(), y_vars=[self.subj_name], **pairgrid_kwargs)
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
-
         # Adjust aesthetics
-        g.set(xlabel="", ylabel="")
-        for ax, title in zip(g.axes.flat, sstats_order):
-            ax.set(title=title)
+        for ax in g.axes.flat:
+            ax.set(title=ax.get_xlabel())
             ax.margins(x=0.3)
             ax.yaxis.grid(True)
             ax.tick_params(left=False)
+        g.set(xlabel="", ylabel="")
         sns.despine(left=True, bottom=True)
-
         return g
 
-    def plot_blandaltman(self, sstats_order=None, facet_kwargs={}, **kwargs):
+    def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         """
+
+        **Use col_order=sstats_order for plotting a subset.
+
         Parameters
         ----------
-        sstats_order : list or None
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        facet_kwargs : dict
-            Other keyword arguments are passed through to :py:class:`seaborn.FacetGrid`.
-        kwargs : dict
-            Other keyword arguments are passed through to :py:func:`pingouin.plot_blandaltman`.
+        kwargs_facetgrid : dict
+            Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
 
         Returns
         -------
         g : :py:class:`seaborn.FacetGrid`
-            Seaborn FacetGrid
+            A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
         """
-        if sstats_order is None:
-            sstats_order = self.all_sleepstats
-        else:
-            assert isinstance(sstats_order, (list, type(None))), "`sstats_order` must be a list"
-
-        # Select scatterplot arguments (passed to blandaltman) and update with optional input
+        facetgrid_kwargs = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
+        facetgrid_kwargs.update(kwargs_facetgrid)
         blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
         blandaltman_kwargs.update(kwargs)
-        # Select FacetGrid arguments and update with optional input
-        col_wrap = 4 if len(sstats_order) > 4 else None
-        facetgrid_kwargs = dict(col_wrap=col_wrap, height=2, aspect=1, sharex=False, sharey=False)
-        facetgrid_kwargs.update(facet_kwargs)
-
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col=self.statistic, col_order=sstats_order, **facetgrid_kwargs)
-        # Draw Bland-Altman on each axis
-        g.map(pg.plot_blandaltman, self.test, self.reference, **blandaltman_kwargs)
-
-        # Tidy-up axis limits with symmetric y-axis and minimal ticks
+        g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
+        # Draw Bland-Altman plot on each axis
+        g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
+        # Adjust aesthetics
         for ax in g.axes.flat:
+            # Tidy-up axis limits with symmetric y-axis and minimal ticks
             bound = max(map(abs, ax.get_ylim()))
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        # More aesthetics
-        ylabel = " - ".join((self.test, self.reference))
+        ylabel = " - ".join((self.test_name, self.refr_name))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)
-
         return g
diff --git a/yasa/plotting.py b/yasa/plotting.py
index d8c6b05..1eb0003 100644
--- a/yasa/plotting.py
+++ b/yasa/plotting.py
@@ -13,7 +13,7 @@
 __all__ = ["plot_hypnogram", "plot_spectrogram", "topoplot"]
 
 
-def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
+def plot_hypnogram(hyp, highlight="REM", fill_color=None, ax=None, **kwargs):
     """
     Plot a hypnogram.
 
@@ -23,14 +23,15 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     ----------
     hyp : :py:class:`yasa.Hypnogram`
         A YASA hypnogram instance.
-    lw : float
-        Linewidth.
     highlight : str or None
         Optional stage to highlight with alternate color.
     fill_color : str or None
         Optional color to fill space above hypnogram line.
     ax : :py:class:`matplotlib.axes.Axes`
         Axis on which to draw the plot, optional.
+    **kwargs : dict
+        Keyword arguments controlling hypnogram line display (e.g., ``linewidth``, ``linestyle``).
+        Passed to :py:func:`matplotlib.pyplot.stairs` and py:func:`matplotlib.pyplot.hlines`.
 
     Returns
     -------
@@ -76,20 +77,25 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     old_fontsize = plt.rcParams["font.size"]
     plt.rcParams.update({"font.size": 18})
 
+    # Open the figure
+    if ax is None:
+        ax = plt.gca()
+
     ## Remap stages to be in desired y-axis order ##
     # Start with default of all allowed labels
     stage_order = hyp.labels.copy()
-    stages_present = hyp.hypno.unique()
-    # Remove Art/Uns from stage order, and place back individually at front to be higher on plot
-    art_str = stage_order.pop(stage_order.index("ART"))
-    uns_str = stage_order.pop(stage_order.index("UNS"))
-    if "ART" in stages_present:
-        stage_order.insert(0, art_str)
-    if "UNS" in stages_present:
-        stage_order.insert(0, uns_str)
+    stages_present = hyp.hypno.unique().tolist()
+    # Reverse order so WAKE is highest, and exclude ART/UNS which are always last
+    stage_order = stage_order[:-2][::-1]
+    # Add ART/UNS back above WAKE if they're present in the current hypnogram or existing axis
+    gca_ylabels = [x.get_text() for x in ax.get_yticklabels()]
+    if "ART" in stages_present or "ART" in gca_ylabels:
+        stage_order += ["ART"]
+    if "UNS" in stages_present or "UNS" in gca_ylabels:
+        stage_order += ["UNS"]
     # Put REM after WAKE if all 5 standard stages are allowed
     if hyp.n_stages == 5:
-        stage_order.insert(stage_order.index("WAKE") + 1, stage_order.pop(stage_order.index("REM")))
+        stage_order.insert(stage_order.index("WAKE") - 1, stage_order.pop(stage_order.index("REM")))
     # Reset the Hypnogram mapping so any future returns have this order
     hyp.mapping = {stage: i for i, stage in enumerate(stage_order)}
 
@@ -113,18 +119,17 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     # Make mask to draw the highlighted stage
     yvals_highlight = np.ma.masked_not_equal(yvalues, hyp.mapping.get(highlight))
 
-    # Open the figure
-    if ax is None:
-        ax = plt.gca()
-
     # Draw background filling
     if fill_color is not None:
-        bline = hyp.mapping["WAKE"]  # len(stage_order) - 1 to fill from bottom
-        ax.stairs(yvalues.clip(bline), bins, baseline=bline, color=fill_color, fill=True, lw=0)
-    # Draw main hypnogram line, highlighted stage line, and Artefact/Unscored line
-    ax.stairs(yvalues, bins, baseline=None, color="black", lw=lw)
+        bline = hyp.mapping["WAKE"]
+        ax.stairs(yvalues.clip(max=bline), bins, baseline=bline, color=fill_color, fill=True, lw=0)
+    # Draw main hypnogram line and highlighted stage line
+    line_kwargs = {"color": "black", "linewidth": 1.5, "label": hyp.scorer}
+    line_kwargs.update(kwargs)
+    ax.stairs(yvalues, bins, baseline=None, **line_kwargs)
     if not yvals_highlight.mask.all():
-        ax.hlines(yvals_highlight, xmin=bins[:-1], xmax=bins[1:], color="red", lw=lw)
+        line_kwargs.update({"color": "red", "label": None})
+        ax.hlines(yvals_highlight, xmin=bins[:-1], xmax=bins[1:], **line_kwargs)
 
     # Aesthetics
     ax.use_sticky_edges = False
@@ -133,7 +138,6 @@ def plot_hypnogram(hyp, lw=1.5, highlight="REM", fill_color=None, ax=None):
     ax.set_yticklabels(stage_order)
     ax.set_ylabel("Stage")
     ax.set_xlabel(xlabel)
-    ax.invert_yaxis()
     ax.spines[["right", "top"]].set_visible(False)
     if hyp.start is not None:
         ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))

From b47385b3dab34a16044e6989fdfcc904c9214e8b Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Tue, 3 Jan 2023 20:46:12 -0600
Subject: [PATCH 34/43] major restructure of attributes/methods and scores
 calculations

cleanup
---
 yasa/evaluation.py | 729 ++++++++++++++++++++++++++++-----------------
 yasa/hypno.py      |  13 +-
 2 files changed, 459 insertions(+), 283 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index e8a0cff..35273a4 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -1,5 +1,5 @@
 """
-YASA code for evaluating the agreement between two sleep-measurement systems.
+YASA code for evaluating the agreement between two scorers.
 
 There are two levels of evaluating staging performance:
 - Comparing two hypnograms (e.g., human vs automated scorer)
@@ -16,7 +16,7 @@
 import numpy as np
 import pandas as pd
 import pingouin as pg
-from sklearn import metrics
+import sklearn.metrics as skm
 
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -39,14 +39,14 @@
 
 class EpochByEpochEvaluation:
     """
-    See :py:meth:`yasa.Hypnogram.evaluate`
+    For comparing only 2 hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
 
     Parameters
     ----------
-    refr_hyp : :py:class:`yasa.Hypnogram`
-        The reference or ground-truth hypnogram.
-    test_hyp : :py:class:`yasa.Hypnogram`
-        The test or to-be-evaluated hypnogram.
+    refr_hyps : :py:class:`yasa.Hypnogram`
+        A collection of reference or ground-truth hypnograms.
+    test_hyps : :py:class:`yasa.Hypnogram`
+        A collection of test or to-be-evaluated hypnograms.
 
     Notes
     -----
@@ -63,20 +63,9 @@ class EpochByEpochEvaluation:
     Examples
     --------
     >>> import yasa
-    >>> hypno_a = yasa.simulate_hypnogram(tib=90, seed=8, scorer="RaterA")
-    >>> hypno_b = yasa.simulate_hypnogram(tib=90, seed=9, scorer="RaterB")
-    >>> ebe = yasa.EpochByEpochEvaluation(hypno_a, hypno_b)  # or hypno_a.evaluate(hypno_b)
-    >>> ebe.get_confusion_matrix()
-    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
-    RaterA
-    WAKE      52   38  126  23   51    0    0    290
-    N1        59    2   27   8   14    0    0    110
-    N2       117   50  105  15   44    0    0    331
-    N3        34   26   62  42   15    0    0    179
-    REM       15   12   13  10    0    0    0     50
-    ART        0    0    0   0    0    0    0      0
-    UNS        0    0    0   0    0    0    0      0
-    Total    277  128  333  98  124    0    0    960
+    >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i) for i in range(20)]
+    >>> hyps_b = [h.simulate_similar(scorer="RaterB", seed=i) for i, h in enumerate(refr_hyps)]
+    >>> ebe = yasa.EpochByEpochEvaluation(hyps_a, hyps_b)
 
     >>> ebe.get_agreement().round(3)
     metric
@@ -122,251 +111,417 @@ class EpochByEpochEvaluation:
         >>>
         >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
         >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
+
+    When comparing only 2 hypnograms, use the :py:meth:`yasa.Hynogram.evaluate` method:
+
+    >>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
+    >>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
+    >>> ebe = hypno_a.evaluate(hypno_b)
+
+    >>> ebe.get_confusion_matrix()
+    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
+    RaterA
+    WAKE      52   38  126  23   51    0    0    290
+    N1        59    2   27   8   14    0    0    110
+    N2       117   50  105  15   44    0    0    331
+    N3        34   26   62  42   15    0    0    179
+    REM       15   12   13  10    0    0    0     50
+    ART        0    0    0   0    0    0    0      0
+    UNS        0    0    0   0    0    0    0      0
+    Total    277  128  333  98  124    0    0    960
     """
     def __init__(self, refr_hyps, test_hyps):
-        from yasa.hypno import Hypnogram  # Loading here to avoid circular import
+        from yasa.hypno import Hypnogram  # Avoiding circular import
 
-        assert isinstance(refr_hyps, Hypnogram) or hasattr(refr_hyps, "__iter__"), (
-            "`refr_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
-        )
-        assert isinstance(test_hyps, Hypnogram) or hasattr(test_hyps, "__iter__"), (
-            "`test_hyps` must be a YASA hypnogram or iterable containing multiple YASA hypnograms"
-        )
+        assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
+        assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
         assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-
-        # Convert solo hypnograms to len==1 tuples
-        if isinstance(refr_hyps, Hypnogram):  # As below, picking refr_hyps for checks arbitrarily
-            refr_hyps = [refr_hyps]
-            test_hyps = [test_hyps]
-        assert len(refr_hyps) == len(test_hyps), "must have same number of subjects"
+        assert len(refr_hyps) == len(test_hyps), (
+            "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
+        )
 
         if isinstance(refr_hyps, dict):
-            assert refr_hyps.keys() == test_hyps.keys(), "must have same subject identifiers and in same order"
-            subjects, refr_hyps = zip(*refr_hyps.items())
-            # assert all(isinstance(s, str) for s in subjects)
+            # If user provides dictionaries, split into sleep IDs and hypnograms
+            assert refr_hyps.keys() == test_hyps.keys(), (
+                "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
+            )
+            sleep_ids, refr_hyps = zip(*refr_hyps.items())
             test_hyps = tuple(test_hyps.values())
         else:
-            subjects = 1 + np.arange(len(refr_hyps))
+            # Create hypnogram_ids
+            sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
 
-        all_hyps = refr_hyps + test_hyps
-        assert all(isinstance(hyp, Hypnogram) for hyp in all_hyps), "`refr_hyps` and `test_hyps` must only include hypnograms"
-        assert all(h.scorer is not None for h in all_hyps), "all hypnograms must have a scorer"
-        for h1, h2 in zip(all_hyps[:-1], all_hyps[1:]):
-            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+        assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps), (
+            "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+        )
+        assert all(h.scorer is not None for h in refr_hyps + test_hyps), (
+            "all hypnograms must have a scorer name"
+        )
+        for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), "all `refr_hyps` must have the same scorer"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), "all `test_hyps` must have the same scorer"
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
-        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        ## Could use set() for those above
-        ## Or set scorer as the first available and check all equal
+            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), (
+            "all `refr_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), (
+            "all `test_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), (
+            "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        )
+        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), (
+            "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
+        )
+        ## Q: Could use set() for those above.
+        ##    Or set scorer as the first available and check all equal.
 
-        # Convert to dictionaries with subjects and hypnograms
-        refr_hyps = { s: h for s, h in zip(subjects, refr_hyps) }
-        test_hyps = { s: h for s, h in zip(subjects, test_hyps) }
+        # Convert to dictionaries with sleep_ids and hypnograms
+        refr_hyps = { s: h for s, h in zip(sleep_ids, refr_hyps) }
+        test_hyps = { s: h for s, h in zip(sleep_ids, test_hyps) }
 
         # Merge all hypnograms into a single multiindexed dataframe
-        refr = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in refr_hyps.items())
-        test = pd.concat(pd.concat({s: h.hypno}, names=["subject"]) for s, h in test_hyps.items())
+        refr = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in refr_hyps.items())
+        test = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in test_hyps.items())
         data = pd.concat([refr, test], axis=1)
 
-        # Get summary sleep statistics for each measurement.
-        refr_sstats = pd.Series(refr_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        test_sstats = pd.Series(test_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        refr_sstats = refr_sstats.set_index(pd.Index(subjects, name="subject"))
-        test_sstats = test_sstats.set_index(pd.Index(subjects, name="subject"))
-        # sse = yasa.SleepStatsEvaluation(refr_sstats, test_sstats)
-        
+        ########################################################################
+        # INDIVIDUAL-LEVEL AGREEMENT
+        ########################################################################
+
+        # Get individual-level averaged/weighted agreement scores
+        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer_avg).apply(pd.Series)
+        ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
+
+        # Get individual-level one-vs-rest/un-weighted agreement scores
+        # Only include stages that appear in the data
+        # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
+        labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
+        ############ OPTION 1 (uses staticmethod, slower by 500ms)
+        indiv_agree_ovr = (data
+            # Get multiple metrics for each individual sleep
+            .groupby(level=0).apply(self.multi_scorer_ovr, labels=labels)
+            # Unpack metrics results and reshape
+            .apply(pd.Series).stack().apply(pd.Series)
+            # Convert stages to string labels
+            .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+            # Reshape so metrics are columns
+            .stack().unstack(level=1)
+            # Swap MultiIndex levels and sort so stages drive the view
+            .swaplevel().sort_index(level="stage", key=lambda x: x.map(lambda y: labels.index(y)))
+        )
+        # ############ OPTION 2 (does NOT use staticmethod, faster by 500ms)
+        # prfs_func = lambda df: skm.precision_recall_fscore_support(
+        #     *df.values.T, labels=labels, average=None, zero_division=0
+        # )
+        # indiv_agree_ovr = (data
+        #     .groupby(level=0).apply(prfs_func)
+        #     .explode().apply(pd.Series)
+        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
+        #     .set_index("metric", append=True)
+        #     .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+        #     .stack().unstack("metric").rename_axis(columns=None)
+        # )
+        ## Q: Currently both options will leave some all-zero rows, for when a stage is present
+        ##    in some subjects but not others. Prefer to remove?
+        # agr = agr.loc[agr.any(axis=1)]  # or .pipe
+        # And then could drop the label restriction, just passing all labels to preserve order
+
         # Set attributes
         self._data = data
-        self._subjects = subjects
-        self._n_subjects = len(subjects)
+        self._sleep_ids = sleep_ids
+        self._n_sleeps = len(sleep_ids)
         self._refr_hyps = refr_hyps
         self._test_hyps = test_hyps
-        self._refr_sstats = refr_sstats
-        self._test_sstats = test_sstats
-        self._refr_name = refr_hyps[subjects[0]].scorer
-        self._test_name = test_hyps[subjects[0]].scorer
-        self._n_stages = refr_hyps[subjects[0]].n_stages
-        self._labels = refr_hyps[subjects[0]].labels
+        self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
+        self._test_scorer = test_hyps[sleep_ids[0]].scorer
+        self._labels = refr_hyps[sleep_ids[0]].labels
+        self._indiv_agree_avg = indiv_agree_avg
+        self._indiv_agree_ovr = indiv_agree_ovr
+        ## Q: Merge these to one individual agreement dataframe?
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
-        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
+        s = "s" if self._n_sleeps > 1 else ""
         return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
-            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
+            f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
+            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep"
+            f"session{s}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
             "See the online documentation for more details."
         )
 
     def __str__(self):
-        text_subjects = f", {self.n_subjects} subject" + ("s" if self.n_subjects > 1 else "")
-        return (
-            f"<EpochByEpochEvaluation | Test Hypnogram scored by {self.refr_name} evaluated "
-            f"against reference Hypnogram scored by {self.test_name}{text_subjects}>\n"
-            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
-            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
-            "See the online documentation for more details."
-        )
+        return self.__repr__()
 
     @property
     def data(self):
+        """A :py:class:`pandas.DataFrame` including all hypnograms."""
         return self._data
 
-    @property
-    def refr_sstats(self):
-        return self._refr_sstats
-
-    @property
-    def test_sstats(self):
-        return self._test_sstats
-
     @property
     def refr_hyps(self):
-        """The reference Hypnograms."""
-        ## Q: Starting to think there should be a clear convention on what we mean
-        ##    when we say "hypnogram". Should hypnogram mean the Series and Hypnogram
-        ##    mean the YASA object? Similarly for hypno/hyp.
+        """A dictionary of all reference YASA hypnograms with sleep IDs as keys."""
         return self._refr_hyps
 
     @property
     def test_hyps(self):
-        """The test Hypnograms."""
+        """A dictionary of all test YASA hypnograms with sleep IDs as keys."""
         return self._test_hyps
 
     @property
-    def subjects(self):
-        return self._subjects
+    def sleep_ids(self):
+        """A tuple of all sleep IDs."""
+        return self._sleep_ids
 
     @property
-    def n_subjects(self):
-        return self._n_subjects
+    def n_sleeps(self):
+        """The number of unique sleep sessions."""
+        return self._n_sleeps
 
     @property
-    def refr_name(self):
-        """The name of the reference measurement."""
-        return self._refr_name
+    def refr_scorer(self):
+        """The name of the reference scorer."""
+        return self._refr_scorer
 
     @property
-    def test_name(self):
-        """The name of the test measurement."""
-        return self._test_name
+    def test_scorer(self):
+        """The name of the test scorer."""
+        return self._test_scorer
 
     @property
     def labels(self):
+        """All available sleep stage labels."""
         return self._labels
 
     @property
-    def n_stages(self):
-        return self._n_stages
+    def indiv_agree_avg(self):
+        """
+        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` average-based agreement scores
+        for each individual sleep session.
+
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_ovr`
+        """
+        return self._indiv_agree_avg
+
+    @property
+    def indiv_agree_ovr(self):
+        """
+        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` one-vs-rest agreement scores
+        for each individual sleep session. Agreement scores are provided for each sleep stage.
 
-    def get_agreement(self, subject=None):
+        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_avg`
         """
-        Return a dataframe of ``refr_hyp``/``test_hyp`` performance across all stages as measured by
-        common classifier agreement methods.
+        return self._indiv_agree_ovr
 
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement_by_stage`
-        ## Q: Are there better names to differentiate get_agreement vs get_agreement_by_stage?
-        ##    Maybe should be binary vs multiclass?
+    @staticmethod
+    def multi_scorer_avg(df):
+        """Compute multiple agreement scores from a 2-column dataframe.
+
+        This function offers convenience when calculating multiple agreement scores using
+        :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
+        return multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
+        accept multiple functions.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEpochEvaluation`
-            A :py:class:`yasa.EpochByEpochEvaluation` instance.
-        subject : None or a unique subject identifier.
-            Subject identifiers are based on user input, and integers starting from 1 if not provided.
+        df : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
+            The first column contains true values and second column contains predicted values.
 
         Returns
         -------
-        agreement : :py:class:`pandas.Series`
-            A :py:class:`pandas.Series` with agreement metrics as indices.
+        scores : dict
+            A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
-        if subject is not None:
-            true = pred.loc[subject]
-            pred = pred.loc[subject]
-        accuracy = metrics.accuracy_score(true, pred)
-        kappa = metrics.cohen_kappa_score(true, pred)
-        jaccard = metrics.jaccard_score(true, pred, average="weighted")
-        precision = metrics.precision_score(true, pred, average="weighted", zero_division=0)
-        recall = metrics.recall_score(true, pred, average="weighted", zero_division=0)
-        f1 = metrics.f1_score(true, pred, average="weighted", zero_division=0)
-        scores = {
-            "accuracy": accuracy,
-            "kappa": kappa,
-            "weighted_jaccard": jaccard,
-            "weighted_precision": precision,
-            "weighted_recall": recall,
-            "weighted_f1": f1,
+        true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
+        ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
+        ##     For example:
+        ##     >>> scorers = ["accuracy", "recall"]
+        ##     >>> funcs = { s: skm.__getattribute__(f"{s}_scorer") for s in scorers }
+        ##     >>> scores = { s: f(true, pred) for s, f in funcs.items() }
+        ##     Keywords could be applied as needed by checking f.__kwdefaults__
+        ##     This would offer an easy way for users to add their own scorers with an arg as well.
+        return {
+            "accuracy": skm.accuracy_score(true, pred),
+            "kappa": skm.cohen_kappa_score(true, pred),
+            "jaccard_micro": skm.jaccard_score(true, pred, average="micro"),
+            "jaccard_macro": skm.jaccard_score(true, pred, average="macro"),
+            "jaccard_weighted": skm.jaccard_score(true, pred, average="weighted"),
+            "precision_micro": skm.precision_score(true, pred, average="micro", zero_division=0),
+            "precision_macro": skm.precision_score(true, pred, average="macro", zero_division=0),
+            "precision_weighted": skm.precision_score(
+                true, pred, average="weighted", zero_division=0
+            ),
+            "recall_micro": skm.recall_score(true, pred, average="micro", zero_division=0),
+            "recall_macro": skm.recall_score(true, pred, average="macro", zero_division=0),
+            "recall_weighted": skm.recall_score(true, pred, average="weighted", zero_division=0),
+            "f1_micro": skm.f1_score(true, pred, average="micro", zero_division=0),
+            "f1_macro": skm.f1_score(true, pred, average="macro", zero_division=0),
+            "f1_weighted": skm.f1_score(true, pred, average="weighted", zero_division=0),
         }
-        agreement = pd.Series(scores, name="agreement").rename_axis("metric")
-        return agreement
 
-    def get_agreement_by_stage(self, subject=None):
+    @staticmethod
+    def multi_scorer_ovr(df, labels):
+        """Compute multiple one-vs-rest agreement scores from a 2-column dataframe.
+
+        Parameters
+        ----------
+        df : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
+            The first column contains true values and second column contains predicted values.
+        labels : array-like
+            The labels to include in scoring and control the order of returned scores.
+
+        Returns
+        -------
+        scores : dict
+            A dictionary with scorer names (``str``) as keys and scores (``np.ndarray``) as values.
         """
-        Return a dataframe of ``refr_hyp``/``test_hyp`` performance for each stage as measured by
-        common classifier agreement methods.
+        true, pred = zip(*df.values)
+        return {
+            "precision": skm.precision_score(true, pred, labels=labels, average=None, zero_division=0),
+            "recall": skm.recall_score(true, pred, labels=labels, average=None, zero_division=0),
+            "f1": skm.f1_score(true, pred, labels=labels, average=None, zero_division=0),
+            "support": pd.Series(true).value_counts().reindex(labels, fill_value=0).to_numpy(),
+        }
 
-        .. seealso:: :py:meth:`yasa.EpochByEpochResults.get_agreement`
+    def summary(self, by_stage=False, **kwargs):
+        """Return group-level agreement scores.
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+        by_stage : bool
+            If True, returned ``summary`` :py:class:`pandas.DataFrame` will include agreement scores
+            for each sleep stage, derived from one-vs-rest metrics. If False (default), ``summary``
+            will include agreement scores derived from average-based metrics.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
 
         Returns
         -------
-        agreement : :py:class:`pandas.DataFrame`
-            A DataFrame with agreement metrics as indices and stages as columns.
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
+            with descriptive statistics.
+
+            >>> ebe = yasa.EpochByEpochEvaluation(...)
+            >>> ebe.summary()
+
+            This will give a :py:class:`pandas.DataFrame` where each row is an agreement metric and
+            each column is a descriptive statistic (e.g., mean, standard deviation).
+            To control the descriptive statistics included as columns:
+
+            >>> ebe.summary(func=["count", "mean", "sem"])
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
-        if subject is not None:
-            true = true.loc[subject]
-            pred = pred.loc[subject]
-        scores = metrics.precision_recall_fscore_support(
-            true, pred, labels=self.labels, average=None, zero_division=0
-        )
-        agreement = pd.DataFrame(scores)
-        agreement.index = pd.Index(["precision", "recall", "fscore", "support"], name="metric")
-        agreement.columns = pd.Index(self.labels, name="stage")
-        return agreement
+        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
+        agg_kwargs = {"func": ["mean", "std", "min", "median", "max"]} | kwargs
+        if by_stage:
+            summary = (self.indiv_agree_ovr
+                .groupby("stage").agg(**agg_kwargs)
+                .stack(0).rename_axis(["stage", "metric"])
+            )
+        else:
+            summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
+            ## Q: Should we include a column that calculates agreement treating all hypnograms as
+            ##    coming from one individual? Others sometimes report it, though I find it mostly
+            ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
+            # summary.insert(0, "all", self.multi_scorer_avg(self.data))
+        ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
+        ##    one merged DataFrame where the results that are *not* by-stage are included
+        ##    with an "all" stage label:
+        ## >>> summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
+        ## >>> summary = pd.concat([summary, summary_ovr]).sort_index()
+        return summary
+
+    def get_sleep_stats(self):
+        """
+        Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived from
+        both reference and test scorers.
 
-    def get_confusion_matrix(self, subject=None):
-        """Return a ``refr_hyp``/``test_hyp``confusion matrix.
+        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
+
+        .. seealso:: :py:class:`yasa.SleepStatsEvaluation`
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+
+        Returns
+        -------
+        sstats : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with sleep statistics as columns and two rows for each
+            individual (one from reference scorer and another from test scorer).
+        """
+        # Get all sleep statistics
+        refr_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.refr_hyps.items()})
+        test_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.test_hyps.items()})
+        # Reshape and name axis
+        refr_sstats = refr_sstats.T.rename_axis("sleep_id")
+        test_sstats = test_sstats.T.rename_axis("sleep_id")
+        # Convert to MultiIndex with new scorer level
+        refr_sstats = pd.concat({self.refr_scorer: refr_sstats}, names=["scorer"])
+        test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
+        return pd.concat([refr_sstats, test_sstats])
+
+    def get_confusion_matrix(self, sleep_id=None):
+        """
+        Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
+        sessions concatenated together.
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEvaluation`
+            A :py:class:`yasa.EpochByEvaluation` instance.
+        sleep_id : None or a valid sleep ID
+            If None (default), cross-tabulation is derived from the entire group dataset.
+            If a valid sleep ID, cross-tabulation is derived using only the reference and test
+            scored hypnograms from that sleep session.
 
         Returns
         -------
         matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as columns.
+            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as
+            columns.
         """
-        true = self.data[self.refr_name]
-        pred = self.data[self.test_name]
-        if subject is not None:
-            true = true.loc[subject]
-            pred = pred.loc[subject]
-        # Generate confusion matrix.
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
+        true = self.data[self.refr_scorer]
+        pred = self.data[self.test_scorer]
+        if sleep_id is not None:
+            true = true.loc[sleep_id]
+            pred = pred.loc[sleep_id]
         matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
         # Reorder indices in sensible order and to include all stages
         index_col_labels = self.labels + ["Total"]
         matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
         return matrix.astype(int)
 
-    def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
-        """Plot the two hypnograms, where ``refr_hyp`` is overlaid on ``refr_hyp``.
+    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
+        """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
 
         Parameters
         ----------
+        sleep_id : None or a valid sleep ID
+            If a valid sleep ID, plot the reference and test hypnograms from on sleep session.
         legend : bool or dict
             If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
             pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
         ax : :py:class:`matplotlib.axes.Axes` or None
             Axis on which to draw the plot, optional.
         refr_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``refr_hyp``.
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
+            hypnogram.
         test_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting ``test_hyp``.
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the test
+            hypnogram.
 
         Returns
         -------
@@ -381,21 +536,24 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
-        if subject is None:
-            if self.n_subjects == 1:
-                refr_hyp = self.refr_hyps[self.subjects[0]]
-                test_hyp = self.test_hyps[self.subjects[0]]
-            else:
-                raise NotImplementedError("Plotting is currently allowed for only one subject")
-        else:
-            refr_hyp = self.refr_hyps[subject]
-            test_hyp = self.test_hyps[subject]
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
         assert not "ax" in refr_kwargs | test_kwargs, (
             "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
         )
+        if sleep_id is None:
+            if self.n_sleeps == 1:
+                refr_hyp = self.refr_hyps[self.sleep_ids[0]]
+                test_hyp = self.test_hyps[self.sleep_ids[0]]
+            else:
+                raise NotImplementedError("Multi-session plotting is not currently supported")
+        else:
+            refr_hyp = self.refr_hyps[sleep_id]
+            test_hyp = self.test_hyps[sleep_id]
         plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
         plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
         plot_refr_kwargs.update(refr_kwargs)
@@ -411,7 +569,7 @@ def plot_hypnograms(self, subject=None, legend=True, ax=None, refr_kwargs={}, te
                 ax.legend()
         return ax
 
-    def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
+    def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         """Plot ROC curves for each stage.
 
         Parameters
@@ -428,8 +586,10 @@ def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        # assert self.test_hyp.probas is not None
-        raise NotImplementedError("Requires probability/confidence values.")
+        assert sleep_id is None or sleep_id in self.sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
+        raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
 
 
 #############################################################################
@@ -439,22 +599,22 @@ def plot_roc(self, subject=None, palette=None, ax=None, **kwargs):
 
 class SleepStatsEvaluation:
     """
-    Evaluate agreement between two measurement systems (e.g., two different manual scorers or one
-    one manual scorer againt YASA's automatic staging) by comparing their summary sleep statistics
-    derived from multiple subjects or sessions.
+    Evaluate agreement between two scorers (e.g., two different manual scorers or one manual scorer
+    and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
+    subjects or sessions.
 
     Parameters
     ----------
     refr_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the reference measurement system.
-        Rows are individual subjects and columns are individual sleep statistics.
+        A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
+        Rows are individual sleep sessions and columns are individual sleep statistics.
     test_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the test measurement system.
+        A :py:class:`pandas.DataFrame` with sleep statistics from the test scorer.
         Shape, indices, and columns must be identical to ``refr_data``.
-    refr_name : str
-        Name of the reference measurement device, used for labeling.
-    test_name : str
-        Name of the test measurement device, used for labeling.
+    refr_scorer : str
+        Name of the reference scorer, used for labeling.
+    test_scorer : str
+        Name of the test scorer, used for labeling.
     alpha : float
         Alpha cutoff used for all three tests.
     kwargs_normality : dict
@@ -543,67 +703,89 @@ def __init__(
         refr_data,
         test_data,
         *,
-        refr_name="Reference",
-        test_name="Test",
+        refr_scorer="Reference",
+        test_scorer="Test",
         kwargs_normality={"alpha": 0.05},
         kwargs_regression={"alpha": 0.05},
         kwargs_homoscedasticity={"alpha": 0.05},
     ):
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
-        assert np.array_equal(refr_data.index, test_data.index), "`refr_data` and `test_data` indices must be identical"
-        assert np.array_equal(refr_data.columns, test_data.columns), "`refr_data` and `test_data` columns must be identical"
-        assert refr_data.index.name == test_data.index.name, "`refr_data` and `test_data` index names must be identical"
-        assert isinstance(refr_name, str)
-        assert isinstance(test_name, str)
-        assert refr_name != test_name
-        assert isinstance(kwargs_normality, dict)
-        assert isinstance(kwargs_regression, dict)
-        assert isinstance(kwargs_homoscedasticity, dict)
-        assert "alpha" in kwargs_normality
-        assert "alpha" in kwargs_regression
-        assert "alpha" in kwargs_homoscedasticity
-
-        # Merge dataframes, get differences, and reshape wide-to-long format
-        subj_name = "subject" if refr_data.index.name is None else refr_data.index.name
-        refr_data.index.name = subj_name
-        test_data.index.name = subj_name
-        diff_data = pd.concat({"difference": test_data.sub(refr_data)}, names=["measurement"])
-        refr_data = pd.concat({refr_name: refr_data}, names=["measurement"])
-        test_data = pd.concat({test_name: test_data}, names=["measurement"])
-        data = (pd.concat([refr_data, test_data, diff_data])
+        assert np.array_equal(refr_data.index, test_data.index), (
+            "`refr_data` and `test_data` index values must be identical"
+        )
+        assert refr_data.index.name == test_data.index.name, (
+            "`refr_data` and `test_data` index names must be identical"
+        )
+        assert np.array_equal(refr_data.columns, test_data.columns), (
+            "`refr_data` and `test_data` column values must be identical"
+        )
+        assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
+        assert isinstance(test_scorer, str), "`test_scorer` must be a string"
+        assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
+        assert isinstance(kwargs_normality, dict), "`kwargs_normality` must be a dictionary"
+        assert isinstance(kwargs_regression, dict), "`kwargs_regression` must be a dictionary"
+        assert isinstance(kwargs_homoscedasticity, dict), "`kwargs_homoscedasticity` must be a dict"
+        assert "alpha" in kwargs_normality, "`kwargs_normality` must include 'alpha'"
+        assert "alpha" in kwargs_regression, "`kwargs_regression` must include 'alpha'"
+        assert "alpha" in kwargs_homoscedasticity, "`kwargs_homoscedasticity` must include 'alpha'"
+
+        # If refr_data and test_data indices are unnamed, name them
+        sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
+        refr_data.index.name = sleep_id_str
+        test_data.index.name = sleep_id_str
+
+        # Get scorer differences
+        diff_data = test_data.sub(refr_data)
+
+        # Convert to MultiIndex with new scorer level
+        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
+        test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
+
+        # Merge dataframes and reshape to long format
+        data = pd.concat([refr_data, test_data, diff_data])
+        data = (data
             .melt(var_name="sstat", ignore_index=False).reset_index()
-            .pivot(columns="measurement", index=[subj_name, "sstat"], values="value")
+            .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
             .reset_index().rename_axis(columns=None)
         )
 
-        # Remove sleep statistics that have no differences between measurement systems
-        ## TODO: restructure?
+        # Remove sleep statistics that have no differences between scorers
         stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
         data = data.query(f"~sstat.isin({stats_nodiff})")
         for s in stats_nodiff:
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
-            ## Q: Should this be logged as just info?
 
-        ## NORMALITY ## Test reference data for normality at each sleep statistic
-        normality = data.groupby("sstat")[refr_name].apply(pg.normality, **kwargs_normality).droplevel(-1)
+        ## NORMALITY ##
+        # Test reference data for normality at each sleep statistic
+        normality = (data
+            .groupby("sstat")[refr_scorer]
+            .apply(pg.normality, **kwargs_normality)
+            .droplevel(-1)
+        )
 
-        ## PROPORTIONAL BIAS ## Test each sleep statistic for proportional bias
-        # Subject-level residuals for each statistic are added to data.
+        ## PROPORTIONAL BIAS ##
+        # Test each sleep statistic for proportional bias
         prop_bias_results = []
         residuals_results = []
-        # proportional bias and residuals that will be used for the later  tests.
         for ss_name, ss_df in data.groupby("sstat"):
             # Regress the difference scores on the reference scores
-            model = pg.linear_regression(ss_df[refr_name], ss_df["difference"], **kwargs_regression)
+            model = pg.linear_regression(
+                ss_df[refr_scorer], ss_df["difference"], **kwargs_regression
+            )
             model.insert(0, "sstat", ss_name)
-            # Extract subject-level residuals for later homoscedasticity tests
-            resid_dict = {subj_name: ss_df[subj_name], "sstat": ss_name, "pbias_residual": model.residuals_}
+            # Extract sleep-level residuals for later homoscedasticity tests
+            resid_dict = {
+                sleep_id_str: ss_df[sleep_id_str],
+                "sstat": ss_name,
+                "pbias_residual": model.residuals_,
+            }
             resid = pd.DataFrame(resid_dict)
             prop_bias_results.append(model)
             residuals_results.append(resid)
         # Add residuals to raw dataframe, used later when testing homoscedasticity
-        data = data.merge(pd.concat(residuals_results), on=[subj_name, "sstat"])
+        data = data.merge(pd.concat(residuals_results), on=[sleep_id_str, "sstat"])
         # Handle proportional bias results
         prop_bias = pd.concat(prop_bias_results)
         # Save all the proportional bias models before removing intercept, for optional user access
@@ -614,99 +796,94 @@ def __init__(
         prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
 
         ## Test each statistic for homoscedasticity ##
-        columns = [refr_name, "difference", "pbias_residual"]
-        homoscedasticity_func = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
-        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_func).droplevel(-1)
+        columns = [refr_scorer, "difference", "pbias_residual"]
+        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
 
         # Set attributes
         self._data = data
         self._normality = normality
         self._proportional_bias = prop_bias
-        self._proportional_bias_full = prop_bias_full  # Q: Is this worth saving??
+        self._proportional_bias_full = prop_bias_full  ## Q: Is this worth saving??
         self._homoscedasticity = homoscedasticity
-        # These will not be set as properties, as they are only needed internally
-        self._refr_name = refr_name
-        self._test_name = test_name
-        self._subj_name = subj_name
-        self._n_subjects = data[subj_name].nunique()
-        # Pivot new to not include removed sstats
-        self._diff_data = data.pivot(index=self.subj_name, columns="sstat", values="difference")
+        self._refr_scorer = refr_scorer
+        self._test_scorer = test_scorer
+        self._sleep_id_str = sleep_id_str
+        self._n_sleeps = data[sleep_id_str].nunique()
+        self._diff_data = diff_data.drop(columns=stats_nodiff)
+        # self._diff_data = data.pivot(index=sleep_id_str, columns="sstat", values="difference")
 
     @property
     def data(self):
-        """
-        ``refr_data`` and ``test_data`` combined in a long-format :py:class:`pandas.DataFrame`.
-        Also includes difference scores (``test_data`` minus ``refr_data``).
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``refr_data`` and
+        ``test_data`` as well as their difference scores (``test_data`` minus ``refr_data``).
         """
         return self._data
 
     @property
     def diff_data(self):
         """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
-        # # Pivot for subject-rows and statistic-columns
+        # # Pivot for session-rows and statistic-columns
         return self._diff_data
 
     @property
-    def refr_name(self):
-        """The name of the reference measurement."""
-        return self._refr_name
+    def refr_scorer(self):
+        """The name of the reference scorer."""
+        return self._refr_scorer
 
     @property
-    def test_name(self):
-        """The name of the test measurement."""
-        return self._test_name
+    def test_scorer(self):
+        """The name of the test scorer."""
+        return self._test_scorer
 
     @property
-    def subj_name(self):
-        """The name of the subject identifier."""
-        return self._subj_name
+    def sleep_id_str(self):
+        """The name of the unique sleep session identifier."""
+        return self._sleep_id_str
 
     @property
-    def n_subjects(self):
-        """The number of subjects."""
-        return self._n_subjects
+    def n_sleeps(self):
+        """The number of sleep sessions."""
+        return self._n_sleeps
 
     @property
     def normality(self):
-        """A :py:class:`pandas.DataFrame` of normality test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of normality results for all sleep statistics."""
         return self._normality
 
     @property
     def homoscedasticity(self):
-        """A :py:class:`pandas.DataFrame` of homoscedasticity test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of homoscedasticity results for all sleep statistics."""
         return self._homoscedasticity
 
     @property
     def proportional_bias(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        """
+        A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics, with
+        intercept terms removed.
+        """
         return self._proportional_bias
 
     @property
     def proportional_bias_full(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias test results for all sleep statistics."""
+        """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
         return self._proportional_bias_full
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
+            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference"
+            f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
         )
 
     def __str__(self):
-        return (
-            f"<SleepStatsEvaluation | Test measurement '{self.test_name}' evaluated against "
-            f"reference measurement '{self.refr_name}', {self.n_subjects} subjects>\n"
-            " - Use `.summary()` to get pass/fail values from various checks\n"
-            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
-            "See the online documentation for more details."
-        )
+        return __repr__()
 
     def summary(self, descriptives=True):
-        """Return a summary dataframe highlighting what statistics pass checks.
+        """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
@@ -714,7 +891,7 @@ def summary(self, descriptives=True):
             A :py:class:`SleepStatsEvaluation` instance.
         descriptives : bool or dict
             If True (default) or a dictionary, also include descriptive statistics for reference and
-            test measurements. If a dictionary, all key/value pairs are passed as keyword arguments
+            test scorers. If a dictionary, all key/value pairs are passed as keyword arguments
             to the :py:meth:`pandas.DataFrame.agg` call.
 
         Returns
@@ -734,13 +911,13 @@ def summary(self, descriptives=True):
             agg_kwargs = {"func": ["mean", "std"]}
             if isinstance(descriptives, dict):
                 agg_kwargs.update(descriptives)
-            desc = self.data.drop(columns=self.subj_name).groupby("sstat").agg(**agg_kwargs)
+            desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
             desc.columns = desc.columns.map("_".join)
             summary = summary.join(desc)
         return summary
 
     def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
-        """Visualize subject-level discrepancies, generally for outlier inspection.
+        """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
@@ -771,7 +948,7 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         return sns.heatmap(table_norm, **heatmap_kwargs)
 
     def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
-        """Visualize subject-level discrepancies, generally for outlier inspection.
+        """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
@@ -799,9 +976,11 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         # Initialize the PairGrid
         height = 0.3 * len(self.diff_data)
         aspect = 0.6
-        pairgrid_kwargs = dict(hue=self.subj_name, height=height, aspect=aspect)
+        pairgrid_kwargs = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
         pairgrid_kwargs.update(kwargs_pairgrid)
-        g = sns.PairGrid(self.diff_data.reset_index(), y_vars=[self.subj_name], **pairgrid_kwargs)
+        g = sns.PairGrid(
+            self.diff_data.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
+        )
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
         # Adjust aesthetics
@@ -838,7 +1017,7 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         # Initialize a grid of plots with an Axes for each sleep statistic
         g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_name, self.refr_name, **blandaltman_kwargs)
+        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **blandaltman_kwargs)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks
@@ -846,7 +1025,7 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.test_name, self.refr_name))
+        ylabel = " - ".join((self.test_scorer, self.refr_scorer))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)
diff --git a/yasa/hypno.py b/yasa/hypno.py
index a50eeaf..37ae1ad 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -571,22 +571,19 @@ def copy(self):
             scorer=self.scorer,
         )
 
-    def evaluate(self, hypno_test):
+    def evaluate(self, test_hyp):
         """Evaluate agreement between two hypnograms.
 
         Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
-        test hypnogram (i.e., ``hypno_test``) is a hypnogram from an actigraphy/wearable device or
+        test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or
         automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
 
-        Comparing more than two hypnograms is not currently supported.
-
         Parameters
         ----------
         self : :py:class:`yasa.Hypnogram`
             Reference or ground-truth hypnogram.
-        hypno_test : :py:class:`yasa.Hypnogram`
+        test_hyp : :py:class:`yasa.Hypnogram`
             The test or to-be-evaluated hypnogram.
-            Must have the same ``n_stages`` as the reference hypnogram.
 
         Returns
         -------
@@ -605,11 +602,11 @@ def evaluate(self, hypno_test):
             >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
             >>> ebe = hypno_ref.evaluate(hypno_test)
             >>> conf = ebe.get_confusion_matrix()
-            >>> perf = ebe.get_agreement()
+            >>> perf = ebe.summary()
             >>> # Plot the overlapping hypnograms
             >>> ebe.plot_hypnograms()
         """
-        return EpochByEpochEvaluation(self, hypno_test)
+        return EpochByEpochEvaluation([self], [test_hyp])
 
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.

From 28f268b2f9b1208d854c6d6c38310646d24650fb Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Wed, 4 Jan 2023 00:21:49 -0600
Subject: [PATCH 35/43] alternate ovr agreement implementation

use hyp integers instead of strings for big skm agreement speed improvements (~2s)

fmt

quick comment addresses

mad

typo

pd.crosstab --> skm.confusion_matrix
---
 yasa/evaluation.py | 407 +++++++++++++++++++++++++--------------------
 yasa/hypno.py      |   2 +-
 2 files changed, 228 insertions(+), 181 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 35273a4..636b3e8 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -38,21 +38,41 @@
 
 
 class EpochByEpochEvaluation:
-    """
-    For comparing only 2 hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
+    """Evaluate agreement between two collections of hypnograms.
 
-    Parameters
-    ----------
-    refr_hyps : :py:class:`yasa.Hypnogram`
-        A collection of reference or ground-truth hypnograms.
-    test_hyps : :py:class:`yasa.Hypnogram`
-        A collection of test or to-be-evaluated hypnograms.
+    For example, evaluate the agreement between manually-scored hypnograms and automatically-scored
+    hypnograms, or hypnograms derived from actigraphy.
 
-    Notes
-    -----
     Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
+    Parameters
+    ----------
+    refr_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of reference (i.e., ground-truth) hypnograms.
+
+        Each :py:class:`yasa.Hypnogram` in ``refr_hyps`` must have the same
+        :py:attr:`~yasa.Hypnogram.scorer`.
+
+        If a ``dict``, key values are use to generate unique sleep session IDs. If any other
+        iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
+        generated.
+    test_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of test (i.e., to-be-evaluated) hypnograms.
+
+        Each :py:class:`yasa.Hypnogram` in ``test_hyps`` must have the same
+        :py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
+        hypnograms in ``refr_hyps``.
+
+        If a ``dict``, key values must match those of ``refr_hyps``.
+
+    .. important::
+        It is assumed that the order of hypnograms are the same in ``refr_hyps`` and ``test_hyps``.
+        For example, the third hypnogram in ``refr_hyps`` and ``test_hyps`` come from the same sleep
+        session, and only differ in that they have different scorers.
+
+    .. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
+
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
@@ -130,59 +150,65 @@ class EpochByEpochEvaluation:
     UNS        0    0    0   0    0    0    0      0
     Total    277  128  333  98  124    0    0    960
     """
+
     def __init__(self, refr_hyps, test_hyps):
         from yasa.hypno import Hypnogram  # Avoiding circular import
 
         assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
         assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
         assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-        assert len(refr_hyps) == len(test_hyps), (
-            "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
-        )
+        assert len(refr_hyps) == len(
+            test_hyps
+        ), "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
 
         if isinstance(refr_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
-            assert refr_hyps.keys() == test_hyps.keys(), (
-                "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
-            )
+            assert (
+                refr_hyps.keys() == test_hyps.keys()
+            ), "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
             sleep_ids, refr_hyps = zip(*refr_hyps.items())
             test_hyps = tuple(test_hyps.values())
         else:
             # Create hypnogram_ids
             sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
 
-        assert all(isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps), (
-            "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
-        )
-        assert all(h.scorer is not None for h in refr_hyps + test_hyps), (
-            "all hypnograms must have a scorer name"
-        )
+        assert all(
+            isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps
+        ), "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+        assert all(
+            h.scorer is not None for h in refr_hyps + test_hyps
+        ), "all hypnograms must have a scorer name"
         for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
+            assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])), (
-            "all `refr_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])), (
-            "all `test_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)), (
-            "each `refr_hyps` and `test_hyps` pair must have unique scorers"
-        )
-        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)), (
-            "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        )
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])
+        ), "all `refr_hyps` must have the same scorer"
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])
+        ), "all `test_hyps` must have the same scorer"
+        assert all(
+            h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)
+        ), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+        assert all(
+            h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)
+        ), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
         ## Q: Could use set() for those above.
         ##    Or set scorer as the first available and check all equal.
 
         # Convert to dictionaries with sleep_ids and hypnograms
-        refr_hyps = { s: h for s, h in zip(sleep_ids, refr_hyps) }
-        test_hyps = { s: h for s, h in zip(sleep_ids, test_hyps) }
+        refr_hyps = {s: h for s, h in zip(sleep_ids, refr_hyps)}
+        test_hyps = {s: h for s, h in zip(sleep_ids, test_hyps)}
 
-        # Merge all hypnograms into a single multiindexed dataframe
-        refr = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in refr_hyps.items())
-        test = pd.concat(pd.concat({s: h.hypno}, names=["sleep_id"]) for s, h in test_hyps.items())
+        # Merge all hypnograms into a single MultiIndexed dataframe
+        refr = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items()
+        )
+        test = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items()
+        )
         data = pd.concat([refr, test], axis=1)
 
         ########################################################################
@@ -190,42 +216,50 @@ def __init__(self, refr_hyps, test_hyps):
         ########################################################################
 
         # Get individual-level averaged/weighted agreement scores
-        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer_avg).apply(pd.Series)
+        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer).apply(pd.Series)
         ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
 
         # Get individual-level one-vs-rest/un-weighted agreement scores
-        # Only include stages that appear in the data
-        # labels = data[refr_scorer].cat.remove_unused_categories().cat.categories
-        labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
-        ############ OPTION 1 (uses staticmethod, slower by 500ms)
-        indiv_agree_ovr = (data
-            # Get multiple metrics for each individual sleep
-            .groupby(level=0).apply(self.multi_scorer_ovr, labels=labels)
-            # Unpack metrics results and reshape
-            .apply(pd.Series).stack().apply(pd.Series)
-            # Convert stages to string labels
-            .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
+        # Labels ensures the order of returned scores is known
+        # It also can be used to remove unused labels, but that will be taken care of later anyways
+        # skm_labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
+        # skm will return an array of results, so mapping must be linear without skips
+        ## Q: Another option is to get Series.cat.codes for ints and use cat.categories for mapping
+        skm_labels = np.unique(data).tolist()
+        skm_mapping = {i: l for i, l in enumerate(skm_labels)}  # skm integers to YASA integers
+        mapping_int = refr_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integers to YASA strings
+        # labels = refr_hyps[sleep_ids[0]].labels.copy()  # To preserve YASA ordering
+        # labels = [v for k, v in mapping_int.items() if k in skm_labels]  # To preserve YASA ordering
+        prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
+            *df.values.T, beta=1, labels=skm_labels, average=None, zero_division=0
+        )
+        indiv_agree_ovr = (
+            data
+            # Get precision, recall, f1, and support for each individual sleep session
+            .groupby(level=0)
+            .apply(prfs_wrapper)
+            # Unpack arrays
+            .explode()
+            .apply(pd.Series)
+            # Add metric labels and prepend to index, creating MultiIndex
+            .assign(metric=["precision", "recall", "fbeta", "support"] * len(refr_hyps))
+            .set_index("metric", append=True)
+            # Convert stage column names to string labels
+            .rename_axis(columns="stage")
+            .rename(columns=skm_mapping)
+            .rename(columns=mapping_int)
+            # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
+            .pipe(lambda df: df.loc[:, df.any()])
             # Reshape so metrics are columns
-            .stack().unstack(level=1)
-            # Swap MultiIndex levels and sort so stages drive the view
-            .swaplevel().sort_index(level="stage", key=lambda x: x.map(lambda y: labels.index(y)))
+            .stack()
+            .unstack("metric")
+            .rename_axis(columns=None)
+            # Swap MultiIndex levels and sort so stages in standard YASA order
+            .swaplevel()
+            .sort_index(
+                level="stage", key=lambda x: x.map(lambda y: list(mapping_int.values()).index(y))
+            )
         )
-        # ############ OPTION 2 (does NOT use staticmethod, faster by 500ms)
-        # prfs_func = lambda df: skm.precision_recall_fscore_support(
-        #     *df.values.T, labels=labels, average=None, zero_division=0
-        # )
-        # indiv_agree_ovr = (data
-        #     .groupby(level=0).apply(prfs_func)
-        #     .explode().apply(pd.Series)
-        #     .assign(metric=["precision", "recall", "f1", "support"] * len(refr_hyps))
-        #     .set_index("metric", append=True)
-        #     .rename_axis(columns="stage").rename(columns={i: l for i, l in enumerate(labels)})
-        #     .stack().unstack("metric").rename_axis(columns=None)
-        # )
-        ## Q: Currently both options will leave some all-zero rows, for when a stage is present
-        ##    in some subjects but not others. Prefer to remove?
-        # agr = agr.loc[agr.any(axis=1)]  # or .pipe
-        # And then could drop the label restriction, just passing all labels to preserve order
 
         # Set attributes
         self._data = data
@@ -235,7 +269,9 @@ def __init__(self, refr_hyps, test_hyps):
         self._test_hyps = test_hyps
         self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
         self._test_scorer = test_hyps[sleep_ids[0]].scorer
-        self._labels = refr_hyps[sleep_ids[0]].labels
+        self._skm_labels = skm_labels
+        self._skm_mapping = skm_mapping
+        self._mapping_int = mapping_int
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
         ## Q: Merge these to one individual agreement dataframe?
@@ -245,7 +281,7 @@ def __repr__(self):
         s = "s" if self._n_sleeps > 1 else ""
         return (
             f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
-            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep"
+            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep "
             f"session{s}>\n"
             " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
             " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
@@ -290,11 +326,6 @@ def test_scorer(self):
         """The name of the test scorer."""
         return self._test_scorer
 
-    @property
-    def labels(self):
-        """All available sleep stage labels."""
-        return self._labels
-
     @property
     def indiv_agree_avg(self):
         """
@@ -316,7 +347,7 @@ def indiv_agree_ovr(self):
         return self._indiv_agree_ovr
 
     @staticmethod
-    def multi_scorer_avg(df):
+    def multi_scorer(df, weights=None):
         """Compute multiple agreement scores from a 2-column dataframe.
 
         This function offers convenience when calculating multiple agreement scores using
@@ -330,12 +361,23 @@ def multi_scorer_avg(df):
             A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
             The first column contains true values and second column contains predicted values.
 
+        weights : None or :py:class:`pandas.Series`
+            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions when possible.
+            If a :py:class:`pandas.Series`, the index must match exactly that of
+            :py:attr:`~yasa.Hypnogram.data`.
+
         Returns
         -------
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        assert isinstance(weights, type(None)) or weights in df, "`weights` must be None or a column in `df`"
+        if weights is not None:
+            raise NotImplementedError("Custom `weights` not currently supported")
+        t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+        # t = df["col1"].to_numpy()
+        # p = df["col2"].to_numpy()
+        w = df["col3"].to_numpy() if weights is not None else weights
         ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
         ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
         ##     For example:
@@ -345,47 +387,17 @@ def multi_scorer_avg(df):
         ##     Keywords could be applied as needed by checking f.__kwdefaults__
         ##     This would offer an easy way for users to add their own scorers with an arg as well.
         return {
-            "accuracy": skm.accuracy_score(true, pred),
-            "kappa": skm.cohen_kappa_score(true, pred),
-            "jaccard_micro": skm.jaccard_score(true, pred, average="micro"),
-            "jaccard_macro": skm.jaccard_score(true, pred, average="macro"),
-            "jaccard_weighted": skm.jaccard_score(true, pred, average="weighted"),
-            "precision_micro": skm.precision_score(true, pred, average="micro", zero_division=0),
-            "precision_macro": skm.precision_score(true, pred, average="macro", zero_division=0),
-            "precision_weighted": skm.precision_score(
-                true, pred, average="weighted", zero_division=0
+            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=w),
+            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=w),
+            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=w),
+            "mcc": skm.matthews_corrcoef(t, p, sample_weight=w),
+            "precision": skm.precision_score(
+                t, p, average="weighted", sample_weight=w, zero_division=0
+            ),
+            "recall": skm.recall_score(t, p, average="weighted", sample_weight=w, zero_division=0),
+            "fbeta": skm.fbeta_score(
+                t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
             ),
-            "recall_micro": skm.recall_score(true, pred, average="micro", zero_division=0),
-            "recall_macro": skm.recall_score(true, pred, average="macro", zero_division=0),
-            "recall_weighted": skm.recall_score(true, pred, average="weighted", zero_division=0),
-            "f1_micro": skm.f1_score(true, pred, average="micro", zero_division=0),
-            "f1_macro": skm.f1_score(true, pred, average="macro", zero_division=0),
-            "f1_weighted": skm.f1_score(true, pred, average="weighted", zero_division=0),
-        }
-
-    @staticmethod
-    def multi_scorer_ovr(df, labels):
-        """Compute multiple one-vs-rest agreement scores from a 2-column dataframe.
-
-        Parameters
-        ----------
-        df : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
-            The first column contains true values and second column contains predicted values.
-        labels : array-like
-            The labels to include in scoring and control the order of returned scores.
-
-        Returns
-        -------
-        scores : dict
-            A dictionary with scorer names (``str``) as keys and scores (``np.ndarray``) as values.
-        """
-        true, pred = zip(*df.values)
-        return {
-            "precision": skm.precision_score(true, pred, labels=labels, average=None, zero_division=0),
-            "recall": skm.recall_score(true, pred, labels=labels, average=None, zero_division=0),
-            "f1": skm.f1_score(true, pred, labels=labels, average=None, zero_division=0),
-            "support": pd.Series(true).value_counts().reindex(labels, fill_value=0).to_numpy(),
         }
 
     def summary(self, by_stage=False, **kwargs):
@@ -418,18 +430,22 @@ def summary(self, by_stage=False, **kwargs):
             >>> ebe.summary(func=["count", "mean", "sem"])
         """
         assert isinstance(by_stage, bool), "`by_stage` must be True or False"
-        agg_kwargs = {"func": ["mean", "std", "min", "median", "max"]} | kwargs
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
+        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
-            summary = (self.indiv_agree_ovr
-                .groupby("stage").agg(**agg_kwargs)
-                .stack(0).rename_axis(["stage", "metric"])
+            summary = (
+                self.indiv_agree_ovr.groupby("stage")
+                .agg(**agg_kwargs)
+                .stack(0)
+                .rename_axis(["stage", "metric"])
             )
         else:
             summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
             ## Q: Should we include a column that calculates agreement treating all hypnograms as
             ##    coming from one individual? Others sometimes report it, though I find it mostly
             ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
-            # summary.insert(0, "all", self.multi_scorer_avg(self.data))
+            # summary.insert(0, "all", self.multi_scorer(self.data))
         ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
         ##    one merged DataFrame where the results that are *not* by-stage are included
         ##    with an "all" stage label:
@@ -468,7 +484,7 @@ def get_sleep_stats(self):
         test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
         return pd.concat([refr_sstats, test_sstats])
 
-    def get_confusion_matrix(self, sleep_id=None):
+    def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         """
         Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
         sessions concatenated together.
@@ -481,26 +497,59 @@ def get_confusion_matrix(self, sleep_id=None):
             If None (default), cross-tabulation is derived from the entire group dataset.
             If a valid sleep ID, cross-tabulation is derived using only the reference and test
             scored hypnograms from that sleep session.
+        ## Q: This keyword (agg_func) is too complicated, but I wanted your opinion on the best
+        ##    approach. And I wanted you to see the returned value when agg_func=None because it
+        ##    might be best to generate during __init__ to set and access as an attribute.
+        agg_func : str, list, or None
+            If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
+            all individual sleep session results. If not None, group results returns a
+            :py:class:`~pandas.DataFrame` aggregated across individual sleep sessions where
+            ``agg_func`` is passed as ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`.
+            Ignored if ``sleep_id`` is not None.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.
 
         Returns
         -------
-        matrix : :py:class:`pandas.DataFrame`
-            A confusion matrix with ``refr_hyp`` stages as indices and ``test_hyp`` stages as
-            columns.
+        conf_matr : :py:class:`pandas.DataFrame`
+            A confusion matrix with stages from the reference scorer as indices and stages from the
+            test scorer as columns.
+
+        Examples
+        --------
+        >>> ebe = yasa.EpochByEpochEvaluation(...)
+        >>> ebe.get_confusion_matrix()  # Return results from all individual subjects
+        >>> ebe.get_confusion_matrix(agg_func=["mean", "std"])  # Return summary results
+        >>> ebe.get_confusion_matrix(sleep_id="sub-002")  # Return results from one subject
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
+        kwargs = {"labels": self._skm_labels} | kwargs
+        # Get confusion matrix for each individual sleep session
+        ## Q: Should this be done during __init__ and accessible via attribute?
+        conf_mats = (self.data
+            # Get confusion matrix for each individual sleep session
+            .groupby(level=0).apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
+            # Expand results matrix out from single cell
+            .explode().apply(pd.Series)
+            # Convert to MultiIndex with reference scorer as new level
+            .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
+            .set_index(self.refr_scorer, append=True).rename_axis(columns=self.test_scorer)
+            # Convert sleep stage columns and indices to strings
+            .rename(columns=self._skm_mapping).rename(columns=self._mapping_int)
+            .rename(index=self._skm_mapping, level=self.refr_scorer)
+            .rename(index=self._mapping_int, level=self.refr_scorer)
         )
-        true = self.data[self.refr_scorer]
-        pred = self.data[self.test_scorer]
-        if sleep_id is not None:
-            true = true.loc[sleep_id]
-            pred = pred.loc[sleep_id]
-        matrix = pd.crosstab(true, pred, margins=True, margins_name="Total")
-        # Reorder indices in sensible order and to include all stages
-        index_col_labels = self.labels + ["Total"]
-        matrix = matrix.reindex(index=index_col_labels, columns=index_col_labels, fill_value=0)
-        return matrix.astype(int)
+        if sleep_id is None:
+            if agg_func is None:
+                mat = conf_mats
+            else:
+                mat = conf_mats.groupby(self.refr_scorer).agg(agg_func)
+                mat.columns = mat.columns.map("_".join).set_names(self.test_scorer)
+        else:
+            mat = conf_mats.loc[sleep_id]
+        return mat
 
     def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
         """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
@@ -536,15 +585,15 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
             >>> hyp = simulate_hypnogram(seed=7)
             >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
-        assert not "ax" in refr_kwargs | test_kwargs, (
-            "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
-        )
+        assert (
+            not "ax" in refr_kwargs | test_kwargs
+        ), "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
         if sleep_id is None:
             if self.n_sleeps == 1:
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
@@ -586,9 +635,9 @@ def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         ax : :py:class:`matplotlib.axes.Axes`
             Matplotlib Axes
         """
-        assert sleep_id is None or sleep_id in self.sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self.sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
 
 
@@ -698,6 +747,7 @@ class SleepStatsEvaluation:
 
         >>> sse.plot_blandaltman()
     """
+
     def __init__(
         self,
         refr_data,
@@ -711,15 +761,15 @@ def __init__(
     ):
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
-        assert np.array_equal(refr_data.index, test_data.index), (
-            "`refr_data` and `test_data` index values must be identical"
-        )
-        assert refr_data.index.name == test_data.index.name, (
-            "`refr_data` and `test_data` index names must be identical"
-        )
-        assert np.array_equal(refr_data.columns, test_data.columns), (
-            "`refr_data` and `test_data` column values must be identical"
-        )
+        assert np.array_equal(
+            refr_data.index, test_data.index
+        ), "`refr_data` and `test_data` index values must be identical"
+        assert (
+            refr_data.index.name == test_data.index.name
+        ), "`refr_data` and `test_data` index names must be identical"
+        assert np.array_equal(
+            refr_data.columns, test_data.columns
+        ), "`refr_data` and `test_data` column values must be identical"
         assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
         assert isinstance(test_scorer, str), "`test_scorer` must be a string"
         assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
@@ -745,10 +795,12 @@ def __init__(
 
         # Merge dataframes and reshape to long format
         data = pd.concat([refr_data, test_data, diff_data])
-        data = (data
-            .melt(var_name="sstat", ignore_index=False).reset_index()
+        data = (
+            data.melt(var_name="sstat", ignore_index=False)
+            .reset_index()
             .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
-            .reset_index().rename_axis(columns=None)
+            .reset_index()
+            .rename_axis(columns=None)
         )
 
         # Remove sleep statistics that have no differences between scorers
@@ -759,10 +811,8 @@ def __init__(
 
         ## NORMALITY ##
         # Test reference data for normality at each sleep statistic
-        normality = (data
-            .groupby("sstat")[refr_scorer]
-            .apply(pg.normality, **kwargs_normality)
-            .droplevel(-1)
+        normality = (
+            data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
         )
 
         ## PROPORTIONAL BIAS ##
@@ -872,7 +922,7 @@ def proportional_bias_full(self):
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference"
+            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference "
             f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
@@ -882,17 +932,17 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
-    def summary(self, descriptives=True):
+    def summary(self, **kwargs):
         """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
         self : :py:class:`SleepStatsEvaluation`
             A :py:class:`SleepStatsEvaluation` instance.
-        descriptives : bool or dict
-            If True (default) or a dictionary, also include descriptive statistics for reference and
-            test scorers. If a dictionary, all key/value pairs are passed as keyword arguments
-            to the :py:meth:`pandas.DataFrame.agg` call.
+        **kwargs : key, value pairs
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+
+            >>> ebe.summary(func=["mean", "sem", "min", "max"])
 
         Returns
         -------
@@ -900,21 +950,18 @@ def summary(self, descriptives=True):
             A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
             normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
         """
-        assert isinstance(descriptives, (bool, dict)), "`descriptives` must be True, False, or dict"
         series_list = [
             self.normality["normal"],
             self.proportional_bias["unbiased"],
             self.homoscedasticity["equal_var"].rename("homoscedastic"),
         ]
         summary = pd.concat(series_list, axis=1)
-        if descriptives:
-            agg_kwargs = {"func": ["mean", "std"]}
-            if isinstance(descriptives, dict):
-                agg_kwargs.update(descriptives)
-            desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
-            desc.columns = desc.columns.map("_".join)
-            summary = summary.join(desc)
-        return summary
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
+        agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
+        desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
+        desc.columns = desc.columns.map("_".join)
+        return summary.join(desc)
 
     def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """Visualize session-level discrepancies, generally for outlier inspection.
diff --git a/yasa/hypno.py b/yasa/hypno.py
index 37ae1ad..5d28afa 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -572,7 +572,7 @@ def copy(self):
         )
 
     def evaluate(self, test_hyp):
-        """Evaluate agreement between two hypnograms.
+        """Evaluate agreement between two hypnograms of the same sleep session.
 
         Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
         test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or

From 510b4a66381bf348cb60a8c61ae62ba2365bcdc7 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 8 Jan 2023 05:24:15 -0600
Subject: [PATCH 36/43] 3 group-hypnogram plotting options, need feedback

minor

normality bug

black fmt makes my pandas chains lonnggggg

diff_data --> discrepancies

trailing not leading _kwargs
---
 yasa/evaluation.py | 218 +++++++++++++++++++++++++++++++++------------
 1 file changed, 160 insertions(+), 58 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 636b3e8..5a52105 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -17,10 +17,12 @@
 import pandas as pd
 import pingouin as pg
 import sklearn.metrics as skm
+from scipy.stats import zscore
 
 import seaborn as sns
 import matplotlib.pyplot as plt
 
+from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 
 
@@ -46,6 +48,12 @@ class EpochByEpochEvaluation:
     Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
+    .. warning::
+        :py:class:`yasa.evaluation.EpochByEpochEvaluation` is a new YASA feature and the API is
+        subject to future change.
+
+    .. versionadded:: 0.7.0
+
     Parameters
     ----------
     refr_hyps : iterable of :py:class:`yasa.Hypnogram`
@@ -275,6 +283,7 @@ def __init__(self, refr_hyps, test_hyps):
         self._indiv_agree_avg = indiv_agree_avg
         self._indiv_agree_ovr = indiv_agree_ovr
         ## Q: Merge these to one individual agreement dataframe?
+        ##    Setting average="binary" to fill extra column in over dataframe
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
@@ -371,7 +380,9 @@ def multi_scorer(df, weights=None):
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        assert isinstance(weights, type(None)) or weights in df, "`weights` must be None or a column in `df`"
+        assert (
+            isinstance(weights, type(None)) or weights in df
+        ), "`weights` must be None or a column in `df`"
         if weights is not None:
             raise NotImplementedError("Custom `weights` not currently supported")
         t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
@@ -528,16 +539,25 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         kwargs = {"labels": self._skm_labels} | kwargs
         # Get confusion matrix for each individual sleep session
         ## Q: Should this be done during __init__ and accessible via attribute?
-        conf_mats = (self.data
+        ##    I'm a little unsure about what should happen in init and be accessed as a property
+        ##    vs what should require a function. Nothing takes so long that it feels like it
+        ##    couldn't just happen during __init__, leaving mostly just plotting functions as
+        ##    methods. But if that's the case, what's the benefit of being a class? Confused!!
+        conf_mats = (
+            self.data
             # Get confusion matrix for each individual sleep session
-            .groupby(level=0).apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
+            .groupby(level=0)
+            .apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
             # Expand results matrix out from single cell
-            .explode().apply(pd.Series)
+            .explode()
+            .apply(pd.Series)
             # Convert to MultiIndex with reference scorer as new level
             .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
-            .set_index(self.refr_scorer, append=True).rename_axis(columns=self.test_scorer)
+            .set_index(self.refr_scorer, append=True)
+            .rename_axis(columns=self.test_scorer)
             # Convert sleep stage columns and indices to strings
-            .rename(columns=self._skm_mapping).rename(columns=self._mapping_int)
+            .rename(columns=self._skm_mapping)
+            .rename(columns=self._mapping_int)
             .rename(index=self._skm_mapping, level=self.refr_scorer)
             .rename(index=self._mapping_int, level=self.refr_scorer)
         )
@@ -593,13 +613,15 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
         assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
         assert (
             not "ax" in refr_kwargs | test_kwargs
-        ), "ax can't be supplied to `kwargs_ref` or `test_kwargs`, use the `ax` keyword instead"
+        ), "'ax' can't be supplied to `refr_kwargs` or `test_kwargs`, use the `ax` keyword instead"
         if sleep_id is None:
             if self.n_sleeps == 1:
                 refr_hyp = self.refr_hyps[self.sleep_ids[0]]
                 test_hyp = self.test_hyps[self.sleep_ids[0]]
             else:
-                raise NotImplementedError("Multi-session plotting is not currently supported")
+                raise NotImplementedError(
+                    "Multi-session plotting is not currently supported. 3 options being tested!"
+                )
         else:
             refr_hyp = self.refr_hyps[sleep_id]
             test_hyp = self.test_hyps[sleep_id]
@@ -618,6 +640,67 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
                 ax.legend()
         return ax
 
+    def plot_group_hypnogram_opt1(self, ax=None, **kwargs):
+        if ax is None:
+            ax = plt.gca()
+        palette = {"Inaccurate": "plum", "Accurate": "forestgreen"}
+        hue_order = list(palette)
+        hist_kwargs = dict(multiple="stack", stat="count", element="step", discrete=True, lw=0)
+        ser = self.data[self.refr_scorer].eq(self.data[self.test_scorer])
+        df = ser.rename("acc").replace({True: "Accurate", False: "Inaccurate"}).reset_index()
+        sns.histplot(data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax)
+        ax.set_ylabel("Number of unique sleep sessions")
+        ax.set_xlabel("Epochs")
+        ax.margins(x=0, y=0)
+        return ax
+
+    def plot_group_hypnogram_opt2(self, ax=None, **kwargs):
+        from pingouin import compute_bootci
+
+        plot_kwargs = dict(lw=1, color="plum", alpha=1, label="7-epoch rolling average")
+        plot_kwargs.update(kwargs)
+        betw_kwargs = dict(lw=0, alpha=0.3, color=plot_kwargs["color"], label="95% bootstrapped CI")
+        if ax is None:
+            ax = plt.gca()
+        df = self.data[self.refr_scorer].eq(self.data[self.test_scorer]).rename("acc").reset_index()
+        probas = df.groupby("Epoch")["acc"].mean()
+        ci = df.groupby("Epoch")["acc"].apply(compute_bootci, None, "mean").apply(pd.Series)
+        ci = ci.rename(columns={0: "low", 1: "high"})
+        probas = probas.rolling(10, center=True).mean()
+        ci = ci.rolling(10, center=True).mean()
+        ax.fill_between(ci.index, ci["low"], ci["high"], **betw_kwargs)
+        ax.plot(probas.index, probas, **plot_kwargs)
+        ax.set_ylabel("Accuracy across sleep sessions")
+        ax.set_xlabel("Epochs")
+        ax.set_xlim(0, len(probas))
+        ax.set_ylim(0, 1)
+        ax.legend()
+        return ax
+
+    def plot_group_hypnogram_opt3(self, figsize=(7, 10), **kwargs):
+        imshow_kwargs = dict(cmap="Blues", interpolation="none")
+        imshow_kwargs.update(kwargs)
+        n_rows = self.n_sleeps
+        freq = self.refr_hyps[self.sleep_ids[0]].freq
+        freq_secs = pd.Timedelta(freq).total_seconds()
+        fig, axes = plt.subplots(nrows=n_rows, figsize=figsize, sharex=True, sharey=False)
+        for ax, (subj, data) in zip(axes, self.data.groupby(level=0)):
+            img = data.values.T
+            extent = (0, freq_secs * img.shape[1], img.shape[0] - 0.5, -0.5)
+            ax.imshow(img, extent=extent, aspect="auto", origin="upper", **imshow_kwargs)
+            ax.set_yticks([0, 1])
+            ax.set_yticklabels([self.refr_scorer, self.test_scorer])
+            ax.set_ylabel(subj, rotation=0, va="center")
+            ax.spines[["top", "bottom", "left", "right"]].set_visible(False)
+            if not ax.get_subplotspec().is_first_row():
+                ax.tick_params(left=False, labelleft=False)
+            if not ax.get_subplotspec().is_last_row():
+                ax.tick_params(bottom=False)
+                ax.set_xlabel("Time [s]")
+                ax.spines["bottom"].set_visible(False)
+        fig.align_ylabels()
+        return fig
+
     def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
         """Plot ROC curves for each stage.
 
@@ -652,6 +735,12 @@ class SleepStatsEvaluation:
     and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
     subjects or sessions.
 
+    .. warning::
+        :py:class:`yasa.evaluation.SleepStatsEvaluation` is a new YASA feature and the API is
+        subject to future change.
+
+    .. versionadded:: 0.7.0
+
     Parameters
     ----------
     refr_data : :py:class:`pandas.DataFrame`
@@ -666,12 +755,16 @@ class SleepStatsEvaluation:
         Name of the test scorer, used for labeling.
     alpha : float
         Alpha cutoff used for all three tests.
-    kwargs_normality : dict
+    normality_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.normality` call.
-    kwargs_regression : dict
+    regression_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
-    kwargs_homoscedasticity : dict
+    homoscedasticity_kwargs : dict
         Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
+    verbose : bool or str
+        Verbose level. Default (False) will only print warning and error messages. The logging
+        levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
+        between 'info' (or ``verbose=True``) and warning (``verbose=False``).
 
     Notes
     -----
@@ -691,15 +784,19 @@ class SleepStatsEvaluation:
     >>> import yasa
     >>>
     >>> # For this example, generate two fake datasets of sleep statistics
-    >>> hypsA = [yasa.simulate_hypnogram(tib=600, seed=i) for i in range(20)]
-    >>> hypsB = [h.simulate_similar(tib=600, seed=i) for i, h in enumerate(hypsA)]
-    >>> sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
+    >>> hypsA = [yasa.simulate_hypnogram(tib=600, scorer="Ref", seed=i) for i in range(20)]
+    >>> hypsB = [h.simulate_similar(tib=600, scorer="Test", seed=i) for i, h in enumerate(hypsA)]
+    >>> # sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> # sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> # sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
+    >>> ebe = yasa.EpochByEpochEvaluation(hypsA, hypsB)
+    >>> sstats = ebe.get_sleepstats()
+    >>> sstatsA = sstats.loc["Ref"]
+    >>> sstatsB = sstats.loc["Test"]
     >>>
     >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
     >>>
-    >>> sse.summary(descriptives=False)
+    >>> sse.summary()
            normal  unbiased  homoscedastic
     sstat
     %N1      True      True           True
@@ -755,10 +852,13 @@ def __init__(
         *,
         refr_scorer="Reference",
         test_scorer="Test",
-        kwargs_normality={"alpha": 0.05},
-        kwargs_regression={"alpha": 0.05},
-        kwargs_homoscedasticity={"alpha": 0.05},
+        normality_kwargs={"alpha": 0.05},
+        regression_kwargs={"alpha": 0.05},
+        homoscedasticity_kwargs={"alpha": 0.05},
+        verbose=True,
     ):
+        set_log_level(verbose)
+
         assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
         assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
         assert np.array_equal(
@@ -773,28 +873,28 @@ def __init__(
         assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
         assert isinstance(test_scorer, str), "`test_scorer` must be a string"
         assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
-        assert isinstance(kwargs_normality, dict), "`kwargs_normality` must be a dictionary"
-        assert isinstance(kwargs_regression, dict), "`kwargs_regression` must be a dictionary"
-        assert isinstance(kwargs_homoscedasticity, dict), "`kwargs_homoscedasticity` must be a dict"
-        assert "alpha" in kwargs_normality, "`kwargs_normality` must include 'alpha'"
-        assert "alpha" in kwargs_regression, "`kwargs_regression` must include 'alpha'"
-        assert "alpha" in kwargs_homoscedasticity, "`kwargs_homoscedasticity` must include 'alpha'"
+        assert isinstance(normality_kwargs, dict), "`normality_kwargs` must be a dictionary"
+        assert isinstance(regression_kwargs, dict), "`regression_kwargs` must be a dictionary"
+        assert isinstance(homoscedasticity_kwargs, dict), "`homoscedasticity_kwargs` must be a dict"
+        assert "alpha" in normality_kwargs, "`normality_kwargs` must include 'alpha'"
+        assert "alpha" in regression_kwargs, "`regression_kwargs` must include 'alpha'"
+        assert "alpha" in homoscedasticity_kwargs, "`homoscedasticity_kwargs` must include 'alpha'"
 
         # If refr_data and test_data indices are unnamed, name them
         sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
         refr_data.index.name = sleep_id_str
         test_data.index.name = sleep_id_str
 
-        # Get scorer differences
-        diff_data = test_data.sub(refr_data)
+        # Get scorer discrepancies (i.e., differences, test minus reference)
+        discrepancies = test_data.sub(refr_data)
 
         # Convert to MultiIndex with new scorer level
-        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        discrepancies = pd.concat({"difference": discrepancies}, names=["scorer"])
         refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
         test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
 
         # Merge dataframes and reshape to long format
-        data = pd.concat([refr_data, test_data, diff_data])
+        data = pd.concat([refr_data, test_data, discrepancies])
         data = (
             data.melt(var_name="sstat", ignore_index=False)
             .reset_index()
@@ -810,9 +910,12 @@ def __init__(
             logger.warning(f"All {s} differences are zero, removing from evaluation.")
 
         ## NORMALITY ##
-        # Test reference data for normality at each sleep statistic
+        # Test difference data (test - reference) for normality at each sleep statistic
         normality = (
-            data.groupby("sstat")[refr_scorer].apply(pg.normality, **kwargs_normality).droplevel(-1)
+            data
+            .groupby("sstat")["difference"]
+            .apply(pg.normality, **normality_kwargs)
+            .droplevel(-1)
         )
 
         ## PROPORTIONAL BIAS ##
@@ -822,7 +925,7 @@ def __init__(
         for ss_name, ss_df in data.groupby("sstat"):
             # Regress the difference scores on the reference scores
             model = pg.linear_regression(
-                ss_df[refr_scorer], ss_df["difference"], **kwargs_regression
+                ss_df[refr_scorer], ss_df["difference"], **regression_kwargs
             )
             model.insert(0, "sstat", ss_name)
             # Extract sleep-level residuals for later homoscedasticity tests
@@ -843,11 +946,11 @@ def __init__(
         # Now remove intercept rows
         prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
         # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(kwargs_regression["alpha"])
+        prop_bias["unbiased"] = prop_bias["pval"].ge(regression_kwargs["alpha"])
 
         ## Test each statistic for homoscedasticity ##
         columns = [refr_scorer, "difference", "pbias_residual"]
-        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **kwargs_homoscedasticity)
+        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **homoscedasticity_kwargs)
         homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
 
         # Set attributes
@@ -860,8 +963,7 @@ def __init__(
         self._test_scorer = test_scorer
         self._sleep_id_str = sleep_id_str
         self._n_sleeps = data[sleep_id_str].nunique()
-        self._diff_data = diff_data.drop(columns=stats_nodiff)
-        # self._diff_data = data.pivot(index=sleep_id_str, columns="sstat", values="difference")
+        self._discrepancies = discrepancies.drop(columns=stats_nodiff)
 
     @property
     def data(self):
@@ -871,10 +973,10 @@ def data(self):
         return self._data
 
     @property
-    def diff_data(self):
+    def discrepancies(self):
         """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
         # # Pivot for session-rows and statistic-columns
-        return self._diff_data
+        return self._discrepancies
 
     @property
     def refr_scorer(self):
@@ -986,7 +1088,7 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        table = self.diff_data[sleep_stats]
+        table = self.discrepancies[sleep_stats]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
@@ -994,12 +1096,12 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
             heatmap_kwargs["annot"] = table.to_numpy()
         return sns.heatmap(table_norm, **heatmap_kwargs)
 
-    def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kwargs):
+    def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kwargs):
         """Visualize session-level discrepancies, generally for outlier inspection.
 
         Parameters
         ----------
-        kwargs_pairgrid : dict
+        pairgrid_kwargs : dict
             Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
@@ -1017,19 +1119,19 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         .. plot::
             ## TODO: Example using x_vars
         """
-        assert isinstance(kwargs_pairgrid, dict), "`kwargs_pairgrid` must be a dict"
-        stripplot_kwargs = {"size": 10, "linewidth": 1, "edgecolor": "white"}
-        stripplot_kwargs.update(kwargs)
+        assert isinstance(pairgrid_kwargs, dict), "`pairgrid_kwargs` must be a dict"
+        kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
+        kwargs_stripplot.update(kwargs)
         # Initialize the PairGrid
-        height = 0.3 * len(self.diff_data)
+        height = 0.3 * len(self.discrepancies)
         aspect = 0.6
-        pairgrid_kwargs = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
-        pairgrid_kwargs.update(kwargs_pairgrid)
+        kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
+        kwargs_pairgrid.update(pairgrid_kwargs)
         g = sns.PairGrid(
-            self.diff_data.reset_index(), y_vars=[self.sleep_id_str], **pairgrid_kwargs
+            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
         )
         # Draw the dots
-        g.map(sns.stripplot, orient="h", jitter=False, **stripplot_kwargs)
+        g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
         # Adjust aesthetics
         for ax in g.axes.flat:
             ax.set(title=ax.get_xlabel())
@@ -1040,14 +1142,14 @@ def plot_discrepancies_dotplot(self, kwargs_pairgrid={"palette": "winter"}, **kw
         sns.despine(left=True, bottom=True)
         return g
 
-    def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
+    def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
         """
 
         **Use col_order=sstats_order for plotting a subset.
 
         Parameters
         ----------
-        kwargs_facetgrid : dict
+        facetgrid_kwargs : dict
             Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
@@ -1057,14 +1159,14 @@ def plot_blandaltman(self, kwargs_facetgrid={}, **kwargs):
         g : :py:class:`seaborn.FacetGrid`
             A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
         """
-        facetgrid_kwargs = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
-        facetgrid_kwargs.update(kwargs_facetgrid)
-        blandaltman_kwargs = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
-        blandaltman_kwargs.update(kwargs)
+        kwargs_facetgrid = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
+        kwargs_facetgrid.update(facetgrid_kwargs)
+        kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
+        kwargs_blandaltman.update(kwargs)
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col="sstat", **facetgrid_kwargs)
+        g = sns.FacetGrid(self.data, col="sstat", **kwargs_facetgrid)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **blandaltman_kwargs)
+        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **kwargs_blandaltman)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks

From 7d3fd1539598ea16aef59fdb98e28e43682488ea Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 18 Dec 2023 16:48:58 -0600
Subject: [PATCH 37/43] major EpochByEpoch restructure

class methods alphabetical order
---
 yasa/evaluation.py | 1564 ++++++++++++++++++++++++++------------------
 yasa/hypno.py      |   44 +-
 2 files changed, 961 insertions(+), 647 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 5a52105..da75a4a 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -1,9 +1,6 @@
 """
-YASA code for evaluating the agreement between two scorers.
-
-There are two levels of evaluating staging performance:
-- Comparing two hypnograms (e.g., human vs automated scorer)
-- Comparing summary sleep statistics between two scorers (e.g., PSG vs actigraphy)
+YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
+epoch-by-epoch level or at the level of summary sleep statistics.
 
 Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
 See the following resources:
@@ -15,118 +12,134 @@
 
 import numpy as np
 import pandas as pd
-import pingouin as pg
 import sklearn.metrics as skm
-from scipy.stats import zscore
+from scipy import stats
 
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 
 
 logger = logging.getLogger("yasa")
 
 __all__ = [
-    "EpochByEpochEvaluation",
-    "SleepStatsEvaluation",
+    "EpochByEpochAgreement",
+    "SleepStatsAgreement",
 ]
 
 
-#############################################################################
+################################################################################
 # EPOCH BY EPOCH
-#############################################################################
-
-
-class EpochByEpochEvaluation:
-    """Evaluate agreement between two collections of hypnograms.
+################################################################################
 
-    For example, evaluate the agreement between manually-scored hypnograms and automatically-scored
-    hypnograms, or hypnograms derived from actigraphy.
 
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
-    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+class EpochByEpochAgreement:
+    """Evaluate agreement between two hypnograms or two collections of hypnograms.
 
-    .. warning::
-        :py:class:`yasa.evaluation.EpochByEpochEvaluation` is a new YASA feature and the API is
-        subject to future change.
+    Evaluation includes averaged agreement scores, one-vs-rest agreement scores, agreement scores
+    summarized across all sleep and summarized by sleep stage, and various plotting options to
+    visualize the two hypnograms simultaneously. See examples for more detail.
 
     .. versionadded:: 0.7.0
 
     Parameters
     ----------
-    refr_hyps : iterable of :py:class:`yasa.Hypnogram`
-        A collection of reference (i.e., ground-truth) hypnograms.
+    ref_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of reference hypnograms (i.e., those considered ground-truth).
 
-        Each :py:class:`yasa.Hypnogram` in ``refr_hyps`` must have the same
+        Each :py:class:`yasa.Hypnogram` in ``ref_hyps`` must have the same
         :py:attr:`~yasa.Hypnogram.scorer`.
 
         If a ``dict``, key values are use to generate unique sleep session IDs. If any other
         iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
         generated.
-    test_hyps : iterable of :py:class:`yasa.Hypnogram`
-        A collection of test (i.e., to-be-evaluated) hypnograms.
+    obs_hyps : iterable of :py:class:`yasa.Hypnogram`
+        A collection of observed hypnograms (i.e., those to be evaluated).
 
-        Each :py:class:`yasa.Hypnogram` in ``test_hyps`` must have the same
+        Each :py:class:`yasa.Hypnogram` in ``obs_hyps`` must have the same
         :py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
-        hypnograms in ``refr_hyps``.
+        hypnograms in ``ref_hyps``.
 
-        If a ``dict``, key values must match those of ``refr_hyps``.
+        If a ``dict``, key values must match those of ``ref_hyps``.
 
     .. important::
-        It is assumed that the order of hypnograms are the same in ``refr_hyps`` and ``test_hyps``.
-        For example, the third hypnogram in ``refr_hyps`` and ``test_hyps`` come from the same sleep
-        session, and only differ in that they have different scorers.
+        It is assumed that the order of hypnograms are the same in ``ref_hyps`` and ``obs_hyps``.
+        For example, the third hypnogram in ``ref_hyps`` and ``obs_hyps`` must come from the same
+        sleep session, and they must only differ in that they have different scorers.
 
     .. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
 
+    Notes
+    -----
+    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
+
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                        zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
     >>> import yasa
-    >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="RaterA", seed=i) for i in range(20)]
-    >>> hyps_b = [h.simulate_similar(scorer="RaterB", seed=i) for i, h in enumerate(refr_hyps)]
-    >>> ebe = yasa.EpochByEpochEvaluation(hyps_a, hyps_b)
-
-    >>> ebe.get_agreement().round(3)
-    metric
-    accuracy              0.209
-    kappa                -0.051
-    weighted_jaccard      0.130
-    weighted_precision    0.247
-    weighted_recall       0.209
-    weighted_f1           0.223
-    Name: agreement, dtype: float64
-
-    >>> ebe.get_agreement_by_stage().round(3)
-    stage         WAKE       N1       N2       N3   REM  ART  UNS
-    metric
-    precision    0.188    0.016    0.315    0.429   0.0  0.0  0.0
-    recall       0.179    0.018    0.317    0.235   0.0  0.0  0.0
-    fscore       0.183    0.017    0.316    0.303   0.0  0.0  0.0
-    support    290.000  110.000  331.000  179.000  50.0  0.0  0.0
+    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i) for i in range(10)]
+    >>> obs_hyps = [h.simulate_similar(scorer="YASA", seed=i) for i, h in enumerate(ref_hyps)]
+    >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+    >>> agr = ebe.get_agreement()
+    >>> agr.head(5).round(2)
+              accuracy  balanced_acc  kappa   mcc  precision  recall  fbeta
+    sleep_id
+    1             0.31          0.26   0.07  0.07       0.31    0.31   0.31
+    2             0.33          0.33   0.14  0.14       0.35    0.33   0.34
+    3             0.35          0.24   0.06  0.06       0.35    0.35   0.35
+    4             0.22          0.21   0.01  0.01       0.21    0.22   0.21
+    5             0.21          0.17  -0.06 -0.06       0.20    0.21   0.21
+
+    >>> ebe.get_agreement_bystage().head(12).round(3)
+                    fbeta  precision  recall  support
+    stage sleep_id
+    WAKE  1         0.391      0.371   0.413    189.0
+          2         0.299      0.276   0.326    184.0
+          3         0.234      0.204   0.275    255.0
+          4         0.268      0.285   0.252    321.0
+          5         0.228      0.230   0.227    181.0
+          6         0.407      0.384   0.433    284.0
+          7         0.362      0.296   0.467    287.0
+          8         0.298      0.519   0.209    263.0
+          9         0.210      0.191   0.233    313.0
+          10        0.369      0.420   0.329    362.0
+    N1    1         0.185      0.185   0.185    124.0
+          2         0.121      0.131   0.112    160.0
+
+    >>> ebe.get_confusion_matrix(sleep_id=1)
+    YASA   WAKE  N1   N2  N3  REM
+    Human
+    WAKE     78  24   50   3   34
+    N1       23  23   43  15   20
+    N2       60  58  183  43  139
+    N3       30  10   50   5   32
+    REM      19   9  121  50   78
 
     .. plot::
 
         >>> import matplotlib.pyplot as plt
         >>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
-        >>> ebe.plot_hypnograms()
+        >>> ebe.plot_hypnograms(sleep_id=10)
 
     .. plot::
 
         >>> fig, ax = plt.subplots(figsize=(6, 3))
-        >>> ebe.plot_hypnograms(ax=ax, kwargs_test={"color": "black", "lw": 2, "ls": "dotted"})
+        >>> ebe.plot_hypnograms(
+        >>>     sleep_id=8, ax=ax, obs_kwargs={"color": "red", "lw": 2, "ls": "dotted"}
+        >>> )
         >>> plt.tight_layout()
 
     .. plot::
 
+        >>> session = 8
         >>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
         >>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
         >>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
@@ -134,166 +147,110 @@ class EpochByEpochEvaluation:
         >>>     title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
         >>> )
         >>> ax = ebe.plot_hypnograms(
-        >>>     kwargs_ref=style_a, kwargs_test=style_b, legend=legend_style, ax=ax
+        >>>     sleep_id=session, ref_kwargs=style_a, obs_kwargs=style_b, legend=legend_style, ax=ax
+        >>> )
+        >>> acc = ebe.get_agreement().multiply(100).at[session, "accuracy"]
+        >>> ax.text(
+        >>>     0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes
         >>> )
-        >>>
-        >>> acc = ebe.get_agreement().multiply(100).round(0).at["accuracy"]
-        >>> ax.text(0.01, 1, f"Accuracy = {acc}%", ha="left", va="bottom", transform=ax.transAxes)
 
-    When comparing only 2 hypnograms, use the :py:meth:`yasa.Hynogram.evaluate` method:
+    When comparing only 2 hypnograms, use the :py:meth:`~yasa.Hynogram.evaluate` method:
 
     >>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
     >>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
     >>> ebe = hypno_a.evaluate(hypno_b)
-
     >>> ebe.get_confusion_matrix()
-    RaterB  WAKE   N1   N2  N3  REM  ART  UNS  Total
+    RaterB  WAKE  N1  N2  N3
     RaterA
-    WAKE      52   38  126  23   51    0    0    290
-    N1        59    2   27   8   14    0    0    110
-    N2       117   50  105  15   44    0    0    331
-    N3        34   26   62  42   15    0    0    179
-    REM       15   12   13  10    0    0    0     50
-    ART        0    0    0   0    0    0    0      0
-    UNS        0    0    0   0    0    0    0      0
-    Total    277  128  333  98  124    0    0    960
+    WAKE      71   2  20   8
+    N1         1   0   9   0
+    N2        12   4  25   0
+    N3        24   0   1   3
     """
 
-    def __init__(self, refr_hyps, test_hyps):
+    def __init__(self, ref_hyps, obs_hyps):
         from yasa.hypno import Hypnogram  # Avoiding circular import
 
-        assert hasattr(refr_hyps, "__iter__"), "`refr_hyps` must be a an iterable"
-        assert hasattr(test_hyps, "__iter__"), "`test_hyps` must be a an iterable"
-        assert type(refr_hyps) == type(test_hyps), "`refr_hyps` and `test_hyps` must be same type"
-        assert len(refr_hyps) == len(
-            test_hyps
-        ), "`refr_hyps` and `test_hyps` must have the same number of hypnograms"
+        assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
+        assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
+        assert type(ref_hyps) == type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
+        assert len(ref_hyps) == len(
+            obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
 
-        if isinstance(refr_hyps, dict):
+        if isinstance(ref_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
             assert (
-                refr_hyps.keys() == test_hyps.keys()
-            ), "hypnograms in `refr_hyps` and `test_hyps` must have identical sleep IDs"
-            sleep_ids, refr_hyps = zip(*refr_hyps.items())
-            test_hyps = tuple(test_hyps.values())
+                ref_hyps.keys() == obs_hyps.keys()
+            ), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
+            sleep_ids, ref_hyps = zip(*ref_hyps.items())
+            obs_hyps = tuple(obs_hyps.values())
         else:
             # Create hypnogram_ids
-            sleep_ids = tuple(range(1, 1 + len(refr_hyps)))
+            sleep_ids = tuple(range(1, 1 + len(ref_hyps)))
 
         assert all(
-            isinstance(hyp, Hypnogram) for hyp in refr_hyps + test_hyps
-        ), "`refr_hyps` and `test_hyps` must only include YASA hypnograms"
+            isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
         assert all(
-            h.scorer is not None for h in refr_hyps + test_hyps
+            h.scorer is not None for h in ref_hyps + obs_hyps
         ), "all hypnograms must have a scorer name"
-        for h1, h2 in zip((refr_hyps + test_hyps)[:-1], (refr_hyps + test_hyps)[1:]):
+        for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
             assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
         assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(refr_hyps[:-1], refr_hyps[1:])
-        ), "all `refr_hyps` must have the same scorer"
+            h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
+        ), "all `ref_hyps` must have the same scorer"
         assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(test_hyps[:-1], test_hyps[1:])
-        ), "all `test_hyps` must have the same scorer"
+            h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
+        ), "all `obs_hyps` must have the same scorer"
         assert all(
-            h1.scorer != h2.scorer for h1, h2 in zip(refr_hyps, test_hyps)
-        ), "each `refr_hyps` and `test_hyps` pair must have unique scorers"
+            h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
         assert all(
-            h1.n_epochs == h2.n_epochs for h1, h2 in zip(refr_hyps, test_hyps)
-        ), "each `refr_hyps` and `test_hyps` pair must have the same n_epochs"
-        ## Q: Could use set() for those above.
-        ##    Or set scorer as the first available and check all equal.
+            h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
 
-        # Convert to dictionaries with sleep_ids and hypnograms
-        refr_hyps = {s: h for s, h in zip(sleep_ids, refr_hyps)}
-        test_hyps = {s: h for s, h in zip(sleep_ids, test_hyps)}
+        # Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
+        ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
+        obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}
 
         # Merge all hypnograms into a single MultiIndexed dataframe
-        refr = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in refr_hyps.items()
+        ref = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items()
         )
-        test = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in test_hyps.items()
+        obs = pd.concat(
+            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items()
         )
-        data = pd.concat([refr, test], axis=1)
+        data = pd.concat([ref, obs], axis=1)
 
-        ########################################################################
-        # INDIVIDUAL-LEVEL AGREEMENT
-        ########################################################################
-
-        # Get individual-level averaged/weighted agreement scores
-        indiv_agree_avg = data.groupby(level=0).apply(self.multi_scorer).apply(pd.Series)
-        ## Q: Check speed against pd.DataFrame({s: multscore(hyps[s], hyps[s]) for s in subjects})
-
-        # Get individual-level one-vs-rest/un-weighted agreement scores
-        # Labels ensures the order of returned scores is known
-        # It also can be used to remove unused labels, but that will be taken care of later anyways
-        # skm_labels = [l for l in refr_hyps[sleep_ids[0]].hypno.cat.categories if l in data.values]
-        # skm will return an array of results, so mapping must be linear without skips
-        ## Q: Another option is to get Series.cat.codes for ints and use cat.categories for mapping
-        skm_labels = np.unique(data).tolist()
-        skm_mapping = {i: l for i, l in enumerate(skm_labels)}  # skm integers to YASA integers
-        mapping_int = refr_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integers to YASA strings
-        # labels = refr_hyps[sleep_ids[0]].labels.copy()  # To preserve YASA ordering
-        # labels = [v for k, v in mapping_int.items() if k in skm_labels]  # To preserve YASA ordering
-        prfs_wrapper = lambda df: skm.precision_recall_fscore_support(
-            *df.values.T, beta=1, labels=skm_labels, average=None, zero_division=0
-        )
-        indiv_agree_ovr = (
-            data
-            # Get precision, recall, f1, and support for each individual sleep session
-            .groupby(level=0)
-            .apply(prfs_wrapper)
-            # Unpack arrays
-            .explode()
-            .apply(pd.Series)
-            # Add metric labels and prepend to index, creating MultiIndex
-            .assign(metric=["precision", "recall", "fbeta", "support"] * len(refr_hyps))
-            .set_index("metric", append=True)
-            # Convert stage column names to string labels
-            .rename_axis(columns="stage")
-            .rename(columns=skm_mapping)
-            .rename(columns=mapping_int)
-            # Remove all-zero rows (i.e., stages that were not present in the hypnogram)
-            .pipe(lambda df: df.loc[:, df.any()])
-            # Reshape so metrics are columns
-            .stack()
-            .unstack("metric")
-            .rename_axis(columns=None)
-            # Swap MultiIndex levels and sort so stages in standard YASA order
-            .swaplevel()
-            .sort_index(
-                level="stage", key=lambda x: x.map(lambda y: list(mapping_int.values()).index(y))
-            )
-        )
+        # Generate some mapping dictionaries to be used later in class methods
+        skm_labels = np.unique(data).tolist()  # all unique YASA integer codes in this hypno
+        skm2yasa_map = {i: l for i, l in enumerate(skm_labels)}  # skm order to YASA integers
+        yasa2yasa_map = ref_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integer to YASA string
 
         # Set attributes
         self._data = data
         self._sleep_ids = sleep_ids
-        self._n_sleeps = len(sleep_ids)
-        self._refr_hyps = refr_hyps
-        self._test_hyps = test_hyps
-        self._refr_scorer = refr_hyps[sleep_ids[0]].scorer
-        self._test_scorer = test_hyps[sleep_ids[0]].scorer
+        self._ref_hyps = ref_hyps
+        self._obs_hyps = obs_hyps
+        self._ref_scorer = ref_hyps[sleep_ids[0]].scorer
+        self._obs_scorer = obs_hyps[sleep_ids[0]].scorer
         self._skm_labels = skm_labels
-        self._skm_mapping = skm_mapping
-        self._mapping_int = mapping_int
-        self._indiv_agree_avg = indiv_agree_avg
-        self._indiv_agree_ovr = indiv_agree_ovr
-        ## Q: Merge these to one individual agreement dataframe?
-        ##    Setting average="binary" to fill extra column in over dataframe
+        self._skm2yasa_map = skm2yasa_map
+        self._yasa2yasa_map = yasa2yasa_map
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
-        s = "s" if self._n_sleeps > 1 else ""
+        s = "s" if self.n_sleeps > 1 else ""
         return (
-            f"<EpochByEpochEvaluation | Test hypnogram{s} scored by {self.test_scorer} evaluated "
-            f"against reference hypnogram{s} scored by {self.refr_scorer}, {self._n_sleeps} sleep "
-            f"session{s}>\n"
-            " - Use `.get_agreement()` to get agreement measures as a pandas.Series\n"
-            " - Use `.plot_hypnograms()` to plot the two hypnograms overlaid\n"
+            f"<EpochByEpochAgreement | Observed hypnogram{s} scored by {self.obs_scorer} "
+            f"evaluated against reference hypnogram{s} scored by {self.ref_scorer}, "
+            f"{self.n_sleeps} sleep session{s}>\n"
+            " - Use `.get_agreement()` to get agreement measures as a pandas DataFrame or Series\n"
+            " - Use `.plot_hypnograms()` to plot two overlaid hypnograms\n"
             "See the online documentation for more details."
         )
 
@@ -305,218 +262,207 @@ def data(self):
         """A :py:class:`pandas.DataFrame` including all hypnograms."""
         return self._data
 
-    @property
-    def refr_hyps(self):
-        """A dictionary of all reference YASA hypnograms with sleep IDs as keys."""
-        return self._refr_hyps
-
-    @property
-    def test_hyps(self):
-        """A dictionary of all test YASA hypnograms with sleep IDs as keys."""
-        return self._test_hyps
-
-    @property
-    def sleep_ids(self):
-        """A tuple of all sleep IDs."""
-        return self._sleep_ids
-
     @property
     def n_sleeps(self):
         """The number of unique sleep sessions."""
-        return self._n_sleeps
+        return len(self._sleep_ids)
 
     @property
-    def refr_scorer(self):
+    def ref_scorer(self):
         """The name of the reference scorer."""
-        return self._refr_scorer
+        return self._ref_scorer
 
     @property
-    def test_scorer(self):
-        """The name of the test scorer."""
-        return self._test_scorer
-
-    @property
-    def indiv_agree_avg(self):
-        """
-        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` average-based agreement scores
-        for each individual sleep session.
-
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_ovr`
-        """
-        return self._indiv_agree_avg
-
-    @property
-    def indiv_agree_ovr(self):
-        """
-        A :py:class:`pandas.DataFrame` of ``refr_hyp``/``test_hyp`` one-vs-rest agreement scores
-        for each individual sleep session. Agreement scores are provided for each sleep stage.
-
-        .. seealso:: :py:attr:`yasa.EpochByEvaluation.indiv_agree_avg`
-        """
-        return self._indiv_agree_ovr
+    def obs_scorer(self):
+        """The name of the observed scorer."""
+        return self._obs_scorer
 
     @staticmethod
-    def multi_scorer(df, weights=None):
+    def multi_scorer(df, scorers):
         """Compute multiple agreement scores from a 2-column dataframe.
 
         This function offers convenience when calculating multiple agreement scores using
         :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
-        return multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
+        returns multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
         accept multiple functions.
 
         Parameters
         ----------
         df : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with exactly 2 columns and length of *n_samples*.
-            The first column contains true values and second column contains predicted values.
-
-        weights : None or :py:class:`pandas.Series`
-            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions when possible.
-            If a :py:class:`pandas.Series`, the index must match exactly that of
-            :py:attr:`~yasa.Hypnogram.data`.
+            A :py:class:`~pandas.DataFrame` with 2 columns and length of *n_samples*.
+            The first column contains reference values and second column contains observed values.
+            If a third column, it must contain sample weights to be passed to underlying
+            :py:mod:`sklearn.metrics` functions as ``sample_weight`` where applicable.
+        scorers : dictionary
+            The scorers to be used for evaluating agreement. A dictionary with scorer names (str) as
+            keys and functions as values.
 
         Returns
         -------
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        assert (
-            isinstance(weights, type(None)) or weights in df
-        ), "`weights` must be None or a column in `df`"
-        if weights is not None:
-            raise NotImplementedError("Custom `weights` not currently supported")
-        t, p = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
-        # t = df["col1"].to_numpy()
-        # p = df["col2"].to_numpy()
-        w = df["col3"].to_numpy() if weights is not None else weights
-        ## Q: The dictionary below be compiled more concisely if we were comfortable accessing
-        ##    "private" attributes. I understand that's a no-no but I'm not exactly sure why.
-        ##     For example:
-        ##     >>> scorers = ["accuracy", "recall"]
-        ##     >>> funcs = { s: skm.__getattribute__(f"{s}_scorer") for s in scorers }
-        ##     >>> scores = { s: f(true, pred) for s, f in funcs.items() }
-        ##     Keywords could be applied as needed by checking f.__kwdefaults__
-        ##     This would offer an easy way for users to add their own scorers with an arg as well.
-        return {
-            "accuracy": skm.accuracy_score(t, p, normalize=True, sample_weight=w),
-            "balanced_acc": skm.balanced_accuracy_score(t, p, adjusted=False, sample_weight=w),
-            "kappa": skm.cohen_kappa_score(t, p, labels=None, weights=None, sample_weight=w),
-            "mcc": skm.matthews_corrcoef(t, p, sample_weight=w),
-            "precision": skm.precision_score(
-                t, p, average="weighted", sample_weight=w, zero_division=0
-            ),
-            "recall": skm.recall_score(t, p, average="weighted", sample_weight=w, zero_division=0),
-            "fbeta": skm.fbeta_score(
-                t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
-            ),
-        }
+        assert isinstance(scorers, dict)
+        assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
+        if df.shape[1] == 3:
+            true, pred, weights = zip(*df.values)
+        else:
+            true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
+            weights = None
+        scores = {s: f(true, pred, weights) for s, f in scorers.items()}
+        return scores
 
-    def summary(self, by_stage=False, **kwargs):
-        """Return group-level agreement scores.
+    def get_agreement(self, sample_weight=None, scorers=None):
+        """
+        Return a :py:class:`pandas.DataFrame` of weighted (i.e., averaged) agreement scores.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
-        by_stage : bool
-            If True, returned ``summary`` :py:class:`pandas.DataFrame` will include agreement scores
-            for each sleep stage, derived from one-vs-rest metrics. If False (default), ``summary``
-            will include agreement scores derived from average-based metrics.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
+            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
+        sample_weight : None or :py:class:`pandas.Series`
+            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions where possible.
+            If a :py:class:`pandas.Series`, the index must match exactly that of
+            :py:attr:`~yasa.Hypnogram.data`.
+        scorers : None, list, or dictionary
+            The scorers to be used for evaluating agreement. If None (default), default scorers are
+            used. If a list, the list must contain strings that represent metrics from the sklearn
+            metrics module (e.g., ``accuracy``, ``precision``). If more customization is desired, a
+            dictionary can be passed with scorer names (str) as keys and custom functions as values.
+            The custom functions should take 3 positional arguments (true values, predicted values,
+            and sample weights).
 
         Returns
         -------
-        summary : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
-            with descriptive statistics.
-
-            >>> ebe = yasa.EpochByEpochEvaluation(...)
-            >>> ebe.summary()
-
-            This will give a :py:class:`pandas.DataFrame` where each row is an agreement metric and
-            each column is a descriptive statistic (e.g., mean, standard deviation).
-            To control the descriptive statistics included as columns:
-
-            >>> ebe.summary(func=["count", "mean", "sem"])
+        agreement : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
         """
-        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
-        mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
-        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
-        if by_stage:
-            summary = (
-                self.indiv_agree_ovr.groupby("stage")
-                .agg(**agg_kwargs)
-                .stack(0)
-                .rename_axis(["stage", "metric"])
+        assert (
+            isinstance(sample_weight, (type(None), pd.Series))
+        ), "`sample_weight` must be None or pandas Series"
+        assert isinstance(scorers, (type(None), list, dict))
+        if isinstance(scorers, list):
+            assert all(isinstance(x, str) for x in scorers)
+        elif isinstance(scorers, dict):
+            assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
+        if scorers is None:
+            # Create dictionary of default scorer functions
+            scorers = {
+                "accuracy": lambda t, p, w: skm.accuracy_score(
+                    t, p, normalize=True, sample_weight=w
+                ),
+                "balanced_acc": lambda t, p, w: skm.balanced_accuracy_score(
+                    t, p, adjusted=False, sample_weight=w
+                ),
+                "kappa": lambda t, p, w: skm.cohen_kappa_score(
+                    t, p, labels=None, weights=None, sample_weight=w
+                ),
+                "mcc": lambda t, p, w: skm.matthews_corrcoef(t, p, sample_weight=w),
+                "precision": lambda t, p, w: skm.precision_score(
+                    t, p, average="weighted", sample_weight=w, zero_division=0
+                ),
+                "recall": lambda t, p, w: skm.recall_score(
+                    t, p, average="weighted", sample_weight=w, zero_division=0
+                ),
+                "fbeta": lambda t, p, w: skm.fbeta_score(
+                    t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
+                ),
+            }
+        elif isinstance(scorers, list):
+            # Convert the list to a dictionary of sklearn scorers
+            scorers = {s: skm.__getattribute__(f"{s}_scorer") for s in scorers}
+        # Make a copy of data since weights series might be added to it
+        df = self.data.copy()
+        if sample_weight is not None:
+            assert sample_weight.index == self.data.index, (
+                "If not ``None``, ``sample_weight`` Series must be a pandas Series with same index as `self.data`"
             )
-        else:
-            summary = self.indiv_agree_avg.agg(**agg_kwargs).T.rename_axis("metric")
-            ## Q: Should we include a column that calculates agreement treating all hypnograms as
-            ##    coming from one individual? Others sometimes report it, though I find it mostly
-            ##    meaningless because of possible n_epochs imbalances between subjects. I vote no.
-            # summary.insert(0, "all", self.multi_scorer(self.data))
-        ## Q: Alternatively, we could remove the `by_stage` parameter and stack these into
-        ##    one merged DataFrame where the results that are *not* by-stage are included
-        ##    with an "all" stage label:
-        ## >>> summary = summary.assign(stage="all").set_index("stage", append=True).swaplevel()
-        ## >>> summary = pd.concat([summary, summary_ovr]).sort_index()
-        return summary
-
-    def get_sleep_stats(self):
+            # Add weights as a third column for multi_scorer to use
+            df["weights"] = sample_weight
+        # Get individual-level averaged/weighted agreement scores
+        agreement = df.groupby(level=0).apply(self.multi_scorer, scorers=scorers).apply(pd.Series)
+        # Set attribute for later access
+        self._agreement = agreement
+        # Convert to Series if just one session being evaluated
+        if self.n_sleeps == 1:
+            agreement = agreement.squeeze().rename("agreement")
+        return agreement
+
+    def get_agreement_bystage(self, beta=1.0):
         """
-        Return a :py:class:`pandas.DataFrame` of sleep statistics for each individual derived from
-        both reference and test scorers.
-
-        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
-
-        .. seealso:: :py:class:`yasa.SleepStatsEvaluation`
+        Return a :py:class:`pandas.DataFrame` of unweighted (i.e., one-vs-rest) agreement scores.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
+        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
+            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
+        beta : float
+            See :py:func:`sklearn.metrics.precision_recall_fscore_support`.
 
         Returns
         -------
-        sstats : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with sleep statistics as columns and two rows for each
-            individual (one from reference scorer and another from test scorer).
+        agreement : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
+            :py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
         """
-        # Get all sleep statistics
-        refr_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.refr_hyps.items()})
-        test_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self.test_hyps.items()})
-        # Reshape and name axis
-        refr_sstats = refr_sstats.T.rename_axis("sleep_id")
-        test_sstats = test_sstats.T.rename_axis("sleep_id")
-        # Convert to MultiIndex with new scorer level
-        refr_sstats = pd.concat({self.refr_scorer: refr_sstats}, names=["scorer"])
-        test_sstats = pd.concat({self.test_scorer: test_sstats}, names=["scorer"])
-        return pd.concat([refr_sstats, test_sstats])
+        scorer = lambda df: skm.precision_recall_fscore_support(
+            *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
+        )
+        agreement = (
+            self.data
+            # Get precision, recall, f1, and support for each individual sleep session
+            .groupby(level=0)
+            .apply(scorer)
+            # Unpack arrays
+            .explode()
+            .apply(pd.Series)
+            # Add metric labels column and prepend it to index, creating MultiIndex
+            .assign(metric=["precision", "recall", "fbeta", "support"] * self.n_sleeps)
+            .set_index("metric", append=True)
+            # Convert stage column names to string labels
+            .rename_axis(columns="stage")
+            .rename(columns=self._skm2yasa_map)
+            .rename(columns=self._yasa2yasa_map)
+            # Remove all-zero columns (i.e., stages that were not present in the hypnogram)
+            .pipe(lambda df: df.loc[:, df.any()])
+            # Reshape so metrics are columns
+            .stack()
+            .unstack("metric")
+            .rename_axis(columns=None)
+            # Swap MultiIndex levels and sort so stages are in standard YASA order
+            .swaplevel()
+            .sort_index(
+                level="stage",
+                key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y))
+            )
+        )
+        # Set attribute for later access
+        self._agreement_bystage = agreement
+        # Remove the MultiIndex if just one session being evaluated
+        if self.n_sleeps == 1:
+            agreement = agreement.reset_index(level=1, drop=True)
+        return agreement
 
     def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         """
-        Return a ``refr_hyp``/``test_hyp``confusion matrix from either a single session or all
+        Return a ``ref_hyp``/``obs_hyp``confusion matrix from either a single session or all
         sessions concatenated together.
 
         Parameters
         ----------
-        self : :py:class:`yasa.EpochByEvaluation`
-            A :py:class:`yasa.EpochByEvaluation` instance.
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
         sleep_id : None or a valid sleep ID
             If None (default), cross-tabulation is derived from the entire group dataset.
-            If a valid sleep ID, cross-tabulation is derived using only the reference and test
+            If a valid sleep ID, cross-tabulation is derived using only the reference and observed
             scored hypnograms from that sleep session.
-        ## Q: This keyword (agg_func) is too complicated, but I wanted your opinion on the best
-        ##    approach. And I wanted you to see the returned value when agg_func=None because it
-        ##    might be best to generate during __init__ to set and access as an attribute.
-        agg_func : str, list, or None
+        agg_func : None or str
             If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
-            all individual sleep session results. If not None, group results returns a
-            :py:class:`~pandas.DataFrame` aggregated across individual sleep sessions where
-            ``agg_func`` is passed as ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`.
-            Ignored if ``sleep_id`` is not None.
+            all individual session results. If not None, group results returns a
+            :py:class:`~pandas.DataFrame` aggregated across sessions where ``agg_func`` is passed as
+            ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`. For example, set
+            ``agg_func="sum"`` to get a single confusion matrix across all epochs that does not take
+            session into account.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.
 
@@ -528,22 +474,59 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
 
         Examples
         --------
-        >>> ebe = yasa.EpochByEpochEvaluation(...)
-        >>> ebe.get_confusion_matrix()  # Return results from all individual subjects
-        >>> ebe.get_confusion_matrix(agg_func=["mean", "std"])  # Return summary results
-        >>> ebe.get_confusion_matrix(sleep_id="sub-002")  # Return results from one subject
+        >>> import yasa
+        >>> ref_hyps = [yasa.simulate_hypnogram(tib=90, scorer="Rater1", seed=i) for i in range(3)]
+        >>> obs_hyps = [h.simulate_similar(scorer="Rater2", seed=i) for i, h in enumerate(ref_hyps)]
+        >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+        >>> ebe.get_confusion_matrix(sleep_id=2)
+        Rater2  WAKE  N1  N2  N3  REM
+        Rater1
+        WAKE       1   2  23   0    0
+        N1         0   9  13   0    0
+        N2         0   6  71   0    0
+        N3         0  13  42   0    0
+        REM        0   0   0   0    0
+
+        >>> ebe.get_confusion_matrix()
+        Rater2           WAKE  N1  N2  N3  REM
+        sleep_id Rater1
+        1        WAKE      30   0   3   0   35
+                 N1         3   2   7   0    0
+                 N2        21  12   7   0    4
+                 N3         0   0   0   0    0
+                 REM        2   8  29   0   17
+        2        WAKE       1   2  23   0    0
+                 N1         0   9  13   0    0
+                 N2         0   6  71   0    0
+                 N3         0  13  42   0    0
+                 REM        0   0   0   0    0
+        3        WAKE      16   0   7  19   19
+                 N1         0   7   2   0    5
+                 N2         0  10  12   7    5
+                 N3         0   0  16  11    0
+                 REM        0  15  11  18    0
+
+        >>> ebe.get_confusion_matrix(agg_func="sum")
+        Rater2  WAKE  N1  N2  N3  REM
+        Rater1
+        WAKE      47   2  33  19   54
+        N1         3  18  22   0    5
+        N2        21  28  90   7    9
+        N3         0  13  58  11    0
+        REM        2  23  40  18   17
         """
         assert (
-            sleep_id is None or sleep_id in self.sleep_ids
+            sleep_id is None or sleep_id in self._sleep_ids
         ), "`sleep_id` must be None or a valid sleep ID"
+        assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
+        assert not ((self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None), (
+            "`agg_func` must be None if plotting a single session."
+        )
         kwargs = {"labels": self._skm_labels} | kwargs
-        # Get confusion matrix for each individual sleep session
-        ## Q: Should this be done during __init__ and accessible via attribute?
-        ##    I'm a little unsure about what should happen in init and be accessed as a property
-        ##    vs what should require a function. Nothing takes so long that it feels like it
-        ##    couldn't just happen during __init__, leaving mostly just plotting functions as
-        ##    methods. But if that's the case, what's the benefit of being a class? Confused!!
-        conf_mats = (
+        # Generate a DataFrame with a confusion matrix for each session
+        #   Seems easier to just generate this whole thing and then either
+        #   extract a single one or aggregate across them all, depending on user request
+        confusion_matrices = (
             self.data
             # Get confusion matrix for each individual sleep session
             .groupby(level=0)
@@ -552,44 +535,87 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
             .explode()
             .apply(pd.Series)
             # Convert to MultiIndex with reference scorer as new level
-            .assign(**{self.refr_scorer: self._skm_labels * self.n_sleeps})
-            .set_index(self.refr_scorer, append=True)
-            .rename_axis(columns=self.test_scorer)
+            .assign(**{self.ref_scorer: self._skm_labels * self.n_sleeps})
+            .set_index(self.ref_scorer, append=True)
+            .rename_axis(columns=self.obs_scorer)
             # Convert sleep stage columns and indices to strings
-            .rename(columns=self._skm_mapping)
-            .rename(columns=self._mapping_int)
-            .rename(index=self._skm_mapping, level=self.refr_scorer)
-            .rename(index=self._mapping_int, level=self.refr_scorer)
+            .rename(columns=self._skm2yasa_map)
+            .rename(columns=self._yasa2yasa_map)
+            .rename(index=self._skm2yasa_map, level=self.ref_scorer)
+            .rename(index=self._yasa2yasa_map, level=self.ref_scorer)
         )
+        if self.n_sleeps == 1:
+            # If just one session, use the only session ID as the key, for simplified returned df
+            sleep_id = self._sleep_ids[0]
         if sleep_id is None:
             if agg_func is None:
-                mat = conf_mats
+                mat = confusion_matrices
             else:
-                mat = conf_mats.groupby(self.refr_scorer).agg(agg_func)
-                mat.columns = mat.columns.map("_".join).set_names(self.test_scorer)
+                mat = confusion_matrices.groupby(self.ref_scorer, sort=False).agg(agg_func)
         else:
-            mat = conf_mats.loc[sleep_id]
+            mat = confusion_matrices.loc[sleep_id]
         return mat
 
-    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, test_kwargs={}):
-        """Plot the two hypnograms, where the reference hypnogram is overlaid on the test hypnogram.
+    def get_sleep_stats(self):
+        """
+        Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
+        both reference and observed scorers.
+
+        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
+
+        .. seealso:: :py:class:`yasa.SleepStatsAgreement`
+
+        Parameters
+        ----------
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
+
+        Returns
+        -------
+        sstats : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
+            individual (one for reference scorer and another for test scorer).
+        """
+        # Get all sleep statistics
+        ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
+        obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
+        # Reshape and name axis
+        ref_sstats = ref_sstats.T.rename_axis("sleep_id")
+        obs_sstats = obs_sstats.T.rename_axis("sleep_id")
+        # Convert to MultiIndex with new scorer level
+        ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
+        obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
+        # Concatenate into one DataFrame
+        sstats = pd.concat([ref_sstats, obs_sstats])
+        # Remove the MultiIndex if just one session being evaluated
+        if self.n_sleeps == 1:
+            sstats = sstats.reset_index(level=1, drop=True)
+        return sstats
+
+    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, obs_kwargs={}):
+        """Plot the two hypnograms of one session overlapping on the same axis.
 
         .. seealso:: :py:func:`yasa.plot_hypnogram`
 
         Parameters
         ----------
-        sleep_id : None or a valid sleep ID
-            If a valid sleep ID, plot the reference and test hypnograms from on sleep session.
+        self : :py:class:`yasa.EpochByEpochAgreement`
+            A :py:class:`yasa.EpochByEpochAgreement` instance.
+        sleep_id : a valid sleep ID or None
+            The sleep session to plot. If multiple sessions are included in the
+            :py:class:`~yasa.EpochByEpochAgreement` instance, a ``sleep_id`` must be provided. If
+            only one session is present, ``None`` (default) will plot the two hypnograms of the
+            only session.
         legend : bool or dict
             If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
             pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
         ax : :py:class:`matplotlib.axes.Axes` or None
             Axis on which to draw the plot, optional.
-        refr_kwargs : dict
+        ref_kwargs : dict
             Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
             hypnogram.
-        test_kwargs : dict
-            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the test
+        obs_kwargs : dict
+            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the observed
             hypnogram.
 
         Returns
@@ -602,165 +628,180 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, refr_kwargs={}, t
         .. plot::
 
             >>> from yasa import simulate_hypnogram
-            >>> hyp = simulate_hypnogram(seed=7)
-            >>> ax = hyp.evaluate(hyp.simulate_similar()).plot_hypnograms()
+            >>> hyp = simulate_hypnogram(scorer="Anthony", seed=19)
+            >>> ax = hyp.evaluate(hyp.simulate_similar(scorer="Alan", seed=68)).plot_hypnograms()
         """
         assert (
-            sleep_id is None or sleep_id in self.sleep_ids
+            sleep_id is None or sleep_id in self._sleep_ids
         ), "`sleep_id` must be None or a valid sleep ID"
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
-        assert isinstance(refr_kwargs, dict), "`refr_kwargs` must be a dictionary"
-        assert isinstance(test_kwargs, dict), "`test_kwargs` must be a dictionary"
-        assert (
-            not "ax" in refr_kwargs | test_kwargs
-        ), "'ax' can't be supplied to `refr_kwargs` or `test_kwargs`, use the `ax` keyword instead"
-        if sleep_id is None:
-            if self.n_sleeps == 1:
-                refr_hyp = self.refr_hyps[self.sleep_ids[0]]
-                test_hyp = self.test_hyps[self.sleep_ids[0]]
-            else:
-                raise NotImplementedError(
-                    "Multi-session plotting is not currently supported. 3 options being tested!"
-                )
+        assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
+        assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
+        assert not "ax" in ref_kwargs | obs_kwargs, (
+            "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
+        )
+        assert not (sleep_id is None and self.n_sleeps > 1), (
+            "Multi-session plotting is not currently supported. `sleep_id` must not be None when "
+            "multiple sessions are present"
+        )
+        # Select the session hypnograms to plot
+        if sleep_id is None and self.n_sleeps == 1:
+            ref_hyp = self._ref_hyps[self._sleep_ids[0]]
+            obs_hyp = self._obs_hyps[self._sleep_ids[0]]
         else:
-            refr_hyp = self.refr_hyps[sleep_id]
-            test_hyp = self.test_hyps[sleep_id]
-        plot_refr_kwargs = {"highlight": None, "alpha": 0.8}
-        plot_test_kwargs = {"highlight": None, "alpha": 0.8, "color": "darkcyan", "ls": "dashed"}
-        plot_refr_kwargs.update(refr_kwargs)
-        plot_test_kwargs.update(test_kwargs)
-        if ax is None:
-            ax = plt.gca()
-        refr_hyp.plot_hypnogram(ax=ax, **plot_refr_kwargs)
-        test_hyp.plot_hypnogram(ax=ax, **plot_test_kwargs)
-        if legend and "label" in plot_refr_kwargs | plot_test_kwargs:
+            ref_hyp = self._ref_hyps[sleep_id]
+            obs_hyp = self._obs_hyps[sleep_id]
+        # Set default plotting kwargs and merge with user kwargs
+        plot_ref_kwargs = {
+            "label": self.ref_scorer,
+            "highlight": None,
+            "color": "black",
+            "alpha": 0.8,
+        }
+        plot_obs_kwargs = {
+            "label": self.obs_scorer,
+            "highlight": None,
+            "color": "green",
+            "alpha": 0.8,
+            "ls": "dashed",
+        }
+        plot_ref_kwargs.update(ref_kwargs)
+        plot_obs_kwargs.update(obs_kwargs)
+        # Draw the hypnograms
+        ax = ref_hyp.plot_hypnogram(ax=ax, **plot_ref_kwargs)
+        ax = obs_hyp.plot_hypnogram(ax=ax, **plot_obs_kwargs)
+        # Add legend if desired
+        if legend:
             if isinstance(legend, dict):
                 ax.legend(**legend)
             else:
                 ax.legend()
         return ax
 
-    def plot_group_hypnogram_opt1(self, ax=None, **kwargs):
-        if ax is None:
-            ax = plt.gca()
-        palette = {"Inaccurate": "plum", "Accurate": "forestgreen"}
-        hue_order = list(palette)
-        hist_kwargs = dict(multiple="stack", stat="count", element="step", discrete=True, lw=0)
-        ser = self.data[self.refr_scorer].eq(self.data[self.test_scorer])
-        df = ser.rename("acc").replace({True: "Accurate", False: "Inaccurate"}).reset_index()
-        sns.histplot(data=df, x="Epoch", hue="acc", hue_order=hue_order, palette=palette, ax=ax)
-        ax.set_ylabel("Number of unique sleep sessions")
-        ax.set_xlabel("Epochs")
-        ax.margins(x=0, y=0)
-        return ax
-
-    def plot_group_hypnogram_opt2(self, ax=None, **kwargs):
-        from pingouin import compute_bootci
-
-        plot_kwargs = dict(lw=1, color="plum", alpha=1, label="7-epoch rolling average")
-        plot_kwargs.update(kwargs)
-        betw_kwargs = dict(lw=0, alpha=0.3, color=plot_kwargs["color"], label="95% bootstrapped CI")
-        if ax is None:
-            ax = plt.gca()
-        df = self.data[self.refr_scorer].eq(self.data[self.test_scorer]).rename("acc").reset_index()
-        probas = df.groupby("Epoch")["acc"].mean()
-        ci = df.groupby("Epoch")["acc"].apply(compute_bootci, None, "mean").apply(pd.Series)
-        ci = ci.rename(columns={0: "low", 1: "high"})
-        probas = probas.rolling(10, center=True).mean()
-        ci = ci.rolling(10, center=True).mean()
-        ax.fill_between(ci.index, ci["low"], ci["high"], **betw_kwargs)
-        ax.plot(probas.index, probas, **plot_kwargs)
-        ax.set_ylabel("Accuracy across sleep sessions")
-        ax.set_xlabel("Epochs")
-        ax.set_xlim(0, len(probas))
-        ax.set_ylim(0, 1)
-        ax.legend()
-        return ax
+    def summary(self, by_stage=False, **kwargs):
+        """Return group-level agreement scores.
 
-    def plot_group_hypnogram_opt3(self, figsize=(7, 10), **kwargs):
-        imshow_kwargs = dict(cmap="Blues", interpolation="none")
-        imshow_kwargs.update(kwargs)
-        n_rows = self.n_sleeps
-        freq = self.refr_hyps[self.sleep_ids[0]].freq
-        freq_secs = pd.Timedelta(freq).total_seconds()
-        fig, axes = plt.subplots(nrows=n_rows, figsize=figsize, sharex=True, sharey=False)
-        for ax, (subj, data) in zip(axes, self.data.groupby(level=0)):
-            img = data.values.T
-            extent = (0, freq_secs * img.shape[1], img.shape[0] - 0.5, -0.5)
-            ax.imshow(img, extent=extent, aspect="auto", origin="upper", **imshow_kwargs)
-            ax.set_yticks([0, 1])
-            ax.set_yticklabels([self.refr_scorer, self.test_scorer])
-            ax.set_ylabel(subj, rotation=0, va="center")
-            ax.spines[["top", "bottom", "left", "right"]].set_visible(False)
-            if not ax.get_subplotspec().is_first_row():
-                ax.tick_params(left=False, labelleft=False)
-            if not ax.get_subplotspec().is_last_row():
-                ax.tick_params(bottom=False)
-                ax.set_xlabel("Time [s]")
-                ax.spines["bottom"].set_visible(False)
-        fig.align_ylabels()
-        return fig
-
-    def plot_roc(self, sleep_id=None, palette=None, ax=None, **kwargs):
-        """Plot ROC curves for each stage.
+        Default aggregated measures are
 
         Parameters
         ----------
-        palette : dict or None
-            If a dictionary, keys are stages and values are corresponding colors.
-        ax : :py:class:`matplotlib.axes.Axes`
-            Axis on which to draw the plot, optional.
+        self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
+            A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
+        by_stage : bool
+            If ``False`` (default), ``summary`` will include agreement scores derived from
+            average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
+            will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
         **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`matplotlib.pyplot.plot` call.
+            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
+            This can be used to customize the descriptive statistics returned.
 
         Returns
         -------
-        ax : :py:class:`matplotlib.axes.Axes`
-            Matplotlib Axes
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
+            with descriptive statistics.
+
+            >>> ebe = yasa.EpochByEpochAgreement(...)
+            >>> agreement = ebe.get_agreement()
+            >>> ebe.summary()
+
+            This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
+            each column is a descriptive statistic (e.g., mean, standard deviation).
+            To control the descriptive statistics included as columns:
+
+            >>> ebe.summary(func=["count", "mean", "sem"])
         """
-        assert (
-            sleep_id is None or sleep_id in self.sleep_ids
-        ), "`sleep_id` must be None or a valid sleep ID"
-        raise NotImplementedError("ROC plots will be implemented once YASA hypnograms have probas.")
+        assert self.n_sleeps > 1, (
+            "Summary scores can not be computed with only one hypnogram pair."
+        )
+        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
+        if by_stage:
+            assert hasattr(self, "_agreement_bystage"), (
+                "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
+            )
+        else:
+            assert hasattr(self, "_agreement"), (
+                "Must run `self.get_agreement` before obtaining summary results."
+            )
+        # Create a function for getting mean absolute deviation
+        mad = lambda df: (df - df.mean()).abs().mean()
+        mad.__name__ = "mad"  # Pandas uses this lambda attribute to name the aggregated column
+        # Merge default and user kwargs
+        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
+        if by_stage:
+            summary = (
+                self
+                .agreement_bystage.groupby("stage")
+                .agg(**agg_kwargs)
+                .stack(level=0)
+                .rename_axis(["stage", "metric"])
+            )
+        else:
+            summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
+        return summary
 
 
-#############################################################################
+################################################################################
 # SLEEP STATISTICS
-#############################################################################
+################################################################################
 
 
-class SleepStatsEvaluation:
+class SleepStatsAgreement:
     """
-    Evaluate agreement between two scorers (e.g., two different manual scorers or one manual scorer
-    and YASA's automatic staging) by comparing their summary sleep statistics derived from multiple
-    subjects or sessions.
+    Evaluate agreement between sleep statistics reported by two different scorers or scoring
+    methods.
+
+    Bias and limits-of-agreement (and their confidence intervals) are calcualted for each sleep
+    statistic. How these are calculated depends on the sleep statistic's underlying error
+    distribution. See [Menghini2021]_ for details, but in brief:
 
-    .. warning::
-        :py:class:`yasa.evaluation.SleepStatsEvaluation` is a new YASA feature and the API is
-        subject to future change.
+    * Bias: The difference between the two scorers (observed minus reference).
+        If sleep-statistic differences (observed minus reference) show proportional bias,
+        bias is represented as a regression equation that takes into account changes in bias as
+        a function of measurement value. Otherwise, bias is represented as the standard mean
+        difference.
+    * Limits-of-agreement: If sleep statistic differences show proportional bias, ...
+    * Confidence intervals: If sleep statistic differences follow a normal distribution,
+        confidence intervals are calculated using standard parametric methods. Otherwise,
+        bootstrapped confidence intervals are generated (see also ``bootstrap_cis``).
+
+    Observed sleep statistics can be corrected (i.e., ``calibrated``) to bring them into alignment
+    with the sleep statistics from the reference scorer.
+
+    Bias values are calculated as...
+    LOA ...
+    CI ...
+
+
+    .. important::
+        Bias, limits-of-agreement, and confidence intervals are all calculated differently depending
+        on assumption violations. See Menghini et al., 2021 [Menghini2021]_ for details.
+
+    .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
     .. versionadded:: 0.7.0
 
     Parameters
     ----------
-    refr_data : :py:class:`pandas.DataFrame`
+    ref_data : :py:class:`pandas.DataFrame`
         A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
-        Rows are individual sleep sessions and columns are individual sleep statistics.
-    test_data : :py:class:`pandas.DataFrame`
-        A :py:class:`pandas.DataFrame` with sleep statistics from the test scorer.
-        Shape, indices, and columns must be identical to ``refr_data``.
-    refr_scorer : str
-        Name of the reference scorer, used for labeling.
-    test_scorer : str
-        Name of the test scorer, used for labeling.
+        Rows are unique observations and columns are unique sleep statistics.
+    obs_data : :py:class:`pandas.DataFrame`
+        A :py:class:`pandas.DataFrame` with sleep statistics from the observed scorer.
+        Rows are unique observations and columns are unique sleep statistics.
+        Shape, index, and columns must be identical to ``ref_data``.
+    ref_scorer : str
+        Name of the reference scorer.
+    obs_scorer : str
+        Name of the observed scorer.
     alpha : float
-        Alpha cutoff used for all three tests.
-    normality_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.normality` call.
-    regression_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.linear_regression` call.
-    homoscedasticity_kwargs : dict
-        Keywords arguments passed to the :py:func:`pingouin.homoscedasticity` call.
+        Alpha cutoff used for all assumption tests.
+
+        .. note:: set ``alpha=1`` to ignore all corrections.
+    bootstrap_all_cis : bool
+        If ``True``, generate all 95% confidence intervals using a bootstrap resampling procedure.
+        Otherwise (``False``, default) use the resampling procedure only when discrepancy values
+        break normality assumptions.
     verbose : bool or str
         Verbose level. Default (False) will only print warning and error messages. The logging
         levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
@@ -775,7 +816,7 @@ class SleepStatsEvaluation:
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. Sleep, 44(2),
+                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                        zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
@@ -794,7 +835,7 @@ class SleepStatsEvaluation:
     >>> sstatsA = sstats.loc["Ref"]
     >>> sstatsB = sstats.loc["Test"]
     >>>
-    >>> sse = yasa.SleepStatsEvaluation(sstatsA, sstatsB)
+    >>> sse = yasa.SleepStatsAgreement(sstatsA, sstatsB)
     >>>
     >>> sse.summary()
            normal  unbiased  homoscedastic
@@ -847,156 +888,185 @@ class SleepStatsEvaluation:
 
     def __init__(
         self,
-        refr_data,
-        test_data,
+        ref_data,
+        obs_data,
         *,
-        refr_scorer="Reference",
-        test_scorer="Test",
-        normality_kwargs={"alpha": 0.05},
-        regression_kwargs={"alpha": 0.05},
-        homoscedasticity_kwargs={"alpha": 0.05},
+        ref_scorer="Reference",
+        obs_scorer="Observed",
+        alpha=0.05,
+        bootstrap_all_cis=False,
         verbose=True,
     ):
-        set_log_level(verbose)
 
-        assert isinstance(refr_data, pd.DataFrame), "`refr_data` must be a pandas DataFrame"
-        assert isinstance(test_data, pd.DataFrame), "`test_data` must be a pandas DataFrame"
+        assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
+        assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
         assert np.array_equal(
-            refr_data.index, test_data.index
-        ), "`refr_data` and `test_data` index values must be identical"
+            ref_data.index, obs_data.index
+        ), "`ref_data` and `obs_data` index values must be identical"
         assert (
-            refr_data.index.name == test_data.index.name
-        ), "`refr_data` and `test_data` index names must be identical"
+            ref_data.index.name == obs_data.index.name
+        ), "`ref_data` and `obs_data` index names must be identical"
         assert np.array_equal(
-            refr_data.columns, test_data.columns
-        ), "`refr_data` and `test_data` column values must be identical"
-        assert isinstance(refr_scorer, str), "`refr_scorer` must be a string"
-        assert isinstance(test_scorer, str), "`test_scorer` must be a string"
-        assert refr_scorer != test_scorer, "`refr_scorer` and `test_scorer` must be unique"
-        assert isinstance(normality_kwargs, dict), "`normality_kwargs` must be a dictionary"
-        assert isinstance(regression_kwargs, dict), "`regression_kwargs` must be a dictionary"
-        assert isinstance(homoscedasticity_kwargs, dict), "`homoscedasticity_kwargs` must be a dict"
-        assert "alpha" in normality_kwargs, "`normality_kwargs` must include 'alpha'"
-        assert "alpha" in regression_kwargs, "`regression_kwargs` must include 'alpha'"
-        assert "alpha" in homoscedasticity_kwargs, "`homoscedasticity_kwargs` must include 'alpha'"
-
-        # If refr_data and test_data indices are unnamed, name them
-        sleep_id_str = "sleep_id" if refr_data.index.name is None else refr_data.index.name
-        refr_data.index.name = sleep_id_str
-        test_data.index.name = sleep_id_str
-
-        # Get scorer discrepancies (i.e., differences, test minus reference)
-        discrepancies = test_data.sub(refr_data)
-
-        # Convert to MultiIndex with new scorer level
-        discrepancies = pd.concat({"difference": discrepancies}, names=["scorer"])
-        refr_data = pd.concat({refr_scorer: refr_data}, names=["scorer"])
-        test_data = pd.concat({test_scorer: test_data}, names=["scorer"])
-
-        # Merge dataframes and reshape to long format
-        data = pd.concat([refr_data, test_data, discrepancies])
+            ref_data.columns, obs_data.columns
+        ), "`ref_data` and `obs_data` column values must be identical"
+        assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
+        assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
+        assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
+        assert isinstance(alpha, float) and 0 <= alpha <= 1, "`alpha` must be a number between 0 and 1, inclusive"
+        assert isinstance(bootstrap_all_cis, bool), "`bootstrap_all_cis` must be True or False"
+
+        # If `ref_data` and `obs_data` indices are unnamed, name them
+        session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
+        ref_data.index.name = session_key
+        obs_data.index.name = session_key
+
+        # Get scorer differences (i.e., observed minus reference)
+        diff_data = obs_data.sub(ref_data)
+
+        # Prepend a "scorer" level to index of each individual dataframe, making MultiIndex
+        obs_data = pd.concat({obs_scorer: obs_data}, names=["scorer"])
+        ref_data = pd.concat({ref_scorer: ref_data}, names=["scorer"])
+        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
+        # Merge observed data, reference data, and differences
+        data = pd.concat([obs_data, ref_data, diff_data])
+        # Reshape to long-format with 3 columns (observed, reference, difference)
         data = (
-            data.melt(var_name="sstat", ignore_index=False)
-            .reset_index()
-            .pivot(columns="scorer", index=[sleep_id_str, "sstat"], values="value")
+            data.melt(var_name="sleep_stat", ignore_index=False)
             .reset_index()
+            .pivot(columns="scorer", index=["sleep_stat", session_key], values="value")
             .rename_axis(columns=None)
+            .sort_index()
         )
 
         # Remove sleep statistics that have no differences between scorers
-        stats_nodiff = data.groupby("sstat")["difference"].any().loc[lambda x: ~x].index.tolist()
-        data = data.query(f"~sstat.isin({stats_nodiff})")
-        for s in stats_nodiff:
-            logger.warning(f"All {s} differences are zero, removing from evaluation.")
-
-        ## NORMALITY ##
-        # Test difference data (test - reference) for normality at each sleep statistic
-        normality = (
-            data
-            .groupby("sstat")["difference"]
-            .apply(pg.normality, **normality_kwargs)
-            .droplevel(-1)
+        stats_with_nodiff = diff_data.any().loc[lambda x: ~x].index.tolist()
+        data = data.query(f"~sleep_stat.isin({stats_with_nodiff})")
+        for s in stats_with_nodiff:
+            logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
+
+        ########################################################################
+        # TEST ASSUMPTION VIOLATIONS
+        ########################################################################
+
+        grouper = data.groupby("sleep_stat")  # For convenience
+
+        # Test SYSTEMATIC BIAS between the two scorers for each sleep statistic (do means differ?).
+        # This test is used to determine whether corrections are applied during calibration only.
+        systematic_bias = grouper["difference"].apply(pg.ttest, y=0).droplevel(-1)
+
+        # Test NORMALITY of difference values at each sleep statistic.
+        # This test is used to determine how confidence intervals for Bias and LoA are calculated.
+        normality = grouper["difference"].apply(pg.normality, alpha=alpha).droplevel(-1)
+
+        # Test PROPORTIONAL BIAS at each sleep statistic (do scorer diffs vary as with ref measure?)
+        # This test is used to determine how Bias and LoA are calculated.
+        regr_f = lambda df: pg.linear_regression(df[ref_scorer], df[obs_scorer], alpha=alpha)
+        resid_f = lambda df: pd.Series(regr_f(df).residuals_, index=df.index.get_level_values(1))
+        proportional_bias = grouper.apply(regr_f).droplevel(-1).set_index("names", append=True)
+        proportional_bias = proportional_bias.swaplevel().sort_index()
+        residuals = grouper.apply(resid_f).stack().rename("residual")
+
+        # Test HETEROSCEDASTICITY at each sleep statistic.
+        # This test is used to determine how LoAs are calculated.
+        data = data.join(residuals)
+        homosc_columns = [ref_scorer, "difference", "residual"]
+        homosc_f = lambda df: pg.homoscedasticity(df[homosc_columns], alpha=alpha)
+        heteroscedasticity = data.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
+        # Add same test for log-transformed values, also used for determining LoA calculation method
+        log_transform = lambda x: np.log(x + 1e-6)
+        backlog_transform = lambda x: np.exp(x) - 1e-6
+        logdata = data[[ref_scorer, obs_scorer]].applymap(log_transform)
+        logdata["difference"] = logdata[obs_scorer].sub(logdata[ref_scorer])
+        logdata["residual"] = logdata.groupby("sleep_stat").apply(resid_f).stack()#.rename("residual")
+        heteroscedasticity_log = logdata.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
+        # data_exp = logdata[[ref_scorer, obs_scorer, "difference"]].applymap(backlog_transform)
+        # data_exp = logdata["difference"].map(backlog_transformer)
+
+        # Aggregate test results into a dataframe of True/False for later convenience.
+        violations = (
+            systematic_bias["p-val"].lt(alpha).to_frame("is_systematically_biased")
+            .join(~normality["normal"].rename("is_nonnormal"))
+            .join(proportional_bias.loc[ref_scorer, "pval"].lt(alpha).rename("is_proportionally_biased"))
+            .join(~heteroscedasticity["equal_var"].rename("is_heteroscedastic"))
+            .join(~heteroscedasticity_log["equal_var"].rename("is_log_heteroscedastic"))
         )
 
-        ## PROPORTIONAL BIAS ##
-        # Test each sleep statistic for proportional bias
-        prop_bias_results = []
-        residuals_results = []
-        for ss_name, ss_df in data.groupby("sstat"):
-            # Regress the difference scores on the reference scores
-            model = pg.linear_regression(
-                ss_df[refr_scorer], ss_df["difference"], **regression_kwargs
-            )
-            model.insert(0, "sstat", ss_name)
-            # Extract sleep-level residuals for later homoscedasticity tests
-            resid_dict = {
-                sleep_id_str: ss_df[sleep_id_str],
-                "sstat": ss_name,
-                "pbias_residual": model.residuals_,
-            }
-            resid = pd.DataFrame(resid_dict)
-            prop_bias_results.append(model)
-            residuals_results.append(resid)
-        # Add residuals to raw dataframe, used later when testing homoscedasticity
-        data = data.merge(pd.concat(residuals_results), on=[sleep_id_str, "sstat"])
-        # Handle proportional bias results
-        prop_bias = pd.concat(prop_bias_results)
-        # Save all the proportional bias models before removing intercept, for optional user access
-        prop_bias_full = prop_bias.reset_index(drop=True)
-        # Now remove intercept rows
-        prop_bias = prop_bias.query("names != 'Intercept'").drop(columns="names").set_index("sstat")
-        # Add True/False passing column for easy access
-        prop_bias["unbiased"] = prop_bias["pval"].ge(regression_kwargs["alpha"])
-
-        ## Test each statistic for homoscedasticity ##
-        columns = [refr_scorer, "difference", "pbias_residual"]
-        homoscedasticity_f = lambda df: pg.homoscedasticity(df[columns], **homoscedasticity_kwargs)
-        homoscedasticity = data.groupby("sstat").apply(homoscedasticity_f).droplevel(-1)
+        # Get name of method for each calculation.
+        # CI - standard or bootstrap
+        # Bias - standard or modeled
+        # LoA - standard, log_standard, modeled, or residuals
+        get_ci_method = lambda row: "bootstrap" if row.is_nonnormal else "standard"
+        get_bias_method = lambda row: "modeled" if row.is_proportionally_biased else "standard"
+        get_loa_method = lambda row: (
+            "modeled" if row.is_log_heteroscedastic else "log_standard"
+        ) if row.is_heteroscedastic else (
+            "residuals" if row.is_proportionally_biased else "standard"
+        )
+        methods = {
+            "loa": violations.apply(get_loa_method, axis=1),
+            "bias": violations.apply(get_bias_method, axis=1),
+            "ci": violations.apply(get_ci_method, axis=1),
+        }
+        methods = pd.DataFrame(methods)
+        if bootstrap_all_cis:
+            methods["ci"] = ["standard"] * len(violations)
 
-        # Set attributes
+        ########################################################################
+        # ATTRIBUTES
+        ########################################################################
+
+        self._ref_scorer = ref_scorer
+        self._obs_scorer = obs_scorer
+        self._n_sessions = data.index.get_level_values(session_key).nunique()
         self._data = data
+        self._diff_data = diff_data.droplevel(0).drop(columns=stats_with_nodiff)
+        self._systematic_bias = systematic_bias
         self._normality = normality
-        self._proportional_bias = prop_bias
-        self._proportional_bias_full = prop_bias_full  ## Q: Is this worth saving??
-        self._homoscedasticity = homoscedasticity
-        self._refr_scorer = refr_scorer
-        self._test_scorer = test_scorer
-        self._sleep_id_str = sleep_id_str
-        self._n_sleeps = data[sleep_id_str].nunique()
-        self._discrepancies = discrepancies.drop(columns=stats_nodiff)
+        self._proportional_bias = proportional_bias
+        self._heteroscedasticity = heteroscedasticity
+        self._violations = violations
+        self._methods = methods
+        # self._bias = bias
+        # self._bias_vars = bias_vars
+        # self._loas = loas
+        # self._loas_vars = loas_vars
+
 
     @property
     def data(self):
-        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``refr_data`` and
-        ``test_data`` as well as their difference scores (``test_data`` minus ``refr_data``).
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
+        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
         """
         return self._data
 
+    @property
+    def methods(self):
+        return self._methods
+
+    @property
+    def biased(self):
+        return self._biased
+
     @property
     def discrepancies(self):
-        """A :py:class:`pandas.DataFrame` of ``test_data`` minus ``refr_data``."""
+        """A :py:class:`pandas.DataFrame` of ``obs_data`` minus ``ref_data``."""
         # # Pivot for session-rows and statistic-columns
         return self._discrepancies
 
     @property
-    def refr_scorer(self):
+    def ref_scorer(self):
         """The name of the reference scorer."""
-        return self._refr_scorer
-
-    @property
-    def test_scorer(self):
-        """The name of the test scorer."""
-        return self._test_scorer
+        return self._ref_scorer
 
     @property
-    def sleep_id_str(self):
-        """The name of the unique sleep session identifier."""
-        return self._sleep_id_str
+    def obs_scorer(self):
+        """The name of the observed scorer."""
+        return self._obs_scorer
 
     @property
-    def n_sleeps(self):
+    def n_sessions(self):
         """The number of sleep sessions."""
-        return self._n_sleeps
+        return self._n_sessions
 
     @property
     def normality(self):
@@ -1010,22 +1080,14 @@ def homoscedasticity(self):
 
     @property
     def proportional_bias(self):
-        """
-        A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics, with
-        intercept terms removed.
-        """
-        return self._proportional_bias
-
-    @property
-    def proportional_bias_full(self):
         """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
-        return self._proportional_bias_full
+        return self._proportional_bias
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
-            f"<SleepStatsEvaluation | Test scorer {self.test_scorer} evaluated against reference "
-            f"scorer {self.refr_scorer}, {self.n_sleeps} sleep sessions>\n"
+            f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
+            f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
             " - Use `.summary()` to get pass/fail values from various checks\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
@@ -1034,17 +1096,264 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
+    @staticmethod
+    def _get_standard_bias(x):
+        """Wrapper around `np.mean`, for organizational purposes. For internal use."""
+        return x.mean()
+
+    @staticmethod
+    def _get_standard_loas(x, agreement=1.96, std=None):
+        """Return standard lower and upper limits of agreement. For internal use only.
+
+        Parameters
+        ----------
+        x : array_like
+        agreement : float, int
+        std : float, int
+
+        Returns
+        -------
+        loas : py:class:`numpy.ndarray`
+            A numpy array of shape (2,) where lower LoA is first and upper LoA is second.
+        """
+        if std is None:
+            std = x.std()
+        return x.mean() + np.array([-agreement, agreement]) * std
+
+    @staticmethod
+    def _get_regression_coefficient(x, y, index):
+        """Run linear regression and return a single coefficient.
+        
+        A wrapper to aid in computing CIs (with pg.compute_bootci). For internal use only.
+
+        Parameters
+        ----------
+        x : array_like
+            Predictor values
+        y : array_like
+            Outcome values
+        index: int
+            0 to get coefficient of intercept, N to get coefficient of Nth predictor
+
+        Returns
+        -------
+        coef: float
+            Regression coefficient of the effect of `b`.
+        """
+        ## Q: Jump straight to np.lstsq for speed?
+        return pg.linear_regression(x, y, add_intercept=True).at[index, "coef"]
+
+    @staticmethod
+    def _get_standard_bias_ci(x, confidence=0.95):
+        """Return standard confidence intervals for bias."""
+        n = x.size
+        dof = x.size - 1
+        avg = x.mean()
+        std = x.std()
+        sem = np.sqrt(std**2 / n)
+        low, high = stats.t.interval(confidence, dof, loc=avg, scale=sem)
+        return low, high
+
+    @staticmethod
+    def _get_standard_loas_cis(x, agreement=1.96, std=None, confidence=0.95):
+        """Return standard confidence intervals for both lower LoA and upper LoA.
+
+        Parameters
+        ----------
+        x : array_like
+        agreement : float, int
+        std : float, int
+        confidence : float
+
+        Returns
+        -------
+        cis : dict
+            A dictionary of length 2, with keys "lower" and "upper" LoA, and values of tuples
+            containing "lower" and "upper" confidence intervals for each.
+        """
+        n = x.size
+        dof = x.size - 1
+        if std is None:
+            std = x.std()
+        lower, upper = DiscrepancyEvaluation._get_standard_loas(x, agreement)
+        sem = np.sqrt(3 * std**2 / n)
+        lower_lo, lower_hi = stats.t.interval(confidence, dof, loc=lower, scale=sem)
+        upper_lo, upper_hi = stats.t.interval(confidence, dof, loc=upper, scale=sem)
+        return {"lower": (lower_lo, lower_hi), "upper": (upper_lo, upper_hi)}
+
+    def get_bias(self, alpha=0.05, **bootci_kwargs):
+        results = []
+        for sstat, row in self.methods.iterrows():
+            # Extract difference values once for convenience.
+            diffs = self.data.loc[sstat, "difference"].to_numpy()
+
+            # Identify the method that will be used.
+            if self._violations.at[sstat, "is_proportionally_biased"]:
+                bias_method = "modeled"
+            else:
+                bias_method = "standard"
+
+            if self._violations.at[sstat, "is_nonnormal"]:
+                ci_method = "bootstrap"
+            else:
+                ci_method = "standard"
+
+            # Initialize dictionary to hold row information.
+            metadata = {"sleep_stat": sstat, "method": bias_method}
+
+            # Calculate necessary variables to get bias (either bias or b0 and b1).
+            if bias_method == "modeled":
+                # Systematic bias and constant bias present, model based on constant bias regression.
+                # x, y = self.data.loc[sstat, [self.ref_scorer, "difference"]].T.to_numpy()
+                ref = self.data.loc[sstat, self.ref_scorer].to_numpy()
+                b0 = self._get_regression_coefficient(ref, diffs, index=0)
+                b1 = self._get_regression_coefficient(ref, diffs, index=1)
+                # Confidence intervals for b0 and b1
+                if ci_method == "bootstrap":
+                    b0_lo, b0_hi = pg.compute_bootci(
+                        ref,
+                        diffs,
+                        func=lambda x, y: self._get_regression_coefficient(x, y, index=0),
+                        **bootci_kwargs,
+                    )
+                    b1_lo, b1_hi = pg.compute_bootci(
+                        ref,
+                        diffs,
+                        func=lambda x, y: self._get_regression_coefficient(x, y, index=1),
+                        **bootci_kwargs,
+                    )
+                elif ci_method == "standard":
+                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
+                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
+                    b0_lo, b0_hi, b1_lo, b1_hi = pg.linear_regression(
+                        ref, diffs, alpha=alpha
+                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
+
+            elif bias_method == "standard":
+                b0 = self._get_standard_bias(diffs)
+                if ci_method == "bootstrap":
+                    b0_lo, b0_hi = pg.compute_bootci(
+                        diffs, func=self._get_standard_bias, **bootci_kwargs
+                    )
+                elif ci_method == "standard":
+                    b0_lo, b0_hi = self._get_standard_bias_ci(diffs)
+            else:
+                raise ValueError(f"Unexpected bias method {bias_method}.")
+
+            results.append(dict(variable="b0", mean=b0, ci_lower=b0_lo, ci_upper=b0_hi, **metadata))
+            if bias_method == "modeled":
+                results.append(dict(variable="b1", mean=b1, ci_lower=b1_lo, ci_upper=b1_hi, **metadata))
+
+        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
+        self._bias_values = df
+
+    def get_loa(self, alpha=0.05, **bootci_kwargs):
+        results = []
+        for sstat, row in self.methods.iterrows():
+            # Extract difference values once for convenience.
+            diffs = self.data.loc[sstat, "difference"].to_numpy()
+
+            # Identify the method that will be used.
+            if self._violations.at[sstat, "is_heteroscedastic"]:
+                if self._violations.at[sstat, "is_log_heteroscedastic"]:
+                    loa_method = "modeled"
+                else:
+                    loa_method = "log_standard"
+            else:
+                if self._violations.at[sstat, "is_proportionally_biased"]:
+                    loa_method = "residuals"
+                else:
+                    loa_method = "standard"
+
+            if self._violations.at[sstat, "is_nonnormal"]:
+                ci_method = "bootstrap"
+            else:
+                ci_method = "standard"
+
+            metadata = {"sleep_stat": sstat, "method": loa_method}
+            if loa_method in ["standard", "residuals"]:
+                # Get standard deviation of calibrated (i.e., bias-adjusted) observed values
+                # calibration_func = lambda x: x - (b0 + b1 * x)  # b0 and b1 were generated this iteration above
+                # Get standard deviation of residuals?
+                if loa_method == "residuals":
+                    std = self.data.loc[sstat, "residual"].std()
+                else:
+                    std = diffs.std()  # dof=1
+                lower, upper = self._get_standard_loas(diffs, std=std)
+                if ci_method == "bootstrap":
+                    lower_lo, lower_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[0], **bootci_kwargs)
+                    upper_lo, upper_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[1], **bootci_kwargs)
+                elif ci_method == "standard":
+                    cis = self._get_standard_loas_cis(diffs, std=std)
+                    lower_lo, lower_hi = cis["lower"]
+                    upper_lo, upper_hi = cis["upper"]
+
+                results.append(dict(variable="lower", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
+                results.append(dict(variable="upper", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
+            elif loa_method == "modeled":
+                x, y = self.data.loc[sstat, [obs_scorer, "residual"]].T.values
+                c0 = self._get_regression_coefficient(x, y, index=0)
+                c1 = self._get_regression_coefficient(x, y, index=1)
+                if ci_method == "bootstrap":
+                    c0_lo, c0_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=0), **ci_kwargs)
+                    c1_lo, c1_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=1), **ci_kwargs)
+                elif ci_method == "standard":
+                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
+                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
+                    c0_lo, c0_hi, c1_lo, c1_hi = pg.linear_regression(
+                        x, y, alpha=alpha
+                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
+                else:
+                    raise ValueError(f"Unknown CI method {ci_method}.")
+                results.append(dict(variable="c0", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
+                results.append(dict(variable="c1", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
+            else:
+                raise ValueError(f"Unexpected LoA method {loa_method}.")
+        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
+        self._loa_values = df
+
+    def get_text_summary(self, fmt_dict=None):
+        """
+        """
+        results = {}
+        # Bias
+        for (meth, sstat), df in self._bias_values.groupby(["method", "sleep_stat"]):
+            if meth == "standard":
+                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
+                bias = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc["b0"]
+            elif meth == "modeled":
+                fstr = "{b0_mean:.2f} [{b0_ci_lower:.2f}, {b0_ci_upper:.2f}] + {b1_mean:.2f} [{b1_ci_lower:.2f}, {b1_ci_upper:.2f}] x ref"
+                temp = df.unstack("variable").swaplevel(axis=1)
+                temp.columns = temp.columns.map("_".join)
+                bias = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
+            results[sstat] = dict(bias=bias)
+        # LoA
+        for (meth, sstat), df in self._loa_values.groupby(["method", "sleep_stat"]):
+            if meth in ["standard", "residuals"]:
+                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
+                lower, upper = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc[["lower", "upper"]]
+            else:
+                fstr = "{c0_mean:.2f} [{c0_ci_lower:.2f}, {c0_ci_upper:.2f}] + {c1_mean:.2f} [{c1_ci_lower:.2f}, {c1_ci_upper:.2f}] x ref"
+                temp = df.unstack("variable").swaplevel(axis=1)
+                temp.columns = temp.columns.map("_".join)
+                lower = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
+                upper = lower.copy()
+            results[sstat].update({"lower": lower, "upper": upper})
+
+        df = pd.DataFrame(results).T.rename_axis("sleep_stat")
+        return df
+
     def summary(self, **kwargs):
         """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
 
         Parameters
         ----------
-        self : :py:class:`SleepStatsEvaluation`
-            A :py:class:`SleepStatsEvaluation` instance.
+        self : :py:class:`yasa.SleepStatsAgreement`
+            A :py:class:`yasa.SleepStatsAgreement` instance.
         **kwargs : key, value pairs
             Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
 
-            >>> ebe.summary(func=["mean", "sem", "min", "max"])
+            >>> ssa.summary(func=["mean", "sem", "min", "max"])
 
         Returns
         -------
@@ -1053,15 +1362,16 @@ def summary(self, **kwargs):
             normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
         """
         series_list = [
+            self.bias["biased"],
             self.normality["normal"],
-            self.proportional_bias["unbiased"],
+            self.proportional_bias["bias_constant"],
             self.homoscedasticity["equal_var"].rename("homoscedastic"),
         ]
         summary = pd.concat(series_list, axis=1)
         mad = lambda df: (df - df.mean()).abs().mean()
         mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
         agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
-        desc = self.data.drop(columns=self.sleep_id_str).groupby("sstat").agg(**agg_kwargs)
+        desc = self.data.groupby("sleep_stat").agg(**agg_kwargs)
         desc.columns = desc.columns.map("_".join)
         return summary.join(desc)
 
@@ -1082,13 +1392,13 @@ def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
         """
         assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
         if sleep_stats is None:
-            sleep_stats = self.data["sstat"].unique()  # All available sleep statistics
+            sleep_stats = self.data.index.get_level_values("sleep_stat").unique()
         heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
         heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
         if "cbar_kws" in kwargs:
             heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
         heatmap_kwargs.update(kwargs)
-        table = self.discrepancies[sleep_stats]
+        table = self._diff_data[sleep_stats]
         # Normalize statistics (i.e., columns) between zero and one then convert to percentage
         table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
         if heatmap_kwargs["annot"]:
@@ -1123,12 +1433,12 @@ def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kw
         kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
         kwargs_stripplot.update(kwargs)
         # Initialize the PairGrid
-        height = 0.3 * len(self.discrepancies)
+        height = 0.3 * len(self._diff_data)
         aspect = 0.6
         kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
         kwargs_pairgrid.update(pairgrid_kwargs)
         g = sns.PairGrid(
-            self.discrepancies.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
+            self._diff_data.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
         )
         # Draw the dots
         g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
@@ -1164,9 +1474,9 @@ def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
         kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
         kwargs_blandaltman.update(kwargs)
         # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data, col="sstat", **kwargs_facetgrid)
+        g = sns.FacetGrid(self.data.reset_index(), col="sleep_stat", **kwargs_facetgrid)
         # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.test_scorer, self.refr_scorer, **kwargs_blandaltman)
+        g.map(pg.plot_blandaltman, self.obs_scorer, self.ref_scorer, **kwargs_blandaltman)
         # Adjust aesthetics
         for ax in g.axes.flat:
             # Tidy-up axis limits with symmetric y-axis and minimal ticks
@@ -1174,7 +1484,7 @@ def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
             ax.set_ylim(-bound, bound)
             ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
             ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.test_scorer, self.refr_scorer))
+        ylabel = " - ".join((self.obs_scorer, self.ref_scorer))
         g.set_ylabels(ylabel)
         g.set_titles(col_template="{col_name}")
         g.tight_layout(w_pad=1, h_pad=2)
diff --git a/yasa/hypno.py b/yasa/hypno.py
index 5d28afa..28a83c3 100644
--- a/yasa/hypno.py
+++ b/yasa/hypno.py
@@ -10,7 +10,7 @@
 from yasa.io import set_log_level
 from yasa.plotting import plot_hypnogram
 from yasa.sleepstats import transition_matrix
-from yasa.evaluation import EpochByEpochEvaluation
+from yasa.evaluation import EpochByEpochAgreement
 from pandas.api.types import CategoricalDtype
 
 __all__ = [
@@ -571,42 +571,46 @@ def copy(self):
             scorer=self.scorer,
         )
 
-    def evaluate(self, test_hyp):
+    def evaluate(self, obs_hyp):
         """Evaluate agreement between two hypnograms of the same sleep session.
 
-        Typically the reference hypnogram (i.e., ``self``) is a manually-scored hypnogram and the
-        test hypnogram (i.e., ``test_hyp``) is a hypnogram from an actigraphy/wearable device or
-        automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
+        For example, the reference hypnogram (i.e., ``self``) might be a manually-scored hypnogram
+        and the reference hypnogram (i.e., ``ref_hyp``) might be a hypnogram from actigraphy, a
+        wearable device, or an automated scorer (e.g., :py:meth:`yasa.SleepStaging.predict`).
 
         Parameters
         ----------
         self : :py:class:`yasa.Hypnogram`
             Reference or ground-truth hypnogram.
-        test_hyp : :py:class:`yasa.Hypnogram`
-            The test or to-be-evaluated hypnogram.
+        obs_hyp : :py:class:`yasa.Hypnogram`
+            The observed or to-be-evaluated hypnogram.
 
         Returns
         -------
-        ebe : :py:class:`yasa.EpochByEpochEvaluation`
-            See :py:class:`yasa.EpochByEpochEvaluation` documentation for more detail.
+        ebe : :py:class:`yasa.EpochByEpochAgreement`
+            See :py:class:`~yasa.EpochByEpochAgreement` documentation for more detail.
 
         Examples
         --------
+        >>> from yasa import simulate_hypnogram
+        >>> hyp_a = simulate_hypnogram(tib=90, scorer="AASM", seed=8)
+        >>> hyp_b = hyp_a.simulate_similar(scorer="YASA", seed=9)
+        >>> ebe = hyp_a.evaluate(hyp_b)
+        >>> ebe.get_agreement().round(3)
+        accuracy        0.550
+        balanced_acc    0.355
+        kappa           0.227
+        mcc             0.231
+        precision       0.515
+        recall          0.550
+        fbeta           0.524
+        Name: agreement, dtype: float64
+
         .. plot::
 
-            >>> import yasa
-            >>> hypno_ref = yasa.simulate_hypno(tib=600, seed=11)
-            >>> hypno_ref = yasa.Hypnogram(hypno_ref, scorer="Rater1")
-            >>> _, true_probas = hypno_ref.transition_matrix()
-            >>> hypno_test = yasa.simulate_hypno(tib=600, seed=12, trans_probas=true_probas)
-            >>> hypno_test = yasa.Hypnogram(hypno_test, scorer="Rater2")
-            >>> ebe = hypno_ref.evaluate(hypno_test)
-            >>> conf = ebe.get_confusion_matrix()
-            >>> perf = ebe.summary()
-            >>> # Plot the overlapping hypnograms
             >>> ebe.plot_hypnograms()
         """
-        return EpochByEpochEvaluation([self], [test_hyp])
+        return EpochByEpochAgreement([self], [obs_hyp])
 
     def find_periods(self, threshold="5min", equal_length=False):
         """Find sequences of consecutive values exceeding a certain duration in hypnogram.

From b643d92a119552185cbbf0774479b94b031985fe Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Sun, 11 Feb 2024 14:19:52 -0500
Subject: [PATCH 38/43] SleepStatsAgreement major restructure

---
 yasa/evaluation.py | 1093 +++++++++++++++++++++-----------------------
 1 file changed, 517 insertions(+), 576 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index da75a4a..3911c9c 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -2,7 +2,7 @@
 YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
 epoch-by-epoch level or at the level of summary sleep statistics.
 
-Analyses are modeled after the standardized framework proposed in Menghini et al., 2021, SLEEP.
+Analyses are influenced by the standardized framework proposed in Menghini et al., 2021, SLEEP.
 See the following resources:
 - https://doi.org/10.1093/sleep/zsaa170
 - https://sri-human-sleep.github.io/sleep-trackers-performance
@@ -12,6 +12,7 @@
 
 import numpy as np
 import pandas as pd
+import pingouin as pg
 import sklearn.metrics as skm
 from scipy import stats
 
@@ -72,15 +73,15 @@ class EpochByEpochAgreement:
 
     Notes
     -----
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
-                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
+                      zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
@@ -748,34 +749,21 @@ def summary(self, by_stage=False, **kwargs):
 
 class SleepStatsAgreement:
     """
-    Evaluate agreement between sleep statistics reported by two different scorers or scoring
-    methods.
-
-    Bias and limits-of-agreement (and their confidence intervals) are calcualted for each sleep
-    statistic. How these are calculated depends on the sleep statistic's underlying error
-    distribution. See [Menghini2021]_ for details, but in brief:
-
-    * Bias: The difference between the two scorers (observed minus reference).
-        If sleep-statistic differences (observed minus reference) show proportional bias,
-        bias is represented as a regression equation that takes into account changes in bias as
-        a function of measurement value. Otherwise, bias is represented as the standard mean
-        difference.
-    * Limits-of-agreement: If sleep statistic differences show proportional bias, ...
-    * Confidence intervals: If sleep statistic differences follow a normal distribution,
-        confidence intervals are calculated using standard parametric methods. Otherwise,
-        bootstrapped confidence intervals are generated (see also ``bootstrap_cis``).
-
-    Observed sleep statistics can be corrected (i.e., ``calibrated``) to bring them into alignment
-    with the sleep statistics from the reference scorer.
-
-    Bias values are calculated as...
-    LOA ...
-    CI ...
-
-
-    .. important::
-        Bias, limits-of-agreement, and confidence intervals are all calculated differently depending
-        on assumption violations. See Menghini et al., 2021 [Menghini2021]_ for details.
+    Evaluate agreement between sleep statistics reported by two different scorers.
+
+    Features include:
+    Evaluation includes bias and limits of agreement (as well as both their confidence intervals),
+    various plotting options, and calibration functions for correcting biased values from the
+    observed scorer.
+
+    * Get summary calculations of bias, limits of agreement, and their confidence intervals.
+    * Test statistical assumptions of bias, limits of agreement, and their confidence intervals,
+    and apply corrective procedures when the assumptions are not met.
+    * Get bias and limits of agreement in a string-formatted table.
+    * Calibrate new data to correct for biases in observed data.
+    * Return individual calibration functions.
+    * Visualize discrepancies for outlier inspection.
+    * Visualize Bland-Altman plots.
 
     .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
 
@@ -794,14 +782,17 @@ class SleepStatsAgreement:
         Name of the reference scorer.
     obs_scorer : str
         Name of the observed scorer.
+    agreement : float
+        Multiple of the standard deviation to plot agreement limits. The default is 1.96, which
+        corresponds to a 95% confidence interval if the differences are normally distributed.
+
+        .. note:: ``agreement`` gets adjusted for regression-modeled limits of agreement.
+    confidence : float
+        The percentage confidence interval for the confidence intervals that are applied to bias and
+        limits of agreement. The same confidence interval percentage is applied to both standard and
+        bootstrapped confidence intervals.
     alpha : float
         Alpha cutoff used for all assumption tests.
-
-        .. note:: set ``alpha=1`` to ignore all corrections.
-    bootstrap_all_cis : bool
-        If ``True``, generate all 95% confidence intervals using a bootstrap resampling procedure.
-        Otherwise (``False``, default) use the resampling procedure only when discrepancy values
-        break normality assumptions.
     verbose : bool or str
         Verbose level. Default (False) will only print warning and error messages. The logging
         levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
@@ -809,81 +800,103 @@ class SleepStatsAgreement:
 
     Notes
     -----
-    Many steps here are modeled after guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
+    Sleep statistics that are identical between scorers are removed from analysis.
+
+    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
     See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
 
     References
     ----------
     .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                       (2021). A standardized framework for testing the performance of sleep-tracking
-                       technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
-                       zsaa170. https://doi.org/10.1093/sleep/zsaa170
+                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
+                      zsaa170. https://doi.org/10.1093/sleep/zsaa170
 
     Examples
     --------
     >>> import pandas as pd
     >>> import yasa
     >>>
-    >>> # For this example, generate two fake datasets of sleep statistics
-    >>> hypsA = [yasa.simulate_hypnogram(tib=600, scorer="Ref", seed=i) for i in range(20)]
-    >>> hypsB = [h.simulate_similar(tib=600, scorer="Test", seed=i) for i, h in enumerate(hypsA)]
-    >>> # sstatsA = pd.Series(hypsA).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> # sstatsB = pd.Series(hypsB).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-    >>> # sstatsA.index = sstatsB.index = sstatsA.index.map(lambda x: f"sub-{x+1:03d}")
-    >>> ebe = yasa.EpochByEpochEvaluation(hypsA, hypsB)
-    >>> sstats = ebe.get_sleepstats()
-    >>> sstatsA = sstats.loc["Ref"]
-    >>> sstatsB = sstats.loc["Test"]
-    >>>
-    >>> sse = yasa.SleepStatsAgreement(sstatsA, sstatsB)
-    >>>
-    >>> sse.summary()
-           normal  unbiased  homoscedastic
-    sstat
-    %N1      True      True           True
-    %N2      True      True           True
-    %N3      True      True           True
-    %REM    False      True           True
-    SE       True      True           True
-    SOL     False     False           True
-    TST      True      True           True
-
-    Access more detailed statistical output of each test.
-
-    >>> sse.normality
-                  W      pval  normal
-    sstat
-    %N1    0.973407  0.824551    True
-    %N2    0.960684  0.557595    True
-    %N3    0.958591  0.516092    True
-    %REM   0.901733  0.044447   False
-    SE     0.926732  0.133580    True
-    SOL    0.774786  0.000372   False
-    TST    0.926733  0.133584    True
-    WASO   0.924288  0.119843    True
-
-    >>> sse.homoscedasticity.head(2)
-                  W      pval  equal_var
-    sstat
-    %N1    0.684833  0.508274       True
-    %N2    0.080359  0.922890       True
-
-    >>> sse.proportional_bias.round(3).head(2)
-            coef     se      T   pval     r2  adj_r2  CI[2.5%]  CI[97.5%]  unbiased
-    sstat
-    %N1   -0.487  0.314 -1.551  0.138  0.118   0.069    -1.146      0.172      True
-    %N2   -0.107  0.262 -0.409  0.688  0.009  -0.046    -0.658      0.444      True
+    >>> # Generate fake reference and observed datasets with similar sleep statistics
+    >>> ref_scorer = "Henri"
+    >>> obs_scorer = "Piéron"
+    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer=ref_scorer, seed=i) for i in range(20)]
+    >>> obs_hyps = [h.simulate_similar(tib=600, scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
+    >>> # Generate sleep statistics from hypnograms using EpochByEpochAgreement
+    >>> eea = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
+    >>> sstats = eea.get_sleep_stats()
+    >>> ref_sstats = sstats.loc[ref_scorer]
+    >>> obs_sstats = sstats.loc[obs_scorer]
+    >>> # Create SleepStatsAgreement instance
+    >>> ssa = yasa.SleepStatsAgreement(ref_sstats, obs_sstats)
+    >>> ssa.summary().round(1).head(3)
+    variable   bias_intercept             ...   uloa_parm
+    interval           center lower upper ...      center lower upper
+    sleep_stat                            ...
+    %N1                  -5.4 -13.9   3.2 ...         6.1   3.7   8.5
+    %N2                 -27.3 -49.1  -5.6 ...        12.4   7.2  17.6
+    %N3                  -9.1 -23.8   5.5 ...        20.4  12.6  28.3
+
+    >>> ssa.get_table().head(3)[["bias", "loa"]]
+                          bias                            loa
+    sleep_stat
+    %N1                   0.25  Bias ± 2.46 * (-0.00 + 1.00x)
+    %N2         -27.34 + 0.55x   Bias ± 2.46 * (0.00 + 1.00x)
+    %N3                   1.38   Bias ± 2.46 * (0.00 + 1.00x)
+
+    >>> ssa.assumptions.head(3)
+                unbiased  normal  constant_bias  homoscedastic
+    sleep_stat
+    %N1             True    True           True          False
+    %N2             True    True          False          False
+    %N3             True    True           True          False
+
+    >>> ssa.auto_methods.head(3)
+                bias   loa    ci
+    sleep_stat
+    %N1         parm  regr  parm
+    %N2         regr  regr  parm
+    %N3         parm  regr  parm
+
+    >>> ssa.get_table(bias_method="parm", loa_method="parm").head(3)[["bias", "loa"]]
+                 bias            loa
+    sleep_stat
+    %N1          0.25    -5.55, 6.06
+    %N2         -0.23  -12.87, 12.40
+    %N3          1.38  -17.67, 20.44
+
+    Generate a new observed dataset and calibrate the values based on bias present in original observed
+
+    >>> new_hyps = [h.simulate_similar(tib=600, scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
+    >>> new_sstats = pd.Series(new_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+    >>> new_sstats = new_sstats[["N1", "TST", "WASO"]]
+    >>> new_sstats.round(1).head(5)
+         N1    TST   WASO
+    0  42.5  439.5  147.5
+    1  84.0  550.0   38.5
+    2  53.5  489.0  103.0
+    3  57.0  469.5  120.0
+    4  71.0  531.0   69.0
+
+    >>> new_stats_calibrated = ssa.calibrate_stats(new_sstats, bias_method="auto")
+    >>> new_stats_calibrated.round(1).head(5)
+         N1    TST   WASO
+    0  42.9  433.8  150.0
+    1  84.4  544.2   41.0
+    2  53.9  483.2  105.5
+    3  57.4  463.8  122.5
+    4  71.4  525.2   71.5
 
     .. plot::
 
         >>> import matplotlib.pyplot as plt
-        >>> ax = sse.plot_discrepancies_heatmap()
+        >>> ax = ssa.plot_discrepancies_heatmap()
         >>> ax.set_title("Sleep statistic discrepancies")
         >>> plt.tight_layout()
 
     .. plot::
 
-        >>> sse.plot_blandaltman()
+        >>> ssa.plot_blandaltman()
     """
 
     def __init__(
@@ -893,9 +906,11 @@ def __init__(
         *,
         ref_scorer="Reference",
         obs_scorer="Observed",
+        agreement=1.96,
+        confidence=0.95,
         alpha=0.05,
-        bootstrap_all_cis=False,
         verbose=True,
+        bootstrap_kwargs={},
     ):
 
         assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
@@ -912,146 +927,138 @@ def __init__(
         assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
         assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
         assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
-        assert isinstance(alpha, float) and 0 <= alpha <= 1, "`alpha` must be a number between 0 and 1, inclusive"
-        assert isinstance(bootstrap_all_cis, bool), "`bootstrap_all_cis` must be True or False"
+        assert isinstance(agreement, (float, int)) and agreement > 0, "`agreement` must be a number greater than 0"
+        assert isinstance(confidence, (float, int)) and 0 < alpha < 1, "`confidence` must be a number between 0 and 1"
+        assert isinstance(alpha, (float, int)) and 0 < alpha < 1, "`alpha` must be a number between 0 and 1"
+        assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
+        restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]
+        assert all(k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs), f"None of {restricted_bootstrap_kwargs} can be set by the user"
 
         # If `ref_data` and `obs_data` indices are unnamed, name them
         session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
         ref_data.index.name = session_key
         obs_data.index.name = session_key
 
-        # Get scorer differences (i.e., observed minus reference)
-        diff_data = obs_data.sub(ref_data)
-
-        # Prepend a "scorer" level to index of each individual dataframe, making MultiIndex
-        obs_data = pd.concat({obs_scorer: obs_data}, names=["scorer"])
-        ref_data = pd.concat({ref_scorer: ref_data}, names=["scorer"])
-        diff_data = pd.concat({"difference": diff_data}, names=["scorer"])
-        # Merge observed data, reference data, and differences
-        data = pd.concat([obs_data, ref_data, diff_data])
-        # Reshape to long-format with 3 columns (observed, reference, difference)
+        # Reshape to long format DataFrame with 2 columns (observed, reference) and MultiIndex
         data = (
-            data.melt(var_name="sleep_stat", ignore_index=False)
-            .reset_index()
-            .pivot(columns="scorer", index=["sleep_stat", session_key], values="value")
+            pd.concat([obs_data, ref_data], keys=[obs_scorer, ref_scorer], names=["scorer"])
+            .melt(var_name="sleep_stat", ignore_index=False)
+            .pivot_table(index=["sleep_stat", session_key], columns="scorer", values="value")
             .rename_axis(columns=None)
             .sort_index()
         )
 
+        # Get scorer differences (i.e., observed minus reference)
+        data["difference"] = data[obs_scorer].sub(data[ref_scorer])
+
         # Remove sleep statistics that have no differences between scorers
-        stats_with_nodiff = diff_data.any().loc[lambda x: ~x].index.tolist()
-        data = data.query(f"~sleep_stat.isin({stats_with_nodiff})")
-        for s in stats_with_nodiff:
+        stats_rm = data.groupby("sleep_stat")["difference"].any().loc[lambda x: ~x].index.tolist()
+        data = data.drop(labels=stats_rm)
+        for s in stats_rm:
             logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
 
+        # Create grouper variable for convenience
+        grouper = data.groupby("sleep_stat")
+
         ########################################################################
-        # TEST ASSUMPTION VIOLATIONS
+        # Generate parametric Bias and LoA for all sleep stats
         ########################################################################
-
-        grouper = data.groupby("sleep_stat")  # For convenience
-
-        # Test SYSTEMATIC BIAS between the two scorers for each sleep statistic (do means differ?).
-        # This test is used to determine whether corrections are applied during calibration only.
-        systematic_bias = grouper["difference"].apply(pg.ttest, y=0).droplevel(-1)
-
-        # Test NORMALITY of difference values at each sleep statistic.
-        # This test is used to determine how confidence intervals for Bias and LoA are calculated.
-        normality = grouper["difference"].apply(pg.normality, alpha=alpha).droplevel(-1)
-
-        # Test PROPORTIONAL BIAS at each sleep statistic (do scorer diffs vary as with ref measure?)
-        # This test is used to determine how Bias and LoA are calculated.
-        regr_f = lambda df: pg.linear_regression(df[ref_scorer], df[obs_scorer], alpha=alpha)
-        resid_f = lambda df: pd.Series(regr_f(df).residuals_, index=df.index.get_level_values(1))
-        proportional_bias = grouper.apply(regr_f).droplevel(-1).set_index("names", append=True)
-        proportional_bias = proportional_bias.swaplevel().sort_index()
-        residuals = grouper.apply(resid_f).stack().rename("residual")
-
-        # Test HETEROSCEDASTICITY at each sleep statistic.
-        # This test is used to determine how LoAs are calculated.
-        data = data.join(residuals)
-        homosc_columns = [ref_scorer, "difference", "residual"]
-        homosc_f = lambda df: pg.homoscedasticity(df[homosc_columns], alpha=alpha)
-        heteroscedasticity = data.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
-        # Add same test for log-transformed values, also used for determining LoA calculation method
-        log_transform = lambda x: np.log(x + 1e-6)
-        backlog_transform = lambda x: np.exp(x) - 1e-6
-        logdata = data[[ref_scorer, obs_scorer]].applymap(log_transform)
-        logdata["difference"] = logdata[obs_scorer].sub(logdata[ref_scorer])
-        logdata["residual"] = logdata.groupby("sleep_stat").apply(resid_f).stack()#.rename("residual")
-        heteroscedasticity_log = logdata.groupby("sleep_stat").apply(homosc_f).droplevel(-1)
-        # data_exp = logdata[[ref_scorer, obs_scorer, "difference"]].applymap(backlog_transform)
-        # data_exp = logdata["difference"].map(backlog_transformer)
-
-        # Aggregate test results into a dataframe of True/False for later convenience.
-        violations = (
-            systematic_bias["p-val"].lt(alpha).to_frame("is_systematically_biased")
-            .join(~normality["normal"].rename("is_nonnormal"))
-            .join(proportional_bias.loc[ref_scorer, "pval"].lt(alpha).rename("is_proportionally_biased"))
-            .join(~heteroscedasticity["equal_var"].rename("is_heteroscedastic"))
-            .join(~heteroscedasticity_log["equal_var"].rename("is_log_heteroscedastic"))
+        n_sessions = data.index.get_level_values(session_key).nunique()
+        # Parametric Bias
+        parm_vals = grouper["difference"].mean().to_frame("bias_parm")
+        # Parametric LoA
+        parm_vals["lloa_parm"], parm_vals["uloa_parm"] = zip(
+            *grouper["difference"].apply(self._arr_to_loa, agreement=agreement)
         )
 
-        # Get name of method for each calculation.
-        # CI - standard or bootstrap
-        # Bias - standard or modeled
-        # LoA - standard, log_standard, modeled, or residuals
-        get_ci_method = lambda row: "bootstrap" if row.is_nonnormal else "standard"
-        get_bias_method = lambda row: "modeled" if row.is_proportionally_biased else "standard"
-        get_loa_method = lambda row: (
-            "modeled" if row.is_log_heteroscedastic else "log_standard"
-        ) if row.is_heteroscedastic else (
-            "residuals" if row.is_proportionally_biased else "standard"
-        )
-        methods = {
-            "loa": violations.apply(get_loa_method, axis=1),
-            "bias": violations.apply(get_bias_method, axis=1),
-            "ci": violations.apply(get_ci_method, axis=1),
-        }
-        methods = pd.DataFrame(methods)
-        if bootstrap_all_cis:
-            methods["ci"] = ["standard"] * len(violations)
-
         ########################################################################
-        # ATTRIBUTES
+        # Generate standard CIs for standard Bias and LoA for all sleep stats
         ########################################################################
+        t_parm = stats.t.ppf((1 + confidence) / 2, n_sessions - 1)
+        sem = grouper["difference"].sem(ddof=1)
+        # Parametric CIs for parametric Bias and LoA
+        parm_ci = pd.DataFrame({
+            "bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
+            "bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
+            "lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
+            "lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
+            "uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
+            "uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
+        })
 
-        self._ref_scorer = ref_scorer
-        self._obs_scorer = obs_scorer
-        self._n_sessions = data.index.get_level_values(session_key).nunique()
-        self._data = data
-        self._diff_data = diff_data.droplevel(0).drop(columns=stats_with_nodiff)
-        self._systematic_bias = systematic_bias
-        self._normality = normality
-        self._proportional_bias = proportional_bias
-        self._heteroscedasticity = heteroscedasticity
-        self._violations = violations
-        self._methods = methods
-        # self._bias = bias
-        # self._bias_vars = bias_vars
-        # self._loas = loas
-        # self._loas_vars = loas_vars
+        ########################################################################
+        # Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
+        ########################################################################
+        # Run regression used to (a) model bias and (b) test for proportional/constant bias
+        bias_regr = grouper[[ref_scorer, "difference"]].apply(self._get_linregress_as_dict).apply(pd.Series)
+        # Get residuals from this regression, bc they are needed to run the next regression for homoscedasticity test
+        idx = data.index.get_level_values("sleep_stat")
+        slopes = bias_regr.loc[idx, "slope"].to_numpy()
+        intercepts = bias_regr.loc[idx, "intercept"].to_numpy()
+        predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
+        data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
+        # Run regression used to (b) model LoA and (b) test for heteroscedasticity/homoscedasticity
+        data["residuals_abs"] = data["residuals"].abs()
+        loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._get_linregress_as_dict).apply(pd.Series)
+        # Stack the two regression dataframes together
+        regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)
 
+        ########################################################################
+        # Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
+        ########################################################################
+        t_regr = stats.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
+        # Parametric CIs for modeled Bias and LoA
+        regr_ci = pd.DataFrame({
+            "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
+            "intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
+            "slope-lower": regr["slope"] - regr["stderr"] * t_regr,
+            "slope-upper": regr["slope"] + regr["stderr"] * t_regr,
+        })
 
-    @property
-    def data(self):
-        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
-        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
-        """
-        return self._data
+        ########################################################################
+        # Test all statistical assumptions
+        ########################################################################
+        assumptions = pd.DataFrame({
+            "unbiased": grouper["difference"].apply(lambda a: stats.ttest_1samp(a, 0).pvalue).ge(alpha),
+            "normal": grouper["difference"].apply(lambda a: stats.shapiro(a).pvalue).ge(alpha),
+            # "normal": grouper["difference"].apply(stats.shapiro).str[1].ge(alpha),
+            "constant_bias": bias_regr["pvalue"].ge(alpha),
+            "homoscedastic": loa_regr["pvalue"].ge(alpha),
+        })
 
-    @property
-    def methods(self):
-        return self._methods
+        ########################################################################
+        # Setting attributes
+        ########################################################################
 
-    @property
-    def biased(self):
-        return self._biased
+        # Merge the parametric and regression values for Bias and LoA
+        regr_vals = regr.unstack(0)[["slope", "intercept"]]
+        regr_vals.columns = regr_vals.columns.swaplevel().map("_".join)
+        vals = parm_vals.join(regr_vals).rename_axis("variable", axis=1)
+
+        # Merge the two CI dataframes for easier access
+        regr_ci = regr_ci.unstack(0)
+        regr_ci.columns = regr_ci.columns.swaplevel().map("_".join)
+        ci = parm_ci.join(regr_ci)
+        ci.columns = pd.MultiIndex.from_tuples(
+            tuples=ci.columns.str.split("-", expand=True), names=["variable", "interval"],
+        )
+        ci = pd.concat({"parm": ci, "boot": pd.DataFrame().reindex_like(ci)}, names=["ci_method"], axis=1)
+        ci = ci.sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
 
-    @property
-    def discrepancies(self):
-        """A :py:class:`pandas.DataFrame` of ``obs_data`` minus ``ref_data``."""
-        # # Pivot for session-rows and statistic-columns
-        return self._discrepancies
+        self._agreement = agreement
+        self._confidence = confidence
+        self._bootstrap_kwargs = bootstrap_kwargs
+        self._ref_scorer = ref_scorer
+        self._obs_scorer = obs_scorer
+        self._n_sessions = n_sessions
+        self._data = data
+        self._assumptions = assumptions
+        self._regr = regr
+        self._vals = vals
+        self._ci = ci
+        self._bias_method_opts = ["parm", "regr", "auto"]
+        self._loa_method_opts = ["parm", "regr", "auto"]
+        self._ci_method_opts = ["parm", "boot", "auto"]
 
     @property
     def ref_scorer(self):
@@ -1069,26 +1076,47 @@ def n_sessions(self):
         return self._n_sessions
 
     @property
-    def normality(self):
-        """A :py:class:`pandas.DataFrame` of normality results for all sleep statistics."""
-        return self._normality
+    def data(self):
+        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
+        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
+        Long format.
+        """
+        return self._data.drop(columns=["difference", "residuals", "residuals_abs"])
+
+    @property
+    def assumptions(self):
+        """A :py:class:`pandas.DataFrame` containing boolean values for all statistical tests used
+        to test assumptions.
+        """
+        return self._assumptions
 
     @property
-    def homoscedasticity(self):
-        """A :py:class:`pandas.DataFrame` of homoscedasticity results for all sleep statistics."""
-        return self._homoscedasticity
+    def sleep_statistics(self):
+        """Return a list of all sleep stats included in the agreement analyses."""
+        return self.data.index.get_level_values("sleep_stat").unique().to_list()
 
     @property
-    def proportional_bias(self):
-        """A :py:class:`pandas.DataFrame` of proportional bias results for all sleep statistics."""
-        return self._proportional_bias
+    def auto_methods(self):
+        """
+        A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
+        """
+        return pd.concat(
+            [
+                self.assumptions["constant_bias"].map({True: "parm", False: "regr"}).rename("bias"),
+                self.assumptions["homoscedastic"].map({True: "parm", False: "regr"}).rename("loa"),
+                self.assumptions["normal"].map({True: "parm", False: "boot"}).rename("ci"),
+                self.assumptions["unbiased"].map({True: "calibrate", False: "uncalibrated"}).rename("calibration"),
+            ],
+            axis=1,
+        )
 
     def __repr__(self):
         # TODO v0.8: Keep only the text between < and >
         return (
             f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
             f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
-            " - Use `.summary()` to get pass/fail values from various checks\n"
+            " - Use `.summary()` to get a dataframe of bias and limits of agreement for each sleep "
+            "statistic\n"
             " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
             "See the online documentation for more details."
         )
@@ -1096,396 +1124,309 @@ def __repr__(self):
     def __str__(self):
         return __repr__()
 
-    @staticmethod
-    def _get_standard_bias(x):
-        """Wrapper around `np.mean`, for organizational purposes. For internal use."""
-        return x.mean()
+    ############################################################################
+    # Define some utility functions, mostly to aid with the use of df.apply and stats.bootstrap
+    ############################################################################
 
     @staticmethod
-    def _get_standard_loas(x, agreement=1.96, std=None):
-        """Return standard lower and upper limits of agreement. For internal use only.
-
-        Parameters
-        ----------
-        x : array_like
-        agreement : float, int
-        std : float, int
-
-        Returns
-        -------
-        loas : py:class:`numpy.ndarray`
-            A numpy array of shape (2,) where lower LoA is first and upper LoA is second.
-        """
-        if std is None:
-            std = x.std()
-        return x.mean() + np.array([-agreement, agreement]) * std
+    def _arr_to_loa(x, agreement):
+        mean = np.mean(x)
+        bound = agreement * np.std(x, ddof=1)
+        return mean-bound, mean+bound
 
     @staticmethod
-    def _get_regression_coefficient(x, y, index):
-        """Run linear regression and return a single coefficient.
-        
-        A wrapper to aid in computing CIs (with pg.compute_bootci). For internal use only.
-
-        Parameters
-        ----------
-        x : array_like
-            Predictor values
-        y : array_like
-            Outcome values
-        index: int
-            0 to get coefficient of intercept, N to get coefficient of Nth predictor
-
-        Returns
-        -------
-        coef: float
-            Regression coefficient of the effect of `b`.
+    def _get_linregress_as_dict(*args, **kwargs):
         """
-        ## Q: Jump straight to np.lstsq for speed?
-        return pg.linear_regression(x, y, add_intercept=True).at[index, "coef"]
+        A wrapper around :py:func:`scipy.stats.linregress` that returns a dictionary instead of a
+        named tuple. In the normally returned object, `intercept_stderr` is an extra field that is
+        not included when converting the named tuple, so this allows it to be included when using
+        something like groupby.
+        """
+        regr = stats.linregress(*args, **kwargs)
+        return {
+            "slope": regr.slope,
+            "intercept": regr.intercept,
+            "rvalue": regr.rvalue,
+            "pvalue": regr.pvalue,
+            "stderr": regr.stderr,
+            "intercept_stderr": regr.intercept_stderr,
+        }
 
-    @staticmethod
-    def _get_standard_bias_ci(x, confidence=0.95):
-        """Return standard confidence intervals for bias."""
-        n = x.size
-        dof = x.size - 1
-        avg = x.mean()
-        std = x.std()
-        sem = np.sqrt(std**2 / n)
-        low, high = stats.t.interval(confidence, dof, loc=avg, scale=sem)
-        return low, high
+    def _generate_bootstrap_ci(self, sleep_stats):
+        """
+        Generate bootstrapped confidence intervals for bias and limits of agreement. This operates
+        in-place by concatenating bootstrapped CIs to existing parametric CIs (the latter are
+        calculated by default during initialization).
+        """
+        assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
+        assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
+        assert all(isinstance(ss, str) for ss in sleep_stats), "elements of `sleep_stats` must be strings"
+        assert all(ss in self.sleep_statistics for ss in sleep_stats)
+        # sleep_stats_to_boot = pd.Index(sleep_stats).difference(sleep_stats_booted)
+        # grouper = self._data.loc[sleep_stats_to_boot].groupby("sleep_stat")
+        # Update bootstrap keywords arguments with defaults
+        bs_kwargs = {
+            "n_resamples": 1000,
+            "method": "BCa",
+            "confidence_level": self._confidence,  # should not change from parametric confidence level
+            "vectorized": False,  # should stay False
+            "paired": True,  # should be True, especially if method is BCa
+        } | self._bootstrap_kwargs
+
+        def boot_stats(ref_arr, diff_arr, rabs_arr):
+            # Wrap around all the stats to bootstrap, to avoid redundant scipy.stats.bootstrap calls
+            # Order of arrays is dependent on the column order used when calling grouper.apply
+            bias_parm = np.mean(diff_arr)
+            lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
+            bias_slope, bias_intercept = stats.linregress(ref_arr, diff_arr)[:2]
+            # Note this is not recalculating residuals each time for the next regression
+            loa_slope, loa_intercept = stats.linregress(ref_arr, rabs_arr)[:2]
+            return bias_parm, lloa_parm, uloa_parm, bias_intercept, bias_slope, loa_intercept, loa_slope
+
+        # !! Column order MUST match the order of arrays boot_stats expects as INPUT
+        # !! Variable order MUST match the order of floats boot_stats returns as OUTPUT
+        interval_order = ["lower", "upper"]
+        column_order = ["Reference", "difference", "residuals_abs"]
+        variable_order = [
+            "bias_parm",
+            "lloa_parm",
+            "uloa_parm",
+            "bias_intercept",
+            "bias_slope",
+            "loa_intercept",
+            "loa_slope",
+        ]
+        boot_ci = (self._data
+            .loc[sleep_stats, column_order]  # Extract the relevant sleep stats and columns
+            .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
+            # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
+            # of the passed dataframe to a tuple of 3 1D arrays
+            .apply(lambda df: stats.bootstrap(tuple(df.to_numpy().T), boot_stats, **bs_kwargs))
+            .map(lambda res: res.confidence_interval)  # Pull high/low CIs out of the results object
+            .explode()  # Break high and low CIs into separate rows
+            .to_frame("value")  # Convert to dataframe and name column
+            .assign(interval=interval_order * len(sleep_stats))  # Add a column indicating interval
+            .explode("value")  # Break low CI variables and high CI variables out of arrays
+            .assign(variable=variable_order * len(sleep_stats) * 2)  # Add a column indicating variable
+            .pivot(columns=["variable", "interval"], values="value")  # Go long to wide format
+            .sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
+        )
+        # Merge with existing CI dataframe
+        self._ci["boot"] = self._ci["boot"].fillna(boot_ci)
 
-    @staticmethod
-    def _get_standard_loas_cis(x, agreement=1.96, std=None, confidence=0.95):
-        """Return standard confidence intervals for both lower LoA and upper LoA.
+    def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fstrings={}):
+        """Return a pandas dataframe with bias, loa, bias_ci, loa_ci as string equations.
+        For all sleep stats, then index later what you want.
 
         Parameters
         ----------
-        x : array_like
-        agreement : float, int
-        std : float, int
-        confidence : float
+        bias_method : str
+            If ``'parm'`` (i.e., parametric), bias is always represented as the mean difference
+            (observed minus reference).
+            If ``'regr'`` (i.e., regression), bias is always represented as a regression equation.
+            If ``'auto'`` (default), bias is represented as a regression equation for sleep
+            statistics where the score differences are proportionally biased and as the mean
+            difference otherwise.
+        loa_method : str
+            If ``'parm'`` (i.e., parametric), limits of agreement are always represented as
+            bias +/- 1.96 standard deviations (where 1.96 can be adjusted through the ``agreement``
+            parameter).
+            If ``'regr'`` (i.e., regression), limits of agreement are always represented as a
+            regression equation.
+            If ``'auto'`` (default), limits of agreement are represented as a regression equation
+            for sleep statistics where the score differences are proportionally biased and as
+            bias +/- 1.96 standard deviation otherwise.
+        ci_method : str
+            If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
+            standard t-distribution.
+            If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
+            bootstrap resampling procedure.
+            If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
+            resampling procedure for sleep statistics where the distribution of score differences is
+            non-normal and using a standard t-distribution otherwise.
 
         Returns
         -------
-        cis : dict
-            A dictionary of length 2, with keys "lower" and "upper" LoA, and values of tuples
-            containing "lower" and "upper" confidence intervals for each.
-        """
-        n = x.size
-        dof = x.size - 1
-        if std is None:
-            std = x.std()
-        lower, upper = DiscrepancyEvaluation._get_standard_loas(x, agreement)
-        sem = np.sqrt(3 * std**2 / n)
-        lower_lo, lower_hi = stats.t.interval(confidence, dof, loc=lower, scale=sem)
-        upper_lo, upper_hi = stats.t.interval(confidence, dof, loc=upper, scale=sem)
-        return {"lower": (lower_lo, lower_hi), "upper": (upper_lo, upper_hi)}
-
-    def get_bias(self, alpha=0.05, **bootci_kwargs):
-        results = []
-        for sstat, row in self.methods.iterrows():
-            # Extract difference values once for convenience.
-            diffs = self.data.loc[sstat, "difference"].to_numpy()
-
-            # Identify the method that will be used.
-            if self._violations.at[sstat, "is_proportionally_biased"]:
-                bias_method = "modeled"
-            else:
-                bias_method = "standard"
-
-            if self._violations.at[sstat, "is_nonnormal"]:
-                ci_method = "bootstrap"
-            else:
-                ci_method = "standard"
-
-            # Initialize dictionary to hold row information.
-            metadata = {"sleep_stat": sstat, "method": bias_method}
-
-            # Calculate necessary variables to get bias (either bias or b0 and b1).
-            if bias_method == "modeled":
-                # Systematic bias and constant bias present, model based on constant bias regression.
-                # x, y = self.data.loc[sstat, [self.ref_scorer, "difference"]].T.to_numpy()
-                ref = self.data.loc[sstat, self.ref_scorer].to_numpy()
-                b0 = self._get_regression_coefficient(ref, diffs, index=0)
-                b1 = self._get_regression_coefficient(ref, diffs, index=1)
-                # Confidence intervals for b0 and b1
-                if ci_method == "bootstrap":
-                    b0_lo, b0_hi = pg.compute_bootci(
-                        ref,
-                        diffs,
-                        func=lambda x, y: self._get_regression_coefficient(x, y, index=0),
-                        **bootci_kwargs,
-                    )
-                    b1_lo, b1_hi = pg.compute_bootci(
-                        ref,
-                        diffs,
-                        func=lambda x, y: self._get_regression_coefficient(x, y, index=1),
-                        **bootci_kwargs,
-                    )
-                elif ci_method == "standard":
-                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
-                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
-                    b0_lo, b0_hi, b1_lo, b1_hi = pg.linear_regression(
-                        ref, diffs, alpha=alpha
-                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
-
-            elif bias_method == "standard":
-                b0 = self._get_standard_bias(diffs)
-                if ci_method == "bootstrap":
-                    b0_lo, b0_hi = pg.compute_bootci(
-                        diffs, func=self._get_standard_bias, **bootci_kwargs
-                    )
-                elif ci_method == "standard":
-                    b0_lo, b0_hi = self._get_standard_bias_ci(diffs)
-            else:
-                raise ValueError(f"Unexpected bias method {bias_method}.")
-
-            results.append(dict(variable="b0", mean=b0, ci_lower=b0_lo, ci_upper=b0_hi, **metadata))
-            if bias_method == "modeled":
-                results.append(dict(variable="b1", mean=b1, ci_lower=b1_lo, ci_upper=b1_hi, **metadata))
-
-        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
-        self._bias_values = df
-
-    def get_loa(self, alpha=0.05, **bootci_kwargs):
-        results = []
-        for sstat, row in self.methods.iterrows():
-            # Extract difference values once for convenience.
-            diffs = self.data.loc[sstat, "difference"].to_numpy()
-
-            # Identify the method that will be used.
-            if self._violations.at[sstat, "is_heteroscedastic"]:
-                if self._violations.at[sstat, "is_log_heteroscedastic"]:
-                    loa_method = "modeled"
-                else:
-                    loa_method = "log_standard"
-            else:
-                if self._violations.at[sstat, "is_proportionally_biased"]:
-                    loa_method = "residuals"
-                else:
-                    loa_method = "standard"
+        table : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` of string representations of bias, limits of agreement,
+            and their confidence intervals for all sleep statistics.
 
-            if self._violations.at[sstat, "is_nonnormal"]:
-                ci_method = "bootstrap"
-            else:
-                ci_method = "standard"
-
-            metadata = {"sleep_stat": sstat, "method": loa_method}
-            if loa_method in ["standard", "residuals"]:
-                # Get standard deviation of calibrated (i.e., bias-adjusted) observed values
-                # calibration_func = lambda x: x - (b0 + b1 * x)  # b0 and b1 were generated this iteration above
-                # Get standard deviation of residuals?
-                if loa_method == "residuals":
-                    std = self.data.loc[sstat, "residual"].std()
-                else:
-                    std = diffs.std()  # dof=1
-                lower, upper = self._get_standard_loas(diffs, std=std)
-                if ci_method == "bootstrap":
-                    lower_lo, lower_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[0], **bootci_kwargs)
-                    upper_lo, upper_hi = pg.compute_bootci(diffs, func=lambda x: self._get_standard_loas(x, std=std)[1], **bootci_kwargs)
-                elif ci_method == "standard":
-                    cis = self._get_standard_loas_cis(diffs, std=std)
-                    lower_lo, lower_hi = cis["lower"]
-                    upper_lo, upper_hi = cis["upper"]
-
-                results.append(dict(variable="lower", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
-                results.append(dict(variable="upper", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
-            elif loa_method == "modeled":
-                x, y = self.data.loc[sstat, [obs_scorer, "residual"]].T.values
-                c0 = self._get_regression_coefficient(x, y, index=0)
-                c1 = self._get_regression_coefficient(x, y, index=1)
-                if ci_method == "bootstrap":
-                    c0_lo, c0_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=0), **ci_kwargs)
-                    c1_lo, c1_hi = pg.compute_bootci(x, y, func=lambda x, y: self._get_regression_coefficient(x, y, index=1), **ci_kwargs)
-                elif ci_method == "standard":
-                    col1 = "CI[{:.1f}%]".format((1 - alpha / 2) * 100) 
-                    col2 = "CI[{:.1f}%]".format(alpha / 2 * 100) 
-                    c0_lo, c0_hi, c1_lo, c1_hi = pg.linear_regression(
-                        x, y, alpha=alpha
-                    ).loc[[0, 1], [col1, col2]].to_numpy().flatten()
-                else:
-                    raise ValueError(f"Unknown CI method {ci_method}.")
-                results.append(dict(variable="c0", mean=lower, ci_lower=lower_lo, ci_upper=lower_hi, **metadata))
-                results.append(dict(variable="c1", mean=upper, ci_lower=upper_lo, ci_upper=upper_hi, **metadata))
-            else:
-                raise ValueError(f"Unexpected LoA method {loa_method}.")
-        df = pd.json_normalize(results).set_index(["method", "sleep_stat", "variable"]).sort_index()
-        self._loa_values = df
+        Examples
+        --------
 
-    def get_text_summary(self, fmt_dict=None):
         """
+        assert isinstance(bias_method, str), "`bias_method` must be a string"
+        assert bias_method in self._bias_method_opts, f"`bias_method` must be one of {self._bias_method_opts}"
+        assert isinstance(loa_method, str), "`loa_method` must be a string"
+        assert loa_method in self._loa_method_opts, f"`loa_method` must be one of {self._loa_method_opts}"
+        assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
+        loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)  # Agreement gets adjusted when LoA is modeled
+        if not fstrings:
+            fstrings = {
+                "bias_parm": "{bias_parm_center:.2f}",
+                "bias_regr": "{bias_intercept_center:.2f} + {bias_slope_center:.2f}x",
+                "loa_parm": "{lloa_parm_center:.2f}, {uloa_parm_center:.2f}",
+                "loa_regr": "Bias \u00B1 {loa_regr_agreement:.2f} * ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)",
+                "bias_parm_ci": (
+                    "[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"
+                ),
+                "bias_regr_ci": (
+                    "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], [{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
+                ),
+                "loa_parm_ci": (
+                    "[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], [{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
+                ),
+                "loa_regr_ci": (
+                    "[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], [{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
+                ),
+            }
+        # fstrings["loa_regr"] = fstrings["loa_regr"].replace("loa_regr_agreement", str(loa_regr_agreement))
+        values = self.summary(ci_method=ci_method)
+        values.columns = values.columns.map("_".join)  # Convert MultiIndex columns to Index
+        values["loa_regr_agreement"] = loa_regr_agreement  # Add a column of regr agreement so it can be used as variable
+        def return_all_the_strings(row, fstrings_dict):
+            return {var: fstr.format(**row) for var, fstr in fstrings_dict.items()}
+        all_strings = values.apply(return_all_the_strings, fstrings_dict=fstrings, axis=1).apply(pd.Series)
+        if bias_method == "auto":
+            bias_parm_idx = self.auto_methods.query("bias == 'parm'").index.tolist()
+        elif bias_method == "parm":
+            bias_parm_idx = self.sleep_statistics
+        elif bias_method == "regr":
+            bias_parm_idx = []
+        if loa_method == "auto":
+            loa_parm_idx = self.auto_methods.query("loa == 'parm'").index.tolist()
+        elif loa_method == "parm":
+            loa_parm_idx = self.sleep_statistics
+        elif loa_method == "regr":
+            loa_parm_idx = []
+        bias_regr_idx = [ss for ss in self.sleep_statistics if ss not in bias_parm_idx]
+        loa_regr_idx = [ss for ss in self.sleep_statistics if ss not in loa_parm_idx]
+        bias_parm = all_strings.loc[bias_parm_idx, ["bias_parm", "bias_parm_ci"]]
+        bias_regr = all_strings.loc[bias_regr_idx, ["bias_regr", "bias_regr_ci"]]
+        bias_parm.columns = bias_parm.columns.str.replace("_parm", "")
+        bias_regr.columns = bias_parm.columns.str.replace("_regr", "")
+        bias = pd.concat([bias_parm, bias_regr])
+        # bias = bias_parm.reindex(self.sleep_statistics).fillna(bias_regr)
+        loa_parm = all_strings.loc[loa_parm_idx, ["loa_parm", "loa_parm_ci"]]
+        loa_regr = all_strings.loc[loa_regr_idx, ["loa_regr", "loa_regr_ci"]]
+        loa_parm.columns = loa_parm.columns.str.replace("_parm", "")
+        loa_regr.columns = loa_regr.columns.str.replace("_regr", "")
+        loa = pd.concat([loa_parm, loa_regr])
+        return bias.join(loa, validate="1:1").sort_index(axis=0)
+
+    def summary(self, ci_method="auto"):
+        """
+        Return a dataframe that merges all the center values with their upper and lower confidence intervals.
+        There are always 2 options for CIs, so this is a convenient method to easily retrieve a set
+        of ALL values with their requested upper/lower bounds.
+        Returns a pandas DataFrame with 2-level multiindex as columns. with variable (bias) and 
+        interval (center, lower, upper)
         """
-        results = {}
-        # Bias
-        for (meth, sstat), df in self._bias_values.groupby(["method", "sleep_stat"]):
-            if meth == "standard":
-                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
-                bias = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc["b0"]
-            elif meth == "modeled":
-                fstr = "{b0_mean:.2f} [{b0_ci_lower:.2f}, {b0_ci_upper:.2f}] + {b1_mean:.2f} [{b1_ci_lower:.2f}, {b1_ci_upper:.2f}] x ref"
-                temp = df.unstack("variable").swaplevel(axis=1)
-                temp.columns = temp.columns.map("_".join)
-                bias = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
-            results[sstat] = dict(bias=bias)
-        # LoA
-        for (meth, sstat), df in self._loa_values.groupby(["method", "sleep_stat"]):
-            if meth in ["standard", "residuals"]:
-                fstr = "{mean:.2f} [{ci_lower:.2f}, {ci_upper:.2f}]"
-                lower, upper = df.droplevel([0,1]).apply(lambda r: fstr.format(**r), axis=1).loc[["lower", "upper"]]
-            else:
-                fstr = "{c0_mean:.2f} [{c0_ci_lower:.2f}, {c0_ci_upper:.2f}] + {c1_mean:.2f} [{c1_ci_lower:.2f}, {c1_ci_upper:.2f}] x ref"
-                temp = df.unstack("variable").swaplevel(axis=1)
-                temp.columns = temp.columns.map("_".join)
-                lower = temp.apply(lambda r: fstr.format(**r), axis=1)[0]
-                upper = lower.copy()
-            results[sstat].update({"lower": lower, "upper": upper})
-
-        df = pd.DataFrame(results).T.rename_axis("sleep_stat")
+        assert isinstance(ci_method, str), "`ci_method` must be a string"
+        assert ci_method in self._ci_method_opts, f"`ci_method` must be one of {self._ci_method_opts}"
+        # Make sure relevant sleep statistics have bootstrapped CIs, generate them if not
+        if ci_method in ["boot", "auto"]:
+            if ci_method == "boot":
+                sleep_stats_to_boot = self.sleep_statistics
+            elif ci_method == "auto":
+                sleep_stats_to_boot = self.auto_methods.query("ci == 'boot'").index.tolist()
+            # Check if any of the sleep stats already have bootstrapped CIs (e.g., if user calls "auto" and then "boot")
+            sleep_stats_booted = self._ci["boot"].dropna().index
+            sleep_stats_to_boot = [s for s in sleep_stats_to_boot if s not in sleep_stats_booted]
+            if sleep_stats_to_boot:
+                self._generate_bootstrap_ci(sleep_stats=sleep_stats_to_boot)
+        if ci_method == "auto":
+            idx_boot, idx_parm = self.auto_methods.reset_index().groupby("ci", sort=True)["sleep_stat"].apply(list)
+            parm_vals = self._ci.loc[idx_parm, "parm"]
+            boot_vals = self._ci.loc[idx_boot, "boot"]
+            ci_vals = pd.concat([parm_vals, boot_vals])
+        else:
+            ci_vals = self._ci[ci_method]
+        # Add an extra level to values columns, indicating they are the center interval
+        center_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
+        df = center_vals.join(ci_vals, how="left", validate="1:1").astype(float).sort_index(axis=1)
         return df
 
-    def summary(self, **kwargs):
-        """Return a summary dataframe highlighting whether tests passed for each sleep statistic.
+    def calibrate(self, sstats_c, bias_method="auto"):
+        """Return a Series of adjusted sleep stats.
+        # input should be a dataframe like sstats_a and sstats_b
+        Sleep stats input are adjusted according to observed biases in observed relative to reference
+        Return adjusted sleep stats.
 
         Parameters
         ----------
-        self : :py:class:`yasa.SleepStatsAgreement`
-            A :py:class:`yasa.SleepStatsAgreement` instance.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
-
-            >>> ssa.summary(func=["mean", "sem", "min", "max"])
+        obs_data : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with sleep statistics from an observed scorer.
+            Rows are unique observations and columns are unique sleep statistics.
+            Shape, index, and columns must be identical to ``ref_data`` and ``obs_data``.
+        bias_method : str
+            Name of the reference scorer.
 
         Returns
         -------
-        summary : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with boolean values indicating the pass/fail status for
-            normality, proportional bias, and homoscedasticity tests (for each sleep statistic).
-        """
-        series_list = [
-            self.bias["biased"],
-            self.normality["normal"],
-            self.proportional_bias["bias_constant"],
-            self.homoscedasticity["equal_var"].rename("homoscedastic"),
-        ]
-        summary = pd.concat(series_list, axis=1)
-        mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this to name the aggregated column
-        agg_kwargs = {"func": [mad, "mean", "std"]} | kwargs
-        desc = self.data.groupby("sleep_stat").agg(**agg_kwargs)
-        desc.columns = desc.columns.map("_".join)
-        return summary.join(desc)
-
-    def plot_discrepancies_heatmap(self, sleep_stats=None, **kwargs):
-        """Visualize session-level discrepancies, generally for outlier inspection.
+        obs_data_calibrated : :py:class:`pandas.DataFrame`
+            A :py:class:`pandas.DataFrame` with calibrated sleep statistics from an observed scorer.
 
-        Parameters
-        ----------
-        sleep_stats : list or None
-            List of sleep statistics to plot. Default (None) is to plot all sleep statistics.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`seaborn.heatmap` call.
+        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
 
-        Returns
+        Example
         -------
-        ax : :py:class:`matplotlib.axes.Axes`
-            Matplotlib Axes
+        >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="Henri", seed=i) for i in range(20)]
+        >>> hyps_b = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(20)]
+        >>> hyps_c = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(10)]
+        # sstats_a = pd.Series(hyps_a).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_b = pd.Series(hyps_b).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_c = pd.Series(hyps_c).map(lambda h: h.sleep_statistics()).apply(pd.Series)
+        # sstats_a.index = sstats_b.index = sstats_a.index.map(lambda x: f"sub-{x+1:03d}")
+        >>> agr = yasa.SleepStatsAgreement(sstats_a, sstats_b)
+        >>> sstats_c_calibrated = agr.calibrate(sstats_c)
+        >>> print(sstats_c_calibrated.round(2).head(5))
+        """
+        assert isinstance(sstats_c, pd.DataFrame)
+        assert all(col in self.sleep_statistics for col in sstats_c)
+        assert isinstance(bias_method, str)
+        assert bias_method in self._bias_method_opts
+        parm_adjusted = sstats_c + self._vals["bias_parm"]
+        regr_adjusted = sstats_c * self._vals["bias_slope"] + self._vals["bias_intercept"]
+        if bias_method == "parm":
+            return parm_adjusted
+        elif bias_method == "regr":
+            return regr_adjusted
+        elif bias_method == "auto":
+            parm_idx = self.auto_methods.query("bias == 'parm'").index.to_list()
+            bias_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
+            return parm_adjusted[parm_idx].join(regr_adjusted[bias_idx]).dropna(axis=1)
+
+    def get_calibration_func(sleep_stat):
         """
-        assert isinstance(sleep_stats, (list, type(None))), "`sleep_stats` must be a list or None"
-        if sleep_stats is None:
-            sleep_stats = self.data.index.get_level_values("sleep_stat").unique()
-        heatmap_kwargs = {"cmap": "binary", "annot": True, "fmt": ".1f", "square": False}
-        heatmap_kwargs["cbar_kws"] = dict(label="Normalized discrepancy %")
-        if "cbar_kws" in kwargs:
-            heatmap_kwargs["cbar_kws"].update(kwargs["cbar_kws"])
-        heatmap_kwargs.update(kwargs)
-        table = self._diff_data[sleep_stats]
-        # Normalize statistics (i.e., columns) between zero and one then convert to percentage
-        table_norm = table.sub(table.min(), axis=1).div(table.apply(np.ptp)).multiply(100)
-        if heatmap_kwargs["annot"]:
-            # Use raw values for writing
-            heatmap_kwargs["annot"] = table.to_numpy()
-        return sns.heatmap(table_norm, **heatmap_kwargs)
-
-    def plot_discrepancies_dotplot(self, pairgrid_kwargs={"palette": "winter"}, **kwargs):
-        """Visualize session-level discrepancies, generally for outlier inspection.
-
-        Parameters
-        ----------
-        pairgrid_kwargs : dict
-            Keywords arguments passed to the :py:class:`seaborn.PairGrid` call.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to the :py:func:`seaborn.stripplot` call.
 
-        Returns
-        -------
-        g : :py:class:`seaborn.PairGrid`
-            A :py:class:`seaborn.FacetGrid` with sleep statistics dotplots on each axis.
+        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
 
         Examples
         --------
-        To plot a limited subset of sleep statistics, use the ``x_vars`` keyword argument of
-        :py:class:`seaborn.PairGrid`.
-
-        .. plot::
-            ## TODO: Example using x_vars
-        """
-        assert isinstance(pairgrid_kwargs, dict), "`pairgrid_kwargs` must be a dict"
-        kwargs_stripplot = {"size": 10, "linewidth": 1, "edgecolor": "white"}
-        kwargs_stripplot.update(kwargs)
-        # Initialize the PairGrid
-        height = 0.3 * len(self._diff_data)
-        aspect = 0.6
-        kwargs_pairgrid = dict(hue=self.sleep_id_str, height=height, aspect=aspect)
-        kwargs_pairgrid.update(pairgrid_kwargs)
-        g = sns.PairGrid(
-            self._diff_data.reset_index(), y_vars=[self.sleep_id_str], **kwargs_pairgrid
-        )
-        # Draw the dots
-        g.map(sns.stripplot, orient="h", jitter=False, **kwargs_stripplot)
-        # Adjust aesthetics
-        for ax in g.axes.flat:
-            ax.set(title=ax.get_xlabel())
-            ax.margins(x=0.3)
-            ax.yaxis.grid(True)
-            ax.tick_params(left=False)
-        g.set(xlabel="", ylabel="")
-        sns.despine(left=True, bottom=True)
-        return g
-
-    def plot_blandaltman(self, facetgrid_kwargs={}, **kwargs):
-        """
-
-        **Use col_order=sstats_order for plotting a subset.
-
-        Parameters
-        ----------
-        facetgrid_kwargs : dict
-            Keyword arguments passed to the :py:class:`seaborn.FacetGrid` call.
-        **kwargs : key, value pairs
-            Additional keyword arguments are passed to :py:func:`pingouin.plot_blandaltman`.
-
-        Returns
-        -------
-        g : :py:class:`seaborn.FacetGrid`
-            A :py:class:`seaborn.FacetGrid` with sleep statistics Bland-Altman plots on each axis.
+        >>> ssa = yasa.SleepStatsAgreement(...)
+        >>> calibrate_rem = ssa.get_calibration_func("REM")
+        >>> new_obs_rem_vals = np.array([50, 40, 30, 20])
+        >>> calibrate_rem(new_obs_rem_vals)
+        >>> calibrate_rem(new_obs_rem_vals)
+        array([50, 40, 30, 20])
+        >>> calibrate_rem(new_obs_rem_vals, bias_test=False)
+        array([42.825, 32.825, 22.825, 12.825])
+        >>> calibrate_rem(new_obs_rem_vals, bias_test=False, method="regr")
+        array([ -9.33878878,  -9.86815607, -10.39752335, -10.92689064])
         """
-        kwargs_facetgrid = dict(col_wrap=4, height=2, aspect=1, sharex=False, sharey=False)
-        kwargs_facetgrid.update(facetgrid_kwargs)
-        kwargs_blandaltman = dict(xaxis="y", annotate=False, edgecolor="black", facecolor="none")
-        kwargs_blandaltman.update(kwargs)
-        # Initialize a grid of plots with an Axes for each sleep statistic
-        g = sns.FacetGrid(self.data.reset_index(), col="sleep_stat", **kwargs_facetgrid)
-        # Draw Bland-Altman plot on each axis
-        g.map(pg.plot_blandaltman, self.obs_scorer, self.ref_scorer, **kwargs_blandaltman)
-        # Adjust aesthetics
-        for ax in g.axes.flat:
-            # Tidy-up axis limits with symmetric y-axis and minimal ticks
-            bound = max(map(abs, ax.get_ylim()))
-            ax.set_ylim(-bound, bound)
-            ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=2, integer=True, symmetric=True))
-            ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=1, integer=True))
-        ylabel = " - ".join((self.obs_scorer, self.ref_scorer))
-        g.set_ylabels(ylabel)
-        g.set_titles(col_template="{col_name}")
-        g.tight_layout(w_pad=1, h_pad=2)
-        return g
+        assert isinstance(sleep_stat, str)
+        assert sleep_stat in self.sleep_statistics
+        parm, slope, intercept = ssa._vals.loc[ss, ["bias_parm", "bias_slope", "bias_intercept"]].to_numpy()
+        auto_method = ssa.auto_methods.at[ss, "bias"]
+        not_biased = ssa.assumptions.at[ss, "unbiased"]
+        def calibration_func(x, method="auto", bias_test=True):
+            x = np.array(x)
+            method = auto_method if method == "auto" else method
+            if bias_test and not_biased:  # If sleep stat is not statistically biased, don't calibrate
+                return x
+            elif method == "parm":
+                return x + parm
+            elif method == "regr":
+                return x * slope + intercept
+        return calibration_func

From 8e575bd2357f2694928fd98293e9d27397ac82ea Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 12 Feb 2024 09:21:21 -0500
Subject: [PATCH 39/43] formatting

---
 yasa/evaluation.py | 393 ++++++++++++++++++++++++++-------------------
 1 file changed, 231 insertions(+), 162 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index 3911c9c..adde5aa 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -12,15 +12,9 @@
 
 import numpy as np
 import pandas as pd
-import pingouin as pg
 import sklearn.metrics as skm
 from scipy import stats
 
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from yasa.plotting import plot_hypnogram
-
 
 logger = logging.getLogger("yasa")
 
@@ -174,7 +168,7 @@ def __init__(self, ref_hyps, obs_hyps):
 
         assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
         assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
-        assert type(ref_hyps) == type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
+        assert type(ref_hyps) is type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
         assert len(ref_hyps) == len(
             obs_hyps
         ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
@@ -376,7 +370,8 @@ def get_agreement(self, sample_weight=None, scorers=None):
         df = self.data.copy()
         if sample_weight is not None:
             assert sample_weight.index == self.data.index, (
-                "If not ``None``, ``sample_weight`` Series must be a pandas Series with same index as `self.data`"
+                "If not `None`, `sample_weight` Series must be a pandas Series with the same index "
+                "as `self.data`"
             )
             # Add weights as a third column for multi_scorer to use
             df["weights"] = sample_weight
@@ -406,9 +401,10 @@ def get_agreement_bystage(self, beta=1.0):
             A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
             :py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
         """
-        scorer = lambda df: skm.precision_recall_fscore_support(
-            *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
-        )
+        def scorer(df):
+            return skm.precision_recall_fscore_support(
+                *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
+            )
         agreement = (
             self.data
             # Get precision, recall, f1, and support for each individual sleep session
@@ -638,7 +634,7 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, ob
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
         assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
-        assert not "ax" in ref_kwargs | obs_kwargs, (
+        assert "ax" not in ref_kwargs | obs_kwargs, (
             "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
         )
         assert not (sleep_id is None and self.n_sleeps > 1), (
@@ -724,9 +720,11 @@ def summary(self, by_stage=False, **kwargs):
             assert hasattr(self, "_agreement"), (
                 "Must run `self.get_agreement` before obtaining summary results."
             )
+
         # Create a function for getting mean absolute deviation
-        mad = lambda df: (df - df.mean()).abs().mean()
-        mad.__name__ = "mad"  # Pandas uses this lambda attribute to name the aggregated column
+        def mad(df):
+            return (df - df.mean()).abs().mean()
+
         # Merge default and user kwargs
         agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
@@ -750,12 +748,11 @@ def summary(self, by_stage=False, **kwargs):
 class SleepStatsAgreement:
     """
     Evaluate agreement between sleep statistics reported by two different scorers.
-
-    Features include:
     Evaluation includes bias and limits of agreement (as well as both their confidence intervals),
     various plotting options, and calibration functions for correcting biased values from the
     observed scorer.
 
+    Features include:
     * Get summary calculations of bias, limits of agreement, and their confidence intervals.
     * Test statistical assumptions of bias, limits of agreement, and their confidence intervals,
     and apply corrective procedures when the assumptions are not met.
@@ -821,7 +818,7 @@ class SleepStatsAgreement:
     >>> ref_scorer = "Henri"
     >>> obs_scorer = "Piéron"
     >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer=ref_scorer, seed=i) for i in range(20)]
-    >>> obs_hyps = [h.simulate_similar(tib=600, scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
+    >>> obs_hyps = [h.simulate_similar(scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
     >>> # Generate sleep statistics from hypnograms using EpochByEpochAgreement
     >>> eea = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
     >>> sstats = eea.get_sleep_stats()
@@ -865,9 +862,7 @@ class SleepStatsAgreement:
     %N2         -0.23  -12.87, 12.40
     %N3          1.38  -17.67, 20.44
 
-    Generate a new observed dataset and calibrate the values based on bias present in original observed
-
-    >>> new_hyps = [h.simulate_similar(tib=600, scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
+    >>> new_hyps = [h.simulate_similar(scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
     >>> new_sstats = pd.Series(new_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
     >>> new_sstats = new_sstats[["N1", "TST", "WASO"]]
     >>> new_sstats.round(1).head(5)
@@ -913,31 +908,39 @@ def __init__(
         bootstrap_kwargs={},
     ):
 
+        restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]
+
         assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
         assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
-        assert np.array_equal(
-            ref_data.index, obs_data.index
-        ), "`ref_data` and `obs_data` index values must be identical"
-        assert (
-            ref_data.index.name == obs_data.index.name
-        ), "`ref_data` and `obs_data` index names must be identical"
-        assert np.array_equal(
-            ref_data.columns, obs_data.columns
-        ), "`ref_data` and `obs_data` column values must be identical"
+        assert np.array_equal(ref_data.index, obs_data.index), (
+            "`ref_data` and `obs_data` index values must be identical"
+        )
+        assert ref_data.index.name == obs_data.index.name, (
+            "`ref_data` and `obs_data` index names must be identical"
+        )
+        assert np.array_equal(ref_data.columns, obs_data.columns), (
+            "`ref_data` and `obs_data` column values must be identical"
+        )
         assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
         assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
         assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
-        assert isinstance(agreement, (float, int)) and agreement > 0, "`agreement` must be a number greater than 0"
-        assert isinstance(confidence, (float, int)) and 0 < alpha < 1, "`confidence` must be a number between 0 and 1"
-        assert isinstance(alpha, (float, int)) and 0 < alpha < 1, "`alpha` must be a number between 0 and 1"
+        assert isinstance(agreement, (float, int)) and agreement > 0, (
+            "`agreement` must be a number greater than 0"
+        )
+        assert isinstance(confidence, (float, int)) and 0 < alpha < 1, (
+            "`confidence` must be a number between 0 and 1"
+        )
+        assert isinstance(alpha, (float, int)) and 0 <= alpha <= 1, (
+            "`alpha` must be a number between 0 and 1 inclusive"
+        )
         assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
-        restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]
-        assert all(k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs), f"None of {restricted_bootstrap_kwargs} can be set by the user"
+        assert all(k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs), (
+            f"None of {restricted_bootstrap_kwargs} can be set by the user"
+        )
 
         # If `ref_data` and `obs_data` indices are unnamed, name them
         session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
-        ref_data.index.name = session_key
-        obs_data.index.name = session_key
+        ref_data.index.name = obs_data.index.name = session_key
 
         # Reshape to long format DataFrame with 2 columns (observed, reference) and MultiIndex
         data = (
@@ -949,7 +952,7 @@ def __init__(
         )
 
         # Get scorer differences (i.e., observed minus reference)
-        data["difference"] = data[obs_scorer].sub(data[ref_scorer])
+        data["difference"] = data[obs_scorer] - data[ref_scorer]
 
         # Remove sleep statistics that have no differences between scorers
         stats_rm = data.groupby("sleep_stat")["difference"].any().loc[lambda x: ~x].index.tolist()
@@ -957,13 +960,13 @@ def __init__(
         for s in stats_rm:
             logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
 
-        # Create grouper variable for convenience
+        # Create grouper and n_sessions variables for convenience
         grouper = data.groupby("sleep_stat")
+        n_sessions = data.index.get_level_values(session_key).nunique()
 
         ########################################################################
         # Generate parametric Bias and LoA for all sleep stats
         ########################################################################
-        n_sessions = data.index.get_level_values(session_key).nunique()
         # Parametric Bias
         parm_vals = grouper["difference"].mean().to_frame("bias_parm")
         # Parametric LoA
@@ -972,8 +975,9 @@ def __init__(
         )
 
         ########################################################################
-        # Generate standard CIs for standard Bias and LoA for all sleep stats
+        # Generate standard CIs for parametric Bias and LoA for all sleep stats
         ########################################################################
+        # Get critical t and standard error used to calculate parametric CIs for parametric Bias/LoA
         t_parm = stats.t.ppf((1 + confidence) / 2, n_sessions - 1)
         sem = grouper["difference"].sem(ddof=1)
         # Parametric CIs for parametric Bias and LoA
@@ -990,22 +994,23 @@ def __init__(
         # Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
         ########################################################################
         # Run regression used to (a) model bias and (b) test for proportional/constant bias
-        bias_regr = grouper[[ref_scorer, "difference"]].apply(self._get_linregress_as_dict).apply(pd.Series)
-        # Get residuals from this regression, bc they are needed to run the next regression for homoscedasticity test
+        bias_regr = grouper[[ref_scorer, "difference"]].apply(self._linregr_dict).apply(pd.Series)
+        # Get absolute residuals from this regression bc they are used in the next regression
         idx = data.index.get_level_values("sleep_stat")
         slopes = bias_regr.loc[idx, "slope"].to_numpy()
         intercepts = bias_regr.loc[idx, "intercept"].to_numpy()
         predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
         data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
-        # Run regression used to (b) model LoA and (b) test for heteroscedasticity/homoscedasticity
         data["residuals_abs"] = data["residuals"].abs()
-        loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._get_linregress_as_dict).apply(pd.Series)
+        # Run regression used to (b) model LoA and (b) test for heteroscedasticity/homoscedasticity
+        loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._linregr_dict).apply(pd.Series)
         # Stack the two regression dataframes together
         regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)
 
         ########################################################################
         # Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
         ########################################################################
+        # Get critical t used used to calculate parametric CIs for regression Bias/LoA
         t_regr = stats.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
         # Parametric CIs for modeled Bias and LoA
         regr_ci = pd.DataFrame({
@@ -1019,9 +1024,10 @@ def __init__(
         # Test all statistical assumptions
         ########################################################################
         assumptions = pd.DataFrame({
-            "unbiased": grouper["difference"].apply(lambda a: stats.ttest_1samp(a, 0).pvalue).ge(alpha),
+            "unbiased": (
+                grouper["difference"].apply(lambda a: stats.ttest_1samp(a, 0).pvalue).ge(alpha)
+            ),
             "normal": grouper["difference"].apply(lambda a: stats.shapiro(a).pvalue).ge(alpha),
-            # "normal": grouper["difference"].apply(stats.shapiro).str[1].ge(alpha),
             "constant_bias": bias_regr["pvalue"].ge(alpha),
             "homoscedastic": loa_regr["pvalue"].ge(alpha),
         })
@@ -1042,15 +1048,17 @@ def __init__(
         ci.columns = pd.MultiIndex.from_tuples(
             tuples=ci.columns.str.split("-", expand=True), names=["variable", "interval"],
         )
-        ci = pd.concat({"parm": ci, "boot": pd.DataFrame().reindex_like(ci)}, names=["ci_method"], axis=1)
+        empty_df = pd.DataFrame().reindex_like(ci)
+        ci = pd.concat({"parm": ci, "boot": empty_df}, names=["ci_method"], axis=1)
         ci = ci.sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
 
+        # Set attributes
         self._agreement = agreement
         self._confidence = confidence
         self._bootstrap_kwargs = bootstrap_kwargs
+        self._n_sessions = n_sessions
         self._ref_scorer = ref_scorer
         self._obs_scorer = obs_scorer
-        self._n_sessions = n_sessions
         self._data = data
         self._assumptions = assumptions
         self._regr = regr
@@ -1072,40 +1080,38 @@ def obs_scorer(self):
 
     @property
     def n_sessions(self):
-        """The number of sleep sessions."""
+        """The number of sessions."""
         return self._n_sessions
 
     @property
     def data(self):
-        """A :py:class:`pandas.DataFrame` containing all sleep statistics from ``ref_data`` and
-        ``obs_data`` as well as their difference scores (``obs_data`` minus ``ref_data``).
-        Long format.
+        """A long-format :py:class:`pandas.DataFrame` containing all raw sleep statistics from
+        ``ref_data`` and ``obs_data``.
         """
         return self._data.drop(columns=["difference", "residuals", "residuals_abs"])
 
+    @property
+    def sleep_statistics(self):
+        """Return a list of all sleep statistics included in the agreement analyses."""
+        return self.data.index.get_level_values("sleep_stat").unique().to_list()
+
     @property
     def assumptions(self):
-        """A :py:class:`pandas.DataFrame` containing boolean values for all statistical tests used
-        to test assumptions.
+        """A :py:class:`pandas.DataFrame` containing boolean values indicating the pass/fail status
+        of all statistical tests performed to test assumptions.
         """
         return self._assumptions
 
-    @property
-    def sleep_statistics(self):
-        """Return a list of all sleep stats included in the agreement analyses."""
-        return self.data.index.get_level_values("sleep_stat").unique().to_list()
-
     @property
     def auto_methods(self):
-        """
-        A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
+        """A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is
+        selected.
         """
         return pd.concat(
             [
                 self.assumptions["constant_bias"].map({True: "parm", False: "regr"}).rename("bias"),
                 self.assumptions["homoscedastic"].map({True: "parm", False: "regr"}).rename("loa"),
                 self.assumptions["normal"].map({True: "parm", False: "boot"}).rename("ci"),
-                self.assumptions["unbiased"].map({True: "calibrate", False: "uncalibrated"}).rename("calibration"),
             ],
             axis=1,
         )
@@ -1122,7 +1128,7 @@ def __repr__(self):
         )
 
     def __str__(self):
-        return __repr__()
+        return self.__repr__()
 
     ############################################################################
     # Define some utility functions, mostly to aid with the use of df.apply and stats.bootstrap
@@ -1130,15 +1136,16 @@ def __str__(self):
 
     @staticmethod
     def _arr_to_loa(x, agreement):
+        """Return a tuple with lower and upper limits of agreement."""
         mean = np.mean(x)
         bound = agreement * np.std(x, ddof=1)
-        return mean-bound, mean+bound
+        return mean - bound, mean + bound
 
     @staticmethod
-    def _get_linregress_as_dict(*args, **kwargs):
+    def _linregr_dict(*args, **kwargs):
         """
         A wrapper around :py:func:`scipy.stats.linregress` that returns a dictionary instead of a
-        named tuple. In the normally returned object, `intercept_stderr` is an extra field that is
+        named tuple. In the normally returned object, ``intercept_stderr`` is an extra field that is
         not included when converting the named tuple, so this allows it to be included when using
         something like groupby.
         """
@@ -1154,34 +1161,40 @@ def _get_linregress_as_dict(*args, **kwargs):
 
     def _generate_bootstrap_ci(self, sleep_stats):
         """
-        Generate bootstrapped confidence intervals for bias and limits of agreement. This operates
-        in-place by concatenating bootstrapped CIs to existing parametric CIs (the latter are
-        calculated by default during initialization).
+        Internal method to generate bootstrapped confidence intervals for bias and LoA.
+        This operates in-place by concatenating bootstrapped CIs to existing parametric CIs.
+        Note that parametric CIs are generated by default during init (bc they are quicker).
+
+        Parameters
+        ----------
+        sleep_stats : list
+            A list of sleep statistics to bootstrap confidence intervals for.
         """
         assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
         assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
-        assert all(isinstance(ss, str) for ss in sleep_stats), "elements of `sleep_stats` must be strings"
-        assert all(ss in self.sleep_statistics for ss in sleep_stats)
-        # sleep_stats_to_boot = pd.Index(sleep_stats).difference(sleep_stats_booted)
-        # grouper = self._data.loc[sleep_stats_to_boot].groupby("sleep_stat")
-        # Update bootstrap keywords arguments with defaults
+        assert all(isinstance(ss, str) for ss in sleep_stats), (
+            "all elements of `sleep_stats` must be strings"
+        )
+        assert all(ss in self.sleep_statistics for ss in sleep_stats), (
+            f"all elements of `sleep_stats` must be one of {self.sleep_statistics}"
+        )
+        # Update bootstrap keyword arguments with defaults
         bs_kwargs = {
             "n_resamples": 1000,
             "method": "BCa",
-            "confidence_level": self._confidence,  # should not change from parametric confidence level
-            "vectorized": False,  # should stay False
-            "paired": True,  # should be True, especially if method is BCa
+            "confidence_level": self._confidence,  # should not change from parametric level
+            "vectorized": False,  # should stay False, bc of how the custom get_vars function works
+            "paired": True,  # should stay True, especially if method is BCa
         } | self._bootstrap_kwargs
 
-        def boot_stats(ref_arr, diff_arr, rabs_arr):
-            # Wrap around all the stats to bootstrap, to avoid redundant scipy.stats.bootstrap calls
-            # Order of arrays is dependent on the column order used when calling grouper.apply
+        def get_vars(ref_arr, diff_arr, rabs_arr):
+            """A function to get all variables at once and avoid redundant stats.bootstrap calls."""
             bias_parm = np.mean(diff_arr)
             lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
-            bias_slope, bias_intercept = stats.linregress(ref_arr, diff_arr)[:2]
-            # Note this is not recalculating residuals each time for the next regression
-            loa_slope, loa_intercept = stats.linregress(ref_arr, rabs_arr)[:2]
-            return bias_parm, lloa_parm, uloa_parm, bias_intercept, bias_slope, loa_intercept, loa_slope
+            bias_slope, bias_inter = stats.linregress(ref_arr, diff_arr)[:2]
+            # Note this is NOT recalculating residuals each time for the next regression
+            loa_slope, loa_inter = stats.linregress(ref_arr, rabs_arr)[:2]
+            return bias_parm, lloa_parm, uloa_parm, bias_inter, bias_slope, loa_inter, loa_slope
 
         # !! Column order MUST match the order of arrays boot_stats expects as INPUT
         # !! Variable order MUST match the order of floats boot_stats returns as OUTPUT
@@ -1196,18 +1209,19 @@ def boot_stats(ref_arr, diff_arr, rabs_arr):
             "loa_intercept",
             "loa_slope",
         ]
-        boot_ci = (self._data
+        boot_ci = (
+            self._data
             .loc[sleep_stats, column_order]  # Extract the relevant sleep stats and columns
             .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
             # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
             # of the passed dataframe to a tuple of 3 1D arrays
-            .apply(lambda df: stats.bootstrap(tuple(df.to_numpy().T), boot_stats, **bs_kwargs))
+            .apply(lambda df: stats.bootstrap(tuple(df.to_numpy().T), get_vars, **bs_kwargs))
             .map(lambda res: res.confidence_interval)  # Pull high/low CIs out of the results object
             .explode()  # Break high and low CIs into separate rows
             .to_frame("value")  # Convert to dataframe and name column
             .assign(interval=interval_order * len(sleep_stats))  # Add a column indicating interval
             .explode("value")  # Break low CI variables and high CI variables out of arrays
-            .assign(variable=variable_order * len(sleep_stats) * 2)  # Add a column indicating variable
+            .assign(variable=variable_order * len(sleep_stats) * 2)  # Add column indicating variabl
             .pivot(columns=["variable", "interval"], values="value")  # Go long to wide format
             .sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
         )
@@ -1215,8 +1229,8 @@ def boot_stats(ref_arr, diff_arr, rabs_arr):
         self._ci["boot"] = self._ci["boot"].fillna(boot_ci)
 
     def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fstrings={}):
-        """Return a pandas dataframe with bias, loa, bias_ci, loa_ci as string equations.
-        For all sleep stats, then index later what you want.
+        """
+        Return a :py:class:`~pandas.DataFrame` with bias, loa, bias_ci, loa_ci as string equations.
 
         Parameters
         ----------
@@ -1244,49 +1258,60 @@ def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fst
             If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
             resampling procedure for sleep statistics where the distribution of score differences is
             non-normal and using a standard t-distribution otherwise.
+        fstrings : dict
+            Optional custom strings for formatting cells.
 
         Returns
         -------
         table : :py:class:`pandas.DataFrame`
-            A :py:class:`~pandas.DataFrame` of string representations of bias, limits of agreement,
+            A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
             and their confidence intervals for all sleep statistics.
-
-        Examples
-        --------
-
         """
         assert isinstance(bias_method, str), "`bias_method` must be a string"
-        assert bias_method in self._bias_method_opts, f"`bias_method` must be one of {self._bias_method_opts}"
+        assert bias_method in self._bias_method_opts, (
+            f"`bias_method` must be one of {self._bias_method_opts}"
+        )
         assert isinstance(loa_method, str), "`loa_method` must be a string"
-        assert loa_method in self._loa_method_opts, f"`loa_method` must be one of {self._loa_method_opts}"
+        assert loa_method in self._loa_method_opts, (
+            f"`loa_method` must be one of {self._loa_method_opts}"
+        )
         assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
-        loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)  # Agreement gets adjusted when LoA is modeled
+        # Agreement gets adjusted when LoA is modeled
+        loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)
         if not fstrings:
             fstrings = {
                 "bias_parm": "{bias_parm_center:.2f}",
                 "bias_regr": "{bias_intercept_center:.2f} + {bias_slope_center:.2f}x",
                 "loa_parm": "{lloa_parm_center:.2f}, {uloa_parm_center:.2f}",
-                "loa_regr": "Bias \u00B1 {loa_regr_agreement:.2f} * ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)",
+                "loa_regr": (
+                    "Bias \u00B1 {loa_regr_agreement:.2f} "
+                    "* ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)"
+                ),
                 "bias_parm_ci": (
                     "[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"
                 ),
                 "bias_regr_ci": (
-                    "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], [{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
+                    "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], "
+                    "[{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
                 ),
                 "loa_parm_ci": (
-                    "[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], [{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
+                    "[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], "
+                    "[{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
                 ),
                 "loa_regr_ci": (
-                    "[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], [{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
+                    "[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], "
+                    "[{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
                 ),
             }
-        # fstrings["loa_regr"] = fstrings["loa_regr"].replace("loa_regr_agreement", str(loa_regr_agreement))
         values = self.summary(ci_method=ci_method)
         values.columns = values.columns.map("_".join)  # Convert MultiIndex columns to Index
-        values["loa_regr_agreement"] = loa_regr_agreement  # Add a column of regr agreement so it can be used as variable
-        def return_all_the_strings(row, fstrings_dict):
+        # Add a column of regr agreement so it can be used as variable
+        values["loa_regr_agreement"] = loa_regr_agreement
+
+        def format_all_str(row, fstrings_dict):
             return {var: fstr.format(**row) for var, fstr in fstrings_dict.items()}
-        all_strings = values.apply(return_all_the_strings, fstrings_dict=fstrings, axis=1).apply(pd.Series)
+
+        all_strings = values.apply(format_all_str, fstrings_dict=fstrings, axis=1).apply(pd.Series)
         if bias_method == "auto":
             bias_parm_idx = self.auto_methods.query("bias == 'parm'").index.tolist()
         elif bias_method == "parm":
@@ -1306,99 +1331,124 @@ def return_all_the_strings(row, fstrings_dict):
         bias_parm.columns = bias_parm.columns.str.replace("_parm", "")
         bias_regr.columns = bias_parm.columns.str.replace("_regr", "")
         bias = pd.concat([bias_parm, bias_regr])
-        # bias = bias_parm.reindex(self.sleep_statistics).fillna(bias_regr)
         loa_parm = all_strings.loc[loa_parm_idx, ["loa_parm", "loa_parm_ci"]]
         loa_regr = all_strings.loc[loa_regr_idx, ["loa_regr", "loa_regr_ci"]]
         loa_parm.columns = loa_parm.columns.str.replace("_parm", "")
         loa_regr.columns = loa_regr.columns.str.replace("_regr", "")
         loa = pd.concat([loa_parm, loa_regr])
-        return bias.join(loa, validate="1:1").sort_index(axis=0)
+        table = bias.join(loa, validate="1:1").sort_index(axis=0)
+        return table
 
     def summary(self, ci_method="auto"):
         """
-        Return a dataframe that merges all the center values with their upper and lower confidence intervals.
-        There are always 2 options for CIs, so this is a convenient method to easily retrieve a set
-        of ALL values with their requested upper/lower bounds.
-        Returns a pandas DataFrame with 2-level multiindex as columns. with variable (bias) and 
-        interval (center, lower, upper)
+        Return a :py:class:`~pandas.DataFrame` that includes all calculated metrics:
+        * Parametric bias
+        * Parametric lower and upper limits of agreement
+        * Regression intercept and slope for modeled bias
+        * Regression intercept and slope for modeled limits of agreement
+        * Lower and upper confidence intervals for all metrics
+
+        Parameters
+        ----------
+        ci_method : str
+            If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
+            standard t-distribution.
+            If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
+            bootstrap resampling procedure.
+            If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
+            resampling procedure for sleep statistics where the distribution of score differences is
+            non-normal and using a standard t-distribution otherwise.
+
+        Returns
+        -------
+        summary : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
+            and their confidence intervals for all sleep statistics.
         """
         assert isinstance(ci_method, str), "`ci_method` must be a string"
-        assert ci_method in self._ci_method_opts, f"`ci_method` must be one of {self._ci_method_opts}"
-        # Make sure relevant sleep statistics have bootstrapped CIs, generate them if not
+        assert ci_method in self._ci_method_opts, f"`ci_method` must be in {self._ci_method_opts}"
+        # Make sure relevant sleep statistics have bootstrapped CIs, and generate them if not
         if ci_method in ["boot", "auto"]:
             if ci_method == "boot":
                 sleep_stats_to_boot = self.sleep_statistics
             elif ci_method == "auto":
                 sleep_stats_to_boot = self.auto_methods.query("ci == 'boot'").index.tolist()
-            # Check if any of the sleep stats already have bootstrapped CIs (e.g., if user calls "auto" and then "boot")
+            # Remove any sleep stats already bootstrapped CIs (eg if "boot" is callaed after "auto")
             sleep_stats_booted = self._ci["boot"].dropna().index
             sleep_stats_to_boot = [s for s in sleep_stats_to_boot if s not in sleep_stats_booted]
             if sleep_stats_to_boot:
                 self._generate_bootstrap_ci(sleep_stats=sleep_stats_to_boot)
         if ci_method == "auto":
-            idx_boot, idx_parm = self.auto_methods.reset_index().groupby("ci", sort=True)["sleep_stat"].apply(list)
-            parm_vals = self._ci.loc[idx_parm, "parm"]
-            boot_vals = self._ci.loc[idx_boot, "boot"]
+            parm_idx = self.auto_methods.query("ci == 'parm'").index.to_list()
+            boot_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
+            parm_vals = self._ci.loc[parm_idx, "parm"]
+            boot_vals = self._ci.loc[boot_idx, "boot"]
             ci_vals = pd.concat([parm_vals, boot_vals])
         else:
             ci_vals = self._ci[ci_method]
         # Add an extra level to values columns, indicating they are the center interval
-        center_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
-        df = center_vals.join(ci_vals, how="left", validate="1:1").astype(float).sort_index(axis=1)
-        return df
+        centr_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
+        summary = centr_vals.join(ci_vals, how="left", validate="1:1").astype(float)
+        return summary.sort_index(axis=1)
 
-    def calibrate(self, sstats_c, bias_method="auto"):
-        """Return a Series of adjusted sleep stats.
-        # input should be a dataframe like sstats_a and sstats_b
-        Sleep stats input are adjusted according to observed biases in observed relative to reference
-        Return adjusted sleep stats.
+    def calibrate(self, data, bias_method="auto", adjust_all=False):
+        """
+        Calibrate a :py:class:`~pandas.DataFrame` of sleep statistics from a new scorer based on
+        observed biases in ``obs_data``/``obs_scorer``.
 
         Parameters
         ----------
-        obs_data : :py:class:`pandas.DataFrame`
+        data : :py:class:`pandas.DataFrame`
             A :py:class:`pandas.DataFrame` with sleep statistics from an observed scorer.
             Rows are unique observations and columns are unique sleep statistics.
-            Shape, index, and columns must be identical to ``ref_data`` and ``obs_data``.
         bias_method : str
-            Name of the reference scorer.
+            If ``'parm'``, sleep statistics are always adjusted based on parametric bias.
+            If ``'regr'``, sleep statistics are always adjusted based on regression-modeled bias.
+            If ``'auto'`` (default), bias sleep statistics are adjusted by either ``'parm'`` or
+            ``'regr'``, depending on assumption violations.
+
+            .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.summary`
+
+        adjust_all: bool
+            If False (default), only adjust values for sleep statistics that showed a statistically
+            significant bias in the ``obs_data``. If True, adjust values for all sleep statistics.
 
         Returns
         -------
-        obs_data_calibrated : :py:class:`pandas.DataFrame`
-            A :py:class:`pandas.DataFrame` with calibrated sleep statistics from an observed scorer.
+        calibrated_data : :py:class:`pandas.DataFrame`
+            A :py:class:`~pandas.DataFrame` with calibrated sleep statistics.
 
         .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
-
-        Example
-        -------
-        >>> hyps_a = [yasa.simulate_hypnogram(tib=600, scorer="Henri", seed=i) for i in range(20)]
-        >>> hyps_b = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(20)]
-        >>> hyps_c = [h.simulate_similar(tib=600, scorer="Piéron", seed=i) for i in range(10)]
-        # sstats_a = pd.Series(hyps_a).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        # sstats_b = pd.Series(hyps_b).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        # sstats_c = pd.Series(hyps_c).map(lambda h: h.sleep_statistics()).apply(pd.Series)
-        # sstats_a.index = sstats_b.index = sstats_a.index.map(lambda x: f"sub-{x+1:03d}")
-        >>> agr = yasa.SleepStatsAgreement(sstats_a, sstats_b)
-        >>> sstats_c_calibrated = agr.calibrate(sstats_c)
-        >>> print(sstats_c_calibrated.round(2).head(5))
         """
-        assert isinstance(sstats_c, pd.DataFrame)
-        assert all(col in self.sleep_statistics for col in sstats_c)
-        assert isinstance(bias_method, str)
-        assert bias_method in self._bias_method_opts
-        parm_adjusted = sstats_c + self._vals["bias_parm"]
-        regr_adjusted = sstats_c * self._vals["bias_slope"] + self._vals["bias_intercept"]
+        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
+        assert all(col in self.sleep_statistics for col in data), (
+            f"all columns of `data` must be valid sleep statistics: {self.sleep_statistics}"
+        )
+        assert isinstance(bias_method, str), "`bias_method` must be a string"
+        assert bias_method in self._bias_method_opts, (
+            f"`bias_method` must be one of {self._bias_method_opts}"
+        )
+        assert isinstance(adjust_all, bool), "`adjust_all` must be True or False"
+        parm_adjusted = data + self._vals["bias_parm"]
+        regr_adjusted = data * self._vals["bias_slope"] + self._vals["bias_intercept"]
         if bias_method == "parm":
-            return parm_adjusted
+            calibrated_data = parm_adjusted
         elif bias_method == "regr":
-            return regr_adjusted
+            calibrated_data = regr_adjusted
         elif bias_method == "auto":
             parm_idx = self.auto_methods.query("bias == 'parm'").index.to_list()
-            bias_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
-            return parm_adjusted[parm_idx].join(regr_adjusted[bias_idx]).dropna(axis=1)
-
-    def get_calibration_func(sleep_stat):
+            regr_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
+            calibrated_data = parm_adjusted[parm_idx].join(regr_adjusted[regr_idx]).dropna(axis=1)
+        if not adjust_all:
+            # Put the raw values back for sleep stats that don't show statistical bias
+            unbiased_sstats = self.assumptions.query("unbiased == True").index.to_list()
+            calibrated_data[unbiased_sstats] = data[unbiased_sstats]
+        return calibrated_data
+
+    def get_calibration_func(self, sleep_stat):
         """
+        Return a function for calibrating a specific sleep statistic, based on observed biases in
+        ``obs_data``/``obs_scorer``.
 
         .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
 
@@ -1417,16 +1467,35 @@ def get_calibration_func(sleep_stat):
         """
         assert isinstance(sleep_stat, str)
         assert sleep_stat in self.sleep_statistics
-        parm, slope, intercept = ssa._vals.loc[ss, ["bias_parm", "bias_slope", "bias_intercept"]].to_numpy()
-        auto_method = ssa.auto_methods.at[ss, "bias"]
-        not_biased = ssa.assumptions.at[ss, "unbiased"]
-        def calibration_func(x, method="auto", bias_test=True):
-            x = np.array(x)
+        columns = ["bias_parm", "bias_slope", "bias_intercept"]
+        parm, slope, intercept = self._vals.loc[sleep_stat, columns]
+        auto_method = self.auto_methods.at[sleep_stat, "bias"]
+        not_biased = self.assumptions.at[sleep_stat, "unbiased"]
+
+        def calibration_func(x, method="auto", adjust_all=False):
+            """Calibrate values for sleep statistic.
+
+            Parameters
+            ----------
+            x : array
+                Values to be calibrated
+            method: str
+                Method of bias calculation for calibration (``'parm'``, ``'regr'``, or ``'auto'``).
+            adjust_all : bool
+                If False, only adjust sleep stat if observed bias was statistically significant.
+
+            Returns
+            -------
+            x_calibrated : :py:class:`numpy.array`
+                An array of calibrated x values.
+            """
+            x = np.asarray(x)
             method = auto_method if method == "auto" else method
-            if bias_test and not_biased:  # If sleep stat is not statistically biased, don't calibrate
+            if not_biased and not adjust_all:  # Return input if sleep stat is not statstclly biased
                 return x
             elif method == "parm":
                 return x + parm
             elif method == "regr":
                 return x * slope + intercept
+
         return calibration_func

From 8dbfff2ce22f755b356d7a49318cd222464b8be0 Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 26 Feb 2024 10:40:59 -0500
Subject: [PATCH 40/43] remove pingouin requirement

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1fd261a..2fb7b1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,3 @@ sleepecg>=0.5.0
 joblib
 antropy
 lightgbm
-pingouin>=0.5.3

From 0447edc4fe0c458d0f4cf3916481d65e9096a57c Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 26 Feb 2024 10:41:22 -0500
Subject: [PATCH 41/43] addressed minor PR review comments

---
 yasa/evaluation.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index adde5aa..b93e6b6 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -85,7 +85,7 @@ class EpochByEpochAgreement:
     >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
     >>> agr = ebe.get_agreement()
     >>> agr.head(5).round(2)
-              accuracy  balanced_acc  kappa   mcc  precision  recall  fbeta
+              accuracy  balanced_acc  kappa   mcc  precision  recall     f1
     sleep_id
     1             0.31          0.26   0.07  0.07       0.31    0.31   0.31
     2             0.33          0.33   0.14  0.14       0.35    0.33   0.34
@@ -297,11 +297,15 @@ def multi_scorer(df, scorers):
         scores : dict
             A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
         """
-        assert isinstance(scorers, dict)
-        assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
+        assert isinstance(df, pd.DataFrame), "`df` must be a pandas DataFrame"
+        assert df.shape[1] in [2, 3], "`df` must have either 2 or 3 columns"
+        assert isinstance(scorers, dict), "`scorers` must be a dictionary"
+        assert all(isinstance(k, str) and callable(v) for k, v in scorers.items()), (
+            "Each key of `scorers` must be a string, and each value must be a callable function"
+        )
         if df.shape[1] == 3:
             true, pred, weights = zip(*df.values)
-        else:
+        elif df.shape[1] == 2:
             true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
             weights = None
         scores = {s: f(true, pred, weights) for s, f in scorers.items()}
@@ -359,8 +363,8 @@ def get_agreement(self, sample_weight=None, scorers=None):
                 "recall": lambda t, p, w: skm.recall_score(
                     t, p, average="weighted", sample_weight=w, zero_division=0
                 ),
-                "fbeta": lambda t, p, w: skm.fbeta_score(
-                    t, p, beta=1, average="weighted", sample_weight=w, zero_division=0
+                "f1": lambda t, p, w: skm.f1_score(
+                    t, p, average="weighted", sample_weight=w, zero_division=0
                 ),
             }
         elif isinstance(scorers, list):

From ada6ab0fc3ab1ffc0fbdcefa2d6fd861989a0eab Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 26 Feb 2024 11:33:12 -0500
Subject: [PATCH 42/43] formatting

---
 yasa/evaluation.py | 95 +++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index b93e6b6..fc4d9c0 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -12,8 +12,8 @@
 
 import numpy as np
 import pandas as pd
+import scipy.stats as sps
 import sklearn.metrics as skm
-from scipy import stats
 
 
 logger = logging.getLogger("yasa")
@@ -164,50 +164,49 @@ class EpochByEpochAgreement:
     """
 
     def __init__(self, ref_hyps, obs_hyps):
-        from yasa.hypno import Hypnogram  # Avoiding circular import
+        from yasa.hypno import Hypnogram  # Avoiding circular import, bc hypno imports this class
 
         assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
         assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
         assert type(ref_hyps) is type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
-        assert len(ref_hyps) == len(
-            obs_hyps
-        ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
+        assert len(ref_hyps) == len(obs_hyps), (
+            "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
+        )
 
         if isinstance(ref_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
-            assert (
-                ref_hyps.keys() == obs_hyps.keys()
-            ), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
+            assert ref_hyps.keys() == obs_hyps.keys(), (
+                "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
+            )
             sleep_ids, ref_hyps = zip(*ref_hyps.items())
             obs_hyps = tuple(obs_hyps.values())
         else:
             # Create hypnogram_ids
             sleep_ids = tuple(range(1, 1 + len(ref_hyps)))
 
-        assert all(
-            isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
-        ), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
-        assert all(
-            h.scorer is not None for h in ref_hyps + obs_hyps
-        ), "all hypnograms must have a scorer name"
+        assert all(isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps), (
+            "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
+        )
+        assert all(h.scorer is not None for h in ref_hyps + obs_hyps), (
+            "all hypnograms in `ref_hyps` and `obs_hyps` must have a scorer name"
+        )
         for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
             assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
-        assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
-        ), "all `ref_hyps` must have the same scorer"
-        assert all(
-            h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
-        ), "all `obs_hyps` must have the same scorer"
-        assert all(
-            h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
-        ), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
-        assert all(
-            h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
-        ), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
-
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])), (
+            "all `ref_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])), (
+            "all `obs_hyps` must have the same scorer"
+        )
+        assert all(h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)), (
+            "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
+        )
+        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)), (
+            "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
+        )
         # Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
         ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
         obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}
@@ -274,7 +273,9 @@ def obs_scorer(self):
 
     @staticmethod
     def multi_scorer(df, scorers):
-        """Compute multiple agreement scores from a 2-column dataframe.
+        """
+        Compute multiple agreement scores from a 2-column dataframe (an optional 3rd column may
+        contain sample weights).
 
         This function offers convenience when calculating multiple agreement scores using
         :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
@@ -336,9 +337,9 @@ def get_agreement(self, sample_weight=None, scorers=None):
         agreement : :py:class:`pandas.DataFrame`
             A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
         """
-        assert (
-            isinstance(sample_weight, (type(None), pd.Series))
-        ), "`sample_weight` must be None or pandas Series"
+        assert isinstance(sample_weight, (type(None), pd.Series)), (
+            "`sample_weight` must be None or pandas Series"
+        )
         assert isinstance(scorers, (type(None), list, dict))
         if isinstance(scorers, list):
             assert all(isinstance(x, str) for x in scorers)
@@ -516,9 +517,9 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         N3         0  13  58  11    0
         REM        2  23  40  18   17
         """
-        assert (
-            sleep_id is None or sleep_id in self._sleep_ids
-        ), "`sleep_id` must be None or a valid sleep ID"
+        assert sleep_id is None or sleep_id in self._sleep_ids, (
+            "`sleep_id` must be None or a valid sleep ID"
+        )
         assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
         assert not ((self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None), (
             "`agg_func` must be None if plotting a single session."
@@ -982,7 +983,7 @@ def __init__(
         # Generate standard CIs for parametric Bias and LoA for all sleep stats
         ########################################################################
         # Get critical t and standard error used to calculate parametric CIs for parametric Bias/LoA
-        t_parm = stats.t.ppf((1 + confidence) / 2, n_sessions - 1)
+        t_parm = sps.t.ppf((1 + confidence) / 2, n_sessions - 1)
         sem = grouper["difference"].sem(ddof=1)
         # Parametric CIs for parametric Bias and LoA
         parm_ci = pd.DataFrame({
@@ -1006,7 +1007,7 @@ def __init__(
         predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
         data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
         data["residuals_abs"] = data["residuals"].abs()
-        # Run regression used to (b) model LoA and (b) test for heteroscedasticity/homoscedasticity
+        # Run regression used to (a) model LoA and (b) test for heteroscedasticity/homoscedasticity
         loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._linregr_dict).apply(pd.Series)
         # Stack the two regression dataframes together
         regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)
@@ -1015,7 +1016,7 @@ def __init__(
         # Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
         ########################################################################
         # Get critical t used used to calculate parametric CIs for regression Bias/LoA
-        t_regr = stats.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
+        t_regr = sps.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
         # Parametric CIs for modeled Bias and LoA
         regr_ci = pd.DataFrame({
             "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
@@ -1029,9 +1030,9 @@ def __init__(
         ########################################################################
         assumptions = pd.DataFrame({
             "unbiased": (
-                grouper["difference"].apply(lambda a: stats.ttest_1samp(a, 0).pvalue).ge(alpha)
+                grouper["difference"].apply(lambda a: sps.ttest_1samp(a, 0).pvalue).ge(alpha)
             ),
-            "normal": grouper["difference"].apply(lambda a: stats.shapiro(a).pvalue).ge(alpha),
+            "normal": grouper["difference"].apply(lambda a: sps.shapiro(a).pvalue).ge(alpha),
             "constant_bias": bias_regr["pvalue"].ge(alpha),
             "homoscedastic": loa_regr["pvalue"].ge(alpha),
         })
@@ -1108,8 +1109,8 @@ def assumptions(self):
 
     @property
     def auto_methods(self):
-        """A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is
-        selected.
+        """
+        A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
         """
         return pd.concat(
             [
@@ -1153,7 +1154,7 @@ def _linregr_dict(*args, **kwargs):
         not included when converting the named tuple, so this allows it to be included when using
         something like groupby.
         """
-        regr = stats.linregress(*args, **kwargs)
+        regr = sps.linregress(*args, **kwargs)
         return {
             "slope": regr.slope,
             "intercept": regr.intercept,
@@ -1195,9 +1196,9 @@ def get_vars(ref_arr, diff_arr, rabs_arr):
             """A function to get all variables at once and avoid redundant stats.bootstrap calls."""
             bias_parm = np.mean(diff_arr)
             lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
-            bias_slope, bias_inter = stats.linregress(ref_arr, diff_arr)[:2]
+            bias_slope, bias_inter = sps.linregress(ref_arr, diff_arr)[:2]
             # Note this is NOT recalculating residuals each time for the next regression
-            loa_slope, loa_inter = stats.linregress(ref_arr, rabs_arr)[:2]
+            loa_slope, loa_inter = sps.linregress(ref_arr, rabs_arr)[:2]
             return bias_parm, lloa_parm, uloa_parm, bias_inter, bias_slope, loa_inter, loa_slope
 
         # !! Column order MUST match the order of arrays boot_stats expects as INPUT
@@ -1219,7 +1220,7 @@ def get_vars(ref_arr, diff_arr, rabs_arr):
             .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
             # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
             # of the passed dataframe to a tuple of 3 1D arrays
-            .apply(lambda df: stats.bootstrap(tuple(df.to_numpy().T), get_vars, **bs_kwargs))
+            .apply(lambda df: sps.bootstrap(tuple(df.to_numpy().T), get_vars, **bs_kwargs))
             .map(lambda res: res.confidence_interval)  # Pull high/low CIs out of the results object
             .explode()  # Break high and low CIs into separate rows
             .to_frame("value")  # Convert to dataframe and name column
@@ -1469,8 +1470,8 @@ def get_calibration_func(self, sleep_stat):
         >>> calibrate_rem(new_obs_rem_vals, bias_test=False, method="regr")
         array([ -9.33878878,  -9.86815607, -10.39752335, -10.92689064])
         """
-        assert isinstance(sleep_stat, str)
-        assert sleep_stat in self.sleep_statistics
+        assert isinstance(sleep_stat, str), "`sleep_stat` must be a string"
+        assert sleep_stat in self.sleep_statistics, "`sleep_stat` must be a valid sleep statistic"
         columns = ["bias_parm", "bias_slope", "bias_intercept"]
         parm, slope, intercept = self._vals.loc[sleep_stat, columns]
         auto_method = self.auto_methods.at[sleep_stat, "bias"]

From f3ed7aad6ffc8b6e419c0cd8d99f097255f3202e Mon Sep 17 00:00:00 2001
From: remrama <mallett.remy@gmail.com>
Date: Mon, 4 Mar 2024 13:27:58 -0600
Subject: [PATCH 43/43] black formatting

---
 yasa/evaluation.py | 250 +++++++++++++++++++++++----------------------
 1 file changed, 126 insertions(+), 124 deletions(-)

diff --git a/yasa/evaluation.py b/yasa/evaluation.py
index fc4d9c0..007d4db 100644
--- a/yasa/evaluation.py
+++ b/yasa/evaluation.py
@@ -8,6 +8,7 @@
 - https://sri-human-sleep.github.io/sleep-trackers-performance
 - https://github.com/SRI-human-sleep/sleep-trackers-performance
 """
+
 import logging
 
 import numpy as np
@@ -169,55 +170,51 @@ def __init__(self, ref_hyps, obs_hyps):
         assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
         assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
         assert type(ref_hyps) is type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
-        assert len(ref_hyps) == len(obs_hyps), (
-            "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
-        )
+        assert len(ref_hyps) == len(
+            obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
 
         if isinstance(ref_hyps, dict):
             # If user provides dictionaries, split into sleep IDs and hypnograms
-            assert ref_hyps.keys() == obs_hyps.keys(), (
-                "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
-            )
+            assert (
+                ref_hyps.keys() == obs_hyps.keys()
+            ), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
             sleep_ids, ref_hyps = zip(*ref_hyps.items())
             obs_hyps = tuple(obs_hyps.values())
         else:
             # Create hypnogram_ids
             sleep_ids = tuple(range(1, 1 + len(ref_hyps)))
 
-        assert all(isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps), (
-            "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
-        )
-        assert all(h.scorer is not None for h in ref_hyps + obs_hyps), (
-            "all hypnograms in `ref_hyps` and `obs_hyps` must have a scorer name"
-        )
+        assert all(
+            isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
+        ), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
+        assert all(
+            h.scorer is not None for h in ref_hyps + obs_hyps
+        ), "all hypnograms in `ref_hyps` and `obs_hyps` must have a scorer name"
         for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
             assert h1.freq == h2.freq, "all hypnograms must have the same freq"
             assert h1.labels == h2.labels, "all hypnograms must have the same labels"
             assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
             assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])), (
-            "all `ref_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])), (
-            "all `obs_hyps` must have the same scorer"
-        )
-        assert all(h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)), (
-            "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
-        )
-        assert all(h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)), (
-            "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
-        )
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
+        ), "all `ref_hyps` must have the same scorer"
+        assert all(
+            h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
+        ), "all `obs_hyps` must have the same scorer"
+        assert all(
+            h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
+        assert all(
+            h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
+        ), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
         # Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
         ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
         obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}
 
         # Merge all hypnograms into a single MultiIndexed dataframe
-        ref = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items()
-        )
-        obs = pd.concat(
-            pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items()
-        )
+        ref = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items())
+        obs = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items())
         data = pd.concat([ref, obs], axis=1)
 
         # Generate some mapping dictionaries to be used later in class methods
@@ -301,9 +298,9 @@ def multi_scorer(df, scorers):
         assert isinstance(df, pd.DataFrame), "`df` must be a pandas DataFrame"
         assert df.shape[1] in [2, 3], "`df` must have either 2 or 3 columns"
         assert isinstance(scorers, dict), "`scorers` must be a dictionary"
-        assert all(isinstance(k, str) and callable(v) for k, v in scorers.items()), (
-            "Each key of `scorers` must be a string, and each value must be a callable function"
-        )
+        assert all(
+            isinstance(k, str) and callable(v) for k, v in scorers.items()
+        ), "Each key of `scorers` must be a string, and each value must be a callable function"
         if df.shape[1] == 3:
             true, pred, weights = zip(*df.values)
         elif df.shape[1] == 2:
@@ -337,9 +334,9 @@ def get_agreement(self, sample_weight=None, scorers=None):
         agreement : :py:class:`pandas.DataFrame`
             A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
         """
-        assert isinstance(sample_weight, (type(None), pd.Series)), (
-            "`sample_weight` must be None or pandas Series"
-        )
+        assert isinstance(
+            sample_weight, (type(None), pd.Series)
+        ), "`sample_weight` must be None or pandas Series"
         assert isinstance(scorers, (type(None), list, dict))
         if isinstance(scorers, list):
             assert all(isinstance(x, str) for x in scorers)
@@ -406,10 +403,12 @@ def get_agreement_bystage(self, beta=1.0):
             A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
             :py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
         """
+
         def scorer(df):
             return skm.precision_recall_fscore_support(
                 *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
             )
+
         agreement = (
             self.data
             # Get precision, recall, f1, and support for each individual sleep session
@@ -435,7 +434,7 @@ def scorer(df):
             .swaplevel()
             .sort_index(
                 level="stage",
-                key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y))
+                key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y)),
             )
         )
         # Set attribute for later access
@@ -517,13 +516,13 @@ def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
         N3         0  13  58  11    0
         REM        2  23  40  18   17
         """
-        assert sleep_id is None or sleep_id in self._sleep_ids, (
-            "`sleep_id` must be None or a valid sleep ID"
-        )
+        assert (
+            sleep_id is None or sleep_id in self._sleep_ids
+        ), "`sleep_id` must be None or a valid sleep ID"
         assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
-        assert not ((self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None), (
-            "`agg_func` must be None if plotting a single session."
-        )
+        assert not (
+            (self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None
+        ), "`agg_func` must be None if plotting a single session."
         kwargs = {"labels": self._skm_labels} | kwargs
         # Generate a DataFrame with a confusion matrix for each session
         #   Seems easier to just generate this whole thing and then either
@@ -639,9 +638,9 @@ def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, ob
         assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
         assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
         assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
-        assert "ax" not in ref_kwargs | obs_kwargs, (
-            "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
-        )
+        assert (
+            "ax" not in ref_kwargs | obs_kwargs
+        ), "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
         assert not (sleep_id is None and self.n_sleeps > 1), (
             "Multi-session plotting is not currently supported. `sleep_id` must not be None when "
             "multiple sessions are present"
@@ -713,18 +712,16 @@ def summary(self, by_stage=False, **kwargs):
 
             >>> ebe.summary(func=["count", "mean", "sem"])
         """
-        assert self.n_sleeps > 1, (
-            "Summary scores can not be computed with only one hypnogram pair."
-        )
+        assert self.n_sleeps > 1, "Summary scores can not be computed with only one hypnogram pair."
         assert isinstance(by_stage, bool), "`by_stage` must be True or False"
         if by_stage:
-            assert hasattr(self, "_agreement_bystage"), (
-                "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
-            )
+            assert hasattr(
+                self, "_agreement_bystage"
+            ), "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
         else:
-            assert hasattr(self, "_agreement"), (
-                "Must run `self.get_agreement` before obtaining summary results."
-            )
+            assert hasattr(
+                self, "_agreement"
+            ), "Must run `self.get_agreement` before obtaining summary results."
 
         # Create a function for getting mean absolute deviation
         def mad(df):
@@ -734,8 +731,7 @@ def mad(df):
         agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
         if by_stage:
             summary = (
-                self
-                .agreement_bystage.groupby("stage")
+                self.agreement_bystage.groupby("stage")
                 .agg(**agg_kwargs)
                 .stack(level=0)
                 .rename_axis(["stage", "metric"])
@@ -917,31 +913,31 @@ def __init__(
 
         assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
         assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
-        assert np.array_equal(ref_data.index, obs_data.index), (
-            "`ref_data` and `obs_data` index values must be identical"
-        )
-        assert ref_data.index.name == obs_data.index.name, (
-            "`ref_data` and `obs_data` index names must be identical"
-        )
-        assert np.array_equal(ref_data.columns, obs_data.columns), (
-            "`ref_data` and `obs_data` column values must be identical"
-        )
+        assert np.array_equal(
+            ref_data.index, obs_data.index
+        ), "`ref_data` and `obs_data` index values must be identical"
+        assert (
+            ref_data.index.name == obs_data.index.name
+        ), "`ref_data` and `obs_data` index names must be identical"
+        assert np.array_equal(
+            ref_data.columns, obs_data.columns
+        ), "`ref_data` and `obs_data` column values must be identical"
         assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
         assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
         assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
-        assert isinstance(agreement, (float, int)) and agreement > 0, (
-            "`agreement` must be a number greater than 0"
-        )
-        assert isinstance(confidence, (float, int)) and 0 < alpha < 1, (
-            "`confidence` must be a number between 0 and 1"
-        )
-        assert isinstance(alpha, (float, int)) and 0 <= alpha <= 1, (
-            "`alpha` must be a number between 0 and 1 inclusive"
-        )
+        assert (
+            isinstance(agreement, (float, int)) and agreement > 0
+        ), "`agreement` must be a number greater than 0"
+        assert (
+            isinstance(confidence, (float, int)) and 0 < alpha < 1
+        ), "`confidence` must be a number between 0 and 1"
+        assert (
+            isinstance(alpha, (float, int)) and 0 <= alpha <= 1
+        ), "`alpha` must be a number between 0 and 1 inclusive"
         assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
-        assert all(k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs), (
-            f"None of {restricted_bootstrap_kwargs} can be set by the user"
-        )
+        assert all(
+            k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs
+        ), f"None of {restricted_bootstrap_kwargs} can be set by the user"
 
         # If `ref_data` and `obs_data` indices are unnamed, name them
         session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
@@ -986,14 +982,16 @@ def __init__(
         t_parm = sps.t.ppf((1 + confidence) / 2, n_sessions - 1)
         sem = grouper["difference"].sem(ddof=1)
         # Parametric CIs for parametric Bias and LoA
-        parm_ci = pd.DataFrame({
-            "bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
-            "bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
-            "lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
-            "lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
-            "uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
-            "uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
-        })
+        parm_ci = pd.DataFrame(
+            {
+                "bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
+                "bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
+                "lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
+                "lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
+                "uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
+                "uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
+            }
+        )
 
         ########################################################################
         # Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
@@ -1018,24 +1016,28 @@ def __init__(
         # Get critical t used used to calculate parametric CIs for regression Bias/LoA
         t_regr = sps.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
         # Parametric CIs for modeled Bias and LoA
-        regr_ci = pd.DataFrame({
-            "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
-            "intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
-            "slope-lower": regr["slope"] - regr["stderr"] * t_regr,
-            "slope-upper": regr["slope"] + regr["stderr"] * t_regr,
-        })
+        regr_ci = pd.DataFrame(
+            {
+                "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
+                "intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
+                "slope-lower": regr["slope"] - regr["stderr"] * t_regr,
+                "slope-upper": regr["slope"] + regr["stderr"] * t_regr,
+            }
+        )
 
         ########################################################################
         # Test all statistical assumptions
         ########################################################################
-        assumptions = pd.DataFrame({
-            "unbiased": (
-                grouper["difference"].apply(lambda a: sps.ttest_1samp(a, 0).pvalue).ge(alpha)
-            ),
-            "normal": grouper["difference"].apply(lambda a: sps.shapiro(a).pvalue).ge(alpha),
-            "constant_bias": bias_regr["pvalue"].ge(alpha),
-            "homoscedastic": loa_regr["pvalue"].ge(alpha),
-        })
+        assumptions = pd.DataFrame(
+            {
+                "unbiased": (
+                    grouper["difference"].apply(lambda a: sps.ttest_1samp(a, 0).pvalue).ge(alpha)
+                ),
+                "normal": grouper["difference"].apply(lambda a: sps.shapiro(a).pvalue).ge(alpha),
+                "constant_bias": bias_regr["pvalue"].ge(alpha),
+                "homoscedastic": loa_regr["pvalue"].ge(alpha),
+            }
+        )
 
         ########################################################################
         # Setting attributes
@@ -1051,7 +1053,8 @@ def __init__(
         regr_ci.columns = regr_ci.columns.swaplevel().map("_".join)
         ci = parm_ci.join(regr_ci)
         ci.columns = pd.MultiIndex.from_tuples(
-            tuples=ci.columns.str.split("-", expand=True), names=["variable", "interval"],
+            tuples=ci.columns.str.split("-", expand=True),
+            names=["variable", "interval"],
         )
         empty_df = pd.DataFrame().reindex_like(ci)
         ci = pd.concat({"parm": ci, "boot": empty_df}, names=["ci_method"], axis=1)
@@ -1177,12 +1180,12 @@ def _generate_bootstrap_ci(self, sleep_stats):
         """
         assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
         assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
-        assert all(isinstance(ss, str) for ss in sleep_stats), (
-            "all elements of `sleep_stats` must be strings"
-        )
-        assert all(ss in self.sleep_statistics for ss in sleep_stats), (
-            f"all elements of `sleep_stats` must be one of {self.sleep_statistics}"
-        )
+        assert all(
+            isinstance(ss, str) for ss in sleep_stats
+        ), "all elements of `sleep_stats` must be strings"
+        assert all(
+            ss in self.sleep_statistics for ss in sleep_stats
+        ), f"all elements of `sleep_stats` must be one of {self.sleep_statistics}"
         # Update bootstrap keyword arguments with defaults
         bs_kwargs = {
             "n_resamples": 1000,
@@ -1215,8 +1218,9 @@ def get_vars(ref_arr, diff_arr, rabs_arr):
             "loa_slope",
         ]
         boot_ci = (
-            self._data
-            .loc[sleep_stats, column_order]  # Extract the relevant sleep stats and columns
+            self._data.loc[
+                sleep_stats, column_order
+            ]  # Extract the relevant sleep stats and columns
             .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
             # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
             # of the passed dataframe to a tuple of 3 1D arrays
@@ -1273,13 +1277,13 @@ def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fst
             and their confidence intervals for all sleep statistics.
         """
         assert isinstance(bias_method, str), "`bias_method` must be a string"
-        assert bias_method in self._bias_method_opts, (
-            f"`bias_method` must be one of {self._bias_method_opts}"
-        )
+        assert (
+            bias_method in self._bias_method_opts
+        ), f"`bias_method` must be one of {self._bias_method_opts}"
         assert isinstance(loa_method, str), "`loa_method` must be a string"
-        assert loa_method in self._loa_method_opts, (
-            f"`loa_method` must be one of {self._loa_method_opts}"
-        )
+        assert (
+            loa_method in self._loa_method_opts
+        ), f"`loa_method` must be one of {self._loa_method_opts}"
         assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
         # Agreement gets adjusted when LoA is modeled
         loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)
@@ -1292,9 +1296,7 @@ def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fst
                     "Bias \u00B1 {loa_regr_agreement:.2f} "
                     "* ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)"
                 ),
-                "bias_parm_ci": (
-                    "[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"
-                ),
+                "bias_parm_ci": ("[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"),
                 "bias_regr_ci": (
                     "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], "
                     "[{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
@@ -1426,13 +1428,13 @@ def calibrate(self, data, bias_method="auto", adjust_all=False):
         .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
         """
         assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
-        assert all(col in self.sleep_statistics for col in data), (
-            f"all columns of `data` must be valid sleep statistics: {self.sleep_statistics}"
-        )
+        assert all(
+            col in self.sleep_statistics for col in data
+        ), f"all columns of `data` must be valid sleep statistics: {self.sleep_statistics}"
         assert isinstance(bias_method, str), "`bias_method` must be a string"
-        assert bias_method in self._bias_method_opts, (
-            f"`bias_method` must be one of {self._bias_method_opts}"
-        )
+        assert (
+            bias_method in self._bias_method_opts
+        ), f"`bias_method` must be one of {self._bias_method_opts}"
         assert isinstance(adjust_all, bool), "`adjust_all` must be True or False"
         parm_adjusted = data + self._vals["bias_parm"]
         regr_adjusted = data * self._vals["bias_slope"] + self._vals["bias_intercept"]