Adds uncertainty plots which allow uncertainty on the prevalence of g…

…roups to be assessed (#8) Includes all code, tests, docs and examples for the uncertainty plot
raeslab · May 29, 2024 · 5142ad3 · 5142ad3
1 parent d8bd8a2
commit 5142ad3
Show file tree

Hide file tree

Showing 14 changed files with 306 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -153,6 +153,30 @@ plt.show()
 
 ![Loreplot with a confounder](https://raw.githubusercontent.com/raeslab/lorepy/main/docs/img/loreplot_confounder.png)
 
+### Assess uncertainty
+
+From loreplots it isn't possible to assess how certain we are of the prevalence of each group across the range. To
+provide a view into this there is a function ```uncertainty_plot```, which can be used as shown below. This will use
+```resampling``` (or ```jackknifing```) to determine the 50% and 95% interval of predicted values and show these in a
+multi-panel plot with one plot per category.
+
+```python
+from lorepy import uncertainty_plot
+
+uncertainty_plot(
+    data=iris_df,
+    x="sepal width (cm)",
+    y="species",
+)
+plt.savefig("./docs/img/uncertainty_default.png", dpi=150)
+plt.show()
+```
+
+![Default uncertainty plot](https://raw.githubusercontent.com/raeslab/lorepy/main/docs/img/uncertainty_default.png)
+
+This also supports custom colors, ranges and classifiers. More examples are available in ```example_uncertainty.py```.
+
+
 ## Development
 
 Additional [documentation for developers](./docs/dev_docs.md) is included with details on running tests, building and deploying to PyPi.

diff --git a/docs/dev_docs.md b/docs/dev_docs.md
@@ -3,19 +3,25 @@
 ## Setting up the environment
 
 To recreate the environment used by the devs, you can get a [requirements.txt](./dev/requirements.txt) file that has the
-same versions we have been using pinned.
+same versions we have been using pinned. To install these after creating a virtual environment use the command below
+(from the root of the project)
+
+```bash
+pip install -r ./docs/dev/requirements.txt
+```
+
 
 ## Running tests
 
 Lorepy is fully covered with unit-tests, to run them you need the pytest package installed (```pip install pytest pytest-cov```).
 Next, run the command below to run the test suite. Note: if you use the environment listed above you will get these.
 
-```python
+```bash
 pytest
 ```
 To enable coverage stats run the command below.
 
-```python
+```bash
 pytest --exitfirst --verbose --failed-first --cov=src
 ```
 

diff --git a/docs/img/uncertainty_confounder.png b/docs/img/uncertainty_confounder.png
diff --git a/docs/img/uncertainty_custom_classifier.png b/docs/img/uncertainty_custom_classifier.png
diff --git a/docs/img/uncertainty_custom_color.png b/docs/img/uncertainty_custom_color.png
diff --git a/docs/img/uncertainty_default.png b/docs/img/uncertainty_default.png
diff --git a/docs/img/uncertainty_jackknife.png b/docs/img/uncertainty_jackknife.png
diff --git a/example_uncertainty.py b/example_uncertainty.py
@@ -0,0 +1,70 @@
+from lorepy import uncertainty_plot
+
+from sklearn.datasets import load_iris
+import matplotlib.pyplot as plt
+import pandas as pd
+
+# Load iris dataset and convert to dataframe
+iris_obj = load_iris()
+iris_df = pd.DataFrame(iris_obj.data, columns=iris_obj.feature_names)
+
+iris_df["species"] = [iris_obj.target_names[s] for s in iris_obj.target]
+
+# Default uncertainty plot
+uncertainty_plot(data=iris_df, x="sepal width (cm)", y="species", iterations=100)
+plt.savefig("./docs/img/uncertainty_default.png", dpi=150)
+plt.show()
+
+# Using jackknife instead of resample to assess uncertainty
+uncertainty_plot(
+    data=iris_df,
+    x="sepal width (cm)",
+    y="species",
+    iterations=100,
+    jackknife_fraction=0.8,
+)
+plt.savefig("./docs/img/uncertainty_jackknife.png", dpi=150)
+plt.show()
+
+# Uncertainty plot with custom colors
+from matplotlib.colors import ListedColormap
+
+colormap = ListedColormap(["red", "green", "blue"])
+uncertainty_plot(
+    data=iris_df,
+    x="sepal width (cm)",
+    y="species",
+    iterations=100,
+    mode="resample",
+    colormap=colormap,
+)
+plt.savefig("./docs/img/uncertainty_custom_color.png", dpi=150)
+plt.show()
+
+# Uncertainty plot with a confounder
+uncertainty_plot(
+    data=iris_df,
+    x="sepal width (cm)",
+    y="species",
+    iterations=100,
+    mode="resample",
+    confounders=[("petal width (cm)", 1)],
+)
+plt.savefig("./docs/img/uncertainty_confounder.png", dpi=150)
+plt.show()
+
+# Uncertainty plot with a custom classifier
+from sklearn.svm import SVC
+
+svc = SVC(probability=True)
+
+uncertainty_plot(
+    data=iris_df,
+    x="sepal width (cm)",
+    y="species",
+    iterations=100,
+    mode="resample",
+    clf=svc,
+)
+plt.savefig("./docs/img/uncertainty_custom_classifier.png", dpi=150)
+plt.show()
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="lorepy",
-    version="0.3.0",
+    version="0.4.0",
     author="Sebastian Proost",
     author_email="sebastian.proost@gmail.com",
     description="Draw Logistic Regression Plots in Python",

diff --git a/src/lorepy/__init__.py b/src/lorepy/__init__.py
@@ -1 +1,2 @@
 from .lorepy import loreplot
+from .uncertainty import uncertainty_plot
diff --git a/src/lorepy/lorepy.py b/src/lorepy/lorepy.py
@@ -6,12 +6,24 @@
 from typing import Optional, Tuple
 
 
-def _get_area_df(lg, x_feature, x_range, deconfound=[]) -> DataFrame:
+def _prepare_data(data, x, y, confounders):
+    x_features = [x] + [i[0] for i in confounders]
+
+    tmp_df = data[x_features + [y]].dropna()
+    X_reg = np.array(tmp_df[x_features])
+    y_reg = np.array(tmp_df[y])
+
+    x_range = (X_reg[:, 0].min(), X_reg[:, 0].max())
+
+    return X_reg, y_reg, x_range
+
+
+def _get_area_df(lg, x_feature, x_range, confounders=[]) -> DataFrame:
     values = np.linspace(x_range[0], x_range[1], num=200)
 
     predict_df = pd.DataFrame({"values": values})
 
-    for k, v in deconfound:
+    for k, v in confounders:
         predict_df[k] = v
 
     proba = lg.predict_proba(predict_df.values)
@@ -25,14 +37,14 @@ def _get_area_df(lg, x_feature, x_range, deconfound=[]) -> DataFrame:
 def _get_dots_df(X, y, lg, y_feature, confounders=[]) -> DataFrame:
     output = []
 
-    for v, s in zip(X, y):
-        proba = lg.predict_proba([v] + [i[1] for i in confounders])
+    for x, s in zip(X, y):
+        proba = lg.predict_proba([x] + [i[1] for i in confounders])
         i = list(lg.classes_).index(s)
         min_value = sum(proba[0][:i])
         max_value = sum(proba[0][: i + 1])
         margin = (max_value - min_value) / 10
         ypos = np.random.uniform(low=min_value + margin, high=max_value - margin)
-        output.append({y_feature: s, "x": v[0], "y": ypos})
+        output.append({y_feature: s, "x": x[0], "y": ypos})
 
     return DataFrame(output)
 
@@ -47,16 +59,16 @@ def loreplot(
     ax=None,
     clf=None,
     confounders=[],
-    **kwargs
+    **kwargs,
 ):
     """
-    Code to create a loreplot with a numerical feature on the x-axis and categorical y from a pandas dataset
+    Code to create a loreplot with a numerical feature on the v-axis and categorical y from a pandas dataset
 
     :param data: Pandas dataframe with data
     :param x: Needs to be a numerical feature
     :param y: Categorical feature
     :param add_dots: Shows where true samples are in the plot (cannot be enabled when deconfounding for additional variables)
-    :param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the x-axis
+    :param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the v-axis
     :param scatter_kws: Dictionary with keyword arguments to pass to the scatter function
     :param ax: subplot to draw on, in case lorepy is used in a subplot
     :param clf: provide a different scikit-learn classifier for the function. Should implement the predict_proba() and fit()
@@ -66,22 +78,18 @@ def loreplot(
     if ax is None:
         ax = plt.gca()
 
-    x_features = [x] + [i[0] for i in confounders]
-
-    tmp_df = data[x_features + [y]].dropna()
-    X_reg = np.array(tmp_df[x_features])
-    y_reg = np.array(tmp_df[y])
+    X_reg, y_reg, r = _prepare_data(data, x, y, confounders)
 
     if x_range is None:
-        x_range = (X_reg[:, 0].min(), X_reg[:, 0].max())
+        x_range = r
 
     lg = LogisticRegression(multi_class="multinomial") if clf is None else clf
     lg.fit(X_reg, y_reg)
 
     if "linestyle" not in kwargs.keys():
         kwargs["linestyle"] = "None"
 
-    area_df = _get_area_df(lg, x, x_range, deconfound=confounders)
+    area_df = _get_area_df(lg, x, x_range, confounders=confounders)
     area_df.plot.area(ax=ax, **kwargs)
 
     if add_dots and len(confounders) == 0:

diff --git a/src/lorepy/uncertainty.py b/src/lorepy/uncertainty.py
@@ -0,0 +1,130 @@
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from pandas import DataFrame
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.utils import resample
+
+from lorepy.lorepy import _get_area_df, _prepare_data
+
+
+def _get_uncertainty_data(
+    x: str,
+    X_reg,
+    y_reg,
+    x_range,
+    mode="resample",
+    jackknife_fraction: float = 0.8,
+    iterations: int = 100,
+    confounders=[],
+    clf=None,
+):
+    areas = []
+    for i in range(iterations):
+        if mode == "jackknife":
+            X_keep, _, y_keep, _ = train_test_split(
+                X_reg, y_reg, train_size=jackknife_fraction
+            )
+        elif mode == "resample":
+            X_keep, y_keep = resample(X_reg, y_reg, replace=True)
+        else:
+            raise NotImplementedError(
+                f"Mode {mode} is unsupported, only jackknife and resample are valid modes"
+            )
+
+        lg = LogisticRegression(multi_class="multinomial") if clf is None else clf
+        lg.fit(X_keep, y_keep)
+        new_area = _get_area_df(lg, x, x_range, confounders=confounders).reset_index()
+
+        areas.append(new_area)
+
+    long_df = pd.concat(areas).melt(id_vars=[x]).sort_values(x)
+
+    output = (
+        long_df.groupby([x, "variable"])
+        .agg(
+            min=pd.NamedAgg(column="value", aggfunc="min"),
+            mean=pd.NamedAgg(column="value", aggfunc="mean"),
+            max=pd.NamedAgg(column="value", aggfunc="max"),
+            low_95=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 2.5)),
+            high_95=pd.NamedAgg(
+                column="value", aggfunc=lambda v: np.percentile(v, 97.5)
+            ),
+            low_50=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 25)),
+            high_50=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 75)),
+        )
+        .reset_index()
+    )
+
+    return output
+
+
+def uncertainty_plot(
+    data: DataFrame,
+    x: str,
+    y: str,
+    x_range=None,
+    mode="resample",
+    jackknife_fraction=0.8,
+    iterations=100,
+    confounders=[],
+    colormap=None,
+    clf=None,
+):
+    """
+    Code to create a multi-panel plot, one panel for each category, with the prevalence of that category across the
+    range of x-values, along with the uncertainty (intervals containing 50% and 95% of the samples are shown)
+
+    :param data: Pandas dataframe with data
+    :param x: Needs to be a numerical feature
+    :param y: Categorical feature
+    :param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the x-axis
+    :param mode: Sampling method, either "resample" (bootstrap) or "jackknife" (default = "resample")
+    :param jackknife_fraction: Fraction of data to retain for each jackknife sample (default = 0.8)
+    :param iterations: Number of iterations for resampling or jackknife (default = 100)
+    :param confounders: List of tuples with the feature and reference value e.g., [("BMI", 25)] will use a reference of 25 for plots
+    :param colormap: Colormap to use for the plot, default is None in which case matplotlib's default will be used
+    :param clf: Provide a different scikit-learn classifier for the function. Should implement the predict_proba() and fit(). If None a LogisticRegression will be used.
+    :return: A tuple containing the figure and axes objects
+    """
+    X_reg, y_reg, r = _prepare_data(data, x, y, confounders)
+
+    if x_range is None:
+        x_range = r
+
+    plot_df = _get_uncertainty_data(
+        x,
+        X_reg,
+        y_reg,
+        x_range,
+        mode=mode,
+        jackknife_fraction=jackknife_fraction,
+        iterations=iterations,
+        confounders=confounders,
+        clf=clf,
+    )
+
+    categories = plot_df.variable.unique()
+
+    fig, axs = plt.subplots(ncols=len(categories), sharex=True, sharey=True)
+
+    cmap = plt.get_cmap("tab10") if colormap is None else colormap
+
+    for idx, category in enumerate(categories):
+        cat_df = plot_df[plot_df.variable == category]
+
+        axs[idx].fill_between(
+            cat_df[x], cat_df["low_95"], cat_df["high_95"], alpha=0.1, color=cmap(idx)
+        )
+        axs[idx].fill_between(
+            cat_df[x], cat_df["low_50"], cat_df["high_50"], alpha=0.2, color=cmap(idx)
+        )
+        axs[idx].plot(cat_df[x], cat_df["mean"], color=cmap(idx))
+        axs[idx].set_title(categories[idx])
+        axs[idx].set_xlabel(x)
+
+        axs[idx].set_xlim(*x_range)
+        axs[idx].set_ylim(0, 1)
+
+    return fig, axs
diff --git a/tests/test_plot.py b/tests/test_plot.py
@@ -17,7 +17,7 @@
 
 # Test case for loreplot with default parameters
 def test_loreplot_default():
-    loreplot(df, "x", "y")  ## first test without specifying the axis
+    loreplot(df, "x", "y")  # first test without specifying the axis
 
     fig, ax = plt.subplots()
     loreplot(df, "x", "y", ax=ax)
@@ -30,7 +30,7 @@ def test_loreplot_default():
 def test_loreplot_default():
     loreplot(
         df, "x", "y", confounders=[("z", 1)]
-    )  ## first test without specifying the axis
+    )  # first test without specifying the axis
 
     fig, ax = plt.subplots()
     loreplot(df, "x", "y", ax=ax)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .lorepy import loreplot
		from .uncertainty import uncertainty_plot