Skip to content

Commit

Permalink
Adds uncertainty plots which allow uncertainty on the prevalence of g…
Browse files Browse the repository at this point in the history
…roups to be assessed (#8)

Includes all code, tests, docs and examples for the uncertainty plot
  • Loading branch information
sepro authored May 29, 2024
1 parent d8bd8a2 commit 5142ad3
Show file tree
Hide file tree
Showing 14 changed files with 306 additions and 21 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,30 @@ plt.show()

![Loreplot with a confounder](https://raw.githubusercontent.com/raeslab/lorepy/main/docs/img/loreplot_confounder.png)

### Assess uncertainty

From loreplots it isn't possible to assess how certain we are of the prevalence of each group across the range. To
provide a view into this there is a function ```uncertainty_plot```, which can be used as shown below. This will use
```resampling``` (or ```jackknifing```) to determine the 50% and 95% interval of predicted values and show these in a
multi-panel plot with one plot per category.

```python
from lorepy import uncertainty_plot

uncertainty_plot(
data=iris_df,
x="sepal width (cm)",
y="species",
)
plt.savefig("./docs/img/uncertainty_default.png", dpi=150)
plt.show()
```

![Default uncertainty plot](https://raw.githubusercontent.com/raeslab/lorepy/main/docs/img/uncertainty_default.png)

This also supports custom colors, ranges and classifiers. More examples are available in ```example_uncertainty.py```.


## Development

Additional [documentation for developers](./docs/dev_docs.md) is included with details on running tests, building and deploying to PyPi.
Expand Down
12 changes: 9 additions & 3 deletions docs/dev_docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
## Setting up the environment

To recreate the environment used by the devs, you can get a [requirements.txt](./dev/requirements.txt) file that has the
same versions we have been using pinned.
same versions we have been using pinned. To install these after creating a virtual environment use the command below
(from the root of the project)

```bash
pip install -r ./docs/dev/requirements.txt
```


## Running tests

Lorepy is fully covered with unit-tests, to run them you need the pytest package installed (```pip install pytest pytest-cov```).
Next, run the command below to run the test suite. Note: if you use the environment listed above you will get these.

```python
```bash
pytest
```
To enable coverage stats run the command below.

```python
```bash
pytest --exitfirst --verbose --failed-first --cov=src
```

Expand Down
Binary file added docs/img/uncertainty_confounder.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/img/uncertainty_custom_classifier.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/img/uncertainty_custom_color.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/img/uncertainty_default.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/img/uncertainty_jackknife.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
70 changes: 70 additions & 0 deletions example_uncertainty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from lorepy import uncertainty_plot

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import pandas as pd

# Load iris dataset and convert to dataframe
iris_obj = load_iris()
iris_df = pd.DataFrame(iris_obj.data, columns=iris_obj.feature_names)

iris_df["species"] = [iris_obj.target_names[s] for s in iris_obj.target]

# Default uncertainty plot
uncertainty_plot(data=iris_df, x="sepal width (cm)", y="species", iterations=100)
plt.savefig("./docs/img/uncertainty_default.png", dpi=150)
plt.show()

# Using jackknife instead of resample to assess uncertainty
uncertainty_plot(
data=iris_df,
x="sepal width (cm)",
y="species",
iterations=100,
jackknife_fraction=0.8,
)
plt.savefig("./docs/img/uncertainty_jackknife.png", dpi=150)
plt.show()

# Uncertainty plot with custom colors
from matplotlib.colors import ListedColormap

colormap = ListedColormap(["red", "green", "blue"])
uncertainty_plot(
data=iris_df,
x="sepal width (cm)",
y="species",
iterations=100,
mode="resample",
colormap=colormap,
)
plt.savefig("./docs/img/uncertainty_custom_color.png", dpi=150)
plt.show()

# Uncertainty plot with a confounder
uncertainty_plot(
data=iris_df,
x="sepal width (cm)",
y="species",
iterations=100,
mode="resample",
confounders=[("petal width (cm)", 1)],
)
plt.savefig("./docs/img/uncertainty_confounder.png", dpi=150)
plt.show()

# Uncertainty plot with a custom classifier
from sklearn.svm import SVC

svc = SVC(probability=True)

uncertainty_plot(
data=iris_df,
x="sepal width (cm)",
y="species",
iterations=100,
mode="resample",
clf=svc,
)
plt.savefig("./docs/img/uncertainty_custom_classifier.png", dpi=150)
plt.show()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="lorepy",
version="0.3.0",
version="0.4.0",
author="Sebastian Proost",
author_email="sebastian.proost@gmail.com",
description="Draw Logistic Regression Plots in Python",
Expand Down
1 change: 1 addition & 0 deletions src/lorepy/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .lorepy import loreplot
from .uncertainty import uncertainty_plot
38 changes: 23 additions & 15 deletions src/lorepy/lorepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,24 @@
from typing import Optional, Tuple


def _get_area_df(lg, x_feature, x_range, deconfound=[]) -> DataFrame:
def _prepare_data(data, x, y, confounders):
x_features = [x] + [i[0] for i in confounders]

tmp_df = data[x_features + [y]].dropna()
X_reg = np.array(tmp_df[x_features])
y_reg = np.array(tmp_df[y])

x_range = (X_reg[:, 0].min(), X_reg[:, 0].max())

return X_reg, y_reg, x_range


def _get_area_df(lg, x_feature, x_range, confounders=[]) -> DataFrame:
values = np.linspace(x_range[0], x_range[1], num=200)

predict_df = pd.DataFrame({"values": values})

for k, v in deconfound:
for k, v in confounders:
predict_df[k] = v

proba = lg.predict_proba(predict_df.values)
Expand All @@ -25,14 +37,14 @@ def _get_area_df(lg, x_feature, x_range, deconfound=[]) -> DataFrame:
def _get_dots_df(X, y, lg, y_feature, confounders=[]) -> DataFrame:
output = []

for v, s in zip(X, y):
proba = lg.predict_proba([v] + [i[1] for i in confounders])
for x, s in zip(X, y):
proba = lg.predict_proba([x] + [i[1] for i in confounders])
i = list(lg.classes_).index(s)
min_value = sum(proba[0][:i])
max_value = sum(proba[0][: i + 1])
margin = (max_value - min_value) / 10
ypos = np.random.uniform(low=min_value + margin, high=max_value - margin)
output.append({y_feature: s, "x": v[0], "y": ypos})
output.append({y_feature: s, "x": x[0], "y": ypos})

return DataFrame(output)

Expand All @@ -47,16 +59,16 @@ def loreplot(
ax=None,
clf=None,
confounders=[],
**kwargs
**kwargs,
):
"""
Code to create a loreplot with a numerical feature on the x-axis and categorical y from a pandas dataset
Code to create a loreplot with a numerical feature on the v-axis and categorical y from a pandas dataset
:param data: Pandas dataframe with data
:param x: Needs to be a numerical feature
:param y: Categorical feature
:param add_dots: Shows where true samples are in the plot (cannot be enabled when deconfounding for additional variables)
:param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the x-axis
:param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the v-axis
:param scatter_kws: Dictionary with keyword arguments to pass to the scatter function
:param ax: subplot to draw on, in case lorepy is used in a subplot
:param clf: provide a different scikit-learn classifier for the function. Should implement the predict_proba() and fit()
Expand All @@ -66,22 +78,18 @@ def loreplot(
if ax is None:
ax = plt.gca()

x_features = [x] + [i[0] for i in confounders]

tmp_df = data[x_features + [y]].dropna()
X_reg = np.array(tmp_df[x_features])
y_reg = np.array(tmp_df[y])
X_reg, y_reg, r = _prepare_data(data, x, y, confounders)

if x_range is None:
x_range = (X_reg[:, 0].min(), X_reg[:, 0].max())
x_range = r

lg = LogisticRegression(multi_class="multinomial") if clf is None else clf
lg.fit(X_reg, y_reg)

if "linestyle" not in kwargs.keys():
kwargs["linestyle"] = "None"

area_df = _get_area_df(lg, x, x_range, deconfound=confounders)
area_df = _get_area_df(lg, x, x_range, confounders=confounders)
area_df.plot.area(ax=ax, **kwargs)

if add_dots and len(confounders) == 0:
Expand Down
130 changes: 130 additions & 0 deletions src/lorepy/uncertainty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas import DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from lorepy.lorepy import _get_area_df, _prepare_data


def _get_uncertainty_data(
x: str,
X_reg,
y_reg,
x_range,
mode="resample",
jackknife_fraction: float = 0.8,
iterations: int = 100,
confounders=[],
clf=None,
):
areas = []
for i in range(iterations):
if mode == "jackknife":
X_keep, _, y_keep, _ = train_test_split(
X_reg, y_reg, train_size=jackknife_fraction
)
elif mode == "resample":
X_keep, y_keep = resample(X_reg, y_reg, replace=True)
else:
raise NotImplementedError(
f"Mode {mode} is unsupported, only jackknife and resample are valid modes"
)

lg = LogisticRegression(multi_class="multinomial") if clf is None else clf
lg.fit(X_keep, y_keep)
new_area = _get_area_df(lg, x, x_range, confounders=confounders).reset_index()

areas.append(new_area)

long_df = pd.concat(areas).melt(id_vars=[x]).sort_values(x)

output = (
long_df.groupby([x, "variable"])
.agg(
min=pd.NamedAgg(column="value", aggfunc="min"),
mean=pd.NamedAgg(column="value", aggfunc="mean"),
max=pd.NamedAgg(column="value", aggfunc="max"),
low_95=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 2.5)),
high_95=pd.NamedAgg(
column="value", aggfunc=lambda v: np.percentile(v, 97.5)
),
low_50=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 25)),
high_50=pd.NamedAgg(column="value", aggfunc=lambda v: np.percentile(v, 75)),
)
.reset_index()
)

return output


def uncertainty_plot(
data: DataFrame,
x: str,
y: str,
x_range=None,
mode="resample",
jackknife_fraction=0.8,
iterations=100,
confounders=[],
colormap=None,
clf=None,
):
"""
Code to create a multi-panel plot, one panel for each category, with the prevalence of that category across the
range of x-values, along with the uncertainty (intervals containing 50% and 95% of the samples are shown)
:param data: Pandas dataframe with data
:param x: Needs to be a numerical feature
:param y: Categorical feature
:param x_range: Either None (range will be selected automatically) or a tuple with min and max value for the x-axis
:param mode: Sampling method, either "resample" (bootstrap) or "jackknife" (default = "resample")
:param jackknife_fraction: Fraction of data to retain for each jackknife sample (default = 0.8)
:param iterations: Number of iterations for resampling or jackknife (default = 100)
:param confounders: List of tuples with the feature and reference value e.g., [("BMI", 25)] will use a reference of 25 for plots
:param colormap: Colormap to use for the plot, default is None in which case matplotlib's default will be used
:param clf: Provide a different scikit-learn classifier for the function. Should implement the predict_proba() and fit(). If None a LogisticRegression will be used.
:return: A tuple containing the figure and axes objects
"""
X_reg, y_reg, r = _prepare_data(data, x, y, confounders)

if x_range is None:
x_range = r

plot_df = _get_uncertainty_data(
x,
X_reg,
y_reg,
x_range,
mode=mode,
jackknife_fraction=jackknife_fraction,
iterations=iterations,
confounders=confounders,
clf=clf,
)

categories = plot_df.variable.unique()

fig, axs = plt.subplots(ncols=len(categories), sharex=True, sharey=True)

cmap = plt.get_cmap("tab10") if colormap is None else colormap

for idx, category in enumerate(categories):
cat_df = plot_df[plot_df.variable == category]

axs[idx].fill_between(
cat_df[x], cat_df["low_95"], cat_df["high_95"], alpha=0.1, color=cmap(idx)
)
axs[idx].fill_between(
cat_df[x], cat_df["low_50"], cat_df["high_50"], alpha=0.2, color=cmap(idx)
)
axs[idx].plot(cat_df[x], cat_df["mean"], color=cmap(idx))
axs[idx].set_title(categories[idx])
axs[idx].set_xlabel(x)

axs[idx].set_xlim(*x_range)
axs[idx].set_ylim(0, 1)

return fig, axs
4 changes: 2 additions & 2 deletions tests/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# Test case for loreplot with default parameters
def test_loreplot_default():
loreplot(df, "x", "y") ## first test without specifying the axis
loreplot(df, "x", "y") # first test without specifying the axis

fig, ax = plt.subplots()
loreplot(df, "x", "y", ax=ax)
Expand All @@ -30,7 +30,7 @@ def test_loreplot_default():
def test_loreplot_default():
loreplot(
df, "x", "y", confounders=[("z", 1)]
) ## first test without specifying the axis
) # first test without specifying the axis

fig, ax = plt.subplots()
loreplot(df, "x", "y", ax=ax)
Expand Down
Loading

0 comments on commit 5142ad3

Please sign in to comment.