Skip to content

Commit

Permalink
Merge pull request #208 from hjkgrp/active_learning
Browse files Browse the repository at this point in the history
Active learning
  • Loading branch information
ralf-meyer authored Apr 17, 2024
2 parents 7d32159 + 0d6a64f commit 78edc26
Show file tree
Hide file tree
Showing 3 changed files with 382 additions and 0 deletions.
Empty file.
269 changes: 269 additions & 0 deletions molSimplify/Informatics/active_learning/expected_improvement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
import numpy as np
from scipy.stats import norm
from typing import Tuple


def get_2D_pareto_indices(points: np.ndarray) -> np.ndarray:
"""Calculate the indices of the points that make up the 2D pareto front.
Parameters
----------
points : array_like, shape (N,2)
two dimensional array of all observations
Returns
-------
pareto_indices : ndarray of int, shape (M,)
list of indices that define the pareto front
"""

# Follows https://en.wikipedia.org/wiki/Maxima_of_a_point_set#Two_dimensions
# Modified for minimization of both dimensions, changed are HIGHLIGHTED by
# upper case text.
# 1. Sort the points in one of the coordinate dimensions
indices = np.argsort(points[:, 0])
pareto_indices = []
y_min = np.inf
# 2. For each point, in INCREASING x order, test whether its y-coordinate is
# SMALLER than the MINIMUM y-coordinate of any previous point
for ind in indices:
if points[ind, 1] < y_min:
# If it is, save the points as one of the maximal points, and remember
# its y-coordinate as the SMALLEST seen so far
pareto_indices.append(ind)
y_min = points[ind, 1]
return np.array(pareto_indices)


def get_2D_EI(pred_mean: np.ndarray, pred_std: np.ndarray,
pareto_points: np.ndarray, method: str = "aug") -> np.ndarray:
"""Calculates the two dimensional expected improvement following equation
(16) in A. J. Keane, AIAA Journal, 44, 4, 2006, https://doi.org/10.2514/1.16875
Parameters
----------
pred_mean : array_like, shape (N,2)
the predicted mean from a ML model for both target properties
pred_std : array_like, shape (N,2)
the predicted std from a ML model for both target properties
pareto_points : array_like, shape (M,2)
the points on the current Pareto front, must be correctly ordered
method: str, default = "aug"
use the probability distribution that augments ("aug") or dominates ("dom")
the current Pareto front. "mix" is an average of the two and corresponds to
straight line connections between the points on the Pareto front.
Returns
-------
np.ndarray, shape (N,)
array of the expected improvement values
"""
PI, centroid = get_2D_PI_and_centroid(
pred_mean, pred_std, pareto_points, method=method)

# Find the closest point on the Pareto front to the centroid. Build a 3d array
# of shape (N, M, 2) with the coordinate differences, square that, sum over the
# last dimension, i.e., x^2 + y^2, take the square root and finally find the
# smallest over the dimension of M Pareto front points
min_dist = np.min(
np.sqrt(
np.sum((centroid[:, np.newaxis, :] - pareto_points[np.newaxis, :, :])**2,
axis=2)
),
axis=1)
# Equation (16)
return PI * min_dist


def get_2D_PI_and_centroid(pred_mean: np.ndarray, pred_std: np.ndarray,
pareto_points: np.ndarray, method: str = "aug"
) -> Tuple[np.ndarray, np.ndarray]:
"""Calculates the two dimensional probability of improvement and the
first moments of that probability distribution according to equations (14) / (16)
and (18) / (19) in A. J. Keane, AIAA Journal, 44, 4, 2006,
https://doi.org/10.2514/1.16875
Note that there is a mistake in the final term in equations (18) and (19) of the
paper.
Parameters
----------
pred_mean : array_like, shape (N,2)
the predicted mean from a ML model for both target properties
pred_std : array_like, shape (N,2)
the predicted std from a ML model for both target properties
pareto_points : array_like, shape (M,2)
the points on the current Pareto front, must be correctly ordered
method: str, default = "aug"
use the probability distribution that augments ("aug") or dominates ("dom")
the current Pareto front. "mix" is an average of the two and corresponds to
straight line connections between the points on the Pareto front.
Returns
-------
np.ndarray, shape (N,)
array of the probability of improvement values
np.ndarray, shape (N,2)
array of the centroids
"""
# Test that the pareto_points make a valid front (Important for the order of the
# integrals)
# The x values must be strictly ascending
if any(np.diff(pareto_points[:, 0]) < 0.0):
raise ValueError("The x values of the Pareto front must be in ascending order")
# The corresponding y values must be strictly descending
if any(np.diff(pareto_points[:, 1]) > 0.0):
raise ValueError("The y values of the Pareto front must be in descending order")

# Variables consistent with naming in the paper:
mu_1 = pred_mean[:, 0]
s_1 = pred_std[:, 0]
mu_2 = pred_mean[:, 1]
s_2 = pred_std[:, 1]
# Dropping the star in the notation of the Pareto front points for brevity
f_1e = pareto_points[:, 0]
f_2e = pareto_points[:, 1]

# Helper function for the centroid calculations that combines the result of the
# integration by parts: mu * cdf - s * pdf into one function:
def int_by_parts(f_e, mu, s):
t = (f_e - mu) / s
return mu * norm.cdf(t) - s * norm.pdf(t)

# Helper functions for the normal distribution
def pdf(f_e, mu, s):
t = (f_e - mu) / s
return norm.pdf(t)

def cdf(f_e, mu, s):
t = (f_e - mu) / s
return norm.cdf(t)

# NOTE: instead of using the integration variables y_1 and y_2 the comments just
# refer to the two dimensions as x and y.
# First part is the integral dx from -inf to the x-coordinate of the left most
# Pareto point and the integral dy from -inf to inf, later of which is 1
PI = cdf(f_1e[0], mu_1, s_1)

# Similarly for the centroid
centroid = np.zeros_like(pred_mean)
# Integrate dx by parts and multiply be the integral dy, which again is 1
centroid[:, 0] = int_by_parts(f_1e[0], mu_1, s_1)
# Integral over dx (same as PI) multiplied by the integral dy from -inf to inf,
# Perform integration by parts, where cdf(inf)=1 and pdf(inf)=0,
# therefore, int y dy from -inf to inf is just mu_2
centroid[:, 1] = cdf(f_1e[0], mu_1, s_1) * mu_2

for i in range(len(pareto_points) - 1):
# First the integral dx from the x-coordinate of Pareto point i to the
# x-coordinate of Pareto point i+1
int_dx = cdf(f_1e[i+1], mu_1, s_1) - cdf(f_1e[i], mu_1, s_1)
cent_1_dx = (int_by_parts(f_1e[i+1], mu_1, s_1)
- int_by_parts(f_1e[i], mu_1, s_1))
# (method dependent) integral dy:
if method == "aug":
# Equation (14)
int_dy = cdf(f_2e[i], mu_2, s_2)
cent_2_dy = int_by_parts(f_2e[i], mu_2, s_2)
elif method == "dom":
# Equation (15)
int_dy = cdf(f_2e[i+1], mu_2, s_2)
cent_2_dy = int_by_parts(f_2e[i+1], mu_2, s_2)
elif method == "mix":
# Average of equation (14) and (15) as discussed in the text below
# equation (15)
int_dy = (cdf(f_2e[i], mu_2, s_2)
+ cdf(f_2e[i+1], mu_2, s_2)) / 2
cent_2_dy = (int_by_parts(f_2e[i], mu_2, s_2)
+ int_by_parts(f_2e[i+1], mu_2, s_2)) / 2
else:
raise NotImplementedError(f"Unknown method {method}")
# Multiply the integrals and add to the variables
PI += int_dx * int_dy
centroid[:, 0] += cent_1_dx * int_dy
centroid[:, 1] += int_dx * cent_2_dy
# Final part is the integral dx from the x-coordinate of the right most Pareto
# point to inf and the integral dy from -inf to the y-coordinate of the right
# most Pareto point
PI += (1 - cdf(f_1e[-1], mu_1, s_1)) * cdf(f_2e[-1], mu_2, s_2)
# Integrate x dx by parts, where we can not use the helper function because of
# different integration bounds (x-coordinate of right most Pareto point to inf)
# and multiply by the integral dy from -inf to the y-coordinate of the right
# most Pareto point. NOTE: mistake in the paper for the integral over dx where
# mu * CDF is used instead of the correct mu * (1 - CDF)
centroid[:, 0] += (mu_1 * (1 - cdf(f_1e[-1], mu_1, s_1))
+ s_1 * pdf(f_1e[-1], mu_1, s_1)
) * cdf(f_2e[-1], mu_2, s_2)
# Integrate dx same as in the PI and multiply by the integral y dy, which
# again is performed using the helper function
centroid[:, 1] += (
1 - cdf(f_1e[-1], mu_1, s_1)) * int_by_parts(f_2e[-1], mu_2, s_2)
centroid /= np.maximum(PI[:, np.newaxis], 1e-10)
return PI, centroid


def get_2D_EHVI(pred_mean: np.ndarray, pred_std: np.ndarray,
pareto_points: np.ndarray, r: np.ndarray):
"""Calculates the two dimensional expected hypervolume improvement following
M. Emmerich, K. Yang, A. Deutz, H. Wang, and C. M. Fonseca, "A Multicriteria
Generalization of Bayesian Global Optimization":
https://doi.org/10.1007/978-3-319-29975-4_12
Parameters
----------
pred_mean : array_like, shape (N,2)
the predicted mean from a ML model for both target properties
pred_std : array_like, shape (N,2)
the predicted std from a ML model for both target properties
pareto_points : array_like, shape (M,2)
the points on the current Pareto front, must be correctly ordered
r : array_like, shape (2,)
reference point for hypervolume calculation
Returns
-------
np.ndarray, shape (N,)
array of the expected hypervolume improvement values
"""
# Test that the pareto_points make a valid front (Important for the order of the
# integrals)
# The x values must be strictly ascending
if any(np.diff(pareto_points[:, 0]) < 0.0):
raise ValueError("The x values of the Pareto front must be in ascending order")
# The corresponding y values must be strictly descending
if any(np.diff(pareto_points[:, 1]) > 0.0):
raise ValueError("The y values of the Pareto front must be in descending order")

# Variables consistent with naming in the paper:
mu_1 = pred_mean[:, 0]
s_1 = pred_std[:, 0]
mu_2 = pred_mean[:, 1]
s_2 = pred_std[:, 1]

P = np.zeros([len(pareto_points) + 2, 2])
P[0] = (r[0], -np.inf)
P[1:-1] = pareto_points[::-1]
P[-1] = (-np.inf, r[1])

ehvi = np.zeros(len(pred_mean))

def psi(a, b, mu, s):
t = (b - mu)/s
return s*norm.pdf(t) + (a - mu)*norm.cdf(t)

# NOTE: This integration order in this function is different from the one in the
# get_2D_PI_and_centroid function going from right to left
# Indices shifted by one for python indexing
for i in range(len(pareto_points)+1):
# Exclude the last point because of a np.inf * 0 multiplication
if i != len(pareto_points):
# Equation (17) in the paper
ehvi += (
(P[i][0] - P[i+1][0])
* norm.cdf((P[i+1][0] - mu_1) / s_1)
* psi(P[i+1][1], P[i+1][1], mu_2, s_2)
)
# Equation (18) in the paper
ehvi += (
psi(P[i][0], P[i][0], mu_1, s_1)
- psi(P[i][0], P[i+1][0], mu_1, s_1)
) * psi(P[i+1][1], P[i+1][1], mu_2, s_2)
return ehvi
113 changes: 113 additions & 0 deletions tests/informatics/test_active_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import pytest
import numpy as np
from molSimplify.Informatics.active_learning.expected_improvement import (
get_2D_pareto_indices,
get_2D_PI_and_centroid,
get_2D_EI,
get_2D_EHVI,
)


def test_get_pareto_indices():
points = np.random.default_rng(0).normal(size=(500, 2))

pareto_indices = get_2D_pareto_indices(points)
np.testing.assert_equal(pareto_indices, [239, 119, 308, 95, 49, 230, 79, 206, 151])

# Also test the 3 other quadrants:
# Maximize dimension 0, minimize dimension 1
points[:, 0] *= -1
pareto_indices = get_2D_pareto_indices(points)
np.testing.assert_equal(pareto_indices, [135, 331, 376, 247, 34, 151])

# Maximize dimension 0, maximize dimension 1
points[:, 1] *= -1
pareto_indices = get_2D_pareto_indices(points)
np.testing.assert_equal(pareto_indices, [135, 258, 23, 489, 466, 123, 109])

# Minimize dimension 0, maximize dimension 1
points[:, 0] *= -1
pareto_indices = get_2D_pareto_indices(points)
np.testing.assert_equal(pareto_indices, [239, 69, 175, 201, 389, 109])


@pytest.mark.parametrize(
"method, ref_PI",
[
("aug", [0.331885, 0.75219, 0.853196]),
("dom", [0.062788, 0.185605, 0.640678]),
("mix", [0.197337, 0.468897, 0.746937]),
]
)
def test_get_2D_PI_and_centroid(method, ref_PI, atol=1e-6):
pred_mean = np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]])
pred_std = np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]])
pareto_points = np.array(
[[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]])

PI_xy, centroid_xy = get_2D_PI_and_centroid(
pred_mean, pred_std, pareto_points, method=method)
# Now swap axis and assert that the PI is the same and that the centroids
# also swapped axis.
# NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
PI_yx, centroid_yx = get_2D_PI_and_centroid(
pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1],
method=method)

np.testing.assert_allclose(PI_xy, PI_yx, atol=atol)
np.testing.assert_allclose(centroid_xy, centroid_yx[:, ::-1], atol=atol)
np.testing.assert_allclose(PI_xy, ref_PI, atol=atol)


@pytest.mark.parametrize(
"method, ref_EI",
[
("aug", [0.033024, 0.966226, 1.684631]),
("dom", [0.066208, 0.220915, 1.708273]),
("mix", [0.017599, 0.551167, 1.693937]),
]
)
def test_get_2D_EI(method, ref_EI, atol=1e-6):
pred_mean = np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]])
pred_std = np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]])
pareto_points = np.array(
[[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]])

EI_xy = get_2D_EI(pred_mean, pred_std, pareto_points, method=method)
# Now swap axis and assert that the EI is the same
# NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
EI_yx = get_2D_EI(pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1],
method=method)

np.testing.assert_allclose(EI_xy, EI_yx, atol=atol)
np.testing.assert_allclose(EI_xy, ref_EI, atol=atol)


@pytest.mark.parametrize(
"pred_mean, pred_std, pareto_points, r, ref_EHVI",
[
(np.array([[0.0, 0.0]]),
np.array([[0.1, 0.1]]),
np.array([[0.0, 2.0], [1.0, 1.0], [2.0, 0.0]]),
np.array([3.0, 3.0]),
[3.07978846]),
(np.array([[10.0, 10.0]]),
np.array([[4.0, 4.0]]),
np.array([[1.0, 2.0], [2.0, 1.0]]),
np.array([11.0, 11.0]),
[0.0726813]),
(np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]]),
np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]]),
np.array([[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]]),
np.array([10.0, 10.0]),
[0.4975774, 1.4780866, 10.2390386]),
]
)
def test_get_2D_EHVI(pred_mean, pred_std, pareto_points, r, ref_EHVI, atol=1e-6):
EHVI_xy = get_2D_EHVI(pred_mean, pred_std, pareto_points, r)
# Now swap axis and assert that the EHVI is the same
# NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
EHVI_yx = get_2D_EHVI(pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1], r[::-1])

np.testing.assert_allclose(EHVI_xy, EHVI_yx, atol=atol)
np.testing.assert_allclose(EHVI_xy, ref_EHVI, atol=atol)

0 comments on commit 78edc26

Please sign in to comment.