Merge pull request #208 from hjkgrp/active_learning

Active learning
hjkgrp · Apr 17, 2024 · 78edc26 · 78edc26
2 parents 7d32159 + 0d6a64f
commit 78edc26
Show file tree

Hide file tree

Showing 3 changed files with 382 additions and 0 deletions.
diff --git a/molSimplify/Informatics/active_learning/__init__.py b/molSimplify/Informatics/active_learning/__init__.py
diff --git a/molSimplify/Informatics/active_learning/expected_improvement.py b/molSimplify/Informatics/active_learning/expected_improvement.py
@@ -0,0 +1,269 @@
+import numpy as np
+from scipy.stats import norm
+from typing import Tuple
+
+
+def get_2D_pareto_indices(points: np.ndarray) -> np.ndarray:
+    """Calculate the indices of the points that make up the 2D pareto front.
+
+    Parameters
+    ----------
+    points : array_like, shape (N,2)
+        two dimensional array of all observations
+
+    Returns
+    -------
+    pareto_indices : ndarray of int, shape (M,)
+        list of indices that define the pareto front
+    """
+
+    # Follows https://en.wikipedia.org/wiki/Maxima_of_a_point_set#Two_dimensions
+    # Modified for minimization of both dimensions, changed are HIGHLIGHTED by
+    # upper case text.
+    # 1. Sort the points in one of the coordinate dimensions
+    indices = np.argsort(points[:, 0])
+    pareto_indices = []
+    y_min = np.inf
+    # 2. For each point, in INCREASING x order, test whether its y-coordinate is
+    #    SMALLER than the MINIMUM y-coordinate of any previous point
+    for ind in indices:
+        if points[ind, 1] < y_min:
+            # If it is, save the points as one of the maximal points, and remember
+            # its y-coordinate as the SMALLEST seen so far
+            pareto_indices.append(ind)
+            y_min = points[ind, 1]
+    return np.array(pareto_indices)
+
+
+def get_2D_EI(pred_mean: np.ndarray, pred_std: np.ndarray,
+              pareto_points: np.ndarray, method: str = "aug") -> np.ndarray:
+    """Calculates the two dimensional expected improvement following equation
+    (16) in A. J. Keane, AIAA Journal, 44, 4, 2006, https://doi.org/10.2514/1.16875
+
+    Parameters
+    ----------
+    pred_mean : array_like, shape (N,2)
+        the predicted mean from a ML model for both target properties
+    pred_std : array_like, shape (N,2)
+        the predicted std from a ML model for both target properties
+    pareto_points : array_like, shape (M,2)
+        the points on the current Pareto front, must be correctly ordered
+    method: str, default = "aug"
+        use the probability distribution that augments ("aug") or dominates ("dom")
+        the current Pareto front. "mix" is an average of the two and corresponds to
+        straight line connections between the points on the Pareto front.
+    Returns
+    -------
+    np.ndarray, shape (N,)
+        array of the expected improvement values
+    """
+    PI, centroid = get_2D_PI_and_centroid(
+        pred_mean, pred_std, pareto_points, method=method)
+
+    # Find the closest point on the Pareto front to the centroid. Build a 3d array
+    # of shape (N, M, 2) with the coordinate differences, square that, sum over the
+    # last dimension, i.e., x^2 + y^2, take the square root and finally find the
+    # smallest over the dimension of M Pareto front points
+    min_dist = np.min(
+        np.sqrt(
+            np.sum((centroid[:, np.newaxis, :] - pareto_points[np.newaxis, :, :])**2,
+                   axis=2)
+                ),
+        axis=1)
+    # Equation (16)
+    return PI * min_dist
+
+
+def get_2D_PI_and_centroid(pred_mean: np.ndarray, pred_std: np.ndarray,
+                           pareto_points: np.ndarray, method: str = "aug"
+                           ) -> Tuple[np.ndarray, np.ndarray]:
+    """Calculates the two dimensional probability of improvement and the
+    first moments of that probability distribution according to equations (14) / (16)
+    and (18) / (19) in A. J. Keane, AIAA Journal, 44, 4, 2006,
+    https://doi.org/10.2514/1.16875
+
+    Note that there is a mistake in the final term in equations (18) and (19) of the
+    paper.
+
+    Parameters
+    ----------
+    pred_mean : array_like, shape (N,2)
+        the predicted mean from a ML model for both target properties
+    pred_std : array_like, shape (N,2)
+        the predicted std from a ML model for both target properties
+    pareto_points : array_like, shape (M,2)
+        the points on the current Pareto front, must be correctly ordered
+    method: str, default = "aug"
+        use the probability distribution that augments ("aug") or dominates ("dom")
+        the current Pareto front. "mix" is an average of the two and corresponds to
+        straight line connections between the points on the Pareto front.
+    Returns
+    -------
+    np.ndarray, shape (N,)
+        array of the probability of improvement values
+    np.ndarray, shape (N,2)
+        array of the centroids
+    """
+    # Test that the pareto_points make a valid front (Important for the order of the
+    # integrals)
+    # The x values must be strictly ascending
+    if any(np.diff(pareto_points[:, 0]) < 0.0):
+        raise ValueError("The x values of the Pareto front must be in ascending order")
+    # The corresponding y values must be strictly descending
+    if any(np.diff(pareto_points[:, 1]) > 0.0):
+        raise ValueError("The y values of the Pareto front must be in descending order")
+
+    # Variables consistent with naming in the paper:
+    mu_1 = pred_mean[:, 0]
+    s_1 = pred_std[:, 0]
+    mu_2 = pred_mean[:, 1]
+    s_2 = pred_std[:, 1]
+    # Dropping the star in the notation of the Pareto front points for brevity
+    f_1e = pareto_points[:, 0]
+    f_2e = pareto_points[:, 1]
+
+    # Helper function for the centroid calculations that combines the result of the
+    # integration by parts: mu * cdf - s * pdf into one function:
+    def int_by_parts(f_e, mu, s):
+        t = (f_e - mu) / s
+        return mu * norm.cdf(t) - s * norm.pdf(t)
+
+    # Helper functions for the normal distribution
+    def pdf(f_e, mu, s):
+        t = (f_e - mu) / s
+        return norm.pdf(t)
+
+    def cdf(f_e, mu, s):
+        t = (f_e - mu) / s
+        return norm.cdf(t)
+
+    # NOTE: instead of using the integration variables y_1 and y_2 the comments just
+    # refer to the two dimensions as x and y.
+    # First part is the integral dx from -inf to the x-coordinate of the left most
+    # Pareto point and the integral dy from -inf to inf, later of which is 1
+    PI = cdf(f_1e[0], mu_1, s_1)
+
+    # Similarly for the centroid
+    centroid = np.zeros_like(pred_mean)
+    # Integrate dx by parts and multiply be the integral dy, which again is 1
+    centroid[:, 0] = int_by_parts(f_1e[0], mu_1, s_1)
+    # Integral over dx (same as PI) multiplied by the integral dy from -inf to inf,
+    # Perform integration by parts, where cdf(inf)=1 and pdf(inf)=0,
+    # therefore, int y dy from -inf to inf is just mu_2
+    centroid[:, 1] = cdf(f_1e[0], mu_1, s_1) * mu_2
+
+    for i in range(len(pareto_points) - 1):
+        # First the integral dx from the x-coordinate of Pareto point i to the
+        # x-coordinate of Pareto point i+1
+        int_dx = cdf(f_1e[i+1], mu_1, s_1) - cdf(f_1e[i], mu_1, s_1)
+        cent_1_dx = (int_by_parts(f_1e[i+1], mu_1, s_1)
+                     - int_by_parts(f_1e[i], mu_1, s_1))
+        # (method dependent) integral dy:
+        if method == "aug":
+            # Equation (14)
+            int_dy = cdf(f_2e[i], mu_2, s_2)
+            cent_2_dy = int_by_parts(f_2e[i], mu_2, s_2)
+        elif method == "dom":
+            # Equation (15)
+            int_dy = cdf(f_2e[i+1], mu_2, s_2)
+            cent_2_dy = int_by_parts(f_2e[i+1], mu_2, s_2)
+        elif method == "mix":
+            # Average of equation (14) and (15) as discussed in the text below
+            # equation (15)
+            int_dy = (cdf(f_2e[i], mu_2, s_2)
+                      + cdf(f_2e[i+1], mu_2, s_2)) / 2
+            cent_2_dy = (int_by_parts(f_2e[i], mu_2, s_2)
+                         + int_by_parts(f_2e[i+1], mu_2, s_2)) / 2
+        else:
+            raise NotImplementedError(f"Unknown method {method}")
+        # Multiply the integrals and add to the variables
+        PI += int_dx * int_dy
+        centroid[:, 0] += cent_1_dx * int_dy
+        centroid[:, 1] += int_dx * cent_2_dy
+    # Final part is the integral dx from the x-coordinate of the right most Pareto
+    # point to inf and the integral dy from -inf to the y-coordinate of the right
+    # most Pareto point
+    PI += (1 - cdf(f_1e[-1], mu_1, s_1)) * cdf(f_2e[-1], mu_2, s_2)
+    # Integrate x dx by parts, where we can not use the helper function because of
+    # different integration bounds (x-coordinate of right most Pareto point to inf)
+    # and multiply by the integral dy from -inf to the y-coordinate of the right
+    # most Pareto point. NOTE: mistake in the paper for the integral over dx where
+    # mu * CDF is used instead of the correct mu * (1 - CDF)
+    centroid[:, 0] += (mu_1 * (1 - cdf(f_1e[-1], mu_1, s_1))
+                       + s_1 * pdf(f_1e[-1], mu_1, s_1)
+                       ) * cdf(f_2e[-1], mu_2, s_2)
+    # Integrate dx same as in the PI and multiply by the integral y dy, which
+    # again is performed using the helper function
+    centroid[:, 1] += (
+        1 - cdf(f_1e[-1], mu_1, s_1)) * int_by_parts(f_2e[-1], mu_2, s_2)
+    centroid /= np.maximum(PI[:, np.newaxis], 1e-10)
+    return PI, centroid
+
+
+def get_2D_EHVI(pred_mean: np.ndarray, pred_std: np.ndarray,
+                pareto_points: np.ndarray, r: np.ndarray):
+    """Calculates the two dimensional expected hypervolume improvement following
+    M. Emmerich, K. Yang, A. Deutz, H. Wang, and C. M. Fonseca, "A Multicriteria
+    Generalization of Bayesian Global Optimization":
+    https://doi.org/10.1007/978-3-319-29975-4_12
+
+    Parameters
+    ----------
+    pred_mean : array_like, shape (N,2)
+        the predicted mean from a ML model for both target properties
+    pred_std : array_like, shape (N,2)
+        the predicted std from a ML model for both target properties
+    pareto_points : array_like, shape (M,2)
+        the points on the current Pareto front, must be correctly ordered
+    r : array_like, shape (2,)
+        reference point for hypervolume calculation
+
+    Returns
+    -------
+    np.ndarray, shape (N,)
+        array of the expected hypervolume improvement values
+    """
+    # Test that the pareto_points make a valid front (Important for the order of the
+    # integrals)
+    # The x values must be strictly ascending
+    if any(np.diff(pareto_points[:, 0]) < 0.0):
+        raise ValueError("The x values of the Pareto front must be in ascending order")
+    # The corresponding y values must be strictly descending
+    if any(np.diff(pareto_points[:, 1]) > 0.0):
+        raise ValueError("The y values of the Pareto front must be in descending order")
+
+    # Variables consistent with naming in the paper:
+    mu_1 = pred_mean[:, 0]
+    s_1 = pred_std[:, 0]
+    mu_2 = pred_mean[:, 1]
+    s_2 = pred_std[:, 1]
+
+    P = np.zeros([len(pareto_points) + 2, 2])
+    P[0] = (r[0], -np.inf)
+    P[1:-1] = pareto_points[::-1]
+    P[-1] = (-np.inf, r[1])
+
+    ehvi = np.zeros(len(pred_mean))
+
+    def psi(a, b, mu, s):
+        t = (b - mu)/s
+        return s*norm.pdf(t) + (a - mu)*norm.cdf(t)
+
+    # NOTE: This integration order in this function is different from the one in the
+    # get_2D_PI_and_centroid function going from right to left
+    # Indices shifted by one for python indexing
+    for i in range(len(pareto_points)+1):
+        # Exclude the last point because of a np.inf * 0 multiplication
+        if i != len(pareto_points):
+            # Equation (17) in the paper
+            ehvi += (
+                (P[i][0] - P[i+1][0])
+                * norm.cdf((P[i+1][0] - mu_1) / s_1)
+                * psi(P[i+1][1], P[i+1][1], mu_2, s_2)
+            )
+        # Equation (18) in the paper
+        ehvi += (
+            psi(P[i][0], P[i][0], mu_1, s_1)
+            - psi(P[i][0], P[i+1][0], mu_1, s_1)
+        ) * psi(P[i+1][1], P[i+1][1], mu_2, s_2)
+    return ehvi
diff --git a/tests/informatics/test_active_learning.py b/tests/informatics/test_active_learning.py
@@ -0,0 +1,113 @@
+import pytest
+import numpy as np
+from molSimplify.Informatics.active_learning.expected_improvement import (
+    get_2D_pareto_indices,
+    get_2D_PI_and_centroid,
+    get_2D_EI,
+    get_2D_EHVI,
+)
+
+
+def test_get_pareto_indices():
+    points = np.random.default_rng(0).normal(size=(500, 2))
+
+    pareto_indices = get_2D_pareto_indices(points)
+    np.testing.assert_equal(pareto_indices, [239, 119, 308, 95, 49, 230, 79, 206, 151])
+
+    # Also test the 3 other quadrants:
+    # Maximize dimension 0, minimize dimension 1
+    points[:, 0] *= -1
+    pareto_indices = get_2D_pareto_indices(points)
+    np.testing.assert_equal(pareto_indices, [135, 331, 376, 247, 34, 151])
+
+    # Maximize dimension 0, maximize dimension 1
+    points[:, 1] *= -1
+    pareto_indices = get_2D_pareto_indices(points)
+    np.testing.assert_equal(pareto_indices, [135, 258, 23, 489, 466, 123, 109])
+
+    # Minimize dimension 0, maximize dimension 1
+    points[:, 0] *= -1
+    pareto_indices = get_2D_pareto_indices(points)
+    np.testing.assert_equal(pareto_indices, [239, 69, 175, 201, 389, 109])
+
+
+@pytest.mark.parametrize(
+    "method, ref_PI",
+    [
+        ("aug", [0.331885, 0.75219, 0.853196]),
+        ("dom", [0.062788, 0.185605, 0.640678]),
+        ("mix", [0.197337, 0.468897, 0.746937]),
+    ]
+)
+def test_get_2D_PI_and_centroid(method, ref_PI, atol=1e-6):
+    pred_mean = np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]])
+    pred_std = np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]])
+    pareto_points = np.array(
+        [[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]])
+
+    PI_xy, centroid_xy = get_2D_PI_and_centroid(
+        pred_mean, pred_std, pareto_points, method=method)
+    # Now swap axis and assert that the PI is the same and that the centroids
+    # also swapped axis.
+    # NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
+    PI_yx, centroid_yx = get_2D_PI_and_centroid(
+        pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1],
+        method=method)
+
+    np.testing.assert_allclose(PI_xy, PI_yx, atol=atol)
+    np.testing.assert_allclose(centroid_xy, centroid_yx[:, ::-1], atol=atol)
+    np.testing.assert_allclose(PI_xy, ref_PI, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "method, ref_EI",
+    [
+        ("aug", [0.033024, 0.966226, 1.684631]),
+        ("dom", [0.066208, 0.220915, 1.708273]),
+        ("mix", [0.017599, 0.551167, 1.693937]),
+    ]
+)
+def test_get_2D_EI(method, ref_EI, atol=1e-6):
+    pred_mean = np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]])
+    pred_std = np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]])
+    pareto_points = np.array(
+        [[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]])
+
+    EI_xy = get_2D_EI(pred_mean, pred_std, pareto_points, method=method)
+    # Now swap axis and assert that the EI is the same
+    # NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
+    EI_yx = get_2D_EI(pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1],
+                      method=method)
+
+    np.testing.assert_allclose(EI_xy, EI_yx, atol=atol)
+    np.testing.assert_allclose(EI_xy, ref_EI, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "pred_mean, pred_std, pareto_points, r, ref_EHVI",
+    [
+        (np.array([[0.0, 0.0]]),
+         np.array([[0.1, 0.1]]),
+         np.array([[0.0, 2.0], [1.0, 1.0], [2.0, 0.0]]),
+         np.array([3.0, 3.0]),
+         [3.07978846]),
+        (np.array([[10.0, 10.0]]),
+         np.array([[4.0, 4.0]]),
+         np.array([[1.0, 2.0], [2.0, 1.0]]),
+         np.array([11.0, 11.0]),
+         [0.0726813]),
+        (np.array([[5.0, 5.0], [3.0, 6.0], [3.0, 3.0]]),
+         np.array([[1.2, 1.4], [1.1, 0.8], [2.6, 1.1]]),
+         np.array([[2.0, 7.0], [4.0, 4.0], [6.0, 2.0], [8.0, 1.0]]),
+         np.array([10.0, 10.0]),
+         [0.4975774, 1.4780866, 10.2390386]),
+    ]
+)
+def test_get_2D_EHVI(pred_mean, pred_std, pareto_points, r, ref_EHVI, atol=1e-6):
+    EHVI_xy = get_2D_EHVI(pred_mean, pred_std, pareto_points, r)
+    # Now swap axis and assert that the EHVI is the same
+    # NOTE: the pareto_points must also be reversed to satisfy the ordering constraints
+    EHVI_yx = get_2D_EHVI(pred_mean[:, ::-1], pred_std[:, ::-1], pareto_points[::-1, ::-1], r[::-1])
+
+    np.testing.assert_allclose(EHVI_xy, EHVI_yx, atol=atol)
+    np.testing.assert_allclose(EHVI_xy, ref_EHVI, atol=atol)