local_weighted_learning.py: fix mypy errors and more (#8073)

tianyizheng02 · web-flow · commit 8102424950f2 · 2023-05-17T12:05:55.000+12:00
diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py
@@ -1,14 +1,55 @@
+"""
+Locally weighted linear regression, also called local regression, is a type of
+non-parametric linear regression that prioritizes data closest to a given
+prediction point. The algorithm estimates the vector of model coefficients β
+using weighted least squares regression:
+
+β = (XᵀWX)⁻¹(XᵀWy),
+
+where X is the design matrix, y is the response vector, and W is the diagonal
+weight matrix.
+
+This implementation calculates wᵢ, the weight of the ith training sample, using
+the Gaussian weight:
+
+wᵢ = exp(-‖xᵢ - x‖²/(2τ²)),
+
+where xᵢ is the ith training sample, x is the prediction point, τ is the
+"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L²
+norm). The bandwidth τ controls how quickly the weight of a training sample
+decreases as its distance from the prediction point increases. One can think of
+the Gaussian weight as a bell curve centered around the prediction point: a
+training sample is weighted lower if it's farther from the center, and τ
+controls the spread of the bell curve.
+
+Other types of locally weighted regression such as locally estimated scatterplot
+smoothing (LOESS) typically use different weight functions.
+
+References:
+    - https://en.wikipedia.org/wiki/Local_regression
+    - https://en.wikipedia.org/wiki/Weighted_least_squares
+    - https://cs229.stanford.edu/notes2022fall/main_notes.pdf
+"""
+
 import matplotlib.pyplot as plt
 import numpy as np
 
 
-def weighted_matrix(
-    point: np.array, training_data_x: np.array, bandwidth: float
-) -> np.array:
+def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray:
     """
-    Calculate the weight for every point in the data set.
-    point --> the x value at which we want to make predictions
-    >>> weighted_matrix(
+    Calculate the weight of every point in the training data around a given
+    prediction point
+
+    Args:
+        point: x-value at which the prediction is being made
+        x_train: ndarray of x-values for training
+        tau: bandwidth value, controls how quickly the weight of training values
+            decreases as the distance from the prediction point increases
+
+    Returns:
+        m x m weight matrix around the prediction point, where m is the size of
+        the training set
+    >>> weight_matrix(
     ...     np.array([1., 1.]),
     ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
     ...     0.6
@@ -17,25 +58,30 @@ def weighted_matrix(
            [0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
            [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
     """
-    m, _ = np.shape(training_data_x)  # m is the number of training samples
-    weights = np.eye(m)  # Initializing weights as identity matrix
-
-    # calculating weights for all training examples [x(i)'s]
+    m = len(x_train)  # Number of training samples
+    weights = np.eye(m)  # Initialize weights as identity matrix
     for j in range(m):
-        diff = point - training_data_x[j]
-        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2))
+        diff = point - x_train[j]
+        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
+
     return weights
 
 
 def local_weight(
-    point: np.array,
-    training_data_x: np.array,
-    training_data_y: np.array,
-    bandwidth: float,
-) -> np.array:
+    point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float
+) -> np.ndarray:
     """
-    Calculate the local weights using the weight_matrix function on training data.
-    Return the weighted matrix.
+    Calculate the local weights at a given prediction point using the weight
+    matrix for that point
+
+    Args:
+        point: x-value at which the prediction is being made
+        x_train: ndarray of x-values for training
+        y_train: ndarray of y-values for training
+        tau: bandwidth value, controls how quickly the weight of training values
+            decreases as the distance from the prediction point increases
+    Returns:
+        ndarray of local weights
     >>> local_weight(
     ...     np.array([1., 1.]),
     ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@@ -45,97 +91,86 @@ def local_weight(
     array([[0.00873174],
            [0.08272556]])
     """
-    weight = weighted_matrix(point, training_data_x, bandwidth)
-    w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ (
-        training_data_x.T @ weight @ training_data_y.T
+    weight_mat = weight_matrix(point, x_train, tau)
+    weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ (
+        x_train.T @ weight_mat @ y_train.T
     )
 
-    return w
+    return weight
 
 
 def local_weight_regression(
-    training_data_x: np.array, training_data_y: np.array, bandwidth: float
-) -> np.array:
+    x_train: np.ndarray, y_train: np.ndarray, tau: float
+) -> np.ndarray:
     """
-    Calculate predictions for each data point on axis
+    Calculate predictions for each point in the training data
+
+    Args:
+        x_train: ndarray of x-values for training
+        y_train: ndarray of y-values for training
+        tau: bandwidth value, controls how quickly the weight of training values
+            decreases as the distance from the prediction point increases
+
+    Returns:
+        ndarray of predictions
     >>> local_weight_regression(
     ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
     ...     np.array([[1.01, 1.66, 3.5]]),
     ...     0.6
     ... )
     array([1.07173261, 1.65970737, 3.50160179])
     """
-    m, _ = np.shape(training_data_x)
-    ypred = np.zeros(m)
+    y_pred = np.zeros(len(x_train))  # Initialize array of predictions
+    for i, item in enumerate(x_train):
+        y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
 
-    for i, item in enumerate(training_data_x):
-        ypred[i] = item @ local_weight(
-            item, training_data_x, training_data_y, bandwidth
-        )
-
-    return ypred
+    return y_pred
 
 
 def load_data(
-    dataset_name: str, cola_name: str, colb_name: str
-) -> tuple[np.array, np.array, np.array, np.array]:
+    dataset_name: str, x_name: str, y_name: str
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
     Load data from seaborn and split it into x and y points
+    >>> pass    # No doctests, function is for demo purposes only
     """
     import seaborn as sns
 
     data = sns.load_dataset(dataset_name)
-    col_a = np.array(data[cola_name])  # total_bill
-    col_b = np.array(data[colb_name])  # tip
-
-    mcol_a = col_a.copy()
-    mcol_b = col_b.copy()
-
-    one = np.ones(np.shape(mcol_b)[0], dtype=int)
+    x_data = np.array(data[x_name])
+    y_data = np.array(data[y_name])
 
-    # pairing elements of one and mcol_a
-    training_data_x = np.column_stack((one, mcol_a))
+    one = np.ones(len(y_data))
 
-    return training_data_x, mcol_b, col_a, col_b
+    # pairing elements of one and x_data
+    x_train = np.column_stack((one, x_data))
 
-
-def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
-    """
-    Get predictions with minimum error for each training data
-    >>> get_preds(
-    ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
-    ...     np.array([[1.01, 1.66, 3.5]]),
-    ...     0.6
-    ... )
-    array([1.07173261, 1.65970737, 3.50160179])
-    """
-    ypred = local_weight_regression(training_data_x, mcol_b, tau)
-    return ypred
+    return x_train, x_data, y_data
 
 
 def plot_preds(
-    training_data_x: np.array,
-    predictions: np.array,
-    col_x: np.array,
-    col_y: np.array,
-    cola_name: str,
-    colb_name: str,
-) -> plt.plot:
+    x_train: np.ndarray,
+    preds: np.ndarray,
+    x_data: np.ndarray,
+    y_data: np.ndarray,
+    x_name: str,
+    y_name: str,
+) -> None:
     """
     Plot predictions and display the graph
+    >>> pass    # No doctests, function is for demo purposes only
     """
-    xsort = training_data_x.copy()
-    xsort.sort(axis=0)
-    plt.scatter(col_x, col_y, color="blue")
+    x_train_sorted = np.sort(x_train, axis=0)
+    plt.scatter(x_data, y_data, color="blue")
     plt.plot(
-        xsort[:, 1],
-        predictions[training_data_x[:, 1].argsort(0)],
+        x_train_sorted[:, 1],
+        preds[x_train[:, 1].argsort(0)],
         color="yellow",
         linewidth=5,
     )
     plt.title("Local Weighted Regression")
-    plt.xlabel(cola_name)
-    plt.ylabel(colb_name)
+    plt.xlabel(x_name)
+    plt.ylabel(y_name)
     plt.show()
 
 
@@ -144,6 +179,7 @@ def plot_preds(
 
     doctest.testmod()
 
-    training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip")
-    predictions = get_preds(training_data_x, mcol_b, 0.5)
-    plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")
+    # Demo with a dataset from the seaborn module
+    training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip")
+    predictions = local_weight_regression(training_data_x, tip, 5)
+    plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")