merge main

TDAmeritrade · Sep 2, 2022 · dbaccab · dbaccab
2 parents e105710 + 11bb86d
commit dbaccab
Show file tree

Hide file tree

Showing 10 changed files with 198 additions and 111 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -111,6 +111,8 @@
     ],
 }
 
+html_context = {"default_mode": "light"}
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".

diff --git a/stumpy/aamp_motifs.py b/stumpy/aamp_motifs.py
@@ -97,7 +97,7 @@ def _aamp_motifs(
     motif_distances = []
 
     candidate_idx = np.argmin(P[-1])
-    for i in range(l):
+    for _ in range(l):
         if len(motif_indices) >= max_motifs:
             break
 
@@ -357,29 +357,17 @@ def aamp_match(
         to `Q` are less than or equal to`max_distance`, sorted by distance (lowest to
         highest). The second column consists of the corresponding indices in `T`.
     """
+    if np.any(np.isnan(Q)) or np.any(np.isinf(Q)):  # pragma: no cover
+        raise ValueError("Q contains illegal values (NaN or inf)")
+
     if len(Q.shape) == 1:
         Q = Q[np.newaxis, :]
     if len(T.shape) == 1:
         T = T[np.newaxis, :]
 
     d, n = T.shape
     m = Q.shape[1]
-
     excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    if max_matches is None:  # pragma: no cover
-        max_matches = np.inf
-
-    if np.any(np.isnan(Q)) or np.any(np.isinf(Q)):  # pragma: no cover
-        raise ValueError("Q contains illegal values (NaN or inf)")
-
-    if max_distance is None:  # pragma: no cover
-
-        def max_distance(D):
-            D_copy = D.copy().astype(np.float64)
-            D_copy[np.isinf(D_copy)] = np.nan
-            return np.nanmax(
-                [np.nanmean(D_copy) - 2.0 * np.nanstd(D_copy), np.nanmin(D_copy)]
-            )
 
     if T_subseq_isfinite is None:
         T, T_subseq_isfinite = core.preprocess_non_normalized(T, m)
@@ -389,28 +377,13 @@ def max_distance(D):
     D = np.empty((d, n - m + 1))
     for i in range(d):
         D[i, :] = core.mass_absolute(Q[i], T[i], T_subseq_isfinite[i], p=p)
-
     D = np.mean(D, axis=0)
-    if not isinstance(max_distance, float):
-        max_distance = max_distance(D)
-
-    matches = []
-
-    if query_idx is not None:
-        candidate_idx = query_idx
-    else:
-        candidate_idx = np.argmin(D)
-
-    for i in range(len(D)):
-        if (
-            D[candidate_idx] > atol + max_distance
-            or ~np.isfinite(D[candidate_idx])
-            or len(matches) >= max_matches
-        ):
-            break
-
-        matches.append([D[candidate_idx], candidate_idx])
-        core.apply_exclusion_zone(D, candidate_idx, excl_zone, np.inf)
-        candidate_idx = np.argmin(D)
 
-    return np.array(matches, dtype=object)
+    return core._find_matches(
+        D,
+        excl_zone,
+        max_distance=max_distance,
+        max_matches=max_matches,
+        query_idx=query_idx,
+        atol=atol,
+    )
diff --git a/stumpy/core.py b/stumpy/core.py
@@ -1465,11 +1465,11 @@ def _mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T):
     distance_matrix : numpy.ndarray
         The full output distance matrix. This is mandatory since it may be reused.
 
-    μ_Q : float
-        Mean of `Q`
+    μ_Q : numpy.ndarray
+        Sliding mean of `Q`
 
-    σ_Q : float
-        Standard deviation of `Q`
+    σ_Q : numpy.ndarray
+        Sliding standard deviation of `Q`
 
     M_T : numpy.ndarray
         Sliding mean of `T`
@@ -2542,7 +2542,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
 
     k : int
         Specify the `k`th value in the concatenated matrix profiles to return. This
-        parameter is ignored when `k_func` is not None.
+        parameter is ignored when `custom_func` is not None.
 
     custom_func : object, default None
         A custom user defined function for selecting the desired value from the
@@ -2592,3 +2592,95 @@ def _check_P(P, threshold=1e-6):
     if are_distances_too_small(P, threshold=threshold):  # pragma: no cover
         logger.warning(f"A large number of values in `P` are smaller than {threshold}.")
         logger.warning("For a self-join, try setting `ignore_trivial=True`.")
+
+
+def _find_matches(
+    D, excl_zone, max_distance=None, max_matches=None, query_idx=None, atol=1e-8
+):
+    """
+    Find all matches of a query `Q` whose distance profile with `T` is `D`.
+
+    Parameters
+    ----------
+    D : numpy.ndarray
+        The distance profile of `Q` with `T`. It is a 1D numpy array of size
+        `len(T)-len(Q)+1`, where `D[i]` is the distance between query `Q` and
+        `T[i : i + len(Q)]`.
+
+    excl_zone : int
+        Size of the exclusion zone. That is, after finding the next-best-match
+        located at index `idx`, we ignore subsequences with start index in range
+        (idx -  excl_zone, idx + excl_zone + 1).
+
+    max_distance : float or function, default None
+        Maximum distance between `Q` and a subsequence `S` for `S` to be considered a
+        match.
+        If a function, then it has to be a function of one argument `D`, which will be
+        the distance profile of `Q` with `T` (a 1D numpy array of size `n-m+1`).
+        If None, this defaults to
+        `np.nanmax([np.nanmean(D) - 2 * np.nanstd(D), np.nanmin(D)])` (i.e. at
+        least the closest match will be returned).
+
+    max_matches : int, default None
+        The maximum amount of similar occurrences to be returned. The resulting
+        occurrences are sorted by distance, so a value of `10` means that the
+        indices of the most similar `10` subsequences is returned. If `None`, then all
+        occurrences are returned.
+
+    query_idx : int, default None
+        This is the index position along the time series, `T`, where the query
+        subsequence, `Q`, is located.
+        `query_idx` should only be used when the matrix profile is a self-join and
+        should be set to `None` for matrix profiles computed from AB-joins.
+        If `query_idx` is set to a specific integer value, then this will help ensure
+        that the self-match will be returned first.
+
+    atol : float, default 1e-8
+        The absolute tolerance parameter. This value will be added to `max_distance`
+        when comparing distances between subsequences.
+
+    Returns
+    -------
+    out : numpy.ndarray
+        The first column consists of values selected from `D`. These are the distances
+        of subsequences of `T` whose distances to `Q` are less than or equal to
+        `max_distance`, sorted by distance (lowest to highest). The second column
+        consists of the corresponding indices in `D`. These are in fact the start index
+        of susequences in `T` selected as the match of `Q`.
+
+    """
+    D = D.copy()
+    if max_distance is None:
+
+        def max_distance(D):
+            D_copy = D.copy().astype(np.float64)
+            D_copy[np.isinf(D_copy)] = np.nan
+            return np.nanmax(
+                [np.nanmean(D_copy) - 2.0 * np.nanstd(D_copy), np.nanmin(D_copy)]
+            )
+
+    if not isinstance(max_distance, float):
+        max_distance = max_distance(D)
+
+    if max_matches is None:
+        max_matches = np.inf
+
+    if query_idx is not None:
+        candidate_idx = query_idx
+    else:
+        candidate_idx = np.argmin(D)
+
+    matches = []
+    for _ in range(len(D)):
+        if (
+            D[candidate_idx] > atol + max_distance
+            or ~np.isfinite(D[candidate_idx])
+            or len(matches) >= max_matches
+        ):
+            break
+
+        matches.append([D[candidate_idx], candidate_idx])
+        apply_exclusion_zone(D, candidate_idx, excl_zone, np.inf)
+        candidate_idx = np.argmin(D)
+
+    return np.array(matches, dtype=object)
diff --git a/stumpy/motifs.py b/stumpy/motifs.py
@@ -96,7 +96,7 @@ def _motifs(
     motif_distances = []
 
     candidate_idx = np.argmin(P[-1])
-    for i in range(l):
+    for _ in range(l):
         if len(motif_indices) >= max_motifs:
             break
 
@@ -422,29 +422,17 @@ def match(
     Q = core._preprocess(Q)
     T = core._preprocess(T)
 
+    if np.any(np.isnan(Q)) or np.any(np.isinf(Q)):  # pragma: no cover
+        raise ValueError("Q contains illegal values (NaN or inf)")
+
     if len(Q.shape) == 1:
         Q = Q[np.newaxis, :]
     if len(T.shape) == 1:
         T = T[np.newaxis, :]
 
     d, n = T.shape
     m = Q.shape[1]
-
     excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    if max_matches is None:  # pragma: no cover
-        max_matches = np.inf
-
-    if np.any(np.isnan(Q)) or np.any(np.isinf(Q)):  # pragma: no cover
-        raise ValueError("Q contains illegal values (NaN or inf)")
-
-    if max_distance is None:  # pragma: no cover
-
-        def max_distance(D):
-            D_copy = D.copy().astype(np.float64)
-            D_copy[np.isinf(D_copy)] = np.nan
-            return np.nanmax(
-                [np.nanmean(D_copy) - 2.0 * np.nanstd(D_copy), np.nanmin(D_copy)]
-            )
 
     if M_T is None or Σ_T is None:  # pragma: no cover
         T, M_T, Σ_T = core.preprocess(T, m)
@@ -456,28 +444,13 @@ def max_distance(D):
     D = np.empty((d, n - m + 1))
     for i in range(d):
         D[i, :] = core.mass(Q[i], T[i], M_T[i], Σ_T[i])
-
     D = np.mean(D, axis=0)
-    if not isinstance(max_distance, float):
-        max_distance = max_distance(D)
-
-    matches = []
-
-    if query_idx is not None:
-        candidate_idx = query_idx
-    else:
-        candidate_idx = np.argmin(D)
-
-    for i in range(len(D)):
-        if (
-            D[candidate_idx] > atol + max_distance
-            or ~np.isfinite(D[candidate_idx])
-            or len(matches) >= max_matches
-        ):
-            break
-
-        matches.append([D[candidate_idx], candidate_idx])
-        core.apply_exclusion_zone(D, candidate_idx, excl_zone, np.inf)
-        candidate_idx = np.argmin(D)
 
-    return np.array(matches, dtype=object)
+    return core._find_matches(
+        D,
+        excl_zone,
+        max_distance=max_distance,
+        max_matches=max_matches,
+        query_idx=query_idx,
+        atol=atol,
+    )
diff --git a/stumpy/mpdist.py b/stumpy/mpdist.py
@@ -192,13 +192,14 @@ def _mpdist_vect(
         Time series or sequence
 
     m : int
-        Window size
+        Window size that will be used for calculating the mpdist between Q and
+        any subsequence in T (of size `len(Q)`)
 
-    μ_Q : float
-        Mean of `Q`
+    μ_Q : numpy.ndarray
+        Sliding mean of `Q`
 
-    σ_Q : float
-        Standard deviation of `Q`
+    σ_Q : numpy.ndarray
+        Sliding standard deviation of `Q`
 
     M_T : numpy.ndarray
         Sliding mean of `T`
@@ -222,6 +223,13 @@ def _mpdist_vect(
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
+
+    Returns
+    -------
+    MPdist_vect : numpy.ndarray
+        The mpdist-based distance profile of `Q` with `T`. It is a 1D array of
+        size `len(T) - len(Q) + 1`. MPdist_vect[i] is the mpdist distance between
+        `Q` and subsequence `T[i : i + len(Q)]`.
     """
     j = Q.shape[0] - m + 1  # `k` is reserved for `P_ABBA` selection
     l = T.shape[0] - m + 1

diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py
@@ -325,7 +325,10 @@ def prescraamp(T_A, m, T_B=None, s=None, p=2.0):
     l = n_A - m + 1
 
     if s is None:  # pragma: no cover
-        s = excl_zone
+        if excl_zone is not None:  # self-join
+            s = excl_zone
+        else:  # AB-join
+            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
 
     indices = np.random.permutation(range(0, l, s)).astype(np.int64)
     P, I = _prescraamp(
@@ -509,9 +512,11 @@ def __init__(
         self._I[:, :] = -1
 
         self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
-
         if s is None:
-            s = self._excl_zone
+            if self._excl_zone is not None:  # self-join
+                s = self._excl_zone
+            else:  # pragma: no cover  # AB-join
+                s = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
 
         if pre_scraamp:
             if self._ignore_trivial:

diff --git a/stumpy/scrump.py b/stumpy/scrump.py
@@ -340,7 +340,10 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0):
     l = n_A - m + 1
 
     if s is None:  # pragma: no cover
-        s = excl_zone
+        if excl_zone is not None:  # self-join
+            s = excl_zone
+        else:  # AB-join
+            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
 
     indices = np.random.permutation(range(0, l, s)).astype(np.int64)
     P, I = _prescrump(
@@ -578,9 +581,11 @@ def __init__(
         self._I[:, :] = -1
 
         self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
-
         if s is None:
-            s = self._excl_zone
+            if self._excl_zone is not None:  # self-join
+                s = self._excl_zone
+            else:  # pragma: no cover  # AB-join
+                s = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
 
         if pre_scrump:
             if self._ignore_trivial:

diff --git a/tests/naive.py b/tests/naive.py
@@ -1759,3 +1759,25 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w
         )
 
     return total_ndists
+
+
+def find_matches(D, excl_zone, max_distance, max_matches=None):
+    if max_matches is None:
+        max_matches = len(D)
+
+    matches = []
+    for i in range(D.size):
+        dist = D[i]
+        if dist <= max_distance:
+            matches.append(i)
+
+    # Removes indices that are inside the exclusion zone of some occurrence with
+    # a smaller distance to the query
+    matches.sort(key=lambda x: D[x])
+    result = []
+    while len(matches) > 0:
+        idx = matches[0]
+        result.append([D[idx], idx])
+        matches = [x for x in matches if x < idx - excl_zone or x > idx + excl_zone]
+
+    return np.array(result[:max_matches], dtype=object)