MaartenGr · MaartenGr · Jun 13, 2024 · Apr 17, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -54,7 +54,7 @@
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
  MyLogger, check_documents_type, check_embeddings_shape,
- check_is_fitted, validate_distance_matrix
+ check_is_fitted, validate_distance_matrix, get_unique_distances
 )
 import bertopic._save_utils as save_utils
 
@@ -979,6 +979,11 @@ def hierarchical_topics(self,
  # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix
  Z = linkage_function(X)
 
+ # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with
+ # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters
+ if len(Z[:, 2]) != len(np.unique(Z[:, 2])):
+ Z[:, 2] = get_unique_distances(Z[:, 2])
+
  # Calculate basic bag-of-words to be iteratively merged later
  documents = pd.DataFrame({"Document": docs,
  "ID": range(len(docs)),

diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import logging
+from typing import Union
 from collections.abc import Iterable
 from scipy.sparse import csr_matrix
 from scipy.spatial.distance import squareform
@@ -147,3 +148,46 @@ def validate_distance_matrix(X, n_samples):
  raise ValueError("Distance matrix cannot contain negative values.")
 
  return X
+
+
+def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
+ """Check if the consecutive elements in the distance array are the same. If so, a small noise
+ is added to one of the elements to make sure that the array does not contain duplicates.
+
+ Arguments:
+ dists: distance array.
+ noise_max: the maximal magnitude of noise to be added.
+
+ Returns:
+ Unique distances sorted in the preserved increasing order.
+
+ Raises:
+ ValueError: If the distance array is not sorted in the increasing order.
+ """
+
+ def get_next_diff_value(array: np.array, ix: int) -> Union[float, None]:
+ """Get the next different value from `array[ix]` in the array."""
+ for j in range(ix + 1, array.shape[0]):
+ if array[j] != array[ix]:
+ return array[j]
+ return None
+
+ if not np.all(np.diff(dists) >= 0):
+ raise ValueError("The distances must be sorted in the increasing order")
+ dists_cp = dists.copy()
+
+ for i in range(dists.shape[0] - 1):
+ if dists[i] == dists[i + 1]:
+ next_unique_dist = get_next_diff_value(dists, i)
+ # if there is no different distance further in the array, the `next_unique_dist` is set to be a lightly
+ # larger than the current (also the max) distance in the array
+ next_unique_dist = dists[i] + noise_max if next_unique_dist is None else next_unique_dist
+
+ # when the max noise is smaller than `curr_max_noise`, then we can be sure that order is preserved.
+ # `dists_cp` must be used since it contains the noise-added values.
+ curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
+ dists_cp[i + 1] += np.random.uniform(
+ low=dists_cp[i] + curr_max_noise / 2,
+ high=dists_cp[i] + curr_max_noise
+ )
+ return dists_cp
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,7 +1,7 @@
 import pytest
 import logging
 import numpy as np
-from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger
+from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, get_unique_distances
 
 
 def test_logger():
@@ -32,3 +32,18 @@ def test_check_embeddings_shape():
  embeddings = np.array([[1, 2, 3],
  [2, 3, 4]])
  check_embeddings_shape(embeddings, docs)
+
+
+def test_make_unique_distances():
+ def check_dists(dists: list[float], noise_max: float):
+ unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
+ assert len(unique_dists) == len(dists), "The number of elements must be the same"
+ assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"
+
+ check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)
+
+ # testing whether the distances are sorted in ascending order when if the noise is extremely high
+ check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20)
+
+ # test whether the distances are sorted in ascending order when the distances are all the same
+ check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7)