MaartenGr · MaartenGr · Jun 13, 2024 · Apr 17, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -54,7 +54,8 @@
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
  MyLogger, check_documents_type, check_embeddings_shape,
- check_is_fitted, validate_distance_matrix, select_topic_representation
+ check_is_fitted, validate_distance_matrix, select_topic_representation,
+ get_unique_distances
 )
 import bertopic._save_utils as save_utils
 
@@ -986,6 +987,11 @@ def hierarchical_topics(self,
  # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix
  Z = linkage_function(X)
 
+ # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with
+ # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters
+ if len(Z[:, 2]) != len(np.unique(Z[:, 2])):
+ Z[:, 2] = get_unique_distances(Z[:, 2])
+
  # Calculate basic bag-of-words to be iteratively merged later
  documents = pd.DataFrame({"Document": docs,
  "ID": range(len(docs)),

diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -152,6 +152,31 @@ def validate_distance_matrix(X, n_samples):
  return X
 
 
+
+def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
+ """Check if the consecutive elements in the distance array are the same. If so, a small noise
+ is added to one of the elements to make sure that the array does not contain duplicates.
+
+ Arguments:
+ dists: distance array sorted in the increasing order.
+ noise_max: the maximal magnitude of noise to be added.
+
+ Returns:
+ Unique distances sorted in the preserved increasing order.
+ """
+ dists_cp = dists.copy()
+
+ for i in range(dists.shape[0] - 1):
+ if dists[i] == dists[i + 1]:
+ # returns the next unique distance or the current distance with the added noise
+ next_unique_dist = next((d for d in dists[i + 1:] if d != dists[i]), dists[i] + noise_max)
+
+ # the noise can never be large then the difference between the next unique distance and the current one
+ curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
+ dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
+ return dists_cp
+
+
 def select_topic_representation(
  ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
  embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,9 +1,11 @@
 import pytest
 import logging
 import numpy as np
-from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation
+from typing import List
+from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation, get_unique_distances
 from scipy.sparse import csr_matrix
 
+
 def test_logger():
  logger = MyLogger()
  logger.configure("DEBUG")
@@ -34,8 +36,23 @@ def test_check_embeddings_shape():
  embeddings = np.array([[1, 2, 3],
  [2, 3, 4]])
  check_embeddings_shape(embeddings, docs)
+
+
+def test_make_unique_distances():
+ def check_dists(dists: List[float], noise_max: float):
+ unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
+ assert len(unique_dists) == len(dists), "The number of elements must be the same"
+ assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"
+
+ check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)
+
+ # testing whether the distances are sorted in ascending order when if the noise is extremely high
+ check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20)
 
+ # test whether the distances are sorted in ascending order when the distances are all the same
+ check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7)
 
+
 def test_select_topic_representation():
  ctfidf_embeddings = np.array([[1, 1, 1]])
  ctfidf_embeddings_sparse = csr_matrix(