diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 9d08bb05..56004c74 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -54,7 +54,8 @@ from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import ( MyLogger, check_documents_type, check_embeddings_shape, - check_is_fitted, validate_distance_matrix, select_topic_representation + check_is_fitted, validate_distance_matrix, select_topic_representation, + get_unique_distances ) import bertopic._save_utils as save_utils @@ -986,6 +987,11 @@ def hierarchical_topics(self, # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix Z = linkage_function(X) + # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with + # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters + if len(Z[:, 2]) != len(np.unique(Z[:, 2])): + Z[:, 2] = get_unique_distances(Z[:, 2]) + # Calculate basic bag-of-words to be iteratively merged later documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), diff --git a/bertopic/_utils.py b/bertopic/_utils.py index a44bf4e9..f8a88f11 100644 --- a/bertopic/_utils.py +++ b/bertopic/_utils.py @@ -152,6 +152,31 @@ def validate_distance_matrix(X, n_samples): return X + +def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array: + """Check if the consecutive elements in the distance array are the same. If so, a small noise + is added to one of the elements to make sure that the array does not contain duplicates. + + Arguments: + dists: distance array sorted in the increasing order. + noise_max: the maximal magnitude of noise to be added. + + Returns: + Unique distances sorted in the preserved increasing order. + """ + dists_cp = dists.copy() + + for i in range(dists.shape[0] - 1): + if dists[i] == dists[i + 1]: + # returns the next unique distance or the current distance with the added noise + next_unique_dist = next((d for d in dists[i + 1:] if d != dists[i]), dists[i] + noise_max) + + # the noise can never be large then the difference between the next unique distance and the current one + curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i]) + dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise) + return dists_cp + + def select_topic_representation( ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, diff --git a/tests/test_utils.py b/tests/test_utils.py index 8024b3d6..5827c017 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,11 @@ import pytest import logging import numpy as np -from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation +from typing import List +from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation, get_unique_distances from scipy.sparse import csr_matrix + def test_logger(): logger = MyLogger() logger.configure("DEBUG") @@ -34,8 +36,23 @@ def test_check_embeddings_shape(): embeddings = np.array([[1, 2, 3], [2, 3, 4]]) check_embeddings_shape(embeddings, docs) + + +def test_make_unique_distances(): + def check_dists(dists: List[float], noise_max: float): + unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max) + assert len(unique_dists) == len(dists), "The number of elements must be the same" + assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique" + + check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7) + + # testing whether the distances are sorted in ascending order when if the noise is extremely high + check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20) + # test whether the distances are sorted in ascending order when the distances are all the same + check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7) + def test_select_topic_representation(): ctfidf_embeddings = np.array([[1, 1, 1]]) ctfidf_embeddings_sparse = csr_matrix(