Light-weight installation without UMAP and HDBSCAN (#2289)

MaartenGr · Feb 28, 2025 · 0c930d2 · 0c930d2
1 parent 68cc1a7
commit 0c930d2
Show file tree

Hide file tree

Showing 14 changed files with 169 additions and 634 deletions.
diff --git a/README.md b/README.md
@@ -63,6 +63,8 @@ pip install bertopic[flair,gensim,spacy,use]
 pip install bertopic[vision]
 ```
 
+For a *light-weight installation* without transformers, UMAP and/or HDBSCAN (for training with Model2Vec or perhaps for inference), see [this tutorial](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation).
+
 ## Getting Started
 For an in-depth overview of the features of BERTopic 
 you can check the [**full documentation**](https://maartengr.github.io/BERTopic/) or you can follow along 

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -37,11 +37,18 @@
 from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable
 
 # Models
-import hdbscan
-from umap import UMAP
+try:
+    from hdbscan import HDBSCAN
+
+    HAS_HDBSCAN = True
+except (ImportError, ModuleNotFoundError):
+    HAS_HDBSCAN = False
+    from sklearn.cluster import HDBSCAN as SK_HDBSCAN
+
 from sklearn.preprocessing import normalize
 from sklearn import __version__ as sklearn_version
 from sklearn.cluster import AgglomerativeClustering
+from sklearn.decomposition import PCA
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 
@@ -143,8 +150,8 @@ def __init__(
         zeroshot_topic_list: List[str] = None,
         zeroshot_min_similarity: float = 0.7,
         embedding_model=None,
-        umap_model: UMAP = None,
-        hdbscan_model: hdbscan.HDBSCAN = None,
+        umap_model=None,
+        hdbscan_model=None,
         vectorizer_model: CountVectorizer = None,
         ctfidf_model: TfidfTransformer = None,
         representation_model: BaseRepresentation = None,
@@ -247,22 +254,38 @@ def __init__(
         self.representation_model = representation_model
 
         # UMAP or another algorithm that has .fit and .transform functions
-        self.umap_model = umap_model or UMAP(
-            n_neighbors=15,
-            n_components=5,
-            min_dist=0.0,
-            metric="cosine",
-            low_memory=self.low_memory,
-        )
+        if umap_model is not None:
+            self.umap_model = umap_model
+        else:
+            try:
+                from umap import UMAP
+
+                self.umap_model = UMAP(
+                    n_neighbors=15,
+                    n_components=5,
+                    min_dist=0.0,
+                    metric="cosine",
+                    low_memory=self.low_memory,
+                )
+            except (ImportError, ModuleNotFoundError):
+                self.umap_model = PCA(n_components=5)
 
         # HDBSCAN or another clustering algorithm that has .fit and .predict functions and
         # the .labels_ variable to extract the labels
-        self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(
-            min_cluster_size=self.min_topic_size,
-            metric="euclidean",
-            cluster_selection_method="eom",
-            prediction_data=True,
-        )
+
+        if hdbscan_model is not None:
+            self.hdbscan_model = hdbscan_model
+        elif HAS_HDBSCAN:
+            self.hdbscan_model = HDBSCAN(
+                min_cluster_size=self.min_topic_size,
+                metric="euclidean",
+                cluster_selection_method="eom",
+                prediction_data=True,
+            )
+        else:
+            self.hdbscan_model = SK_HDBSCAN(
+                min_cluster_size=self.min_topic_size, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
+            )
 
         # Public attributes
         self.topics_ = None
@@ -326,7 +349,7 @@ def fit(
         images: List[str] = None,
         y: Union[List[int], np.ndarray] = None,
     ):
-        """Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.
+        """Fit the models on a collection of documents and generate topics.
 
         Arguments:
             documents: A list of documents to fit on
@@ -684,9 +707,7 @@ def partial_fit(
         # Checks
         check_embeddings_shape(embeddings, documents)
         if not hasattr(self.hdbscan_model, "partial_fit"):
-            raise ValueError(
-                "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function."
-            )
+            raise ValueError("In order to use `.partial_fit`, the cluster model should have a `.partial_fit` function.")
 
         # Prepare documents
         if isinstance(documents, str):
@@ -1524,7 +1545,7 @@ def update_topics(
 
         if top_n_words > 100:
             logger.warning(
-                "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit."
+                "Note that extracting more than 100 words from a sparse can slow down computation quite a bit."
             )
         self.top_n_words = top_n_words
         self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
@@ -2007,7 +2028,7 @@ def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) ->
                 custom_labels = topic_labels
             else:
                 raise ValueError(
-                    "Make sure that `topic_labels` contains the same number " "of labels as there are topics."
+                    "Make sure that `topic_labels` contains the same number of labels as there are topics."
                 )
 
         self.custom_labels_ = custom_labels
@@ -2124,9 +2145,7 @@ def merge_topics(
                 for topic in topic_group:
                     mapping[topic] = topic_group[0]
         else:
-            raise ValueError(
-                "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics."
-            )
+            raise ValueError("Make sure that `topics_to_merge` is eithera list of topics or a list of list of topics.")
 
         # Track mappings and sizes of topics for merging topic embeddings
         mappings = defaultdict(list)
@@ -3769,7 +3788,7 @@ def _cluster_embeddings(
         partial_fit: bool = False,
         y: np.ndarray = None,
     ) -> Tuple[pd.DataFrame, np.ndarray]:
-        """Cluster UMAP embeddings with HDBSCAN.
+        """Cluster UMAP reduced embeddings with HDBSCAN.
 
         Arguments:
             umap_embeddings: The reduced sentence embeddings with UMAP
@@ -4473,12 +4492,18 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
             self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True
         )[0]
         norm_data = normalize(embeddings, norm="l2")
-        predictions = hdbscan.HDBSCAN(
-            min_cluster_size=2,
-            metric="euclidean",
-            cluster_selection_method="eom",
-            prediction_data=True,
-        ).fit_predict(norm_data[self._outliers :])
+
+        if HAS_HDBSCAN:
+            predictions = HDBSCAN(
+                min_cluster_size=2,
+                metric="euclidean",
+                cluster_selection_method="eom",
+                prediction_data=True,
+            ).fit_predict(norm_data[self._outliers :])
+        else:
+            predictions = SK_HDBSCAN(
+                min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
+            ).fit_predict(norm_data[self._outliers :])
 
         # Map similar topics
         mapped_topics = {

diff --git a/bertopic/_save_utils.py b/bertopic/_save_utils.py
@@ -461,22 +461,36 @@ def get_package_versions():
     try:
         import platform
         from numpy import __version__ as np_version
+        from pandas import __version__ as pandas_version
+        from sklearn import __version__ as sklearn_version
+        from plotly import __version__ as plotly_version
 
         try:
             from importlib.metadata import version
 
             hdbscan_version = version("hdbscan")
-        except:  # noqa: E722
+        except (ImportError, ModuleNotFoundError):
             hdbscan_version = None
 
-        from umap import __version__ as umap_version
-        from pandas import __version__ as pandas_version
-        from sklearn import __version__ as sklearn_version
-        from sentence_transformers import __version__ as sbert_version
-        from numba import __version__ as numba_version
-        from transformers import __version__ as transformers_version
+        try:
+            from umap import __version__ as umap_version
+        except (ImportError, ModuleNotFoundError):
+            umap_version = None
 
-        from plotly import __version__ as plotly_version
+        try:
+            from sentence_transformers import __version__ as sbert_version
+        except (ImportError, ModuleNotFoundError):
+            sbert_version = None
+
+        try:
+            from numba import __version__ as numba_version
+        except (ImportError, ModuleNotFoundError):
+            numba_version = None
+
+        try:
+            from transformers import __version__ as transformers_version
+        except (ImportError, ModuleNotFoundError):
+            transformers_version = None
 
         return {
             "Numpy": np_version,

diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -74,10 +74,7 @@ def check_is_fitted(topic_model):
     Raises:
         ValueError: If the matches were not found.
     """
-    msg = (
-        "This %(name)s instance is not fitted yet. Call 'fit' with "
-        "appropriate arguments before using this estimator."
-    )
+    msg = "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
 
     if topic_model.topics_ is None:
         raise ValueError(msg % {"name": type(topic_model).__name__})
@@ -131,11 +128,11 @@ def validate_distance_matrix(X, n_samples):
         # check it has correct size
         n = s[0]
         if n != (n_samples * (n_samples - 1) / 2):
-            raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
+            raise ValueError("The condensed distance matrix must have shape (n*(n-1)/2,).")
     elif len(s) == 2:
         # check it has correct size
         if (s[0] != n_samples) or (s[1] != n_samples):
-            raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
+            raise ValueError("The distance matrix must be of shape (n, n) where n is the number of samples.")
         # force zero diagonal and convert to condensed
         np.fill_diagonal(X, 0)
         X = squareform(X)

diff --git a/bertopic/cluster/_utils.py b/bertopic/cluster/_utils.py
@@ -1,4 +1,3 @@
-import hdbscan
 import numpy as np
 
 
@@ -15,6 +14,11 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
         embeddings: Input embeddings for "approximate_predict"
                     and "membership_vector"
     """
+    try:
+        import hdbscan
+    except (ImportError, ModuleNotFoundError):
+        hdbscan = type("hdbscan", (), {"HDBSCAN": None})()
+
     # Approximate predict
     if func == "approximate_predict":
         if isinstance(model, hdbscan.HDBSCAN):
@@ -62,6 +66,11 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
 
 def is_supported_hdbscan(model):
     """Check whether the input model is a supported HDBSCAN-like model."""
+    try:
+        import hdbscan
+    except (ImportError, ModuleNotFoundError):
+        hdbscan = type("hdbscan", (), {"HDBSCAN": None})()
+
     if isinstance(model, hdbscan.HDBSCAN):
         return True
 

diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py
@@ -75,7 +75,7 @@ def visualize_approximate_distribution(
         df = pd.DataFrame(topic_token_distribution).T
 
     df.columns = [f"{token}_{i}" for i, token in enumerate(tokens)]
-    df.columns = [f"{token}{' '*i}" for i, token in enumerate(tokens)]
+    df.columns = [f"{token}{' ' * i}" for i, token in enumerate(tokens)]
     df.index = list(topic_model.topic_labels_.values())[topic_model._outliers :]
     df = df.loc[(df.sum(axis=1) != 0), :]
 

diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 from typing import List, Union
-from umap import UMAP
 from warnings import warn
 
 try:
@@ -122,8 +121,15 @@ def visualize_document_datamap(
 
     # Reduce input embeddings
     if reduced_embeddings is None:
-        umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce)
-        embeddings_2d = umap_model.embedding_
+        try:
+            from umap import UMAP
+
+            umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce)
+            embeddings_2d = umap_model.embedding_
+        except (ImportError, ModuleNotFoundError):
+            raise ModuleNotFoundError(
+                "UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`."
+            )
     else:
         embeddings_2d = reduced_embeddings
 

diff --git a/bertopic/plotting/_documents.py b/bertopic/plotting/_documents.py
@@ -2,7 +2,6 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-from umap import UMAP
 from typing import List, Union
 
 
@@ -120,8 +119,15 @@ def visualize_documents(
 
     # Reduce input embeddings
     if reduced_embeddings is None:
-        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
-        embeddings_2d = umap_model.embedding_
+        try:
+            from umap import UMAP
+
+            umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
+            embeddings_2d = umap_model.embedding_
+        except (ImportError, ModuleNotFoundError):
+            raise ModuleNotFoundError(
+                "UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`."
+            )
     elif sample is not None and reduced_embeddings is not None:
         embeddings_2d = reduced_embeddings[indices]
     elif sample is None and reduced_embeddings is not None:

diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py
@@ -77,7 +77,7 @@ def visualize_heatmap(
     sorted_topics = topics
     if n_clusters:
         if n_clusters >= len(set(topics)):
-            raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.")
+            raise ValueError("Make sure to set `n_clusters` lower than the total number of unique topics.")
 
         distance_matrix = cosine_similarity(embeddings[topics])
         Z = linkage(distance_matrix, "ward")