Skip to content

Commit

Permalink
Light-weight installation without UMAP and HDBSCAN (#2289)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaartenGr authored Feb 28, 2025
1 parent 68cc1a7 commit 0c930d2
Show file tree
Hide file tree
Showing 14 changed files with 169 additions and 634 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pip install bertopic[flair,gensim,spacy,use]
pip install bertopic[vision]
```

For a *light-weight installation* without transformers, UMAP and/or HDBSCAN (for training with Model2Vec or perhaps for inference), see [this tutorial](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation).

## Getting Started
For an in-depth overview of the features of BERTopic
you can check the [**full documentation**](https://maartengr.github.io/BERTopic/) or you can follow along
Expand Down
91 changes: 58 additions & 33 deletions bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,18 @@
from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable

# Models
import hdbscan
from umap import UMAP
try:
from hdbscan import HDBSCAN

HAS_HDBSCAN = True
except (ImportError, ModuleNotFoundError):
HAS_HDBSCAN = False
from sklearn.cluster import HDBSCAN as SK_HDBSCAN

from sklearn.preprocessing import normalize
from sklearn import __version__ as sklearn_version
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

Expand Down Expand Up @@ -143,8 +150,8 @@ def __init__(
zeroshot_topic_list: List[str] = None,
zeroshot_min_similarity: float = 0.7,
embedding_model=None,
umap_model: UMAP = None,
hdbscan_model: hdbscan.HDBSCAN = None,
umap_model=None,
hdbscan_model=None,
vectorizer_model: CountVectorizer = None,
ctfidf_model: TfidfTransformer = None,
representation_model: BaseRepresentation = None,
Expand Down Expand Up @@ -247,22 +254,38 @@ def __init__(
self.representation_model = representation_model

# UMAP or another algorithm that has .fit and .transform functions
self.umap_model = umap_model or UMAP(
n_neighbors=15,
n_components=5,
min_dist=0.0,
metric="cosine",
low_memory=self.low_memory,
)
if umap_model is not None:
self.umap_model = umap_model
else:
try:
from umap import UMAP

self.umap_model = UMAP(
n_neighbors=15,
n_components=5,
min_dist=0.0,
metric="cosine",
low_memory=self.low_memory,
)
except (ImportError, ModuleNotFoundError):
self.umap_model = PCA(n_components=5)

# HDBSCAN or another clustering algorithm that has .fit and .predict functions and
# the .labels_ variable to extract the labels
self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(
min_cluster_size=self.min_topic_size,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True,
)

if hdbscan_model is not None:
self.hdbscan_model = hdbscan_model
elif HAS_HDBSCAN:
self.hdbscan_model = HDBSCAN(
min_cluster_size=self.min_topic_size,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True,
)
else:
self.hdbscan_model = SK_HDBSCAN(
min_cluster_size=self.min_topic_size, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
)

# Public attributes
self.topics_ = None
Expand Down Expand Up @@ -326,7 +349,7 @@ def fit(
images: List[str] = None,
y: Union[List[int], np.ndarray] = None,
):
"""Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.
"""Fit the models on a collection of documents and generate topics.
Arguments:
documents: A list of documents to fit on
Expand Down Expand Up @@ -684,9 +707,7 @@ def partial_fit(
# Checks
check_embeddings_shape(embeddings, documents)
if not hasattr(self.hdbscan_model, "partial_fit"):
raise ValueError(
"In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function."
)
raise ValueError("In order to use `.partial_fit`, the cluster model should have a `.partial_fit` function.")

# Prepare documents
if isinstance(documents, str):
Expand Down Expand Up @@ -1524,7 +1545,7 @@ def update_topics(

if top_n_words > 100:
logger.warning(
"Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit."
"Note that extracting more than 100 words from a sparse can slow down computation quite a bit."
)
self.top_n_words = top_n_words
self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
Expand Down Expand Up @@ -2007,7 +2028,7 @@ def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) ->
custom_labels = topic_labels
else:
raise ValueError(
"Make sure that `topic_labels` contains the same number " "of labels as there are topics."
"Make sure that `topic_labels` contains the same number of labels as there are topics."
)

self.custom_labels_ = custom_labels
Expand Down Expand Up @@ -2124,9 +2145,7 @@ def merge_topics(
for topic in topic_group:
mapping[topic] = topic_group[0]
else:
raise ValueError(
"Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics."
)
raise ValueError("Make sure that `topics_to_merge` is eithera list of topics or a list of list of topics.")

# Track mappings and sizes of topics for merging topic embeddings
mappings = defaultdict(list)
Expand Down Expand Up @@ -3769,7 +3788,7 @@ def _cluster_embeddings(
partial_fit: bool = False,
y: np.ndarray = None,
) -> Tuple[pd.DataFrame, np.ndarray]:
"""Cluster UMAP embeddings with HDBSCAN.
"""Cluster UMAP reduced embeddings with HDBSCAN.
Arguments:
umap_embeddings: The reduced sentence embeddings with UMAP
Expand Down Expand Up @@ -4473,12 +4492,18 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True
)[0]
norm_data = normalize(embeddings, norm="l2")
predictions = hdbscan.HDBSCAN(
min_cluster_size=2,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True,
).fit_predict(norm_data[self._outliers :])

if HAS_HDBSCAN:
predictions = HDBSCAN(
min_cluster_size=2,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True,
).fit_predict(norm_data[self._outliers :])
else:
predictions = SK_HDBSCAN(
min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
).fit_predict(norm_data[self._outliers :])

# Map similar topics
mapped_topics = {
Expand Down
30 changes: 22 additions & 8 deletions bertopic/_save_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,22 +461,36 @@ def get_package_versions():
try:
import platform
from numpy import __version__ as np_version
from pandas import __version__ as pandas_version
from sklearn import __version__ as sklearn_version
from plotly import __version__ as plotly_version

try:
from importlib.metadata import version

hdbscan_version = version("hdbscan")
except: # noqa: E722
except (ImportError, ModuleNotFoundError):
hdbscan_version = None

from umap import __version__ as umap_version
from pandas import __version__ as pandas_version
from sklearn import __version__ as sklearn_version
from sentence_transformers import __version__ as sbert_version
from numba import __version__ as numba_version
from transformers import __version__ as transformers_version
try:
from umap import __version__ as umap_version
except (ImportError, ModuleNotFoundError):
umap_version = None

from plotly import __version__ as plotly_version
try:
from sentence_transformers import __version__ as sbert_version
except (ImportError, ModuleNotFoundError):
sbert_version = None

try:
from numba import __version__ as numba_version
except (ImportError, ModuleNotFoundError):
numba_version = None

try:
from transformers import __version__ as transformers_version
except (ImportError, ModuleNotFoundError):
transformers_version = None

return {
"Numpy": np_version,
Expand Down
9 changes: 3 additions & 6 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,7 @@ def check_is_fitted(topic_model):
Raises:
ValueError: If the matches were not found.
"""
msg = (
"This %(name)s instance is not fitted yet. Call 'fit' with "
"appropriate arguments before using this estimator."
)
msg = "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."

if topic_model.topics_ is None:
raise ValueError(msg % {"name": type(topic_model).__name__})
Expand Down Expand Up @@ -131,11 +128,11 @@ def validate_distance_matrix(X, n_samples):
# check it has correct size
n = s[0]
if n != (n_samples * (n_samples - 1) / 2):
raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
raise ValueError("The condensed distance matrix must have shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
raise ValueError("The distance matrix must be of shape (n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
Expand Down
11 changes: 10 additions & 1 deletion bertopic/cluster/_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hdbscan
import numpy as np


Expand All @@ -15,6 +14,11 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
embeddings: Input embeddings for "approximate_predict"
and "membership_vector"
"""
try:
import hdbscan
except (ImportError, ModuleNotFoundError):
hdbscan = type("hdbscan", (), {"HDBSCAN": None})()

# Approximate predict
if func == "approximate_predict":
if isinstance(model, hdbscan.HDBSCAN):
Expand Down Expand Up @@ -62,6 +66,11 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):

def is_supported_hdbscan(model):
"""Check whether the input model is a supported HDBSCAN-like model."""
try:
import hdbscan
except (ImportError, ModuleNotFoundError):
hdbscan = type("hdbscan", (), {"HDBSCAN": None})()

if isinstance(model, hdbscan.HDBSCAN):
return True

Expand Down
2 changes: 1 addition & 1 deletion bertopic/plotting/_approximate_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def visualize_approximate_distribution(
df = pd.DataFrame(topic_token_distribution).T

df.columns = [f"{token}_{i}" for i, token in enumerate(tokens)]
df.columns = [f"{token}{' '*i}" for i, token in enumerate(tokens)]
df.columns = [f"{token}{' ' * i}" for i, token in enumerate(tokens)]
df.index = list(topic_model.topic_labels_.values())[topic_model._outliers :]
df = df.loc[(df.sum(axis=1) != 0), :]

Expand Down
12 changes: 9 additions & 3 deletions bertopic/plotting/_datamap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
import pandas as pd
from typing import List, Union
from umap import UMAP
from warnings import warn

try:
Expand Down Expand Up @@ -122,8 +121,15 @@ def visualize_document_datamap(

# Reduce input embeddings
if reduced_embeddings is None:
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
try:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
except (ImportError, ModuleNotFoundError):
raise ModuleNotFoundError(
"UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`."
)
else:
embeddings_2d = reduced_embeddings

Expand Down
12 changes: 9 additions & 3 deletions bertopic/plotting/_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pandas as pd
import plotly.graph_objects as go

from umap import UMAP
from typing import List, Union


Expand Down Expand Up @@ -120,8 +119,15 @@ def visualize_documents(

# Reduce input embeddings
if reduced_embeddings is None:
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
try:
from umap import UMAP

umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
except (ImportError, ModuleNotFoundError):
raise ModuleNotFoundError(
"UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`."
)
elif sample is not None and reduced_embeddings is not None:
embeddings_2d = reduced_embeddings[indices]
elif sample is None and reduced_embeddings is not None:
Expand Down
2 changes: 1 addition & 1 deletion bertopic/plotting/_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def visualize_heatmap(
sorted_topics = topics
if n_clusters:
if n_clusters >= len(set(topics)):
raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.")
raise ValueError("Make sure to set `n_clusters` lower than the total number of unique topics.")

distance_matrix = cosine_similarity(embeddings[topics])
Z = linkage(distance_matrix, "ward")
Expand Down
Loading

0 comments on commit 0c930d2

Please sign in to comment.