MaartenGr · MaartenGr · Jul 1, 2024 · Jun 24, 2024 · Jun 25, 2024 · Jun 26, 2024
diff --git a/.flake8 b/.flake8
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
diff --git a/bertopic/_save_utils.py b/bertopic/_save_utils.py
@@ -135,9 +135,7 @@ def push_to_hf_hub(
         save_ctfidf: Whether to save c-TF-IDF information
     """
     if not _has_hf_hub:
-        raise ValueError(
-            "Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`"
-        )
+        raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`")
 
     # Create repo if it doesn't exist yet and infer complete repo_id
     repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True)
@@ -156,9 +154,7 @@ def push_to_hf_hub(
 
         # Add README if it does not exist
         try:
-            get_hf_file_metadata(
-                hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
-            )
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
         except:  # noqa: E722
             if model_card:
                 readme_text = generate_readme(model, repo_id)
@@ -241,13 +237,9 @@ def load_files_from_hf(path):
 
     # c-TF-IDF
     try:
-        ctfidf_config = load_cfg_from_json(
-            hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)
-        )
+        ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None))
         try:
-            ctfidf_tensors = hf_hub_download(
-                path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None
-            )
+            ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None)
             ctfidf_tensors = load_safetensors(ctfidf_tensors)
         except:  # noqa: E722
             ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None)
@@ -268,9 +260,7 @@ def load_files_from_hf(path):
             topic_list = list(topics["topic_representations"].keys())
             images = {}
             for topic in topic_list:
-                image = Image.open(
-                    hf_hub_download(path, f"images/{topic}.jpg", revision=None)
-                )
+                image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None))
                 images[int(topic)] = image
 
     return topics, params, tensors, ctfidf_tensors, ctfidf_config, images
@@ -283,11 +273,7 @@ def generate_readme(model, repo_id: str):
 
     # Get Statistics
     model_name = repo_id.split("/")[-1]
-    params = {
-        param: value
-        for param, value in model.get_params().items()
-        if "model" not in param
-    }
+    params = {param: value for param, value in model.get_params().items() if "model" not in param}
     params = "\n".join([f"* {param}: {value}" for param, value in params.items()])
     topics = sorted(list(set(model.topics_)))
     nr_topics = str(len(set(model.topics_)))
@@ -298,23 +284,15 @@ def generate_readme(model, repo_id: str):
         nr_documents = ""
 
     # Topic information
-    topic_keywords = [
-        " - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics
-    ]
+    topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics]
     topic_freq = [model.get_topic_freq(topic) for topic in topics]
-    topic_labels = (
-        model.custom_labels_
-        if model.custom_labels_
-        else [model.topic_labels_[topic] for topic in topics]
-    )
+    topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics]
     topics = [
         f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n"
         for index, topic in enumerate(topics)
     ]
     topics = topic_table_head + "".join(topics)
-    frameworks = "\n".join(
-        [f"* {param}: {value}" for param, value in get_package_versions().items()]
-    )
+    frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()])
 
     # Fill Statistics into model card
     model_card = model_card.replace("{MODEL_NAME}", model_name)
@@ -330,9 +308,7 @@ def generate_readme(model, repo_id: str):
     if not has_visual_aspect:
         model_card = model_card.replace("{PIPELINE_TAG}", "text-classification")
     else:
-        model_card = model_card.replace(
-            "pipeline_tag: {PIPELINE_TAG}\n", ""
-        )  # TODO add proper tag for this instance
+        model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "")  # TODO add proper tag for this instance
 
     return model_card
 

diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -45,20 +45,14 @@ def check_documents_type(documents):
         if not any([isinstance(doc, str) for doc in documents]):
             raise TypeError("Make sure that the iterable only contains strings.")
     else:
-        raise TypeError(
-            "Make sure that the documents variable is an iterable containing strings only."
-        )
+        raise TypeError("Make sure that the documents variable is an iterable containing strings only.")
 
 
 def check_embeddings_shape(embeddings, docs):
     """Check if the embeddings have the correct shape."""
     if embeddings is not None:
-        if not any(
-            [isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]
-        ):
-            raise ValueError(
-                "Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. "
-            )
+        if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]):
+            raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ")
         else:
             if embeddings.shape[0] != len(docs):
                 raise ValueError(
@@ -137,16 +131,11 @@ def validate_distance_matrix(X, n_samples):
         # check it has correct size
         n = s[0]
         if n != (n_samples * (n_samples - 1) / 2):
-            raise ValueError(
-                "The condensed distance matrix must have " "shape (n*(n-1)/2,)."
-            )
+            raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
     elif len(s) == 2:
         # check it has correct size
         if (s[0] != n_samples) or (s[1] != n_samples):
-            raise ValueError(
-                "The distance matrix must be of shape "
-                "(n, n) where n is the number of samples."
-            )
+            raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
         # force zero diagonal and convert to condensed
         np.fill_diagonal(X, 0)
         X = squareform(X)
@@ -182,15 +171,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
     for i in range(dists.shape[0] - 1):
         if dists[i] == dists[i + 1]:
             # returns the next unique distance or the current distance with the added noise
-            next_unique_dist = next(
-                (d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max
-            )
+            next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max)
 
             # the noise can never be large then the difference between the next unique distance and the current one
             curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
-            dists_cp[i + 1] = np.random.uniform(
-                low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise
-            )
+            dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
     return dists_cp
 
 

diff --git a/bertopic/backend/_flair.py b/bertopic/backend/_flair.py
@@ -67,9 +67,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
         embeddings = []
         for document in tqdm(documents, disable=not verbose):
             try:
-                sentence = (
-                    Sentence(document) if document else Sentence("an empty document")
-                )
+                sentence = Sentence(document) if document else Sentence("an empty document")
                 self.embedding_model.embed(sentence)
             except RuntimeError:
                 sentence = Sentence("an empty document")

diff --git a/bertopic/backend/_gensim.py b/bertopic/backend/_gensim.py
@@ -48,9 +48,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
             Document/words embeddings with shape (n, m) with `n` documents/words
             that each have an embeddings size of `m`
         """
-        vector_shape = self.embedding_model.get_vector(
-            list(self.embedding_model.index_to_key)[0]
-        ).shape[0]
+        vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
         empty_vector = np.zeros(vector_shape)
 
         # Extract word embeddings and pool to document-level

diff --git a/bertopic/backend/_hftransformers.py b/bertopic/backend/_hftransformers.py
@@ -58,9 +58,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
 
         embeddings = []
         for document, features in tqdm(
-            zip(
-                documents, self.embedding_model(dataset, truncation=True, padding=True)
-            ),
+            zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
             total=len(dataset),
             disable=not verbose,
         ):
@@ -79,12 +77,10 @@ def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
         https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
         """
         token_embeddings = np.array(features)
-        attention_mask = self.embedding_model.tokenizer(
-            document, truncation=True, padding=True, return_tensors="np"
-        )["attention_mask"]
-        input_mask_expanded = np.broadcast_to(
-            np.expand_dims(attention_mask, -1), token_embeddings.shape
-        )
+        attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
+            "attention_mask"
+        ]
+        input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
         sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
         sum_mask = np.clip(
             input_mask_expanded.sum(1),

diff --git a/bertopic/backend/_multimodal.py b/bertopic/backend/_multimodal.py
@@ -84,9 +84,7 @@ def __init__(
         except:  # noqa: E722
             self.tokenizer = None
 
-    def embed(
-        self, documents: List[str], images: List[str] = None, verbose: bool = False
-    ) -> np.ndarray:
+    def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
         """Embed a list of n documents/words or images into an n-dimensional
         matrix of embeddings.
 
@@ -124,9 +122,7 @@ def embed(
         elif image_embeddings is not None:
             return image_embeddings
 
-    def embed_documents(
-        self, documents: List[str], verbose: bool = False
-    ) -> np.ndarray:
+    def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
         """Embed a list of n documents/words into an n-dimensional
         matrix of embeddings.
 
@@ -139,9 +135,7 @@ def embed_documents(
             that each have an embeddings size of `m`
         """
         truncated_docs = [self._truncate_document(doc) for doc in documents]
-        embeddings = self.embedding_model.encode(
-            truncated_docs, show_progress_bar=verbose
-        )
+        embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
         return embeddings
 
     def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
@@ -170,15 +164,12 @@ def embed_images(self, images, verbose):
                 end_index = (i * self.batch_size) + self.batch_size
 
                 images_to_embed = [
-                    Image.open(image) if isinstance(image, str) else image
-                    for image in images[start_index:end_index]
+                    Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
                 ]
                 if self.image_model is not None:
                     img_emb = self.image_model.encode(images_to_embed)
                 else:
-                    img_emb = self.embedding_model.encode(
-                        images_to_embed, show_progress_bar=False
-                    )
+                    img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
                 embeddings.extend(img_emb.tolist())
 
                 # Close images
@@ -191,9 +182,7 @@ def embed_images(self, images, verbose):
             if self.image_model is not None:
                 embeddings = self.image_model.encode(images_to_embed)
             else:
-                embeddings = self.embedding_model.encode(
-                    images_to_embed, show_progress_bar=False
-                )
+                embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
         return embeddings
 
     def _truncate_document(self, document):

diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py
@@ -70,9 +70,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
         if self.batch_size is not None:
             embeddings = []
             for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
-                response = self.client.embeddings.create(
-                    input=batch, **self.generator_kwargs
-                )
+                response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
                 embeddings.extend([r.embedding for r in response.data])
 
                 # Delay subsequent calls
@@ -81,9 +79,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
 
         # Extract embeddings all at once
         else:
-            response = self.client.embeddings.create(
-                input=prepared_documents, **self.generator_kwargs
-            )
+            response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
             embeddings = [r.embedding for r in response.data]
         return np.array(embeddings)
 

diff --git a/bertopic/backend/_use.py b/bertopic/backend/_use.py
@@ -50,9 +50,6 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
             that each have an embeddings size of `m`
         """
         embeddings = np.array(
-            [
-                self.embedding_model([doc]).cpu().numpy()[0]
-                for doc in tqdm(documents, disable=not verbose)
-            ]
+            [self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
         )
         return embeddings
diff --git a/bertopic/backend/_utils.py b/bertopic/backend/_utils.py
@@ -68,9 +68,7 @@
 ]
 
 
-def select_backend(
-    embedding_model, language: str = None, verbose: bool = False
-) -> BaseEmbedder:
+def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
     """Select an embedding model based on language or a specific provided model.
     When selecting a language, we choose all-MiniLM-L6-v2 for English and
     paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
@@ -115,9 +113,7 @@ def select_backend(
         return USEBackend(embedding_model)
 
     # Sentence Transformer embeddings
-    if "sentence_transformers" in str(type(embedding_model)) or isinstance(
-        embedding_model, str
-    ):
+    if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
         from ._sentencetransformers import SentenceTransformerBackend
 
         return SentenceTransformerBackend(embedding_model)
@@ -134,13 +130,9 @@ def select_backend(
             from ._sentencetransformers import SentenceTransformerBackend
 
             if language.lower() in ["English", "english", "en"]:
-                return SentenceTransformerBackend(
-                    "sentence-transformers/all-MiniLM-L6-v2"
-                )
+                return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
             elif language.lower() in languages or language == "multilingual":
-                return SentenceTransformerBackend(
-                    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-                )
+                return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
             else:
                 raise ValueError(
                     f"{language} is currently not supported. However, you can "

diff --git a/bertopic/cluster/_utils.py b/bertopic/cluster/_utils.py
@@ -25,9 +25,7 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
         if "cuml" in str_type_model and "hdbscan" in str_type_model:
             from cuml.cluster import hdbscan as cuml_hdbscan
 
-            predictions, probabilities = cuml_hdbscan.approximate_predict(
-                model, embeddings
-            )
+            predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings)
             return predictions, probabilities
 
         predictions = model.predict(embeddings)

diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py
@@ -86,9 +86,7 @@ def text_color(val):
 
     def highligh_color(data, color="white"):
         attr = "background-color: {}".format(color)
-        return pd.DataFrame(
-            np.where(data == 0, attr, ""), index=data.index, columns=data.columns
-        )
+        return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns)
 
     if len(df) == 0:
         return df