diff --git a/.flake8 b/.flake8 index 1fd48933..01f47754 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,2 @@ -[flake8] +[flake8] max-line-length = 160 diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c2ce2f83..edfca558 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install -e ".[test]" - name: Run Checking Mechanisms run: make check diff --git a/.gitignore b/.gitignore index a8e93cbf..30b09012 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ ENV/ env.bak/ venv.bak/ +# Artifacts .idea .idea/ .vscode diff --git a/LICENSE b/LICENSE index 27f9436e..6bd3f051 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022, Maarten P. Grootendorst +Copyright (c) 2023, Maarten P. Grootendorst Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 00031023..e5bb153e 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,16 @@ allowing for easily interpretable topics whilst keeping important words in the t BERTopic supports [**guided**](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html), -(semi-) [**supervised**](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html), +[**supervised**](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html), +[**semi-supervised**](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html), +[**manual**](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html), +[**long-document**](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html), [**hierarchical**](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html), +[**class-based**](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html), [**dynamic**](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html), and [**online**](https://maartengr.github.io/BERTopic/getting_started/online/online.html) topic modeling. It even supports visualizations similar to LDAvis! -Corresponding medium posts can be found [here](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6?source=friends_link&sk=0b5a470c006d1842ad4c8a3057063a99) -and [here](https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4). For a more detailed overview, you can read the [paper](https://arxiv.org/abs/2203.05794). +Corresponding medium posts can be found [here](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6?source=friends_link&sk=0b5a470c006d1842ad4c8a3057063a99), [here](https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4) and [here](https://towardsdatascience.com/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf?sk=b1e0fd46f70cb15e8422b4794a81161d). For a more detailed overview, you can read the [paper](https://arxiv.org/abs/2203.05794) or see a [brief overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). ## Installation @@ -31,8 +34,7 @@ Installation, with sentence-transformers, can be done using [pypi](https://pypi. pip install bertopic ``` -You may want to install more depending on the transformers and language backends that you will be using. -The possible installations are: +If you want to install BERTopic with other embedding models, you can choose one of the following: ```bash pip install bertopic[flair] @@ -82,8 +84,8 @@ Topic Count Name 3 381 22_key_encryption_keys_encrypted ``` --1 refers to all outliers and should typically be ignored. Next, let's take a look at the most -frequent topic that was generated, topic 0: +The `-1` topic refers to all outlier documents and are typically ignored. Next, let's take a look at the most +frequent topic that was generated: ```python >>> topic_model.get_topic(0) @@ -100,7 +102,22 @@ frequent topic that was generated, topic 0: ('pc', 0.003047105930670237)] ``` -**NOTE**: Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. +Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: + +```python +>>> topic_model.get_document_info(docs) + +Document Topic Name Top_n_words Probability ... +I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... +My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... +Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... +Think! It's the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... +1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... +``` + +> **Note** +> +> Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. ## Visualize Topics After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good @@ -114,51 +131,19 @@ topic_model.visualize_topics() -We can create an overview of the most frequent topics in a way that they are easily interpretable. -Horizontal barcharts typically convey information rather well and allow for an intuitive representation -of the topics: - -```python -topic_model.visualize_barchart() -``` - - - - Find all possible visualizations with interactive examples in the documentation [here](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html). -## Embedding Models -BERTopic supports many embedding models that can be used to embed the documents and words: -* Sentence-Transformers -* 🤗 Transformers -* Flair -* Spacy -* Gensim -* USE -[**Sentence-Transformers**](https://github.com/UKPLab/sentence-transformers) is typically used as it has shown great results embedding documents -meant for semantic similarity. Simply select any from their documentation -[here](https://www.sbert.net/docs/pretrained_models.html) and pass it to BERTopic: +## Modularity +By default, the main steps for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model: -```python -topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2") -``` +https://user-images.githubusercontent.com/25746895/205490350-cd9833e7-9cd5-44fa-8752-407d748de633.mp4 -Similarly, you can choose any [**🤗 Transformers**](https://huggingface.co/models) model and pass it to BERTopic: - -```python -from transformers.pipelines import pipeline +You can swap out any of these models or even remove them entirely. Starting with the embedding step, you can find out how to do this [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) and more about the underlying algorithm and assumptions [here](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). -embedding_model = pipeline("feature-extraction", model="distilbert-base-cased") -topic_model = BERTopic(embedding_model=embedding_model) -``` - -Click [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) -for a full overview of all supported embedding models. - -## Overview -BERTopic has quite a number of functions that quickly can become overwhelming. To alleviate this issue, you will find an overview +## Functionality +BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose. ### Common @@ -173,12 +158,14 @@ Below, you will find an overview of common functions in BERTopic. | Access all topics | `.get_topics()` | | Get topic freq | `.get_topic_freq()` | | Get all topic information| `.get_topic_info()` | +| Get all document information| `.get_document_info(docs)` | | Get representative docs per topic | `.get_representative_docs()` | | Update topic representation | `.update_topics(docs, n_gram_range=(1, 3))` | | Generate topic labels | `.generate_topic_labels()` | | Set topic labels | `.set_topic_labels(my_custom_labels)` | | Merge topics | `.merge_topics(docs, topics_to_merge)` | | Reduce nr of topics | `.reduce_topics(docs, nr_topics=30)` | +| Reduce outliers | `.reduce_outliers(docs, topics)` | | Find topics | `.find_topics("vehicle")` | | Save model | `.save("my_model")` | | Load model | `BERTopic.load("my_model")` | @@ -186,35 +173,39 @@ Below, you will find an overview of common functions in BERTopic. ### Attributes -After having trained your BERTopic model, a number of attributes are saved within your model. These attributes, in part, +After having trained your BERTopic model, several attributes are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in `_` and are public attributes that can be used to access model information. | Attribute | Description | |------------------------|---------------------------------------------------------------------------------------------| -| topics_ | The topics that are generated for each document after training or updating the topic model. | -| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. | -| topic_sizes_ | The size of each topic | -| topic_mapper_ | A class for tracking topics and their mappings anytime they are merged/reduced. | -| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values. | -| c_tf_idf_ | The topic-term matrix as calculated through c-TF-IDF. | -| topic_labels_ | The default labels for each topic. | -| custom_labels_ | Custom labels for each topic as generated through `.set_topic_labels`. | -| topic_embeddings_ | The embeddings for each topic if `embedding_model` was used. | -| representative_docs_ | The representative documents for each topic if HDBSCAN is used. | +| `.topics_` | The topics that are generated for each document after training or updating the topic model. | +| `.probabilities_` | The probabilities that are generated for each document if HDBSCAN is used. | +| `.topic_sizes_` | The size of each topic | +| `.topic_mapper_` | A class for tracking topics and their mappings anytime they are merged/reduced. | +| `.topic_representations_` | The top *n* terms per topic and their respective c-TF-IDF values. | +| `.c_tf_idf_` | The topic-term matrix as calculated through c-TF-IDF. | +| `.topic_labels_` | The default labels for each topic. | +| `.custom_labels_` | Custom labels for each topic as generated through `.set_topic_labels`. | +| `.topic_embeddings_` | The embeddings for each topic if `embedding_model` was used. | +| `.representative_docs_` | The representative documents for each topic if HDBSCAN is used. | ### Variations -There are many different use cases in which topic modeling can be used. As such, a number of -variations of BERTopic have been developed such that one package can be used across across many use cases. +There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases. | Method | Code | |-----------------------|---| -| (semi-) Supervised Topic Modeling | `.fit(docs, y=y)` | -| Topic Modeling per Class | `.topics_per_class(docs, classes)` | -| Dynamic Topic Modeling | `.topics_over_time(docs, timestamps)` | -| Hierarchical Topic Modeling | `.hierarchical_topics(docs)` | -| Guided Topic Modeling | `BERTopic(seed_topic_list=seed_topic_list)` | +| [Topic Distribution Approximation](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) | `.approximate_distribution(docs)` | +| [Online Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/online/online.html) | `.partial_fit(doc)` | +| [Semi-supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html) | `.fit(docs, y=y)` | +| [Supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) | `.fit(docs, y=y)` | +| [Manual Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) | `.fit(docs, y=y)` | +| [Topic Modeling per Class](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html) | `.topics_per_class(docs, classes)` | +| [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` | +| [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` | +| [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` | + ### Visualizations Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. diff --git a/bertopic/__init__.py b/bertopic/__init__.py index 2d18e0cb..b2e745e7 100644 --- a/bertopic/__init__.py +++ b/bertopic/__init__.py @@ -1,6 +1,6 @@ from bertopic._bertopic import BERTopic -__version__ = "0.12.0" +__version__ = "0.13.0" __all__ = [ "BERTopic", diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 1cbf7152..ef0da5e1 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -9,6 +9,7 @@ pass import re +import math import joblib import inspect import numpy as np @@ -17,6 +18,7 @@ from packaging import version from scipy.sparse import csr_matrix from scipy.cluster import hierarchy as sch +from scipy.spatial.distance import squareform from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable # Models @@ -31,7 +33,9 @@ from bertopic import plotting from bertopic._mmr import mmr from bertopic.vectorizers import ClassTfidfTransformer +from bertopic.backend import BaseEmbedder from bertopic.backend._utils import select_backend +from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import MyLogger, check_documents_type, check_embeddings_shape, check_is_fitted # Visualization @@ -132,6 +136,7 @@ def __init__(self, CountVectorizer. min_topic_size: The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics. + NOTE: This param will not be used if you are not using HDBSCAN. nr_topics: Specifying the number of topics will reduce the initial number of topics to the value specified. This reduction can take a while as each reduction in topics (-1) activates a c-TF-IDF @@ -176,6 +181,7 @@ def __init__(self, # Topic-based parameters if top_n_words > 30: raise ValueError("top_n_words should be lower or equal to 30. The preferred value is 10.") + self.top_n_words = top_n_words self.min_topic_size = min_topic_size self.nr_topics = nr_topics @@ -217,7 +223,7 @@ def __init__(self, self.topic_embeddings_ = None self.topic_labels_ = None self.custom_labels_ = None - self.representative_docs_ = None + self.representative_docs_ = {} self.c_tf_idf_ = None # Private attributes for internal tracking purposes @@ -345,7 +351,7 @@ def fit_transform(self, umap_embeddings = self._reduce_dimensionality(embeddings, y) # Cluster reduced embeddings - documents, probabilities = self._cluster_embeddings(umap_embeddings, documents) + documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) # Sort and Map Topic IDs by their frequency if not self.nr_topics: @@ -358,7 +364,11 @@ def fit_transform(self, if self.nr_topics: documents = self._reduce_topics(documents) - self._map_representative_docs(original_topics=True) + if isinstance(self.hdbscan_model, hdbscan.HDBSCAN): + self._map_representative_docs(original_topics=True) + else: + self._save_representative_docs(documents) + self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() @@ -423,15 +433,14 @@ def transform(self, umap_embeddings = self.umap_model.transform(embeddings) logger.info("Reduced dimensionality") - # Extract predictions and probabilities if it is a HDBSCAN model - if isinstance(self.hdbscan_model, hdbscan.HDBSCAN): - predictions, probabilities = hdbscan.approximate_predict(self.hdbscan_model, umap_embeddings) + # Extract predictions and probabilities if it is a HDBSCAN-like model + if is_supported_hdbscan(self.hdbscan_model): + predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings) # Calculate probabilities - if self.calculate_probabilities: + if self.calculate_probabilities and isinstance(self.hdbscan_model, hdbscan.HDBSCAN): probabilities = hdbscan.membership_vector(self.hdbscan_model, umap_embeddings) logger.info("Calculated probabilities with HDBSCAN") - else: predictions = self.hdbscan_model.predict(umap_embeddings) probabilities = None @@ -744,9 +753,9 @@ def topics_per_class(self, Arguments: docs: The documents you used when calling either `fit` or `fit_transform` classes: The class of each document. This can be either a list of strings or ints. - global_tuning: Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix - with the global c-TF-IDF matrix. Turn this off if you want to prevent words in - topic representations that could not be found in the documents at timestamp t. + global_tuning: Fine-tune each topic representation for class c t by averaging its c-TF-IDF matrix + with the global c-TF-IDF matrix. Turn this off if you want to prevent words in + topic representations that could not be found in the documents for class c. Returns: topics_per_class: A dataframe that contains the topic, words, and frequency of topics @@ -851,9 +860,15 @@ def hierarchical_topics(self, if linkage_function is None: linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) - # Calculate linkage + # Calculate distance embeddings = self.c_tf_idf_[self._outliers:] X = distance_function(embeddings) + + # Make sure it is the 1-D condensed distance matrix with zeros on the diagonal + np.fill_diagonal(X, 0) + X = squareform(X) + + # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix Z = linkage_function(X) # Calculate basic bag-of-words to be iteratively merged later @@ -934,6 +949,227 @@ def hierarchical_topics(self, return hier_topics + def approximate_distribution(self, + documents: Union[str, List[str]], + window: int = 4, + stride: int = 1, + min_similarity: float = 0.1, + batch_size: int = 1000, + padding: bool = False, + use_embedding_model: bool = False, + calculate_tokens: bool = False, + separator: str = " ") -> Tuple[np.ndarray, + Union[List[np.ndarray], None]]: + """ A post-hoc approximation of topic distributions across documents. + + In order to perform this approximation, each document is split into tokens + according to the provided tokenizer in the `CountVectorizer`. Then, a + sliding window is applied on each document creating subsets of the document. + For example, with a window size of 3 and stride of 1, the sentence: + + `Solving the right problem is difficult.` + + can be split up into `solving the right`, `the right problem`, `right problem is`, + and `problem is difficult`. These are called tokensets. For each of these + tokensets, we calculate their c-TF-IDF representation and find out + how similar they are to the previously generated topics. Then, the + similarities to the topics for each tokenset are summed in order to + create a topic distribution for the entire document. + + We can also dive into this a bit deeper by then splitting these tokensets + up into individual tokens and calculate how much a word, in a specific sentence, + contributes to the topics found in that document. This can be enabled by + setting `calculate_tokens=True` which can be used for visualization purposes + in `topic_model.visualize_approximate_distribution`. + + The main output, `topic_distributions`, can also be used directly in + `.visualize_distribution(topic_distributions[index])` by simply selecting + a single distribution. + + Arguments: + documents: A single document or a list of documents for which we + approximate their topic distributions + window: Size of the moving window which indicates the number of + tokens being considered. + stride: How far the window should move at each step. + min_similarity: The minimum similarity of a document's tokenset + with respect to the topics. + batch_size: The number of documents to process at a time. If None, + then all documents are processed at once. + NOTE: With a large number of documents, it is not + advised to process all documents at once. + padding: Whether to pad the beginning and ending of a document with + empty tokens. + use_embedding_model: Whether to use the topic model's embedding + model to calculate the similarity between + tokensets and topics instead of using c-TF-IDF. + calculate_tokens: Calculate the similarity of tokens with all topics. + NOTE: This is computation-wise more expensive and + can require more memory. Using this over batches of + documents might be preferred. + separator: The separator used to merge tokens into tokensets. + + Returns: + topic_distributions: A `n` x `m` matrix containing the topic distributions + for all input documents with `n` being the documents + and `m` the topics. + topic_token_distributions: A list of `t` x `m` arrays with `t` being the + number of tokens for the respective document + and `m` the topics. + + Examples: + + After fitting the model, the topic distributions can be calculated regardless + of the clustering model and regardless of whether the documents were previously + seen or not: + + ```python + topic_distr, _ = topic_model.approximate_distribution(docs) + ``` + + As a result, the topic distributions are calculated in `topic_distr` for the + entire document based on token set with a specific window size and stride. + + If you want to calculate the topic distributions on a token-level: + + ```python + topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) + ``` + + The `topic_token_distr` then contains, for each token, the best fitting topics. + As with `topic_distr`, it can contain multiple topics for a single token. + """ + if isinstance(documents, str): + documents = [documents] + + if batch_size is None: + batch_size = len(documents) + batches = 1 + else: + batches = math.ceil(len(documents)/batch_size) + + topic_distributions = [] + topic_token_distributions = [] + + for i in tqdm(range(batches), disable=not self.verbose): + doc_set = documents[i*batch_size: (i+1) * batch_size] + + # Extract tokens + analyzer = self.vectorizer_model.build_tokenizer() + tokens = [analyzer(document) for document in doc_set] + + # Extract token sets + all_sentences = [] + all_indices = [0] + all_token_sets_ids = [] + + for tokenset in tokens: + if len(tokenset) < window: + token_sets = [tokenset] + token_sets_ids = [list(range(len(tokenset)))] + else: + + # Extract tokensets using window and stride parameters + stride_indices = list(range(len(tokenset)))[::stride] + token_sets = [] + token_sets_ids = [] + for stride_index in stride_indices: + selected_tokens = tokenset[stride_index: stride_index+window] + + if padding or len(selected_tokens) == window: + token_sets.append(selected_tokens) + token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens)))) + + # Add empty tokens at the beginning and end of a document + if padding: + padded = [] + padded_ids = [] + t = math.ceil(window / stride) - 1 + for i in range(math.ceil(window / stride) - 1): + padded.append(tokenset[:window - ((t-i) * stride)]) + padded_ids.append(list(range(0, window - ((t-i) * stride)))) + + token_sets = padded + token_sets + token_sets_ids = padded_ids + token_sets_ids + + # Join the tokens + sentences = [separator.join(token) for token in token_sets] + all_sentences.extend(sentences) + all_token_sets_ids.extend(token_sets_ids) + all_indices.append(all_indices[-1] + len(sentences)) + + # Calculate similarity between embeddings of token sets and the topics + if use_embedding_model: + embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) + similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:]) + + # Calculate similarity between c-TF-IDF of token sets and the topics + else: + bow_doc = self.vectorizer_model.transform(all_sentences) + c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + + # Only keep similarities that exceed the minimum + similarity[similarity < min_similarity] = 0 + + # Aggregate results on an individual token level + if calculate_tokens: + topic_distribution = [] + topic_token_distribution = [] + for index, token in enumerate(tokens): + start = all_indices[index] + end = all_indices[index+1] + + if start == end: + end = end + 1 + + # Assign topics to individual tokens + token_id = [i for i in range(len(token))] + token_val = {index: [] for index in token_id} + for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]): + for token in token_set: + if token in token_val: + token_val[token].append(sim) + + matrix = [] + for _, value in token_val.items(): + matrix.append(np.add.reduce(value)) + + # Take empty documents into account + matrix = np.array(matrix) + if len(matrix.shape) == 1: + matrix = np.zeros((1, len(self.topic_labels_) - self._outliers)) + + topic_token_distribution.append(np.array(matrix)) + topic_distribution.append(np.add.reduce(matrix)) + + topic_distribution = normalize(topic_distribution, norm='l1', axis=1) + + # Aggregate on a tokenset level indicated by the window and stride + else: + topic_distribution = [] + for index in range(len(all_indices)-1): + start = all_indices[index] + end = all_indices[index+1] + + if start == end: + end = end + 1 + group = similarity[start:end].sum(axis=0) + topic_distribution.append(group) + topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1) + topic_token_distribution = None + + # Combine results + topic_distributions.append(topic_distribution) + if topic_token_distribution is None: + topic_token_distributions = None + else: + topic_token_distributions.extend(topic_token_distribution) + + topic_distributions = np.vstack(topic_distributions) + + return topic_distributions, topic_token_distributions + def find_topics(self, search_term: str, top_n: int = 5) -> Tuple[List[int], List[float]]: @@ -1170,8 +1406,109 @@ def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]: return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values("Count", ascending=False) + def get_document_info(self, + docs: List[str], + df: pd.DataFrame = None, + metadata: Mapping[str, Any] = None) -> pd.DataFrame: + """ Get information about the documents on which the topic was trained + including the documents themselves, their respective topics, the name + of each topic, the top n words of each topic, whether it is a + representative document, and probability of the clustering if the cluster + model supports it. + + There are also options to include other meta data, such as the topic + distributions or the x and y coordinates of the reduced embeddings. + + Arguments: + docs: The documents on which the topic model was trained. + df: A dataframe containing the metadata and the documents on which + the topic model was originally trained on. + metadata: A dictionary with meta data for each document in the form + of column name (key) and the respective values (value). + + Returns: + document_info: A dataframe with several statistics regarding + the documents on which the topic model was trained. + + Usage: + + To get the document info, you will only need to pass the documents on which + the topic model was trained: + + ```python + document_info = topic_model.get_document_info(docs) + ``` + + There are additionally options to include meta data, such as the topic + distributions. Moreover, we can pass the original dataframe that contains + the documents and extend it with the information retrieved from BERTopic: + + ```python + from sklearn.datasets import fetch_20newsgroups + + # The original data in a dataframe format to include the target variable + data= fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) + df = pd.DataFrame({"Document": data['data'], "Class": data['target']}) + + # Add information about the percentage of the document that relates to the topic + topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000) + distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)] + + # Create our documents dataframe using the original dataframe and meta data about + # the topic distributions + document_info = topic_model.get_document_info(docs, df=df, + metadata={"Topic_distribution": distributions}) + ``` + """ + if df is not None: + document_info = df.copy() + document_info["Document"] = docs + document_info["Topic"] = self.topics_ + else: + document_info = pd.DataFrame({"Document": docs, "Topic": self.topics_}) + + # Add topic info through `.get_topic_info()` + topic_info = self.get_topic_info().drop("Count", axis=1) + document_info = pd.merge(document_info, topic_info, on="Topic", how="left") + + # Add top n words + top_n_words = {topic: " - ".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)} + document_info["Top_n_words"] = document_info.Topic.map(top_n_words) + + # Add flat probabilities + if self.probabilities_ is not None: + if len(self.probabilities_.shape) == 1: + document_info["Probability"] = self.probabilities_ + else: + document_info["Probability"] = [max(probs) if topic != -1 else 1-sum(probs) + for topic, probs in zip(self.topics_, self.probabilities_)] + + # Add representative document labels + repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] + document_info["Representative_document"] = False + document_info.loc[document_info.Document.isin(repr_docs), "Representative_document"] = True + + # Add custom meta data provided by the user + if metadata is not None: + for column, values in metadata.items(): + document_info[column] = values + return document_info + def get_representative_docs(self, topic: int = None) -> List[str]: - """ Extract representative documents per topic + """ Extract the best representing documents per topic. + + NOTE: + This does not extract all documents per topic as all documents + are not saved within BERTopic. To get all documents, please + run the following: + + ```python + # When you used `.fit_transform`: + df = pd.DataFrame({"Document": docs, "Topic": topic}) + + # When you used `.fit`: + df = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_}) + ``` Arguments: topic: A specific topic for which you want @@ -1196,7 +1533,10 @@ def get_representative_docs(self, topic: int = None) -> List[str]: """ check_is_fitted(self) if isinstance(topic, int): - return self.representative_docs_[topic] + if self.representative_docs_.get(topic): + return self.representative_docs_[topic] + else: + return None else: return self.representative_docs_ @@ -1517,6 +1857,141 @@ def reduce_topics(self, return self + def reduce_outliers(self, + documents: List[str], + topics: List[int], + strategy: str = "distributions", + probabilities: np.ndarray = None, + threshold: int = 0, + embeddings: np.ndarray = None, + distributions_params: Mapping[str, Any] = {}) -> List[int]: + """ Reduce outliers by merging them with their nearest topic according + to one of several strategies. + + When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created + that do not fall within any of the created topics. These are labeled as -1. + This function allows the user to match outlier documents with their nearest topic + using one of the following strategies using the `strategy` parameter: + * "probabilities" + This uses the soft-clustering as performed by HDBSCAN to find the + best matching topic for each outlier document. To use this, make + sure to calculate the `probabilities` beforehand by instantiating + BERTopic with `calculate_probabilities=True`. + * "distributions" + Use the topic distributions, as calculated with `.approximate_distribution` + to find the most frequent topic in each outlier document. You can use the + `distributions_params` variable to tweak the parameters of + `.approximate_distribution`. + * "c-tf-idf" + Calculate the c-TF-IDF representation for each outlier document and + find the best matching c-TF-IDF topic representation using + cosine similarity. + * "embeddings" + Using the embeddings of each outlier documents, find the best + matching topic embedding using cosine similarity. + + Arguments: + documents: A list of documents for which we reduce or remove the outliers. + topics: The topics that correspond to the documents + strategy: The strategy used for reducing outliers. + Options: + * "probabilities" + This uses the soft-clustering as performed by HDBSCAN + to find the best matching topic for each outlier document. + + * "distributions" + Use the topic distributions, as calculated with `.approximate_distribution` + to find the most frequent topic in each outlier document. + + * "c-tf-idf" + Calculate the c-TF-IDF representation for outlier documents and + find the best matching c-TF-IDF topic representation. + + * "embeddings" + Calculate the embeddings for outlier documents and + find the best matching topic embedding. + threshold: The threshold for assigning topics to outlier documents. This value + represents the minimum probability when `strategy="probabilities"`. + For all other strategies, it represents the minimum similarity. + embeddings: The pre-computed embeddings to be used when `strategy="embeddings"`. + If this is None, then it will compute the embeddings for the outlier documents. + distributions_params: The parameters used in `.approximate_distribution` when using + the strategy `"distributions"`. + + Returns: + new_topics: The updated topics + + Usage: + + The default settings uses the `"distributions"` strategy: + + ```python + new_topics = topic_model.reduce_outliers(docs, topics) + ``` + + When you use the `"probabilities"` strategy, make sure to also pass the probabilities + as generated through HDBSCAN: + + ```python + from bertopic import BERTopic + topic_model = BERTopic(calculate_probabilities=True) + topics, probs = topic_model.fit_transform(docs) + + new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") + ``` + """ + + # Check correct use of parameters + if strategy.lower() == "probabilities" and probabilities is None: + raise ValueError("Make sure to pass in `probabilities` in order to use the probabilities strategy") + + # Reduce outliers by extracting most likely topics through the topic-term probability matrix + if strategy.lower() == "probabilities": + new_topics = [np.argmax(prob) if max(prob) >= threshold and topic == -1 else topic + for topic, prob in zip(topics, probabilities)] + + # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions + elif strategy.lower() == "distributions": + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + outlier_docs = [documents[index] for index in outlier_ids] + topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params) + outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + # Reduce outliers by finding the most similar c-TF-IDF representations + elif strategy.lower() == "c-tf-idf": + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + outlier_docs = [documents[index] for index in outlier_ids] + + # Calculate c-TF-IDF of outlier documents with all topics + bow_doc = self.vectorizer_model.transform(outlier_docs) + c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + + # Update topics + similarity[similarity < threshold] = 0 + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + # Reduce outliers by finding the most similar topic embeddings + elif strategy.lower() == "embeddings": + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + outlier_docs = [documents[index] for index in outlier_ids] + + # Extract or calculate embeddings for outlier documents + if embeddings is not None: + outlier_embeddings = np.array([embeddings[index] for index in outlier_ids]) + else: + outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) + similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:]) + + # Update topics + similarity[similarity < threshold] = 0 + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + return new_topics + def visualize_topics(self, topics: List[int] = None, top_n_topics: int = None, @@ -1951,6 +2426,57 @@ def visualize_distribution(self, width=width, height=height) + def visualize_approximate_distribution(self, + document: str, + topic_token_distribution: np.ndarray, + normalize: bool = False): + """ Visualize the topic distribution calculated by `.approximate_topic_distribution` + on a token level. Thereby indicating the extend to which a certain word or phrases belong + to a specific topic. The assumption here is that a single word can belong to multiple + similar topics and as such give information about the broader set of topics within + a single document. + + Arguments: + topic_model: A fitted BERTopic instance. + document: The document for which you want to visualize + the approximated topic distribution. + topic_token_distribution: The topic-token distribution of the document as + extracted by `.approximate_topic_distribution` + normalize: Whether to normalize, between 0 and 1 (summing to 1), the + topic distribution values. + + Returns: + df: A stylized dataframe indicating the best fitting topics + for each token. + + Examples: + + ```python + # Calculate the topic distributions on a token level + # Note that we need to have `calculate_token_level=True` + topic_distr, topic_token_distr = topic_model.approximate_distribution( + docs, calculate_token_level=True + ) + + # Visualize the approximated topic distributions + df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]) + df + ``` + + To revert this stylized dataframe back to a regular dataframe, + you can run the following: + + ```python + df.data.columns = [column.strip() for column in df.data.columns] + df = df.data + ``` + """ + check_is_fitted(self) + return plotting.visualize_approximate_distribution(self, + document=document, + topic_token_distribution=topic_token_distribution, + normalize=normalize) + def visualize_hierarchy(self, orientation: str = "left", topics: List[int] = None, @@ -2106,7 +2632,7 @@ def visualize_barchart(self, topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_words: Number of words to show in a topic - custom_labels: Whether to use custom topic labels that were defined using + custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of each figure. @@ -2146,6 +2672,11 @@ def save(self, save_embedding_model: bool = True) -> None: """ Saves the model to the specified path + When saving the model, make sure to also keep track of the versions + of dependencies and Python used. Loading and saving the model should + be done using the same dependencies and Python. Moreover, models + saved in one version of BERTopic should not be loaded in other versions. + Arguments: path: the location and name of the file you want to save save_embedding_model: Whether to save the embedding model in this class @@ -2306,8 +2837,9 @@ def _reduce_dimensionality(self, def _cluster_embeddings(self, umap_embeddings: np.ndarray, documents: pd.DataFrame, - partial_fit: bool = False) -> Tuple[pd.DataFrame, - np.ndarray]: + partial_fit: bool = False, + y: np.ndarray = None) -> Tuple[pd.DataFrame, + np.ndarray]: """ Cluster UMAP embeddings with HDBSCAN Arguments: @@ -2326,8 +2858,15 @@ def _cluster_embeddings(self, documents['Topic'] = labels self.topics_ = labels else: - self.hdbscan_model.fit(umap_embeddings) - labels = self.hdbscan_model.labels_ + try: + self.hdbscan_model.fit(umap_embeddings, y=y) + except TypeError: + self.hdbscan_model.fit(umap_embeddings) + + try: + labels = self.hdbscan_model.labels_ + except AttributeError: + labels = y documents['Topic'] = labels self._update_topic_size(documents) @@ -2336,14 +2875,17 @@ def _cluster_embeddings(self, # track if there are outlier labels and act accordingly when slicing. self._outliers = 1 if -1 in set(labels) else 0 - # Save representative docs and calculate probabilities if it is a HDBSCAN model + # Save representative docs if isinstance(self.hdbscan_model, hdbscan.HDBSCAN): - probabilities = self.hdbscan_model.probabilities_ self._save_representative_docs(documents) - if self.calculate_probabilities: - probabilities = hdbscan.all_points_membership_vectors(self.hdbscan_model) - else: - probabilities = None + + # Extract probabilities + probabilities = None + if hasattr(self.hdbscan_model, "probabilities_"): + probabilities = self.hdbscan_model.probabilities_ + + if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model): + probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors") if not partial_fit: self.topic_mapper_ = TopicMapper(self.topics_) @@ -2417,30 +2959,52 @@ def _save_representative_docs(self, documents: pd.DataFrame): Arguments: documents: Dataframe with documents and their corresponding IDs """ - # Prepare the condensed tree and luf clusters beneath a given cluster - condensed_tree = self.hdbscan_model.condensed_tree_ - raw_tree = condensed_tree._raw_tree - clusters = sorted(condensed_tree._select_clusters()) - cluster_tree = raw_tree[raw_tree['child_size'] > 1] - - # Find the points with maximum lambda value in each leaf - representative_docs = {} - for topic in documents['Topic'].unique(): - if topic != -1: - leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, clusters[topic]) - - result = np.array([]) - for leaf in leaves: - max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max() - points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)] - result = np.hstack((result, points)) - - representative_docs[topic] = list(np.random.choice(result, 3, replace=False).astype(int)) - - # Convert indices to documents - self.representative_docs_ = {topic: [documents.iloc[doc_id].Document for doc_id in doc_ids] - for topic, doc_ids in - representative_docs.items()} + smallest_cluster_size = min(self.topic_sizes_.items(), key=lambda x: x[1])[1] + if smallest_cluster_size < 3: + top_n_representative_docs = smallest_cluster_size + else: + top_n_representative_docs = 3 + + if isinstance(self.hdbscan_model, hdbscan.HDBSCAN): + # Prepare the condensed tree and luf clusters beneath a given cluster + condensed_tree = self.hdbscan_model.condensed_tree_ + raw_tree = condensed_tree._raw_tree + clusters = sorted(condensed_tree._select_clusters()) + cluster_tree = raw_tree[raw_tree['child_size'] > 1] + + # Find the points with maximum lambda value in each leaf + representative_docs = {} + for topic in documents['Topic'].unique(): + if topic != -1: + leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, clusters[topic]) + + result = np.array([]) + for leaf in leaves: + max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max() + points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)] + result = np.hstack((result, points)) + + representative_docs[topic] = list(np.random.choice(result, top_n_representative_docs, replace=False).astype(int)) + + # Convert indices to documents + self.representative_docs_ = {topic: [documents.iloc[doc_id].Document for doc_id in doc_ids] + for topic, doc_ids in + representative_docs.items()} + else: + documents_per_topic = documents.groupby('Topic').sample(n=500, replace=True, random_state=42).drop_duplicates() + self.representative_docs_ = {} + for topic in documents['Topic'].unique(): + + # Calculate similarity + selected_docs = documents_per_topic.loc[documents_per_topic.Topic == topic, "Document"].values + bow = self.vectorizer_model.transform(selected_docs) + ctfidf = self.ctfidf_model.transform(bow) + sim_matrix = cosine_similarity(ctfidf, self.c_tf_idf_[topic + self._outliers]) + + # Extract top 3 most representative documents + indices = np.argpartition(sim_matrix.reshape(1, -1)[0], + -top_n_representative_docs)[-top_n_representative_docs:] + self.representative_docs_[topic] = [selected_docs[index] for index in indices] def _map_representative_docs(self, original_topics: bool = False): """ Map the representative docs per topic to the correct topics @@ -2455,19 +3019,21 @@ def _map_representative_docs(self, original_topics: bool = False): original topics to the most recent topics or from the second-most recent topics. """ - if isinstance(self.hdbscan_model, hdbscan.HDBSCAN): - mappings = self.topic_mapper_.get_mappings(original_topics) + mappings = self.topic_mapper_.get_mappings(original_topics) + if self.representative_docs_ is not None: representative_docs = self.representative_docs_.copy() + else: + representative_docs = {} - # Update the representative documents - updated_representative_docs = {mappings[old_topic]: [] - for old_topic, _ in representative_docs.items()} - for old_topic, docs in representative_docs.items(): - new_topic = mappings[old_topic] - updated_representative_docs[new_topic].extend(docs) + # Update the representative documents + updated_representative_docs = {mappings[old_topic]: [] + for old_topic, _ in representative_docs.items()} + for old_topic, docs in representative_docs.items(): + new_topic = mappings[old_topic] + updated_representative_docs[new_topic].extend(docs) - self.representative_docs_ = updated_representative_docs - self.representative_docs_.pop(-1, None) + self.representative_docs_ = updated_representative_docs + self.representative_docs_.pop(-1, None) def _create_topic_vectors(self): """ Creates embeddings per topics based on their topic representation @@ -2481,7 +3047,7 @@ def _create_topic_vectors(self): a sentence-transformer model to be used or there are custom embeddings but it is allowed to use a different multi-lingual sentence-transformer model """ - if self.embedding_model is not None: + if self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder: topic_list = list(self.topic_representations_.keys()) topic_list.sort() n = self.top_n_words diff --git a/bertopic/_utils.py b/bertopic/_utils.py index afe90b62..65a0890e 100644 --- a/bertopic/_utils.py +++ b/bertopic/_utils.py @@ -1,7 +1,7 @@ import numpy as np import logging from collections.abc import Iterable -from scipy.sparse.csr import csr_matrix +from scipy.sparse import csr_matrix class MyLogger: diff --git a/bertopic/backend/_gensim.py b/bertopic/backend/_gensim.py index ca5c5a98..7ceb603d 100644 --- a/bertopic/backend/_gensim.py +++ b/bertopic/backend/_gensim.py @@ -48,19 +48,19 @@ def embed(self, Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ + vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0] + empty_vector = np.zeros(vector_shape) + + # Extract word embeddings and pool to document-level embeddings = [] for doc in tqdm(documents, disable=not verbose, position=0, leave=True): - # Extract word embeddings and pool to document-level - embeddings.append( - np.mean( - [ - self.embedding_model.get_vector(word) - for word in doc.split() - if word in self.embedding_model.key_to_index - ], - axis=0, - ) - ) + embedding = [self.embedding_model.get_vector(word) for word in doc.split() + if word in self.embedding_model.key_to_index] + + if len(embedding) > 0: + embeddings.append(np.mean(embedding, axis=0)) + else: + embeddings.append(empty_vector) embeddings = np.array(embeddings) return embeddings diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index 2fd0a3e3..900109a7 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -75,7 +75,7 @@ def embed(self, """ # Handle empty documents, spaCy models automatically map # empty strings to the zero vector - empty_document = "" + empty_document = " " # Extract embeddings from a transformer model if "transformer" in self.embedding_model.component_names: @@ -91,4 +91,4 @@ def embed(self, embeddings.append(self.embedding_model(doc or empty_document).vector) embeddings = np.array(embeddings) - return embeddings + return embeddings \ No newline at end of file diff --git a/bertopic/backend/_utils.py b/bertopic/backend/_utils.py index a48c1cc2..9268a975 100644 --- a/bertopic/backend/_utils.py +++ b/bertopic/backend/_utils.py @@ -1,10 +1,13 @@ from ._base import BaseEmbedder -from ._sentencetransformers import SentenceTransformerBackend -from ._hftransformers import HFTransformerBackend -from ._sklearn import SklearnEmbedder -from transformers.pipelines import Pipeline + +# Imports for light-weight variant of BERTopic +from bertopic.backend._sklearn import SklearnEmbedder +from sklearn.pipeline import make_pipeline +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline as ScikitPipeline + languages = ['afrikaans', 'albanian', 'amharic', 'arabic', 'armenian', 'assamese', 'azerbaijani', 'basque', 'belarusian', 'bengali', 'bengali romanize', 'bosnian', 'breton', 'bulgarian', 'burmese', 'burmese zawgyi font', 'catalan', @@ -35,6 +38,7 @@ def select_backend(embedding_model, if isinstance(embedding_model, BaseEmbedder): return embedding_model + # Scikit-learn backend if isinstance(embedding_model, ScikitPipeline): return SklearnEmbedder(embedding_model) @@ -59,27 +63,33 @@ def select_backend(embedding_model, return USEBackend(embedding_model) # Sentence Transformer embeddings - if "sentence_transformers" in str(type(embedding_model)): - return SentenceTransformerBackend(embedding_model) - - # Create a Sentence Transformer model based on a string - if isinstance(embedding_model, str): + if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str): + from ._sentencetransformers import SentenceTransformerBackend return SentenceTransformerBackend(embedding_model) # Hugging Face embeddings - if isinstance(embedding_model, Pipeline): + if "transformers" and "pipeline" in str(type(embedding_model)): + from ._hftransformers import HFTransformerBackend return HFTransformerBackend(embedding_model) # Select embedding model based on language if language: - if language.lower() in ["English", "english", "en"]: - return SentenceTransformerBackend("all-MiniLM-L6-v2") - elif language.lower() in languages or language == "multilingual": - return SentenceTransformerBackend("paraphrase-multilingual-MiniLM-L12-v2") - else: - raise ValueError(f"{language} is currently not supported. However, you can " - f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n" - "Else, please select a language from the following list:\n" - f"{languages}") + try: + from ._sentencetransformers import SentenceTransformerBackend + if language.lower() in ["English", "english", "en"]: + return SentenceTransformerBackend("all-MiniLM-L6-v2") + elif language.lower() in languages or language == "multilingual": + return SentenceTransformerBackend("paraphrase-multilingual-MiniLM-L12-v2") + else: + raise ValueError(f"{language} is currently not supported. However, you can " + f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n" + "Else, please select a language from the following list:\n" + f"{languages}") + + # Only for light-weight installation + except ModuleNotFoundError: + pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100)) + return SklearnEmbedder(pipe) + from ._sentencetransformers import SentenceTransformerBackend return SentenceTransformerBackend("all-MiniLM-L6-v2") diff --git a/bertopic/cluster/__init__.py b/bertopic/cluster/__init__.py new file mode 100644 index 00000000..afe75878 --- /dev/null +++ b/bertopic/cluster/__init__.py @@ -0,0 +1,5 @@ +from ._base import BaseCluster + +__all__ = [ + "BaseCluster", +] diff --git a/bertopic/cluster/_base.py b/bertopic/cluster/_base.py new file mode 100644 index 00000000..dc8412f0 --- /dev/null +++ b/bertopic/cluster/_base.py @@ -0,0 +1,41 @@ +import numpy as np + + +class BaseCluster: + """ The Base Cluster class + + Using this class directly in BERTopic will make it skip + over the cluster step. As a result, topics need to be passed + to BERTopic in the form of its `y` parameter in order to create + topic representations. + + Examples: + + This will skip over the cluster step in BERTopic: + + ```python + from bertopic import BERTopic + from bertopic.dimensionality import BaseCluster + + empty_cluster_model = BaseCluster() + + topic_model = BERTopic(hdbscan_model=empty_cluster_model) + ``` + + Then, this class can be used to perform manual topic modeling. + That is, topic modeling on a topics that were already generated before + without the need to learn them: + + ```python + topic_model.fit(docs, y=y) + ``` + """ + def fit(self, X, y=None): + if y is not None: + self.labels_ = y + else: + self.labels_ = None + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + return X diff --git a/bertopic/cluster/_utils.py b/bertopic/cluster/_utils.py new file mode 100644 index 00000000..5a987346 --- /dev/null +++ b/bertopic/cluster/_utils.py @@ -0,0 +1,48 @@ +import hdbscan +import numpy as np + + +def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None): + """ Function used to select the HDBSCAN-like model for generating + predictions and probabilities. + + Arguments: + model: The cluster model. + func: The function to use. Options: + - "approximate_predict" + - "all_points_membership_vectors" + embeddings: Input embeddings for "approximate_predict" + """ + + # Approximate predict + if func == "approximate_predict": + if isinstance(model, hdbscan.HDBSCAN): + predictions, probabilities = hdbscan.approximate_predict(model, embeddings) + return predictions, probabilities + elif "cuml" and "hdbscan" in str(type(model)).lower(): + from cuml.cluster import hdbscan as cuml_hdbscan + predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings) + return predictions, probabilities + else: + predictions = model.predict(embeddings) + return predictions, None + + # All points membership + if func == "all_points_membership_vectors": + if isinstance(model, hdbscan.HDBSCAN): + return hdbscan.all_points_membership_vectors(model) + elif "cuml" and "hdbscan" in str(type(model)).lower(): + from cuml.cluster import hdbscan as cuml_hdbscan + return cuml_hdbscan.all_points_membership_vectors(model) + else: + return None + + +def is_supported_hdbscan(model): + """ Check whether the input model is a supported HDBSCAN-like model """ + if isinstance(model, hdbscan.HDBSCAN): + return True + elif "cuml" and "hdbscan" in str(type(model)).lower(): + return True + else: + return False diff --git a/bertopic/dimensionality/__init__.py b/bertopic/dimensionality/__init__.py new file mode 100644 index 00000000..ca349f40 --- /dev/null +++ b/bertopic/dimensionality/__init__.py @@ -0,0 +1,5 @@ +from ._base import BaseDimensionalityReduction + +__all__ = [ + "BaseDimensionalityReduction", +] diff --git a/bertopic/dimensionality/_base.py b/bertopic/dimensionality/_base.py new file mode 100644 index 00000000..7b39c3b4 --- /dev/null +++ b/bertopic/dimensionality/_base.py @@ -0,0 +1,26 @@ +import numpy as np + + +class BaseDimensionalityReduction: + """ The Base Dimensionality Reduction class + + You can use this to skip over the dimensionality reduction step in BERTopic. + + Examples: + + This will skip over the reduction step in BERTopic: + + ```python + from bertopic import BERTopic + from bertopic.dimensionality import BaseDimensionalityReduction + + empty_reduction_model = BaseDimensionalityReduction() + + topic_model = BERTopic(umap_model=empty_reduction_model) + ``` + """ + def fit(self, X: np.ndarray = None): + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + return X diff --git a/bertopic/plotting/__init__.py b/bertopic/plotting/__init__.py index 3cc61cc5..b6702901 100644 --- a/bertopic/plotting/__init__.py +++ b/bertopic/plotting/__init__.py @@ -8,6 +8,7 @@ from ._topics_over_time import visualize_topics_over_time from ._topics_per_class import visualize_topics_per_class from ._hierarchical_documents import visualize_hierarchical_documents +from ._approximate_distribution import visualize_approximate_distribution __all__ = [ @@ -20,5 +21,6 @@ "visualize_distribution", "visualize_topics_over_time", "visualize_topics_per_class", - "visualize_hierarchical_documents" + "visualize_hierarchical_documents", + "visualize_approximate_distribution" ] diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py new file mode 100644 index 00000000..5c7e0f5d --- /dev/null +++ b/bertopic/plotting/_approximate_distribution.py @@ -0,0 +1,99 @@ +import numpy as np +import pandas as pd + +try: + from pandas.io.formats.style import Styler + HAS_JINJA = True +except (ModuleNotFoundError, ImportError): + HAS_JINJA = False + + +def visualize_approximate_distribution(topic_model, + document: str, + topic_token_distribution: np.ndarray, + normalize: bool = False): + """ Visualize the topic distribution calculated by `.approximate_topic_distribution` + on a token level. Thereby indicating the extend to which a certain word or phrases belong + to a specific topic. The assumption here is that a single word can belong to multiple + similar topics and as such give information about the broader set of topics within + a single document. + + NOTE: + This fuction will return a stylized pandas dataframe if Jinja2 is installed. If not, + it will only return a pandas dataframe without color highlighting. To install jinja: + + `pip install jinja2` + + Arguments: + topic_model: A fitted BERTopic instance. + document: The document for which you want to visualize + the approximated topic distribution. + topic_token_distribution: The topic-token distribution of the document as + extracted by `.approximate_topic_distribution` + normalize: Whether to normalize, between 0 and 1 (summing to 1), the + topic distribution values. + + Returns: + df: A stylized dataframe indicating the best fitting topics + for each token. + + Examples: + + ```python + # Calculate the topic distributions on a token level + # Note that we need to have `calculate_token_level=True` + topic_distr, topic_token_distr = topic_model.approximate_distribution( + docs, calculate_token_level=True + ) + + # Visualize the approximated topic distributions + df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]) + df + ``` + + To revert this stylized dataframe back to a regular dataframe, + you can run the following: + + ```python + df.data.columns = [column.strip() for column in df.data.columns] + df = df.data + ``` + """ + # Tokenize document + analyzer = topic_model.vectorizer_model.build_tokenizer() + tokens = analyzer(document) + + if len(tokens) == 0: + raise ValueError("Make sure that your document contains at least 1 token.") + + # Prepare dataframe with results + if normalize: + df = pd.DataFrame(topic_token_distribution / topic_token_distribution.sum()).T + else: + df = pd.DataFrame(topic_token_distribution).T + + df.columns = [f"{token}_{i}" for i, token in enumerate(tokens)] + df.columns = [f"{token}{' '*i}" for i, token in enumerate(tokens)] + df.index = list(topic_model.topic_labels_.values())[topic_model._outliers:] + df = df.loc[(df.sum(axis=1) != 0), :] + + # Style the resulting dataframe + def text_color(val): + color = 'white' if val == 0 else 'black' + return 'color: %s' % color + + def highligh_color(data, color='white'): + attr = 'background-color: {}'.format(color) + return pd.DataFrame(np.where(data == 0, attr, ''), index=data.index, columns=data.columns) + + if len(df) == 0: + return df + elif HAS_JINJA: + df = ( + df.style + .format("{:.3f}") + .background_gradient(cmap='Blues', axis=None) + .applymap(lambda x: text_color(x)) + .apply(highligh_color, axis=None) + ) + return df diff --git a/bertopic/plotting/_distribution.py b/bertopic/plotting/_distribution.py index f9809bbc..f13172a9 100644 --- a/bertopic/plotting/_distribution.py +++ b/bertopic/plotting/_distribution.py @@ -44,9 +44,6 @@ def visualize_distribution(topic_model, if len(probabilities[probabilities > min_probability]) == 0: raise ValueError("There are no values where `min_probability` is higher than the " "probabilities that were supplied. Lower `min_probability` to prevent this error.") - if not topic_model.calculate_probabilities: - raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False " - "as it uses the topic probabilities. ") # Get values and indices equal or exceed the minimum probability labels_idx = np.argwhere(probabilities >= min_probability).flatten() diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py index b4358f48..545d24cb 100644 --- a/bertopic/plotting/_heatmap.py +++ b/bertopic/plotting/_heatmap.py @@ -54,9 +54,9 @@ def visualize_heatmap(topic_model, # Select topic embeddings if topic_model.topic_embeddings_ is not None: - embeddings = np.array(topic_model.topic_embeddings_) + embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:] else: - embeddings = topic_model.c_tf_idf_ + embeddings = topic_model.c_tf_idf_[topic_model._outliers:] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -69,13 +69,13 @@ def visualize_heatmap(topic_model, topics = sorted(freq_df.Topic.to_list()) # Order heatmap by similar clusters of topics + sorted_topics = topics if n_clusters: if n_clusters >= len(set(topics)): raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.") - embeddings = embeddings[[topic + topic_model._outliers for topic in topics]] - distance_matrix = cosine_similarity(embeddings) + distance_matrix = cosine_similarity(embeddings[topics]) Z = linkage(distance_matrix, 'ward') clusters = fcluster(Z, t=n_clusters, criterion='maxclust') @@ -85,8 +85,6 @@ def visualize_heatmap(topic_model, mapping[cluster].append(topic) mapping = [cluster for cluster in mapping.values()] sorted_topics = [topic for cluster in mapping for topic in cluster] - else: - sorted_topics = topics # Select embeddings indices = np.array([topics.index(topic) for topic in sorted_topics]) diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py index 237ae678..364c582c 100644 --- a/bertopic/plotting/_hierarchy.py +++ b/bertopic/plotting/_hierarchy.py @@ -3,6 +3,7 @@ from typing import Callable, List from scipy.sparse import csr_matrix from scipy.cluster import hierarchy as sch +from scipy.spatial.distance import squareform from sklearn.metrics.pairwise import cosine_similarity import plotly.graph_objects as go @@ -225,8 +226,14 @@ def _get_annotations(topic_model, """ df = hierarchical_topics.loc[hierarchical_topics.Parent_Name != "Top", :] - # Calculate linkage + # Calculate distance X = distance_function(embeddings) + + # Make sure it is the 1-D condensed distance matrix with zeros on the diagonal + np.fill_diagonal(X, 0) + X = squareform(X) + + # Calculate linkage and generate dendrogram Z = linkage_function(X) P = sch.dendrogram(Z, orientation=orientation, no_plot=True) diff --git a/docs/algorithm/algorithm.md b/docs/algorithm/algorithm.md index 1f5c49d7..45594f88 100644 --- a/docs/algorithm/algorithm.md +++ b/docs/algorithm/algorithm.md @@ -5,173 +5,21 @@ hide: # The Algorithm -Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. The aim of this approach is to make the underlying algorithm as intuitive as possible for a wide range of users. +Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. This approach aims to make the underlying algorithm as intuitive as possible for a wide range of users. ## **Visual Overview** -This visual overview reduces BERTopic to four main steps, namely the embedding of documents, the clustering of documents, the topic extraction, and the topic diversification. Each step contains one or more sub-steps that you can read a bit more about below. -

- - - - -
- - -
-

Embed documents

-
-
- - - -
-
- embeddings -
-
- -

- We start by converting our documents to vector representations through the use of language models. -

-
-
- - SBERT - - - 🤗 Transformers - - - spaCy - -
-
-
- - -
- -

Cluster embeddings

-
- - -
-
- reduction -
-
- -

- The vector representations are reduced in dimensionality so that clustering algorithms have an easier time finding clusters. -

-
-
- UMAP - PCA - Truncated SVD -
-
- - -
-
- cluster -
-
- -

- A clustering algorithm is used to cluster the reduced vectors in order to find semantically similar documents. -

-
-
- HDBSCAN - k-Means - BIRCH -
-
- -
-

Topic Representation

-
- - -
-
- bow -
-
- -

- We tokenize each topic into a bag-of-words representation that allows us to process the data without affecting the input embeddings. -

-
-
- CountVectorizer -
-
- - -
-
- ctfidf -
-
- -

- We calculate words that are interesting to each topic with a class-based TF-IDF procedure called c-TF-IDF. -

-
-
- c-TF-IDF - BM-25 -
-
- -
-

(Optional) Topic Diversity

-
-
- - -
-
- embeddings -
-
- -

- Maximal Marginal Relevance is used to diversify words in each topic which removes repeating and similar words. -

-
-
- MMR -
-
- -
- - +BERTopic can be viewed as a sequence of steps to create its topic representations. There are five steps to this process: + + + +Although these steps are the default, there is some modularity to BERTopic. Each step in this process was carefully selected such that they are all somewhat independent from one another. For example, the tokenization step is not directly influenced by the embedding model that was used to convert the documents which allow us to be creative in how we perform the tokenization step. + +This effect is especially strong in the clustering step. Models like HDBSCAN assume that clusters can have different shapes and forms. As a result, using a centroid-based technique to model the topic representations would not be beneficial since the centroid is not always representative of these types of clusters. A bag-of-words representation, however, makes very few assumptions concerning the shape and form of a cluster. + +As a result, BERTopic is quite modular and can maintain its quality of topic generation throughout a variety of sub-models. In other words, BERTopic essentially allows you to **build your own topic model**: + + ## **Code Overview** After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive. @@ -218,13 +66,13 @@ This overview describes each step in more detail such that you can get an intuit ### **1. Embed documents** We start by converting our documents to numerical representations. Although there are many methods for doing so the default in BERTopic is [sentence-transformers](https://github.com/UKPLab/sentence-transformers). These models are often optimized for semantic similarity which helps tremendously in our clustering task. Moreover, they are great for creating either document- or sentence-embeddings.
-In BERTopic, you can choose any sentence-transformers model but there are two models that are set as defaults: +In BERTopic, you can choose any sentence-transformers model but two models are set as defaults: * `"all-MiniLM-L6-v2"` * `"paraphrase-multilingual-MiniLM-L12-v2"` -The first is an English language model trained specifically for semantic similarity tasks which work quite -well for most use-cases. The second model is very similar to the first with one major difference is that the +The first is an English language model trained specifically for semantic similarity tasks which works quite +well for most use cases. The second model is very similar to the first with one major difference being that the `multilingual` models work for 50+ languages. This model is quite a bit larger than the first and is only selected if you select any language other than English. @@ -245,7 +93,7 @@ After having created our numerical representations of the documents we have to r and customizing your model. ### **3. Cluster Documents** -After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents in a cluster where they might note belong. This will improve the resulting topic representation as there is less noise to draw from. +After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents into a cluster where they might not belong. This will improve the resulting topic representation as there is less noise to draw from. !!! tip Cluster models @@ -254,16 +102,16 @@ After having reduced our embeddings, we can start clustering our data. For that, and customizing your model. ### **4. Bag-of-words** -Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters having different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters. +Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters have different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best-fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters.
-To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which resides the frequency of each word in each cluster. This bag-of-words representation is therefore on a cluster-level and not on a document-level. This distinction is important as we are interested in words on a topic-level (i.e., cluster-level). By using a bag-of-words representation, no assumption is made with respect to the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes. +To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which the frequency of each word in each cluster can be found. This bag-of-words representation is therefore on a cluster level and not on a document level. This distinction is important as we are interested in words on a topic level (i.e., cluster level). By using a bag-of-words representation, no assumption is made concerning the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes. !!! tip Bag-of-words and tokenization There are many ways you can tune or change the bag-of-words step. This step allows for processing the documents however you want without affecting the first step, embedding the documents. You can follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/countvectorizer/countvectorizer.html) for more information about tokenization options in BERTopic. ### **5. Topic representation** -From the generated bag-of-words representation, we want to know what makes one cluster different from another? Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents. +From the generated bag-of-words representation, we want to know what makes one cluster different from another. Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents.
When you apply TF-IDF as usual on a set of documents, what you are doing is comparing the importance of words between documents. Now, what if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be importance scores for words within a cluster. The more important words are within a cluster, the more it is representative of that topic. In other words, if we extract the most important words per cluster, we get descriptions of **topics**! This model is called **class-based TF-IDF**: @@ -274,7 +122,7 @@ words between documents. Now, what if, we instead treat all documents in a singl
Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.

-Then, we take take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. +Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. !!! tip c-TF-IDF parameters diff --git a/docs/algorithm/default.svg b/docs/algorithm/default.svg new file mode 100644 index 00000000..206e10d8 --- /dev/null +++ b/docs/algorithm/default.svg @@ -0,0 +1,41 @@ + + + + + + +SBERT + + + + + +UMAP + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +c-TF-IDF +Weighting scheme +Tokenizer +Clustering +Dimensionality Reduction +Embeddings + + + + + diff --git a/docs/algorithm/modularity.svg b/docs/algorithm/modularity.svg new file mode 100644 index 00000000..a8596c70 --- /dev/null +++ b/docs/algorithm/modularity.svg @@ -0,0 +1,228 @@ + + + + + + +SBERT + + + + + +SpaCy + + + + + +Transformers + + + + + + + + + + + + + + + + + + + + + + + + +UMAP + + + + + +PCA + + + + + +TruncatedSVD + + + + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +Jieba + + + + + +POS + + + + + +k-Means + + + + + +BIRCH + + + + + + + + + + + + +Embeddings +Dimensionality reduction +Clustering +Tokenizer +Weighting scheme + + + + + +SpaCy + + + + + +PCA + + + + + +k-Means + + + + + +CountVectorizer + + + + + +c-TF-IDF + + + + + +c-TF-IDF + + + + + +c-TF-IDF + MMR + + + + + +c-TF-IDF + BM25 + + + + + +TF-IDF + + + + + +TruncatedSVD + + + + + +BIRCH + + + + + +CountVectorizer + + + + + +c-TF-IDF + MMR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/api/cluster/base.md b/docs/api/cluster/base.md new file mode 100644 index 00000000..466bcc32 --- /dev/null +++ b/docs/api/cluster/base.md @@ -0,0 +1,3 @@ +# `BaseCluster` + +::: bertopic.cluster._base.BaseCluster diff --git a/docs/api/dimensionality/base.md b/docs/api/dimensionality/base.md new file mode 100644 index 00000000..060dea6b --- /dev/null +++ b/docs/api/dimensionality/base.md @@ -0,0 +1,3 @@ +# `BaseDimensionalityReduction` + +::: bertopic.dimensionality._base.BaseDimensionalityReduction diff --git a/docs/changelog.md b/docs/changelog.md index 7a075816..1e51fdcf 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,183 @@ hide: # Changelog +## **Version 0.13.0** +*Release date: 4 January, 2023* + +

Highlights:

+ +* Calculate [topic distributions](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) with `.approximate_distribution` regardless of the cluster model used + * Generates topic distributions on a document- and token-levels + * Can be used for any document regardless of its size! +* [Fully supervised BERTopic](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) + * You can now use a classification model for the clustering step instead to create a fully supervised topic model +* [Manual topic modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) + * Generate topic representations from labels directly + * Allows for skipping the embedding and clustering steps in order to go directly to the topic representation step +* [Reduce outliers](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html) with 4 different strategies using `.reduce_outliers` +* Install BERTopic without `SentenceTransformers` for a [lightweight package](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation): + * `pip install --no-deps bertopic` + * `pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml` +* Get meta data of trained documents such as topics and probabilities using `.get_document_info(docs)` +* Added more support for cuML's HDBSCAN + * Calculate and predict probabilities during `fit_transform` and `transform` respectively + * This should give a major speed-up when setting `calculate_probabilities=True` +* More images to the documentation and a lot of changes/updates/clarifications +* Get representative documents for non-HDBSCAN models by comparing document and topic c-TF-IDF representations +* Sklearn Pipeline [Embedder](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#scikit-learn-embeddings) by [@koaning](https://github.com/koaning) in [#791](https://github.com/MaartenGr/BERTopic/pull/791) + +

Fixes:

+ +* Improve `.partial_fit` documentation ([#837](https://github.com/MaartenGr/BERTopic/issues/837)) +* Fixed scipy linkage usage ([#807](https://github.com/MaartenGr/BERTopic/issues/807)) +* Fixed shifted heatmap ([#782](https://github.com/MaartenGr/BERTopic/issues/782)) +* Fixed SpaCy backend ([#744](https://github.com/MaartenGr/BERTopic/issues/744)) +* Fixed representative docs with small clusters (<3) ([#703](https://github.com/MaartenGr/BERTopic/issues/703)) +* Typo fixed by [@timpal0l](https://github.com/timpal0l) in [#734](https://github.com/MaartenGr/BERTopic/pull/734) +* Typo fixed by [@srulikbd](https://github.com/timpal0l) in [#842](https://github.com/MaartenGr/BERTopic/pull/842) +* Correcting iframe urls by [@Mustapha-AJEGHRIR](https://github.com/Mustapha-AJEGHRIR) in [#798](https://github.com/MaartenGr/BERTopic/pull/798) +* Refactor embedding methods by [@zachschillaci27](https://github.com/zachschillaci27) in [#855](https://github.com/MaartenGr/BERTopic/pull/855) +* Added diversity parameter to update_topics() function by [@anubhabdaserrr](https://github.com/anubhabdaserrr) in [#887](https://github.com/MaartenGr/BERTopic/pull/887) + +

Documentation

+ +Personally, I believe that documentation can be seen as a feature and is an often underestimated aspect of open-source. So I went a bit overboard😅... and created an animation about the three pillars of BERTopic using Manim. There are many other visualizations added, one of each variation of BERTopic, and many smaller changes. + + + +

Topic Distributions

+ +The difficulty with a cluster-based topic modeling technique is that it does not directly consider that documents may contain multiple topics. With the new release, we can now model the distributions of topics! We even consider that a single word might be related to multiple topics. If a document is a mixture of topics, what is preventing a single word to be the same? + +To do so, we approximate the distribution of topics in a document by calculating and summing the similarities of tokensets (achieved by applying a sliding window) with the topics: + +```python +# After fitting your model run the following for either your trained documents or even unseen documents +topic_distr, _ = topic_model.approximate_distribution(docs) +``` + +To calculate and visualize the topic distributions in a document on a token-level, we can run the following: + +```python +# We need to calculate the topic distributions on a token level +topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) + +# Create a visualization using a styled dataframe if Jinja2 is installed +df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]); df +``` + +

Supervised Topic Modeling

+ +BERTopic now supports fully-supervised classification! Instead of using a clustering algorithm, like HDBSCAN, we can replace it with a classifier, like Logistic Regression: + +```python +from bertopic import BERTopic +from bertopic.dimensionality import BaseDimensionalityReduction +from sklearn.datasets import fetch_20newsgroups +from sklearn.linear_model import LogisticRegression + +# Get labeled data +data= fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) +docs = data['data'] +y = data['target'] + +# Allows us to skip over the dimensionality reduction step +empty_dimensionality_model = BaseDimensionalityReduction() + +# Create a classifier to be used instead of the cluster model +clf= LogisticRegression() + +# Create a fully supervised BERTopic instance +topic_model= BERTopic( + umap_model=empty_dimensionality_model, + hdbscan_model=clf +) +topics, probs = topic_model.fit_transform(docs, y=y) +``` + +

Manual Topic Modeling

+ +When you already have a bunch of labels and simply want to extract topic representations from them, you might not need to actually learn how those can predicted. We can bypass the `embeddings -> dimensionality reduction -> clustering` steps and go straight to the c-TF-IDF representation of our labels: + +```python +from bertopic import BERTopic +from bertopic.backend import BaseEmbedder +from bertopic.cluster import BaseCluster +from bertopic.dimensionality import BaseDimensionalityReduction + +# Prepare our empty sub-models and reduce frequent words while we are at it. +empty_embedding_model = BaseEmbedder() +empty_dimensionality_model = BaseDimensionalityReduction() +empty_cluster_model = BaseCluster() + +# Fit BERTopic without actually performing any clustering +topic_model= BERTopic( + embedding_model=empty_embedding_model, + umap_model=empty_dimensionality_model, + hdbscan_model=empty_cluster_model, +) +topics, probs = topic_model.fit_transform(docs, y=y) +``` + +

Outlier Reduction

+ +Outlier reduction is an frequently-discussed topic in BERTopic as its default cluster model, HDBSCAN, has a tendency to generate many outliers. This often helps in the topic representation steps, as we do not consider documents that are less relevant, but you might want to still assign those outliers to actual topics. In the modular philosophy of BERTopic, keeping training times in mind, it is now possible to perform outlier reduction **after** having trained your topic model. This allows for ease of iteration and prevents having to train BERTopic many times to find the parameters you are searching for. There are 4 different strategies that you can use, so make sure to check out the [documentation](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html)! + +Using it is rather straightforward: + +```python +new_topics = topic_model.reduce_outliers(docs, topics) +``` + +

Lightweight BERTopic

+ +The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely `"all-MiniLM-L6-v2"`. Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires `pytorch` which often results in a rather large environment, memory-wise. + +Fortunately, it is possible to install BERTopic without `sentence-transformers` and use it as a lightweight solution instead. The installation can be done as follows: + +```bash +pip install --no-deps bertopic +pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml +``` + +Then, we can use BERTopic without `sentence-transformers` as follows using a CPU-based embedding technique: + +```python +from sklearn.pipeline import make_pipeline +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer + +pipe = make_pipeline( + TfidfVectorizer(), + TruncatedSVD(100) +) + +topic_model = BERTopic(embedding_model=pipe) +``` + +As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary! + +

Document Information

+ +Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and the probability of the clustering if the cluster model supports it. There are also options to include other metadata, such as the topic distributions or the x and y coordinates of the reduced embeddings that you can learn more about here. + +To get the document info, you will only need to pass the documents on which the topic model was trained: + + +```python +>>> topic_model.get_document_info(docs) + +Document Topic Name Top_n_words Probability ... +I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... +My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... +Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... +Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... +1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... +``` + + + ## **Version 0.12.0** *Release date: 5 September, 2022* diff --git a/docs/faq.md b/docs/faq.md index 320ce017..ca45654b 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -6,9 +6,7 @@ hide: # Frequently Asked Questions ## **Why are the results not consistent between runs?** -Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code -multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the -topics that suit you best. You only need to generate the embeddings itself once and run BERTopic several times +Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best. You only need to generate the embeddings themselves once and run BERTopic several times with different parameters. If you want to reproduce the results, at the expense of [performance](https://umap-learn.readthedocs.io/en/latest/reproducibility.html), you can set a `random_state` in UMAP to prevent @@ -24,12 +22,9 @@ topic_model = BERTopic(umap_model=umap_model) ``` ## **Which embedding model should I choose?** -Unfortunately, there is not a definitive list of the best models for each language, this highly depends -on your data, the model, and your specific use-case. However, the default model in BERTopic -(`"all-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** -documents or any other language, `"paraphrase-multilingual-MiniLM-L12-v2""` has shown great performance. +Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use case. However, the default model in BERTopic (`"all-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** documents or any other language, `"paraphrase-multilingual-MiniLM-L12-v2"` has shown great performance. -If you want to use a model that provides a higher quality, but takes more compute time, then I would advise using `all-mpnet-base-v2` and `paraphrase-multilingual-mpnet-base-v2` instead. +If you want to use a model that provides a higher quality, but takes more computing time, then I would advise using `all-mpnet-base-v2` and `paraphrase-multilingual-mpnet-base-v2` instead. **SentenceTransformers** [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models) work typically quite well @@ -37,15 +32,13 @@ and are the preferred models to use. They are great at generating document embed multi-lingual versions available. **🤗 transformers** -BERTopic allows you to use any 🤗 transformers model. These models are typically embeddings created on -a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a -specific language for which you want to generate embeddings, you can choose the model [here](https://huggingface.co/models). +BERTopic allows you to use any 🤗 transformers model. These models are typically embeddings created on a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a specific language for which you want to generate embeddings, you can choose the model [here](https://huggingface.co/models). ## **How do I reduce topic outliers?** -There are three ways in reducing outliers. +There are several ways we can reduce outliers. First, the amount of datapoint classified as outliers is handled by the `min_samples` parameters in HDBSCAN. This value is automatically set to the -same value of `min_cluster_size`. However, you can set it indepedently if you want to reduce the number of generated outliers. Lowering this value will +same value of `min_cluster_size`. However, you can set it independently if you want to reduce the number of generated outliers. Lowering this value will result in less noise being generated. ```python @@ -62,22 +55,18 @@ topics, probs = topic_model.fit_transform(docs) Although this will lower outliers found in the data, this might force outliers to be put into topics where they do not belong. So make sure to strike a balance between keeping noise and reducing outliers. -Second, after training our BERTopic model, we can assign outliers to topics. By setting `calculate_probabilities=True`, we calculate the probability -of a document belonging to any topic. That way, we can select, for each document, the topic with the the highest probability. Thus, although we do -generate an outlier class in our BERTopic model, we can assign documents to an actual topic. - -To do this, we can set a probability threshold and assign each document to a topic based on their probabilities: +Second, after training our BERTopic model, we can assign outliers to topics by making use of the `.reduce_outliers` function in BERTopic. An advantage of using this approach is that there are four built in strategies one can choose for reducing outliers. Moreover, this technique allows the user to experiment with reducing outliers across a number of strategies and parameters without actually having to re-train the topic model each time. You can learn more about the `.reduce_outlier` function [here](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html). The following is a minimal example of how to use this function: ```python -import numpy as np -probability_threshold = 0.01 -new_topics = [np.argmax(prob) if max(prob) >= probability_threshold else -1 for prob in probs] -``` +from bertopic import BERTopic -!!! note "Note" - The topics assigned using the above method can result in topics different from using `.fit_transform()`. This is expected - behavior as HDBSCAN is merely trying to imitate soft clustering after fitting the model and it is not a core component - of assigning points to clusters. +# Train your BERTopic model +topic_model = BERTopic() +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers +new_topics = topic_model.reduce_outliers(docs, topics) +``` Third, we can replace HDBSCAN with any other clustering algorithm that we want. So we can choose a clustering algorithm, like k-Means, that does not produce any outliers at all. Using k-Means instead of HDBSCAN is straightforward: @@ -90,18 +79,58 @@ cluster_model = KMeans(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) ``` + +## **How do I remove stop words?** +At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings. + +Instead, we can use the `CountVectorizer` to preprocess our documents **after** having generated embeddings and clustered +our documents. I have found almost no disadvantages to using the `CountVectorizer` to remove stop words and +it is something I would strongly advise to try out: + +```python +from bertopic import BERTopic +from sklearn.feature_extraction.text import CountVectorizer + +vectorizer_model = CountVectorizer(stop_words="english") +topic_model = BERTopic(vectorizer_model=vectorizer_model) +``` + +We can also use the `ClassTfidfTransformer` to reduce the impact of frequent words. The result is very similar to explicitly removing stop words but this process does this automatically: + +```python +from bertopic import BERTopic +from bertopic.vectorizers import ClassTfidfTransformer + +ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) +topic_model = BERTopic(ctfidf_model=ctfidf_model) +``` + ## **How can I speed up BERTopic?** You can speed up BERTopic by either generating your embeddings beforehand or by -setting `calculate_probabilities` to False. Calculating the probabilities is quite expensive and can -significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before -the model is done running or if you have less than 50_000 documents. +setting `calculate_probabilities` to False. Calculating the probabilities is quite expensive and can significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before the model is done running or if you have less than a couple of hundred thousand documents. + +Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models typically require a GPU and using only a CPU can slow down computation time quite a lot. However, if you do not have access to a GPU, looking into quantization might help. + +Lastly, it is also possible to speed up BERTopic with [cuML's](https://rapids.ai/start.html#rapids-release-selector) GPU acceleration of UMAP and HDBSCAN: + + +```python +from bertopic import BERTopic +from cuml.cluster import HDBSCAN +from cuml.manifold import UMAP + +# Create instances of GPU-accelerated UMAP and HDBSCAN +umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) +hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True) + +# Pass the above models to be used in BERTopic +topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model) +``` -Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models -typically require a GPU and using only a CPU can slow down computation time quite a lot. -However, if you do not have access to a GPU, looking into quantization might help. ## **I am facing memory issues. Help!** -There are several ways to perform computation with large datasets. +There are several ways to perform computation with large datasets: + * First, you can set `low_memory` to True when instantiating BERTopic. This may prevent blowing up the memory in UMAP. @@ -124,8 +153,7 @@ parameter is used to indicate the minimum frequency of words. Setting this value * Fourth, you can use online topic modeling instead to use BERTopic on big data by training the model in chunks -If the problem persists, then this could be an issue related to your available memory. The processing of -millions of documents is quite computationally expensive and sufficient RAM is necessary. +If the problem persists, then this could be an issue related to your available memory. The processing of millions of documents is quite computationally expensive and sufficient RAM is necessary. ## **I have only a few topics, how do I increase them?** There are several reasons why your topic model may result in only a few topics: @@ -139,34 +167,27 @@ the minimum size of topics, then you are much more likely to increase the number You could also decrease the `n_neighbors` parameter used in `UMAP` if this does not work. * Third, although this does not happen very often, there simply aren't that many topics to be found -in your documents. You can often see this when you have many `-1` topics, which is actually not a topic +in your documents. You can often see this when you have many `-1` topics, which is not a topic but a category of outliers. ## **I have too many topics, how do I decrease them?** -If you have a large dataset, then it is possible to generate thousands of topics. Especially with large -datasets, there is a good chance they actually contain many small topics. In practice, you might want -a few hundred topics at most in order to interpret them nicely. +If you have a large dataset, then it is possible to generate thousands of topics. Especially with large datasets, there is a good chance they contain many small topics. In practice, you might want a few hundred topics at most to interpret them nicely. There are a few ways of increasing the number of generated topics: -* First, we can set the `min_topic_size` in the BERTopic initialization much higher (e.g., 300) -to make sure that those small clusters will not be generated. This is a HDBSCAN parameter that -specifies what the minimum number of documents are needed in a cluster. More documents in a cluster -means less topics will be generated. +* First, we can set the `min_topic_size` in the BERTopic initialization much higher (e.g., 300) to make sure that those small clusters will not be generated. This is an HDBSCAN parameter that specifies the minimum number of documents needed in a cluster. More documents in a cluster mean fewer topics will be generated. -* Second, you can create a custom UMAP model and set `n_neighbors` much higher than the default 15 (e.g., 200). -This also prevents those micro clusters to be generated as it will needs quite a number of neighboring -documents to create a cluster. +* Second, you can create a custom UMAP model and set `n_neighbors` much higher than the default 15 (e.g., 200). This also prevents those micro clusters to be generated as it will need many neighboring documents to create a cluster. * Third, we can set `nr_topics` to a value that seems logical to the user. Do note that topics are forced -to merge together which might result in a lower quality of topics. In practice, I would advise using -`nr_topic="auto"` as that will merge topics together that are very similar. Dissimilar topics will +to merge which might result in a lower quality of topics. In practice, I would advise using +`nr_topic="auto"` as that will merge topics that are very similar. Dissimilar topics will therefore remain separated. ## **How do I calculate the probabilities of all topics in a document?** Although it is possible to calculate all the probabilities, the process of doing so is quite computationally inefficient and might significantly increase the computation time. To prevent this, the probabilities are -not calculated as a default. In order to calculate, you will have to set `calculate_probabilities` to True: +not calculated as a default. To calculate them, you will have to set `calculate_probabilities` to True: ```python from bertopic import BERTopic @@ -174,11 +195,13 @@ topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) ``` +!!! note + The `calculate_probabilties` parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with [`.approximate_distribution`](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html). + ## **Numpy gives me an error when running BERTopic** -With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous) due -to compilation issues and pypi. +With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi. -This is a known issue with the order of install using pypi. You can find more details about this issue +This is a known issue with the order of installation using pypi. You can find more details about this issue [here](https://github.com/lmcinnes/umap/issues/567) and [here](https://github.com/scikit-learn-contrib/hdbscan/issues/457). I would suggest doing one of the following: @@ -230,8 +253,7 @@ topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model) topics, probs = topic_model.fit_transform(docs) ``` -Depending on the embeddings you are using, you might want to normalize them first in order to -force a cosine-related distance metric in UMAP: +Depending on the embeddings you are using, you might want to normalize them first to force a cosine-related distance metric in UMAP: ```python from cuml.preprocessing import normalize @@ -240,7 +262,7 @@ embeddings = normalize(embeddings) ## **How can I use BERTopic with Chinese documents?** Currently, CountVectorizer tokenizes text by splitting whitespace which does not work for Chinese. -In order to get it to work, you will have to create a custom `CountVectorizer` with `jieba`: +To get it to work, you will have to create a custom `CountVectorizer` with `jieba`: ```python from sklearn.feature_extraction.text import CountVectorizer @@ -272,7 +294,7 @@ issue can be found [here](https://github.com/lmcinnes/umap/issues/631). ## **Should I preprocess the data?** No. By using document embeddings there is typically no need to preprocess the data as all parts of a document -are important in understanding the general topic of the document. Although this holds true in 99% of cases, if you +are important in understanding the general topic of the document. Although this holds in 99% of cases, if you have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply topic modeling to HTML-code to extract topics of code, then it becomes important. \ No newline at end of file diff --git a/docs/getting_started/clustering/clustering.md b/docs/getting_started/clustering/clustering.md index ce8aed8a..7f847cdd 100644 --- a/docs/getting_started/clustering/clustering.md +++ b/docs/getting_started/clustering/clustering.md @@ -1,9 +1,14 @@ -After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings in order to extract our topics. +After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings to extract our topics. This process of clustering is quite important because the more performant our clustering technique the more accurate our topic representations are. -In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not perfect -clustering model and you might want to be using something entirely different for you use case. Moreover, what if a new state-of-the-art model -is released tomorrow? We would like to able to use that in BERTopic, right? +In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not one perfect +clustering model and you might want to be using something entirely different for your use case. Moreover, what if a new state-of-the-art model +is released tomorrow? We would like to be able to use that in BERTopic, right? Since BERTopic assumes some independence among steps, we can allow for this modularity: + +
+ ![Image title](clustering.svg) +
+
As a result, the `hdbscan_model` parameter in BERTopic now allows for a variety of clustering models. To do so, the class should have the following attributes: @@ -28,7 +33,7 @@ class ClusterModel: return X ``` -In this tutorial, I will show you how to use several clustering algorithms in BERTopic. +In this section, we will go through several examples of clustering algorithms and how they can be implemented. ## **HDBSCAN** @@ -43,12 +48,12 @@ hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selecti topic_model = BERTopic(hdbscan_model=hdbscan_model) ``` -Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics that you are using. +Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics you are using. ## **k-Means** Although HDBSCAN works quite well in BERTopic and is typically advised, you might want to be using k-Means instead. It allows you to select how many clusters you would like and forces every single point to be in a cluster. Therefore, no -outliers will be created. This has also has disadvantages. When you force every single point in a cluster, it will mean +outliers will be created. This also has disadvantages. When you force every single point in a cluster, it will mean that the cluster is highly likely to contain noise which can hurt the topic representations. As a small tip, using the `vectorizer_model=CountVectorizer(stop_words="english")` helps quite a bit to then improve the topic representation. @@ -64,7 +69,7 @@ topic_model = BERTopic(hdbscan_model=cluster_model) !!! note As you might have noticed, the `cluster_model` is passed to `hdbscan_model` which might be a bit confusing considering - you are not passing a HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current + you are not passing an HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible. ## **Agglomerative Clustering** @@ -81,4 +86,38 @@ from sklearn.cluster import AgglomerativeClustering cluster_model = AgglomerativeClustering(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) -``` \ No newline at end of file +``` + + +## **cuML HDBSCAN** + +Although the original HDBSCAN implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, +we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up HDBSCAN through GPU acceleration: + +```python +from bertopic import BERTopic +from cuml.cluster import HDBSCAN + +hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True) +topic_model = BERTopic(hdbscan_model=hdbscan_model) +``` + +The great thing about using cuML's HDBSCAN implementation is that it supports many features of the original implementation. In other words, +`calculate_probabilities=True` also works! + +!!! note + As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., `.transform`) using cuML's HDBSCAN. + However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., `.fit` and `.fit_transform`). + +!!! note + If you want to install cuML together with BERTopic using Google Colab, you can run the following code: + + ```bash + !pip install bertopic + !pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip uninstall cupy-cuda115 -y + !pip uninstall cupy-cuda11x -y + !pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64 + ``` diff --git a/docs/getting_started/clustering/clustering.svg b/docs/getting_started/clustering/clustering.svg new file mode 100644 index 00000000..bf36ce61 --- /dev/null +++ b/docs/getting_started/clustering/clustering.svg @@ -0,0 +1,47 @@ + + + + + + +SBERT + + + + + +UMAP + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +c-TF-IDF + + + + + +k-Means + + + + + +BIRCH + + + + diff --git a/docs/getting_started/ctfidf/ctfidf.md b/docs/getting_started/ctfidf/ctfidf.md index 9c3dada6..b14efdc6 100644 --- a/docs/getting_started/ctfidf/ctfidf.md +++ b/docs/getting_started/ctfidf/ctfidf.md @@ -1,13 +1,20 @@ # c-TF-IDF -In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic-level instead of a document-level. This adjusted TF-IDF representation is called **c-TF-IDF** takes into account what makes the documents in once cluster different from documents in another cluster: +In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic level instead of a document level. This adjusted TF-IDF representation is called **c-TF-IDF** and takes into account what makes the documents in one cluster different from documents in another cluster:
Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.

-Then, we take take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. +Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. + +Since the topic representation is somewhat independent of the clustering step, we can change how the c-TF-IDF representation will look like. This can be in the form of parameter tuning, different weighting schemes, or using a diversity metric on top of it. This allows for some modularity concerning the weighting scheme: + +
+ ![Image title](ctfidf.svg) +
+
This class-based TF-IDF representation is enabled by default in BERTopic. However, we can explicitly pass it to BERTopic through the `ctfidf_model` allowing for parameter tuning and the customization of the topic extraction technique: @@ -28,7 +35,7 @@ There are two parameters worth exploring in the `ClassTfidfTransformer`, namely The `bm25_weighting` is a boolean parameter that indicates whether a class-based BM-25 weighting measure is used instead of the default method as defined in the formula at the beginning of this page. -Instead of the using the following weighting scheme: +Instead of using the following weighting scheme: @@ -56,7 +63,7 @@ Instead of the default term frequency: -we take the square root of the term frequency after applying normalizing the frequency matrix: +we take the square root of the term frequency after normalizing the frequency matrix: diff --git a/docs/getting_started/ctfidf/ctfidf.svg b/docs/getting_started/ctfidf/ctfidf.svg new file mode 100644 index 00000000..ef5299c3 --- /dev/null +++ b/docs/getting_started/ctfidf/ctfidf.svg @@ -0,0 +1,47 @@ + + + + + + +SBERT + + + + + +UMAP + + + + + +HDBSCAN + + + + + +c-TF-IDF + 
BM25 + + + + + +CountVectorizer + + + + + +c-TF-IDF + + + + + +c-TF-IDF + MMR + + + + diff --git a/docs/getting_started/dim_reduction/default_pipeline.svg b/docs/getting_started/dim_reduction/default_pipeline.svg new file mode 100644 index 00000000..30759e66 --- /dev/null +++ b/docs/getting_started/dim_reduction/default_pipeline.svg @@ -0,0 +1,18 @@ + + + +SBERT +UMAP +HDBSCAN +c-TF-IDF +Embeddings + +Dimensionality reduction + + +Clustering + + +Topic representation + + diff --git a/docs/getting_started/dim_reduction/dim_reduction.md b/docs/getting_started/dim_reduction/dim_reduction.md index 4a211c2d..201dda45 100644 --- a/docs/getting_started/dim_reduction/dim_reduction.md +++ b/docs/getting_started/dim_reduction/dim_reduction.md @@ -1,13 +1,17 @@ -One important aspect of BERTopic is dimensionality reduction of the embeddings. Typically, embeddings are at least 384 in length and -many clustering algorithms have difficulty clustering in such a high dimensional space. A solution is to reduce the dimensionality -of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with. +An important aspect of BERTopic is the dimensionality reduction of the input embeddings. As embeddings are often high in dimensionality, clustering becomes difficult due to the curse of dimensionality. + +A solution is to reduce the dimensionality of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with. +UMAP is used as a default in BERTopic since it can capture both the local and global high-dimensional space in lower dimensions. +However, there are other solutions out there, such as PCA that users might be interested in trying out. Since BERTopic allows assumes some independency between steps, we can +use any other dimensionality reduction algorithm. The image below illustrates this modularity: + + +
+ ![Image title](dimensionality.svg) +
+
-In BERTopic, we typically use UMAP as it is able to capture both the local and global high-dimensional space in lower dimensions. -However, there are other solutions out there, such as PCA that users might be interested in trying out. -We have seen that developments in the artificial intelligence fields are quite fast and that whatever mights be state-of-the-art now, -could be different a year or even months later. Therefore, BERTopic allows you to use any dimensionality reduction algorithm that -you would like to be using. As a result, the `umap_model` parameter in BERTopic now allows for a variety of dimensionality reduction models. To do so, the class should have the following attributes: @@ -28,7 +32,7 @@ class DimensionalityReduction: return X ``` -In this tutorial, I will show you how to use several dimensionality reduction algorithms in BERTopic. +In this section, we will go through several examples of dimensionality reduction techniques and how they can be implemented. ## **UMAP** @@ -43,11 +47,11 @@ umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine') topic_model = BERTopic(umap_model=umap_model) ``` -Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics that you are using. +Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics you are using. ## **PCA** -Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and to perform -inference with. To use PCA, we can simply import it from `sklearn` and pass it to the `umap_model` parameter: +Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and perform +inference. To use PCA, we can simply import it from `sklearn` and pass it to the `umap_model` parameter: ```python @@ -77,4 +81,60 @@ from sklearn.decomposition import TruncatedSVD dim_model = TruncatedSVD(n_components=5) topic_model = BERTopic(umap_model=dim_model) -``` \ No newline at end of file +``` + +## **cuML UMAP** + +Although the original UMAP implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, +we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up UMAP through GPU acceleration: + +```python +from bertopic import BERTopic +from cuml.manifold import UMAP + +umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) +topic_model = BERTopic(umap_model=umap_model) +``` + +!!! note + If you want to install cuML together with BERTopic using Google Colab, you can run the following code: + + ```bash + !pip install bertopic + !pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip uninstall cupy-cuda115 -y + !pip uninstall cupy-cuda11x -y + !pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64 + ``` + + +## **Skip dimensionality reduction** +Although BERTopic applies dimensionality reduction as a default in its pipeline, this is a step that you might want to skip. We generate an "empty" model that simply returns the data pass it to: + +```python +from bertopic import BERTopic +from bertopic.dimensionality import BaseDimensionalityReduction + +# Fit BERTopic without actually performing any dimensionality reduction +empty_dimensionality_model = BaseDimensionalityReduction() +topic_model = BERTopic(umap_model=empty_dimensionality_model) +``` + +In other words, we go from this pipeline: + +
+
+--8<-- "docs/getting_started/dim_reduction/default_pipeline.svg" +
+
+ +To the following pipeline: + +
+
+--8<-- "docs/getting_started/dim_reduction/no_dimensionality.svg" +
+ +
\ No newline at end of file diff --git a/docs/getting_started/dim_reduction/dimensionality.svg b/docs/getting_started/dim_reduction/dimensionality.svg new file mode 100644 index 00000000..58413491 --- /dev/null +++ b/docs/getting_started/dim_reduction/dimensionality.svg @@ -0,0 +1,47 @@ + + + + + + +SBERT + + + + + +UMAP + + + + + +PCA + + + + + +TruncatedSVD + + + + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +c-TF-IDF + diff --git a/docs/getting_started/dim_reduction/no_dimensionality.svg b/docs/getting_started/dim_reduction/no_dimensionality.svg new file mode 100644 index 00000000..8a31b6f6 --- /dev/null +++ b/docs/getting_started/dim_reduction/no_dimensionality.svg @@ -0,0 +1,14 @@ + + + +SBERT +HDBSCAN +c-TF-IDF +Embeddings + +Clustering + + +Topic representation + + diff --git a/docs/getting_started/distribution/approximate_distribution.svg b/docs/getting_started/distribution/approximate_distribution.svg new file mode 100644 index 00000000..3b00794c --- /dev/null +++ b/docs/getting_started/distribution/approximate_distribution.svg @@ -0,0 +1,123 @@ + +the +right +problem +is +difficult +Solving + + + + + + +right +the +Solving + + + +the +right +problem + + + +right +problem +is + + + +problem +is +difficult + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +create token sets +topic-token set similarity +document-topic distribution +multi-topic assignment on a token level + + + + +solving +topic 2 +topic 1 +topic 3 + +topic 4 +the +right +problem +is +difficult + +0.75 +0.32 +0.16 + + + + + +0.21 +0.29 +0.81 +0.47 +0.26 + + +0.12 +0.33 + diff --git a/docs/getting_started/distribution/distribution.md b/docs/getting_started/distribution/distribution.md new file mode 100644 index 00000000..becee01b --- /dev/null +++ b/docs/getting_started/distribution/distribution.md @@ -0,0 +1,107 @@ +BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that each document is assigned to a single cluster and therefore also a single topic. In practice, documents may contain a mixture of topics. This can be accounted for by splitting up the documents into sentences and feeding those to BERTopic. + +Another option is to use a cluster model that can perform soft clustering, like HDBSCAN. As BERTopic focuses on modularity, we may still want to model that mixture of topics even when we are using a hard-clustering model, like k-Means without the need to split up our documents. This is where `.approximate_distribution` comes in! + +
+
+--8<-- "docs/getting_started/distribution/approximate_distribution.svg" +
+
+ +To perform this approximation, each document is split into tokens according to the provided tokenizer in the `CountVectorizer`. Then, a **sliding window** is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the document: + +> Solving the right problem is difficult. + +can be split up into `solving the right`, `the right problem`, `right problem is`, and `problem is difficult`. These are called token sets. +For each of these token sets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. +Then, the similarities to the topics for each token set are summed to create a topic distribution for the entire document. + +Although it is often said that documents can contain a mixture of topics, these are often modeled by assigning each word to a single topic. +With this approach, we take into account that there may be multiple topics for a single word. + +We can make this multiple-topic word assignment a bit more accurate by then splitting these token sets up into individual tokens and assigning +the topic distributions for each token set to each individual token. That way, we can visualize the extent to which a certain word contributes +to a document's topic distribution. + +## **Example** + +To calculate our topic distributions, we first need to fit a basic topic model: + +```python +from bertopic import BERTopic +from sklearn.datasets import fetch_20newsgroups + +docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] +topic_model = BERTopic().fit(docs) +``` + +After doing so, we can approximate the topic distributions for your documents: + +```python +topic_distr, _ = topic_model.approximate_distribution(docs) +``` + +The resulting `topic_distr` is a *n* x *m* matrix where *n* are the topics and *m* the documents. We can then visualize the distribution +of topics in a document: + +```python +topic_model.visualize_distribution(topic_distr[1]) +``` + + + +Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first +calculate topic distributions on a token level and then visualize the results: + +```python +# Calculate the topic distributions on a token-level +topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) + +# Visualize the token-level distributions +df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1]) +df +``` + +

+ +

+ +!!! tip + You can also approximate the topic distributions for unseen documents. It will not be as accurate as `.transform` but it is quite fast and can serve you well in a production setting. + +!!! note + To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2` + +## **Parameters** +There are a few parameters that are of interest which will be discussed below. + + +### **batch_size** +Creating token sets for each document can result in quite a large list of token sets. The similarity of these token sets with the topics can result a large matrix that might not fit into memory anymore. To circumvent this, we can process batches of documents instead to minimize the memory overload. The value for `batch_size` indicates the number of documents that will be processed at once: + +```python +topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=500) +``` + +### **window** +The number of tokens that are combined into token sets are defined by the `window` parameter. Seeing as we are performing a sliding window, we can change the size of the window. A larger window takes more tokens into account but setting it too large can result in considering too much information. Personally, I like to have this window between 4 and 8: + +```python +topic_distr, _ = topic_model.approximate_distribution(docs, window=4) +``` + +### **stride** +The sliding window that is performed on a document shifts, as a default, 1 token to the right each time to create its token sets. As a result, especially with large windows, a single token gets judged several times. We can use the `stride` parameter to increase the number of tokens the window shifts to the right. By increasing +this value, we are judging each token less frequently which often results in a much faster calculation. Combining this parameter with `window` is preferred. For example, if we have a very large dataset, we can set `stride=4` and `window=8` to judge token sets that contain 8 tokens but that are shifted with 4 steps +each time. As a result, this increases the computational speed quite a bit: + +```python +topic_distr, _ = topic_model.approximate_distribution(docs, window=4) +``` + +### **use_embedding_model** +As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected `embedding_model` instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower: + +```python +topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True) +``` diff --git a/docs/getting_started/distribution/distribution.png b/docs/getting_started/distribution/distribution.png new file mode 100644 index 00000000..74e78c41 Binary files /dev/null and b/docs/getting_started/distribution/distribution.png differ diff --git a/docs/getting_started/distribution/distribution_viz.html b/docs/getting_started/distribution/distribution_viz.html new file mode 100644 index 00000000..a4b7346d --- /dev/null +++ b/docs/getting_started/distribution/distribution_viz.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/docs/getting_started/embeddings/embeddings.md b/docs/getting_started/embeddings/embeddings.md index 714cbf77..0840f598 100644 --- a/docs/getting_started/embeddings/embeddings.md +++ b/docs/getting_started/embeddings/embeddings.md @@ -1,7 +1,18 @@ # Embedding Models -In this tutorial, we will be going through the embedding models that can be used in BERTopic. -Having the option to choose embedding models allows you to leverage pre-trained embeddings that suit your use case. -Moreover, it helps to create a topic when you have little data available. +BERTopic starts with transforming our input documents into numerical representations. Although there are many ways this can be achieved, we typically use sentence-transformers (`"all-MiniLM-L6-v2"`) as it is quite capable of capturing the semantic similarity between documents. + +However, there is not one perfect +embedding model and you might want to be using something entirely different for your use case. Since BERTopic assumes some independence among steps, we can allow for this modularity: + +
+ ![Image title](embeddings.svg) +
+
+ + +This modularity allows us not only to choose any embedding model to convert our documents into numerical representations, we can use essentially any data to perform our clustering. +When new state-of-the-art pre-trained embedding models are released, BERTopic will be able to use them. As a result, BERTopic grows with any new models being released. +Out of the box, BERTopic supports several embedding techniques. In this section, we will go through several of them and how they can be implemented. ### **Sentence Transformers** You can select any model from sentence-transformers [here](https://www.sbert.net/docs/pretrained_models.html) @@ -22,7 +33,7 @@ topic_model = BERTopic(embedding_model=sentence_model) ``` !!! tip "Tip!" - This embedding back-end was put here first for a reason, sentence-transformers works amazing out-of-the-box! Playing around with different models can give you great results. Also, make sure to frequently visit [this](https://www.sbert.net/docs/pretrained_models.html) page as new models are often released. + This embedding back-end was put here first for a reason, sentence-transformers works amazing out of the box! Playing around with different models can give you great results. Also, make sure to frequently visit [this](https://www.sbert.net/docs/pretrained_models.html) page as new models are often released. ### 🤗 Hugging Face Transformers To use a Hugging Face transformers model, load in a pipeline and point @@ -36,8 +47,7 @@ topic_model = BERTopic(embedding_model=embedding_model) ``` !!! tip "Tip!" - These transformers also work quite well using `sentence-transformers` which has a number of - optimizations tricks that make using it a bit faster. + These transformers also work quite well using `sentence-transformers` which has great optimizations tricks that make using it a bit faster. ### **Flair** [Flair](https://github.com/flairNLP/flair) allows you to choose almost any embedding model that @@ -54,7 +64,7 @@ You can select any 🤗 transformers model [here](https://huggingface.co/models) Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings. Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily -pass it to BERTopic in order to use those word embeddings as document embeddings: +pass it to BERTopic to use those word embeddings as document embeddings: ```python from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings @@ -133,8 +143,40 @@ topic_model = BERTopic(embedding_model=ft) Gensim is primarily used for Word Embedding models. This works typically best for short documents since the word embeddings are pooled. + +### **Scikit-Learn Embeddings** +Scikit-Learn is a framework for more than just machine learning. +It offers many preprocessing tools, some of which can be used to create representations +for text. Many of these tools are relatively lightweight and do not require a GPU. +While the representations may be less expressive than many BERT models, the fact that +it runs much faster can make it a relevant candidate to consider. + +If you have a scikit-learn compatible pipeline that you'd like to use to embed +text then you can also pass this to BERTopic. + +```python +from sklearn.pipeline import make_pipeline +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer + +pipe = make_pipeline( + TfidfVectorizer(), + TruncatedSVD(100) +) + +topic_model = BERTopic(embedding_model=pipe) +``` + +!!! Warning + One caveat to be aware of is that scikit-learns base `Pipeline` class does not + support the `.partial_fit()`-API. If you have a pipeline that theoretically should + be able to support online learning then you might want to explore + the [scikit-partial](https://github.com/koaning/scikit-partial) project. + Moreover, since this backend does not generate representations on a word level, + it does not support the `diversity` parameter. + ### **Word + Document Embeddings** -You might want to be using different language models for creating document- and word-embeddings. For example, +You might want to be using different language models for creating document and word embeddings. For example, while SentenceTransformers might be great in embedding sentences and documents, you might prefer to use FastText to create the word embeddings. @@ -186,7 +228,7 @@ topic_model = BERTopic(embedding_model=custom_embedder) ### **Custom Embeddings** The base models in BERTopic are BERT-based models that work well with document similarity tasks. Your documents, -however, might be too specific for a general pre-trained model to be used. Fortunately, you can use embedding +however, might be too specific for a general pre-trained model to be used. Fortunately, you can use the embedding model in BERTopic to create document features. You only need to prepare the document embeddings yourself and pass them through `fit_transform` of BERTopic: @@ -208,7 +250,7 @@ As you can see above, we used a SentenceTransformer model to create the embeddin `🤗 transformers`, `Doc2Vec`, or any other embedding method. #### **TF-IDF** -As mentioned above, any embedding technique can be used. However, when running umap, the typical distance metric is +As mentioned above, any embedding technique can be used. However, when running UMAP, the typical distance metric is `cosine` which does not work quite well for a TF-IDF matrix. Instead, BERTopic will recognize that a sparse matrix is passed and use `hellinger` instead which works quite well for the similarity between probability distributions. @@ -231,42 +273,3 @@ topics, probs = topic_model.fit_transform(docs, embeddings) Here, you will probably notice that creating the embeddings is quite fast whereas `fit_transform` is quite slow. This is to be expected as reducing the dimensionality of a large sparse matrix takes some time. The inverse of using transformer embeddings is true: creating the embeddings is slow whereas `fit_transform` is quite fast. - -#### **Scikit-Learn Embeddings** -Scikit-Learn is a framework for more than just machine learning. -It offers many preprocessing tools, some of which can be used to create representations -for text. Many of these tools are relatively lightweight and don't require a GPU. -While the representations may be less expressive as many BERT models, the fact that -it runs much faster can make it a relevant candidate to consider. - -If you have a scikit-learn compatible pipeline that you'd like to use to embed -text then you can also pass this to BERTopic. - -```python -from sklearn.pipeline import make_pipeline -from sklearn.decomposition import TruncatedSVD -from sklearn.feature_extraction.text import TfidfVectorizer - -pipe = make_pipeline( - TfidfVectorizer(), - TruncatedSVD(100) -) - -topic_model = BERTopic(embedding_model=pipe) -``` - -Internally, this uses the `SklearnEmbedder` that ensures the scikit-learn -pipeline is compatible. - -```python -from bertopic.backend import SklearnEmbedder - -sklearn_embedder = SklearnEmbedder(pipe) -topic_model = BERTopic(embedding_model=sklearn_embedder) -``` - -!!! Warning - One caveat to be aware of is that scikit-learns base `Pipeline` class does not - support the `.partial_fit()`-API. If you have a pipeline that theoretically should - be able to support online learning then you might want to explore - the [scikit-partial](https://github.com/koaning/scikit-partial) project. diff --git a/docs/getting_started/embeddings/embeddings.svg b/docs/getting_started/embeddings/embeddings.svg new file mode 100644 index 00000000..96cfae15 --- /dev/null +++ b/docs/getting_started/embeddings/embeddings.svg @@ -0,0 +1,99 @@ + + + + + + +SpaCy + + + + + +SBERT + + + + + +Transformers + + + + + + + + + + + + + + + + + + + + + + + + +UMAP + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +c-TF-IDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/getting_started/guided/guided.md b/docs/getting_started/guided/guided.md index ffbdc6f7..9233ac41 100644 --- a/docs/getting_started/guided/guided.md +++ b/docs/getting_started/guided/guided.md @@ -1,32 +1,29 @@ -Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach -by setting a number of seed topics in which the model will converge to. These techniques allow the user to set a -pre-defined number of topic representations that are sure to be in documents. For example, take an IT-business -that has a ticket system for the software their clients use. Those tickets may typically contain information about -a specific bug regarding login issues that the IT-business is aware off. +Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach by setting several seed topics to which the model will converge to. These techniques allow the user to set a predefined number of topic representations that are sure to be in documents. For example, take an IT business that has a ticket system for the software their clients use. Those tickets may typically contain information about a specific bug regarding login issues that the IT business is aware of. To model that bug, we can create a seed topic representation containing the words `bug`, `login`, `password`, and `username`. By defining those words, a Guided Topic Modeling approach will try to converge at least one topic to those words. +
+
+--8<-- "docs/getting_started/guided/guided.svg" +
+
+ Guided BERTopic has two main steps: -First, we create embeddings for each seeded topics by joining them and passing them through the document embedder. -These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. -If the document is most similar to a seeded topic, then it will get that topic's label. +First, we create embeddings for each seeded topic by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics. Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing -the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an -irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely -to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something -more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier. +the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier. ### **Example** To demonstrate Guided BERTopic, we use the 20 Newsgroups dataset as our example. We have frequently used this -dataset in BERTopic examples and we sometimes see a topic generated about health with words as `drug` and `cancer` -being important. However, due to the stocastisch nature of UMAP this topic is not always found. +dataset in BERTopic examples and we sometimes see a topic generated about health with words such as `drug` and `cancer` +being important. However, due to the stochastic nature of UMAP, this topic is not always found. In order to guide BERTopic to that topic, we create a seed topic list that we pass through our model. However, there may be several other topics that we know should be in the documents. Let's also initialize those: @@ -45,7 +42,7 @@ topic_model = BERTopic(seed_topic_list=seed_topic_list) topics, probs = topic_model.fit_transform(docs) ``` -AS you can see above, the `seed_topic_list` contains a list of topic representations. By defining the above topics +As you can see above, the `seed_topic_list` contains a list of topic representations. By defining the above topics BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will -not be modeled. Thus, seed topics need to be accurate in order to accurately converge towards them. \ No newline at end of file +not be modeled. Thus, seed topics need to be accurate to accurately converge towards them. \ No newline at end of file diff --git a/docs/getting_started/guided/guided.svg b/docs/getting_started/guided/guided.svg new file mode 100644 index 00000000..2f8de3be --- /dev/null +++ b/docs/getting_started/guided/guided.svg @@ -0,0 +1,152 @@ + + + + + + +"drug cancer drugs doctor" + + + + + +"windows drive dos file" + + + + + +"space launch orbit lunar" +Concatenate and embed the keywords/keyphrases using the embedding model. +For each document, generate labels by finding which seeded topic fits best based on cosine similarity between embeddings. +Average the embedding of each document with the selected seeded topic. +Define seed topics through keywords or keyphrases. +"drug", "cancer", "drugs", "doctor" +Seed topic 1 +Seed topic 2 +Seed topic 3 + +"windows", "drive", "dos", "file" + +"space", "launch", "orbit", "lunar" + + + + + + + + + + + + + + + + + + + + + +Seed topic 3 + + + + + + + + + + + + + +Seed topic 2 + + + + + + + + + + + + + +No seed topic match found + + + + + + + + + + + + + +Seed topic 2 + + + + + + + + +seed topic embedding + + + + + +document embedding ++ +2 + +Mutiply the IDF values of the seeded keywords across all topics with 1.2. + + + +Word +IDF +Multiplier +Adjusted IDF + +drug +1.2 +.55 +.66 + +1.2 +doctor +.78 +.94 + +cat +1 +.22 +.22 + +1 +dog +.11 +.11 + +space +1.2 +.35 +.42 + +1.2 +launch +.89 +1.07 + diff --git a/docs/getting_started/hierarchicaltopics/hierarchical.svg b/docs/getting_started/hierarchicaltopics/hierarchical.svg new file mode 100644 index 00000000..a0998b09 --- /dev/null +++ b/docs/getting_started/hierarchicaltopics/hierarchical.svg @@ -0,0 +1,166 @@ + +Create a distance matrix by calculating the cosine similarity between c-TF-IDF representations of each topic. +Apply a linkage function of choice on the distance matrix to model the hierarchical structure of topics. + + + + + + + + + + + + + + + +Topic 26 + + + + + + + + + + + + + + + +Topic 1 + + + + + + + + + + + + + + + +Topic 38 + + + + + + + + + + + + + + + +Topic 42 + + + + + +re-calculate c-TF-IDF +Update the c-TF-IDF representation based on the collection of documents across the merged topics. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Topic +1 +.12 +.12 +.53 +.53 +.74 +.74 +.89 +.89 +.24 +.24 +.01 +.01 +1 +1 +1 +1 +... +... +... +... +... +... +... +... +1 +2 +3 +1 +2 +3 +n +... +. +. +. +n + + + + + + + + + + + + + + + diff --git a/docs/getting_started/hierarchicaltopics/hierarchicaltopics.md b/docs/getting_started/hierarchicaltopics/hierarchicaltopics.md index 29ed1821..839e28a0 100644 --- a/docs/getting_started/hierarchicaltopics/hierarchicaltopics.md +++ b/docs/getting_started/hierarchicaltopics/hierarchicaltopics.md @@ -1,20 +1,16 @@ -When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. -Some topics could be merged together and having an understanding of the effect will help you understand which topics should and which -should not be merged. +When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. Some topics could be merged and having an understanding of the effect will help you understand which topics should and which should not be merged. -That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created -in order to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might -exist in your data. +That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might exist in your data. -In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix -contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. -The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging -topics is done through the hierarchical clustering capabilities of `scipy` (see [here](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)). -It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the [ward](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ward.html#scipy.cluster.hierarchy.ward) but many others are availabe. +
+
+--8<-- "docs/getting_started/hierarchicaltopics/hierarchical.svg" +
+
-Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. -We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps -us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree. +In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging topics is done through the hierarchical clustering capabilities of `scipy` (see [here](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)). It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the [ward](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ward.html#scipy.cluster.hierarchy.ward) but many others are available. + +Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree. ## **Example** To demonstrate hierarchical topic modeling with BERTopic, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents. @@ -68,7 +64,7 @@ topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations -help you understand the effect of merging certain topics together. Some might be logical to merge whilst others might not. Moreover, +help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes. Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can diff --git a/docs/getting_started/manual/manual.md b/docs/getting_started/manual/manual.md new file mode 100644 index 00000000..709c4370 --- /dev/null +++ b/docs/getting_started/manual/manual.md @@ -0,0 +1,89 @@ +Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Here, we might want to see how we can transform those 20 classes into 20 topics. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic without actually learning them. + +We can view this as a manual topic modeling approach. There is no underlying algorithm for detecting these topics since you already have done that before. Whether that is simply because they are already available, like with the 20 NewsGroups dataset, or maybe because you have created clusters of documents before using packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different. + +In other words, we can pass our labels to BERTopic and it will try to transform those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer. + +
+
+--8<-- "docs/getting_started/manual/pipeline.svg" +
+
+ +To do so, we need to skip over the dimensionality reduction and clustering steps since we already know the labels for our documents. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels: + + +```python +from sklearn.datasets import fetch_20newsgroups + +# Get labeled data +data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) +docs = data['data'] +y = data['target'] +``` + +Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process: + + +```python +from bertopic import BERTopic +from bertopic.backend import BaseEmbedder +from bertopic.cluster import BaseCluster +from bertopic.vectorizers import ClassTfidfTransformer +from bertopic.dimensionality import BaseDimensionalityReduction + +# Prepare our empty sub-models and reduce frequent words while we are at it. +empty_embedding_model = BaseEmbedder() +empty_dimensionality_model = BaseDimensionalityReduction() +empty_cluster_model = BaseCluster() +ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) + +# Fit BERTopic without actually performing any clustering +topic_model= BERTopic( + embedding_model=empty_embedding_model, + umap_model=empty_dimensionality_model, + hdbscan_model=empty_cluster_model, + ctfidf_model=ctfidf_model +) +topics, probs = topic_model.fit_transform(docs, y=y) +``` + +Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`: + +
+
+--8<-- "docs/getting_started/manual/table.svg" +
+
+ +We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship: + +```python +# Map input `y` to topics +mappings = topic_model.topic_mapper_.get_mappings() +mappings = {value: data["target_names"][key] for key, value in mappings.items()} + +# Assign original classes to our topics +df = topic_model.get_topic_info() +df["Class"] = df.Topic.map(mappings) +df +``` + +
+
+--8<-- "docs/getting_started/manual/table_classes.svg" +
+
+ +We can see that the c-TF-IDF representations nicely extract the words that give a nice representation of our input classes. This is all done without actually embedding and clustering the data. + +As a result, the entire "training" process only takes a couple of seconds. Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc. + +!!! note + The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following: + + + ```python + mappings = topic_model.topic_mapper_.get_mappings() + y_mapped = [mappings[val] for val in y] + ``` diff --git a/docs/getting_started/manual/pipeline.svg b/docs/getting_started/manual/pipeline.svg new file mode 100644 index 00000000..c58b1e6a --- /dev/null +++ b/docs/getting_started/manual/pipeline.svg @@ -0,0 +1,12 @@ + + +Documents + +Labels + +c-TF-IDF + + + + + diff --git a/docs/getting_started/manual/table.svg b/docs/getting_started/manual/table.svg new file mode 100644 index 00000000..4f0b5ae0 --- /dev/null +++ b/docs/getting_started/manual/table.svg @@ -0,0 +1,52 @@ + + + +Topic +Count +Name + +0 +0 +999 +0_game_hockey_team_25 + +1_god_church_jesus_christ +997 +1 +1 + +2 +2 +996 +2_bike_dod_ride_bikes + +3_baseball_game_he_year +994 +3 +3 + +4 +4 +991 +4_key_encryption_db_clipper + +5_car_cars_engine_ford +990 +5 +5 + +6 +6 +990 +6_medical_patients_cancer_disease + +7_window_server_widget_motif +988 +7 +7 + +8 +8 +988 +8_space_launch_nasa_orbit + diff --git a/docs/getting_started/manual/table_classes.svg b/docs/getting_started/manual/table_classes.svg new file mode 100644 index 00000000..856f12ef --- /dev/null +++ b/docs/getting_started/manual/table_classes.svg @@ -0,0 +1,62 @@ + + + +Topic +Count +Name +Class + +0 +0 +999 +0_game_hockey_team_25 +rec.sport.hockey + +1_god_church_jesus_christ +997 +1 +1 + +2 +2 +996 +2_bike_dod_ride_bikes + +3_baseball_game_he_year +994 +3 +3 + +4 +4 +991 +4_key_encryption_db_clipper + +5_car_cars_engine_ford +990 +5 +5 + +6 +6 +990 +6_medical_patients_cancer_disease + +7_window_server_widget_motif +988 +7 +7 + +8 +8 +988 +8_space_launch_nasa_orbit +sci.space +comp.windows.x +sci.med +rec.autos +sci.crypt +rec.sport.baseball +rec.motorcycles +soc.religion.christian + diff --git a/docs/getting_started/online/online.md b/docs/getting_started/online/online.md index 5fa4bc2d..deb7451b 100644 --- a/docs/getting_started/online/online.md +++ b/docs/getting_started/online/online.md @@ -1,4 +1,4 @@ -Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained on before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic. +Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic. In BERTopic, there are three main goals for using this technique. @@ -15,16 +15,22 @@ In BERTopic, online topic modeling can be a bit tricky as there are several step 5. Extract topic words 6. Diversify topic words -For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need for continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, step 5 and 6 do not necessarily need online variants since they are built upon step 4, the tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6. +For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6. + +
+
+--8<-- "docs/getting_started/online/online.svg" +
+
This means that we will need online variants for steps 2 through 4. Steps 2 and 3, dimensionality reduction and clustering, can be modeled through the use of Scikit-Learn's `.partial_fit` function. In other words, it supports any algorithm that can be trained using `.partial_fit` since these algorithms can be trained incrementally. For example, incremental dimensionality reduction can be achieved using Scikit-Learn's `IncrementalPCA` and incremental clustering with `MiniBatchKMeans`. -Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large in size. Most notably, the `decay` parameter is a value between 0 and 1 to weight the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix with 10% at each iteration. This will make sure that recent data has more weight than previously iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if its frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer). +Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer). ## **Example** -Online topic modeling in BERTopic is rather straightforward. We first need to have our documents in split in chunks such that we can train and update our topic model incrementally. +Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally. ```python from sklearn.datasets import fetch_20newsgroups @@ -34,7 +40,7 @@ all_docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quo doc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)] ``` -Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define a number of sub-models that support online learning. Specifically, we are going to be using `IncrementalPCA`, `MiniBatchKMeans`, and the `OnlineCountVectorizer`: +Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define several sub-models that support online learning. Specifically, we are going to be using `IncrementalPCA`, `MiniBatchKMeans`, and the `OnlineCountVectorizer`: ```python from sklearn.cluster import MiniBatchKMeans @@ -47,9 +53,6 @@ cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) ``` -!!! tip Tip - You can use any other dimensionality reduction and clustering algorithm as long as they have a `.partial_fit` function. Moreover, you can use dimensionality reduction algorithms that do not support `.partial_fit` functions but do have a `.fit` function to first train it on a large amount of data and then continously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents. - After having defined our sub-models, we can start training our topic model incrementally by looping over our document chunks: ```python @@ -64,20 +67,27 @@ for docs in doc_chunks: topic_model.partial_fit(docs) ``` -And that is it! During each iteration, you can access the predicted topics through the `.topics_` attribute. Do note though that only the most recent batch of documents are tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the `.topics_` attribute. Otherwise, variations such as hierarchical topic model will not work. +And that is it! During each iteration, you can access the predicted topics through the `.topics_` attribute. -```python -# Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration -topics = [] -for docs in doc_chunks: - topic_model.partial_fit(docs) - topics.extend(topic_model.topics_) +!!! note + Do note that in BERTopic it is not possible to use `.partial_fit` after the `.fit` as they work quite differently concerning internally updating topics, frequencies, representations, etc. -topic_model.topics_ = topics -``` +!!! tip Tip + You can use any other dimensionality reduction and clustering algorithm as long as they have a `.partial_fit` function. Moreover, you can use dimensionality reduction algorithms that do not support `.partial_fit` functions but do have a `.fit` function to first train it on a large amount of data and then continuously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents. + +!!! warning + Only the most recent batch of documents is tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the `.topics_` attribute. Otherwise, variations such as **hierarchical topic modeling** will not work. + + ```python + # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration + topics = [] + for docs in doc_chunks: + topic_model.partial_fit(docs) + topics.extend(topic_model.topics_) + + topic_model.topics_ = topics + ``` -!!! note - Do note that in BERTopic it is not possible to use `.partial_fit` after the `.fit` as they work quite differently with respect to internally updating topics, frequencies, representations, etc. ## **River** diff --git a/docs/getting_started/online/online.svg b/docs/getting_started/online/online.svg new file mode 100644 index 00000000..e3ab13e0 --- /dev/null +++ b/docs/getting_started/online/online.svg @@ -0,0 +1,26 @@ + + + +SBERT +IncrementalPCA +MiniBatchKMeans +Online CountVectorizer +Embeddings + +Dimensionality reduction + + +Clustering + + +Incremental Bag-of-Words + +c-TF-IDF + +Topic representation + + +Online variants of these steps in the main BERTopic pipeline are needed in order to enable incremental learning. + + + diff --git a/docs/getting_started/outlier_reduction/fig_base.html b/docs/getting_started/outlier_reduction/fig_base.html new file mode 100644 index 00000000..b1964816 --- /dev/null +++ b/docs/getting_started/outlier_reduction/fig_base.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/docs/getting_started/outlier_reduction/fig_reduced.html b/docs/getting_started/outlier_reduction/fig_reduced.html new file mode 100644 index 00000000..695e540f --- /dev/null +++ b/docs/getting_started/outlier_reduction/fig_reduced.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/docs/getting_started/outlier_reduction/outlier_reduction.md b/docs/getting_started/outlier_reduction/outlier_reduction.md new file mode 100644 index 00000000..b40944ca --- /dev/null +++ b/docs/getting_started/outlier_reduction/outlier_reduction.md @@ -0,0 +1,193 @@ +When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created +that do not fall within any of the created topics. These are labeled as -1. Depending on your use case, you might want +to decrease the number of documents that are labeled as outliers. Fortunately, there are a number of strategies one might +use to reduce the number of outliers after you have trained your BERTopic model. + +The main way to reduce your outliers in BERTopic is by using the `.reduce_outliers` function. To make it work without too much tweaking, you will only need to pass the `docs` and their corresponding `topics`. You can pass outlier and non-outlier documents together since it will only try to reduce outlier documents and label them to a non-outlier topic. + +The following is a minimal example: + +```python +from bertopic import BERTopic + +# Train your BERTopic model +topic_model = BERTopic() +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers +new_topics = topic_model.reduce_outliers(docs, topics) +``` + +!!! note + You can use the `threshold` parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics. + + +## **Strategies** + +The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them +to the best matching c-TF-IDF representations of non-outlier topics. + +However, there are a number of other strategies one can use, either seperately or in conjunction that are worthwhile to explore: +* Using the topic-document probabilities to assign topics +* Using the topic-document distributions to assign topics +* Using c-TF-IDF representations to assign topics +* Using document and topic embeddings to assign topics + +### **Probabilities** +This strategy uses the soft-clustering as performed by HDBSCAN to find the +best matching topic for each outlier document. To use this, make +sure to calculate the `probabilities` beforehand by instantiating +BERTopic with `calculate_probabilities=True`. + +```python +from bertopic import BERTopic + +# Train your BERTopic model and calculate the document-topic probabilities +topic_model = BERTopic(calculate_probabilities=True) +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers using the `probabilities` strategy +new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") +``` + +### **Topic Distributions** +Use the topic distributions, as calculated with `.approximate_distribution` +to find the most frequent topic in each outlier document. You can use the +`distributions_params` variable to tweak the parameters of +`.approximate_distribution`. + +```python +from bertopic import BERTopic + +# Train your BERTopic model +topic_model = BERTopic() +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers using the `distributions` strategy +new_topics = topic_model.reduce_outliers(docs, topics, strategy="distributions") +``` + +### **c-TF-IDF** +Calculate the c-TF-IDF representation for each outlier document and +find the best matching c-TF-IDF topic representation using +cosine similarity. + +```python +from bertopic import BERTopic + +# Train your BERTopic model +topic_model = BERTopic() +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers using the `c-tf-idf` strategy +new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf") +``` + +### **Embeddings** +Using the embeddings of each outlier documents, find the best +matching topic embedding using cosine similarity. + +```python +from bertopic import BERTopic + +# Train your BERTopic model +topic_model = BERTopic() +topics, probs = topic_model.fit_transform(docs) + +# Reduce outliers using the `embeddings` strategy +new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings") +``` + +!!! note + If you have pre-calculated the documents embeddings you can speed up the outlier + reduction process for the `"embeddings"` strategy as it will prevent re-calculating + the document embeddings. + +## **Update Topics** + +After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves. + +!!! warning + In both cases, it is important to realize that + updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2. + + +### **Update Topic Representation** + +When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics. To do so, we can make use of the `.update_topics` function: + +```python +topic_model.update_topics(docs, topics=new_topics) +``` + +As seen above, you will only need to pass the documents on which the model was trained including the new topics that were generated using one of the above four strategies. + + +### **Update Topic Frequency** + +```python +import pandas as pd +topic_model.topics_ = new_topics +documents = pd.DataFrame({"Document": docs, "Topic": new_topics}) +topic_model._update_topic_size(documents) +``` + +topic_model.get_topic_info() + + +### **Exploration** + +When you are reducing the number of topics, it might be worthwhile to iteratively visualize the results in order to get an intuitive understanding of the effect of the above four strategies. Making use of `.visualize_documents`, we can quickly iterate over the different strategies and view their effects. Here, an example will be shown on how to approach such a pipeline. + +First, we train our model: + +```python +from umap import UMAP +from bertopic import BERTopic +from sklearn.datasets import fetch_20newsgroups +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import CountVectorizer + +# Prepare data, extract embeddings, and prepare sub-models +docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] +umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) +vectorizer_model = CountVectorizer(stop_words="english") +sentence_model = SentenceTransformer("all-MiniLM-L6-v2") +embeddings = sentence_model.encode(docs, show_progress_bar=True) + +# We reduce our embeddings to 2D as it will allows us to quickly iterate later on +reduced_embeddings = UMAP(n_neighbors=10, n_components=2, + min_dist=0.0, metric='cosine').fit_transform(embeddings) + +# Train our topic model +topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, + vectorizer_model=vectorizer_model calculate_probabilities=True, nr_topics=40) +topics, probs = topic_model.fit_transform(docs, embeddings) +``` + +After having trained our model, let us take a look at the 2D representation of the generated topics: + +```python +topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, + hide_document_hover=True, hide_annotations=True) +``` + + + + +Next, we reduce the number of outliers using the `probabilities` strategy: + +```python +new_topics = reduce_outliers(topic_model, docs, topics, probabilities=probs, + threshold=0.05, strategy="probabilities") +topic_model.update_topics(docs, topics=new_topics) +``` + +And finally, we visualize the results: + +```python +topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, + hide_document_hover=True, hide_annotations=True) +``` + + diff --git a/docs/getting_started/parameter tuning/parametertuning.md b/docs/getting_started/parameter tuning/parametertuning.md index e906f050..d38b44f4 100644 --- a/docs/getting_started/parameter tuning/parametertuning.md +++ b/docs/getting_started/parameter tuning/parametertuning.md @@ -1,12 +1,12 @@ # Hyperparameter Tuning -Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use-case. -This section will focus on important parameters directly accessable in BERTopic but also hyperparameter optimization in sub-models +Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use case. +This section will focus on important parameters directly accessible in BERTopic but also hyperparameter optimization in sub-models such as HDBSCAN and UMAP. ## **BERTopic** -When instantiating BERTopic, there are a number of hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them. +When instantiating BERTopic, there are several hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them. ### **language** The `language` parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models. @@ -22,30 +22,30 @@ The multilingual model is "paraphrase-multilingual-MiniLM-L12-v2" and supports o ### **top_n_words** -`top_n_words` refers to the number of words per topic that you want extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative for the topic and should be focused on. +`top_n_words` refers to the number of words per topic that you want to be extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative of the topic and should be focused on. ### **n_gram_range** -The `n_gram_range` parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, "New" and "York" are two seperate words but are often used as "New York" which represents an n-gram of 2. Thus, the `n_gram_range` should be set to (1, 2) if you want "New York" in your topic representation. +The `n_gram_range` parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, "New" and "York" are two separate words but are often used as "New York" which represents an n-gram of 2. Thus, the `n_gram_range` should be set to (1, 2) if you want "New York" in your topic representation. ### **min_topic_size** -`min_topic_size` is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many micro clusters. +`min_topic_size` is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many microclusters. -It is advised to play around with this value depending on the size of the your dataset. If it nears a million documents, then it advised to set it much higher than the default of 10, for example 100 or even 500. +It is advised to play around with this value depending on the size of your dataset. If it nears a million documents, then it is advised to set it much higher than the default of 10, for example, 100 or even 500. ### **nr_topics** -`nr_topics` can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced to. For example, if your topic model results in 100 topics but you have set `nr_topics` to 20 then the topic model will try to reduce the number of topics from 100 to 20. +`nr_topics` can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced. For example, if your topic model results in 100 topics but you have set `nr_topics` to 20 then the topic model will try to reduce the number of topics from 100 to 20. -This reduction can take awhile as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics that using HDBSCAN. +This reduction can take a while as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics using HDBSCAN. ### **low_memory** -`low_memory` sets UMAP's `low_memory` to True to make sure that less memory is used in computation. This slows down computation but allows UMAP to be ran on low memory machines. +`low_memory` sets UMAP's `low_memory` to True to make sure that less memory is used in the computation. This slows down computation but allows UMAP to be run on low-memory machines. ### **calculate_probabilities** -`calculate_probabilities` lets you calculate the probabilities of each topic to each document. This is computationally quite expensive and is turned off by default. +`calculate_probabilities` lets you calculate the probabilities of each topic in each document. This is computationally quite expensive and is turned off by default. ## **UMAP** -UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something that is easier to use with HDBSCAN in order to create good clusters. +UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters. However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic: @@ -57,26 +57,26 @@ topic_model = BERTopic(umap_model=umap_model).fit(docs) ``` ### **n_neighbors** -`n_neighbors` is the numer of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a +`n_neighbors` is the number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created. ### **n_components** -`n_components` refers to the dimensionality of the embeddings after reducing them. This is set as a default to `5` in order to reduce dimensionality -as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value has an influence on the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a -hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings is available +`n_components` refers to the dimensionality of the embeddings after reducing them. This is set as a default to `5` to reduce dimensionality +as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value influences the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a +hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings are available to create proper clusters. If you want to increase this value, I would advise setting using a metric for HDBSCAN that works well in high dimensional data. ### **metric** `metric` refers to the method used to compute the distances in high dimensional space. The default is `cosine` as we are dealing with high dimensional data. However, BERTopic is also able to use any input, even regular tabular data, to cluster the documents. Thus, you might want to change the metric -to something that fits with your use case. +to something that fits your use case. ### **low_memory** `low_memory` is used when datasets may consume a lot of memory. Using millions of documents can lead to memory issues and setting this value to `True` might alleviate some of the issues. ## **HDBSCAN** -After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked in order to improve the cluster's quality. +After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality. ```python from hdbscan import HDBSCAN @@ -86,13 +86,13 @@ topic_model = BERTopic(hdbscan_model=hdbscan_model).fit(docs) ``` ### **min_cluster_size** -`min_cluster_size` is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the amount of clusters -that will be generated. It is set to `10` as a default. Increasing this value results in less clusters but of larger size whereas decreasing this value -results in more micro clusters being generated. Typically, I would advise on increasing this value rather than decreasing it. +`min_cluster_size` is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters +that will be generated. It is set to `10` as a default. Increasing this value results in fewer clusters but of larger size whereas decreasing this value +results in more micro clusters being generated. Typically, I would advise increasing this value rather than decreasing it. ### **min_samples** -`min_samples` is automatically set to `min_cluster_size` and controls the amount of outliers are generated. Setting this value significantly lower than -`min_cluster_size` might help you reduce the amount of noise you will get. Do note that outliers are typically to be expected and forcing the output +`min_samples` is automatically set to `min_cluster_size` and controls the number of outliers generated. Setting this value significantly lower than +`min_cluster_size` might help you reduce the amount of noise you will get. Do note that outliers are to be expected and forcing the output to have no outliers may not properly represent the data. ### **metric** @@ -102,4 +102,4 @@ metrics that work with high dimensional data. ### **prediction_data** Make sure you always set this value to `True` as it is needed to predict new points later on. You can set this to False if you do not wish to predict -any unseen datapoints. \ No newline at end of file +any unseen data points. \ No newline at end of file diff --git a/docs/getting_started/quickstart/quickstart.md b/docs/getting_started/quickstart/quickstart.md index 7a9e5575..3edaf581 100644 --- a/docs/getting_started/quickstart/quickstart.md +++ b/docs/getting_started/quickstart/quickstart.md @@ -60,6 +60,19 @@ frequent topic that was generated, topic 0: ('pc', 0.003047105930670237)] ``` +Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: + +```python +>>> topic_model.get_document_info(docs) + +Document Topic Name Top_n_words Probability ... +I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... +My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... +Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... +Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... +1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... +``` + !!! Tip "Tip!" Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. @@ -91,4 +104,9 @@ topic_model = BERTopic.load("my_model") !!! Tip "Tip!" If you do not want to save the embedding model because it is loaded from the cloud, simply run `model.save("my_model", save_embedding_model=False)` instead. Then, you can load in the model - with `BERTopic.load("my_model", embedding_model="whatever_model_you_used")`. \ No newline at end of file + with `BERTopic.load("my_model", embedding_model="whatever_model_you_used")`. + +!!! Warning "Warning" + When saving the model, make sure to also keep track of the versions of dependencies and Python used. + Loading and saving the model should be done using the same dependencies and Python. Moreover, models + saved in one version of BERTopic should not be loaded in other versions. diff --git a/docs/getting_started/search/search.md b/docs/getting_started/search/search.md index f03583cf..83d1c868 100644 --- a/docs/getting_started/search/search.md +++ b/docs/getting_started/search/search.md @@ -32,8 +32,8 @@ search term "motor". Then, we extract the most similar topic and check the resul ('advice', 0.005534544418830091)] ``` -It definitely seems that a topic was found that closely matches with "motor". The topic seems to be motorcycle -related and therefore matches with our "motor" input. You can use the `similarity` variable to see how similar +It definitely seems that a topic was found that closely matches "motor". The topic seems to be motorcycle +related and therefore matches our "motor" input. You can use the `similarity` variable to see how similar the extracted topics are to the search term. !!! note diff --git a/docs/getting_started/semisupervised/semisupervised.md b/docs/getting_started/semisupervised/semisupervised.md new file mode 100644 index 00000000..2a5707f0 --- /dev/null +++ b/docs/getting_started/semisupervised/semisupervised.md @@ -0,0 +1,88 @@ +In BERTopic, you have several options to nudge the creation of topics toward certain pre-specified topics. Here, we will be looking at semi-supervised topic modeling with BERTopic. + +Semi-supervised modeling allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have. + +
+
+--8<-- "docs/getting_started/semisupervised/semisupervised.svg" +
+
+ +In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents +with HDBSCAN. + +First, let us prepare the data needed for our topic model: + +```python +from bertopic import BERTopic +from sklearn.datasets import fetch_20newsgroups + +data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) +docs = data["data"] +categories = data["target"] +category_names = data["target_names"] +``` + +We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is +assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst +taking its underlying categories into account. These categories are here the variable `targets`. + +Each document can be put into one of the following categories: + +```python +>>> category_names + +['alt.atheism', + 'comp.graphics', + 'comp.os.ms-windows.misc', + 'comp.sys.ibm.pc.hardware', + 'comp.sys.mac.hardware', + 'comp.windows.x', + 'misc.forsale', + 'rec.autos', + 'rec.motorcycles', + 'rec.sport.baseball', + 'rec.sport.hockey', + 'sci.crypt', + 'sci.electronics', + 'sci.med', + 'sci.space', + 'soc.religion.christian', + 'talk.politics.guns', + 'talk.politics.mideast', + 'talk.politics.misc', + 'talk.religion.misc'] +``` + +To perform this semi-supervised approach, we can take in some pre-defined topics and simply pass those to the `y` parameter when fitting BERTopic. These labels can be pre-defined topics or simply documents that you feel belong together regardless of their content. BERTopic will nudge the creation of topics toward these categories +using the pre-defined labels. + +To perform supervised topic modeling, we simply use all categories: + +```python +topic_model = BERTopic(verbose=True).fit(docs, y=categories) +``` + +The topic model will be much more attuned to the categories that were defined previously. However, this does not mean that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you have already defined. This allows you to discover previously unknown topics! + +## **Partial labels** + +At times, you might only have labels for a subset of documents. Fortunately, we can still use those labels to at least nudge the documents for which those labels exist. The documents for which we do not have labels are assigned a -1. For this example, imagine we only the labels of categories that are related to computers and we want to create a topic model using semi-supervised modeling: + +```python +labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc', + 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', + 'comp.windows.x',] +indices = [category_names.index(label) for label in labels_to_add] +y = [label if label in indices else -1 for label in categories] +``` + +The `y` variable contains many -1 values since we do not know all the categories. + +Next, we use those newly constructed labels to again BERTopic semi-supervised: + +```python +topic_model = BERTopic(verbose=True).fit(docs, y=y) +``` + +And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the pre-defined categories. diff --git a/docs/getting_started/semisupervised/semisupervised.svg b/docs/getting_started/semisupervised/semisupervised.svg new file mode 100644 index 00000000..e341e30e --- /dev/null +++ b/docs/getting_started/semisupervised/semisupervised.svg @@ -0,0 +1,21 @@ + + + +SBERT +UMAP +HDBSCAN +c-TF-IDF +Embeddings + +Dimensionality reduction + +Labels + + +Clustering + + +Topic representation + + + diff --git a/docs/getting_started/supervised/classification_pipeline.svg b/docs/getting_started/supervised/classification_pipeline.svg new file mode 100644 index 00000000..efd21fb8 --- /dev/null +++ b/docs/getting_started/supervised/classification_pipeline.svg @@ -0,0 +1,14 @@ + + + +SBERT +Logistic Regression +c-TF-IDF +Embeddings + +Classifier + + +Topic representation + + diff --git a/docs/getting_started/supervised/default_pipeline.svg b/docs/getting_started/supervised/default_pipeline.svg new file mode 100644 index 00000000..30759e66 --- /dev/null +++ b/docs/getting_started/supervised/default_pipeline.svg @@ -0,0 +1,18 @@ + + + +SBERT +UMAP +HDBSCAN +c-TF-IDF +Embeddings + +Dimensionality reduction + + +Clustering + + +Topic representation + + diff --git a/docs/getting_started/supervised/square.svg b/docs/getting_started/supervised/square.svg new file mode 100644 index 00000000..c2935526 --- /dev/null +++ b/docs/getting_started/supervised/square.svg @@ -0,0 +1,16 @@ + + + + + + + \ No newline at end of file diff --git a/docs/getting_started/supervised/supervised.md b/docs/getting_started/supervised/supervised.md index 703c1a66..8ed7ed3b 100644 --- a/docs/getting_started/supervised/supervised.md +++ b/docs/getting_started/supervised/supervised.md @@ -1,86 +1,120 @@ -In this tutorial, we will be looking at a new feature of BERTopic, namely (semi)-supervised topic modeling! -This allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have. -In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents -with HDBSCAN. +Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Similarly, you might already have created some labels yourself through packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different. + +Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic and try to learn the relationship between those topics and the input documents. + +> In other words, we are going to be performing classification instead! + +We can view this as a supervised topic modeling approach. Instead of using a clustering algorithm, we are going to be using a classification algorithm instead. + +Generally, we have the following pipeline: + +
+
+--8<-- "docs/getting_started/supervised/default_pipeline.svg" +
+
+ +Instead, we are now going to skip over the dimensionality reduction step and replace the clustering step with a classification model: + +
+
+--8<-- "docs/getting_started/supervised/classification_pipeline.svg" +
+
+ +In other words, we can pass our labels to BERTopic and it will not only learn how to predict labels for new instances, but it also transforms those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer. + +To do so, we need to skip over the dimensionality reduction step and replace the clustering step with a classification algorithm. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels: -First, let us prepare the data needed for our topic model: ```python -from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups +# Get labeled data data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) -docs = data["data"] -categories = data["target"] -category_names = data["target_names"] +docs = data['data'] +y = data['target'] ``` -We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is -assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst -taking its underlying categories into account. These categories are here the variable `targets`. +Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process: -Each document can be put into one of the following categories: ```python ->>> category_names - -['alt.atheism', - 'comp.graphics', - 'comp.os.ms-windows.misc', - 'comp.sys.ibm.pc.hardware', - 'comp.sys.mac.hardware', - 'comp.windows.x', - 'misc.forsale', - 'rec.autos', - 'rec.motorcycles', - 'rec.sport.baseball', - 'rec.sport.hockey', - 'sci.crypt', - 'sci.electronics', - 'sci.med', - 'sci.space', - 'soc.religion.christian', - 'talk.politics.guns', - 'talk.politics.mideast', - 'talk.politics.misc', - 'talk.religion.misc'] +from bertopic import BERTopic +from bertopic.vectorizers import ClassTfidfTransformer +from bertopic.dimensionality import BaseDimensionalityReduction +from sklearn.linear_model import LogisticRegression + +# Get labeled data +data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) +docs = data['data'] +y = data['target'] + +# Skip over dimensionality reduction, replace cluster model with classifier, +# and reduce frequent words while we are at it. +empty_dimensionality_model = BaseDimensionalityReduction() +clf = LogisticRegression() +ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) + +# Create a fully supervised BERTopic instance +topic_model= BERTopic( + umap_model=empty_dimensionality_model, + hdbscan_model=clf, + ctfidf_model=ctfidf_model +) +topics, probs = topic_model.fit_transform(docs, y=y) ``` -## **Semi-supervised Topic Modeling** -In semi-supervised topic modeling, we only have some labels for our documents. The documents for which we do have labels -are used to somewhat guide BERTopic to the extraction of topics for those labels. The documents for which we do not have -labels are assigned a -1. For this example, imagine we only the labels of categories that are related to computers -and we want to create a topic model using semi-supervised modeling: -```python -labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc', - 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', - 'comp.windows.x',] -indices = [category_names.index(label) for label in labels_to_add] -y = [label if label in indices else -1 for label in categories] -``` +Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`: -The `y` variable contains many -1 values since we do not know all the categories. +
+
+--8<-- "docs/getting_started/supervised/table.svg" +
+
-Next, we use those newly constructed labels to again BERTopic semi-supervised: +We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship: ```python -topic_model = BERTopic(verbose=True).fit(docs, y=y) +# Map input `y` to topics +mappings = topic_model.topic_mapper_.get_mappings() +mappings = {value: data["target_names"][key] for key, value in mappings.items()} + +# Assign original classes to our topics +df = topic_model.get_topic_info() +df["Class"] = df.Topic.map(mappings) +df ``` +
+--8<-- "docs/getting_started/supervised/table_classes.svg" +
-And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the -pre-defined categories. +
-## **Supervised Topic Modeling** -In supervised topic modeling, we have labels for all our documents. This can be pre-defined topics or simply documents -that you feel belong together regardless of their content. BERTopic will nudge the creation of topics towards these categories -using the pre-defined labels. - -To perform supervised topic modeling, we simply use all categories: +We can see that the c-TF-IDF representations extract the words that give a good representation of our input classes. This is all done directly from the labeling. A welcome side-effect is that we now have a classification algorithm that allows us to predict the topics of unseen data: ```python -topic_model = BERTopic(verbose=True).fit(docs, y=categories) +>>> topic, _ = topic_model.transform("this is a document about cars") +>>> topic_model.get_topic(topic) +[('car', 0.4407600315538472), + ('cars', 0.32348015696446325), + ('engine', 0.28032518444946686), + ('ford', 0.2500224508115155), + ('oil', 0.2325984913598611), + ('dealer', 0.2310723968585826), + ('my', 0.22045777551991935), + ('it', 0.21327993649430219), + ('tires', 0.20420842634292657), + ('brake', 0.20246902481367085)] ``` -The topic model will be much more attuned to the categories that were defined previously. However, this does not mean -that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you -have already defined. This allows you to discover previously unknown topics! \ No newline at end of file + Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc. + +!!! note + The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following: + + + ```python + mappings = topic_model.topic_mapper_.get_mappings() + y_mapped = [mappings[val] for val in y] + ``` diff --git a/docs/getting_started/supervised/table.svg b/docs/getting_started/supervised/table.svg new file mode 100644 index 00000000..4f0b5ae0 --- /dev/null +++ b/docs/getting_started/supervised/table.svg @@ -0,0 +1,52 @@ + + + +Topic +Count +Name + +0 +0 +999 +0_game_hockey_team_25 + +1_god_church_jesus_christ +997 +1 +1 + +2 +2 +996 +2_bike_dod_ride_bikes + +3_baseball_game_he_year +994 +3 +3 + +4 +4 +991 +4_key_encryption_db_clipper + +5_car_cars_engine_ford +990 +5 +5 + +6 +6 +990 +6_medical_patients_cancer_disease + +7_window_server_widget_motif +988 +7 +7 + +8 +8 +988 +8_space_launch_nasa_orbit + diff --git a/docs/getting_started/supervised/table_classes.svg b/docs/getting_started/supervised/table_classes.svg new file mode 100644 index 00000000..856f12ef --- /dev/null +++ b/docs/getting_started/supervised/table_classes.svg @@ -0,0 +1,62 @@ + + + +Topic +Count +Name +Class + +0 +0 +999 +0_game_hockey_team_25 +rec.sport.hockey + +1_god_church_jesus_christ +997 +1 +1 + +2 +2 +996 +2_bike_dod_ride_bikes + +3_baseball_game_he_year +994 +3 +3 + +4 +4 +991 +4_key_encryption_db_clipper + +5_car_cars_engine_ford +990 +5 +5 + +6 +6 +990 +6_medical_patients_cancer_disease + +7_window_server_widget_motif +988 +7 +7 + +8 +8 +988 +8_space_launch_nasa_orbit +sci.space +comp.windows.x +sci.med +rec.autos +sci.crypt +rec.sport.baseball +rec.motorcycles +soc.religion.christian + diff --git a/docs/getting_started/tips_and_tricks/tips_and_tricks.md b/docs/getting_started/tips_and_tricks/tips_and_tricks.md index 91c1755f..11db646b 100644 --- a/docs/getting_started/tips_and_tricks/tips_and_tricks.md +++ b/docs/getting_started/tips_and_tricks/tips_and_tricks.md @@ -21,6 +21,17 @@ vectorizer_model = CountVectorizer(stop_words="english") topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` +We can also use the `ClassTfidfTransformer` to reduce the impact of frequent words. The end result is very similar to explictly removing stopwords but this process does this automatically: + +```python +from bertopic import BERTopic +from bertopic.vectorizers import ClassTfidfTransformer + +ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) +topic_model = BERTopic(ctfidf_model=ctfidf_model) +``` + + ## **Diversify topic representation** After having calculated our top *n* words per topic there might be many words that essentially mean the same thing. As a little bonus, we can use the `diversity` parameter in BERTopic to @@ -152,6 +163,53 @@ from cuml.preprocessing import normalize embeddings = normalize(embeddings) ``` +!!! note + As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., `.transform`) using cuML's HDBSCAN. + However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., `.fit` and `.fit_tranform`). + +!!! note + If you want to install cuML together with BERTopic using Google Colab, you can run the following code: + + ```bash + !pip install bertopic + !pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com + !pip uninstall cupy-cuda115 -y + !pip uninstall cupy-cuda11x -y + !pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64 + ``` + + +## **Lightweight installation** + +The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely `"all-MiniLM-L6-v2"`. Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires `pytorch` which often results in a rather large environment, memory-wise. + +Fortunately, it is possible to install BERTopic without `sentence-transformers` and use it as a lightweight solution instead. The installation can be done as follows: + +```bash +pip install --no-deps bertopic +pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml +``` + +Then, we can use BERTopic without `sentence-transformers` as follows using a CPU-based embedding technique: + +```python +from sklearn.pipeline import make_pipeline +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer + +pipe = make_pipeline( + TfidfVectorizer(), + TruncatedSVD(100) +) + +topic_model = BERTopic(embedding_model=pipe) +``` + +As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary! + + ## **Finding similar topics between models** Whenever you have trained seperate BERTopic models on different datasets, it might diff --git a/docs/getting_started/topicreduction/topicreduction.md b/docs/getting_started/topicreduction/topicreduction.md index b42249c3..d9762d37 100644 --- a/docs/getting_started/topicreduction/topicreduction.md +++ b/docs/getting_started/topicreduction/topicreduction.md @@ -4,8 +4,7 @@ Instead, we can try to reduce the number of topics that have been created. Below so. ### **Manual Topic Reduction** -Each resulting topic has its own -feature vector constructed from c-TF-IDF. Using those feature vectors, we can find the most similar +Each resulting topic has its feature vector constructed from c-TF-IDF. Using those feature vectors, we can find the most similar topics and merge them. If we do this iteratively, starting from the least frequent topic, we can reduce the number of topics quite easily. We do this until we reach the value of `nr_topics`: ```python @@ -44,9 +43,7 @@ topic_model = BERTopic(nr_topics="auto") ``` ### **Topic Reduction after Training** -Finally, we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so, -is that you can decide the number of topics after knowing how many are created. It is difficult to -predict before training your model how many topics that are in your documents and how many will be extracted. +Finally, we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic: ```python diff --git a/docs/getting_started/topicrepresentation/topicrepresentation.md b/docs/getting_started/topicrepresentation/topicrepresentation.md index 91f3f630..ce515aa9 100644 --- a/docs/getting_started/topicrepresentation/topicrepresentation.md +++ b/docs/getting_started/topicrepresentation/topicrepresentation.md @@ -34,7 +34,7 @@ From the model created above, one of the most frequent topics is the following: ('amanda intercon com', 0.002585262048515583)] ``` -Although there does seem to be some relation between words, it is difficult, at least for me, to intuitively understand +Although there does seems to be some relation between words, it is difficult, at least for me, to intuitively understand what the topic is about. Instead, let's simplify the topic representation by setting `n_gram_range` to (1, 3) to also allow for single words. @@ -64,14 +64,14 @@ topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` !!! Tip "Tip!" - If you want to change the topics to something else, whether that is merging them or removing outliers, you can pass in - a custom list of topics in order to update them: `topic_model.update_topics(docs, topics=my_updated_topics)` + If you want to change the topics to something else, whether that is merging them or removing outliers, you can pass + a custom list of topics to update them: `topic_model.update_topics(docs, topics=my_updated_topics)` ### **Custom labels** The topic labels are currently automatically generated by taking the top 3 words and combining them using the `_` separator. Although this is an informative label, in practice, this is definitely not the prettiest nor necessarily the most accurate label. For example, although the topic label -`1_space_nasa_orbit` is informative, we would prefer to have a bit more intuitive label, such as +`1_space_nasa_orbit` is informative, but we would prefer to have a bit more intuitive label, such as `space travel`. The difficulty with creating such topic labels is that much of the interpretation is left to the user. Would `space travel` be more accurate or perhaps `space explorations`? To truly understand which labels are most suited, going into some of the documents in topics is especially helpful. Although we can go through every single topic ourselves and try to label them, we can start by creating an overview of labels that have the length and number of words that we are looking for. To do so, we can generate our list of topic labels with `.get_topic_labels` and define the number of words, the separator, word length, etc: diff --git a/docs/getting_started/topicsovertime/topicsovertime.md b/docs/getting_started/topicsovertime/topicsovertime.md index 3d19e1a9..ee9afa35 100644 --- a/docs/getting_started/topicsovertime/topicsovertime.md +++ b/docs/getting_started/topicsovertime/topicsovertime.md @@ -5,21 +5,20 @@ topic itself remains the same, environmental awareness, the exact representation BERTopic allows for DTM by calculating the topic representation at each timestep without the need to run the entire model several times. To do this, we first need to fit BERTopic as if there were no temporal -aspect in the data. Thus, a general topic model will be created. We use the global representation as to the main -topics that can be found at, most likely, different timesteps. For each topic and timestep, we calculate the c-TF-IDF -representation. This will result in a specific topic representation at each timestep without the need to create -clusters from embeddings as they were already created. +aspect in the data. Thus, a general topic model will be created. We use the global representation as to the main topics that can be found at, most likely, different timesteps. For each topic and timestep, we calculate the c-TF-IDF representation. This will result in a specific topic representation at each timestep without the need to create clusters from embeddings as they were already created. + +
+
+--8<-- "docs/getting_started/topicsovertime/topicsovertime.svg" +
+
Next, there are two main ways to further fine-tune these specific topic representations, namely **globally** and **evolutionary**. -A topic representation at timestep *t* can fine-tuned **globally** by averaging its c-TF-IDF representation with -that of the global representation. This allows each topic representation to move slightly towards the global -representation whilst still keeping some its specific words. +A topic representation at timestep *t* can be fine-tuned **globally** by averaging its c-TF-IDF representation with that of the global representation. This allows each topic representation to move slightly towards the global representation whilst still keeping some of its specific words. -A topic representation at timestep *t* can be fine-tuned **evolutionary** by averaging its c-TF-IDF representation -with that of the c-TF-IDF representation at timestep *t-1*. This is done for each topic representation allowing for -the representations to evolve over time. +A topic representation at timestep *t* can be fine-tuned **evolutionary** by averaging its c-TF-IDF representation with that of the c-TF-IDF representation at timestep *t-1*. This is done for each topic representation allowing for the representations to evolve over time. Both fine-tuning methods are set to `True` as a default and allow for interesting representations to be created. @@ -29,7 +28,7 @@ modeling on Twitter data. We can analyze how certain people have talked about ce they have been on Twitter. Due to the controversial nature of his tweets, we are going to be using all tweets by Donald Trump. -First, we need to load in the data and do some very basic cleaning. For example, I am not interested in his +First, we need to load the data and do some very basic cleaning. For example, I am not interested in his re-tweets for this use-case: ```python @@ -56,7 +55,7 @@ topics, probs = topic_model.fit_transform(tweets) ``` From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this -by simply calling `topics_over_time` and pass in his tweets, the corresponding timestamps, and the related topics: +by simply calling `topics_over_time` and passing the tweets, the corresponding timestamps, and the related topics: ```python topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20) @@ -99,7 +98,7 @@ topics_over_time = topic_model.topics_over_time(tweets, timestamps, datetime_for ## **Visualization** To me, DTM becomes truly interesting when you have a good way of visualizing how topics have changed over time. -A nice way of doing so is leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency +A nice way of doing so is by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call `visualize_topics_over_time` with the newly created topics over time: diff --git a/docs/getting_started/topicsovertime/topicsovertime.svg b/docs/getting_started/topicsovertime/topicsovertime.svg new file mode 100644 index 00000000..4c9d21e6 --- /dev/null +++ b/docs/getting_started/topicsovertime/topicsovertime.svg @@ -0,0 +1,115 @@ + + + 1 +Topic + +Timestep +1 + + m +Timestep + +Timestep +1 + +Timestep +m + + n +Topic + + + + + + + + + + + + + + + + + + + + + + + + + + + + +c-TF-IDF +c-TF-IDF +c-TF-IDF +c-TF-IDF + + + + + + + + + + + + + + + + + + + + + + + + + + +topic c-TF-IDF + + + + + +c-TF-IDF at t ++ +2 + + + + + + +c-TF-IDF at t + + + + + +c-TF-IDF at t-1 ++ +2 + + +Global Tuning +Split documents by topic +Split documents by topic and timestep +Apply pre-fitted c-TF-IDF on each subset of documents. +Tune the c-TF-IDF at each timestep t by either averaging the representations with the global representation or with the representation at t-1. +Evolutionary Tuning + + + + +Optional tuning of representations + diff --git a/docs/getting_started/topicsperclass/class_modeling.svg b/docs/getting_started/topicsperclass/class_modeling.svg new file mode 100644 index 00000000..999b428e --- /dev/null +++ b/docs/getting_started/topicsperclass/class_modeling.svg @@ -0,0 +1,76 @@ + + + 1 +Topic + + 1 +Class + + m +Class + + 1 +Class + + m +Class + + n +Topic + + + + + + + + + + + + + + + + + + + + + + + + + + + + +c-TF-IDF +c-TF-IDF +c-TF-IDF +c-TF-IDF + + + + + + + + + + + + + + + + + + + + + +Split documents by topic +Split documents by topic and class +Apply pre-fitted c-TF-IDF on each subset of documents. + diff --git a/docs/getting_started/topicsperclass/topicsperclass.md b/docs/getting_started/topicsperclass/topicsperclass.md index 024ea9de..0b1d41b8 100644 --- a/docs/getting_started/topicsperclass/topicsperclass.md +++ b/docs/getting_started/topicsperclass/topicsperclass.md @@ -1,12 +1,16 @@ In some cases, you might be interested in how certain topics are represented over certain categories. Perhaps there are specific groups of users for which you want to see how they talk about certain topics. -Instead of running the topic model per class, we can simply create a topic model and then extract, for each topic, -its representation per class. This allows you to see how certain topics, calculated over all documents, are represented -for certain subgroups. +Instead of running the topic model per class, we can simply create a topic model and then extract, for each topic, its representation per class. This allows you to see how certain topics, calculated over all documents, are represented for certain subgroups. -To do so, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of -documents. +
+
+--8<-- "docs/getting_started/topicsperclass/class_modeling.svg" +
+
+ + +To do so, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents. First, let's prepare the data: diff --git a/docs/getting_started/vectorizers/vectorizers.md b/docs/getting_started/vectorizers/vectorizers.md index f7191e1b..f79abcf2 100644 --- a/docs/getting_started/vectorizers/vectorizers.md +++ b/docs/getting_started/vectorizers/vectorizers.md @@ -1,14 +1,20 @@ -# Vectorizers +In topic modeling, the quality of the topic representations is key for interpreting the topics, communicating results, and understanding patterns. It is of utmost +importance to make sure that the topic representations fit with your use case. -In topic modeling, the quality of the topic representations are key for interpreting the topics, communicating results, and understanding patterns. It is of utmost -importance to make sure that the topic representations fits with your use case. - -In practice, there is not one correct way of creating topic representations. Some use cases might opt more higher n-grams, whereas others might focus more on single +In practice, there is not one correct way of creating topic representations. Some use cases might opt for higher n-grams, whereas others might focus more on single words without any stop words. The diversity in use cases also means that we need to have some flexibility in BERTopic to make sure it can be used across most use cases. +The image below illustrates this modularity: + +
+ ![Image title](vectorizers.svg) +
+
+ +In this section, we will go through several examples of vectorization algorithms and how they can be implemented. ## **CountVectorizer** -One often understimated component of BERTopic is the `CountVectorizer` and `c-TF-IDF` calculation. Together, they are responsible for creating the topic representations and luckily +One often underestimated component of BERTopic is the `CountVectorizer` and `c-TF-IDF` calculation. Together, they are responsible for creating the topic representations and luckily can be quite flexible in parameter tuning. Here, we will go through tips and tricks for tuning your `CountVectorizer` and see how they might affect the topic representations. Before starting, it should be noted that you can pass the `CountVectorizer` before and after training your topic model. Passing it before training allows you to @@ -49,7 +55,7 @@ on fine-tuning our topic representations after training our model. ### **Basic Usage** -First, let's start with defining our documents and train our topic model: +First, let's start with defining our documents and training our topic model: ```python from bertopic import BERTopic @@ -86,7 +92,7 @@ them to the topic representation above. ### **Parameters** -There are a number of basic parameters in the CountVectorizer that we can use to improve upon the quality of the resulting topic representations. +There are several basic parameters in the CountVectorizer that we can use to improve upon the quality of the resulting topic representations. #### ngram_range @@ -140,7 +146,7 @@ Although they look very similar, if we zoom in on topic 8, we can see longer wor #### stop_words In some of the topics, we can see stop words appearing like `he` or `the`. -Stop words is something we typically want to prevent in our topic representations as they do not give additional information to the topic. +Stop words are something we typically want to prevent in our topic representations as they do not give additional information to the topic. To prevent those stop words, we can use the `stop_words` parameter in the `CountVectorizer` to remove them from the representations: ```python @@ -174,11 +180,11 @@ We can also pass in a list of stop words if you have multiple languages to take One important parameter to keep in mind is the `min_df`. This is typically an integer representing how frequent a word must be before being added to our representation. You can imagine that if we have a million documents and a certain word only appears a single time across all of them, then -it would be highly unlikely to be representive of a topic. Typically, the `c-TF-IDF` calculation removes that word from the topic representation but when -you have millions of documents, that will also lead to very large topic-term matrix. To prevent a huge vocabulary, we can set the `min_df` to only accept +it would be highly unlikely to be representative of a topic. Typically, the `c-TF-IDF` calculation removes that word from the topic representation but when +you have millions of documents, that will also lead to a very large topic-term matrix. To prevent a huge vocabulary, we can set the `min_df` to only accept words that have a minimum frequency. -When you have millions of documents, or error issues, I would advise increasing the value of `min_df` as long as the topic representations might sense: +When you have millions of documents or error issues, I would advise increasing the value of `min_df` as long as the topic representations might sense: ```python from sklearn.feature_extraction.text import CountVectorizer @@ -203,14 +209,14 @@ With the following topic representation: 10 9 174 9_audio_condition_stereo_asking ``` -As you can see, the output is nearly the same which is actually what we would like to achieve. All words that appear less than 10 times are now removed +As you can see, the output is nearly the same which is what we would like to achieve. All words that appear less than 10 times are now removed from our topic-term matrix (i.e., `c-TF-IDF` matrix) which drastically lowers the matrix in size. #### max_features A parameter similar to `min_df` is `max_features` which allows you to select the top n most frequent words to be used in the topic representation. -Setting this to, for example, `10_000` creates a topic-term matrix with `10_000` terms. This helps you control the size of the topic-term matrix +Setting this, for example, to `10_000` creates a topic-term matrix with `10_000` terms. This helps you control the size of the topic-term matrix directly without having to fiddle around with the `min_df` parameter: ```python @@ -255,7 +261,7 @@ def tokenize_zh(text): vectorizer = CountVectorizer(tokenizer=tokenize_zh) ``` -Then, we can simply pass the vectorizer update our topic representations: +Then, we can simply pass the vectorizer to update our topic representations: ```python topic_model.update_topics(docs, vectorizer_model=vectorizer_model) @@ -264,7 +270,7 @@ topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ## **OnlineCountVectorizer** -When using the online/incremental variant of BERTopic, we need a `CountVectorizer` than can incrementally update its representation. For that purpose, `OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large in size. It is a class that can be found in `bertopic.vectorizers` which extends `sklearn.feature_extraction.text.CountVectorizer`. In other words, you can use the exact same parameter in `OnlineCountVectorizer` as found in Scikit-Learn's `CountVectorizer`. We can use it as follows: +When using the online/incremental variant of BERTopic, we need a `CountVectorizer` than can incrementally update its representation. For that purpose, `OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. It is a class that can be found in `bertopic.vectorizers` which extends `sklearn.feature_extraction.text.CountVectorizer`. In other words, you can use the exact same parameter in `OnlineCountVectorizer` as found in Scikit-Learn's `CountVectorizer`. We can use it as follows: ```python from bertopic import BERTopic @@ -281,13 +287,13 @@ Other than parameters found in `CountVectorizer`, such as `stop_words` and `ngr #### decay -At each iteration, we sum the bag-of-words representation of the new documents with the bag-of-words representation of all documents processed thus far. In other words, the bag-of-words matrix keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less and less relevant as time goes on. Therefore, a `decay` parameter was implemented that decays the bag-of-words's frequencies at each iteration before adding the document frequencies of new documents. The `decay` parameter is a value between 0 and 1 and indicates the percentage of frequencies the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix with 10% at each iteration before adding the new bag-of-words matrix. This will make sure that recent data has more weight than previously iterations. +At each iteration, we sum the bag-of-words representation of the new documents with the bag-of-words representation of all documents processed thus far. In other words, the bag-of-words matrix keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less and less relevant as time goes on. Therefore, a `decay` parameter was implemented that decays the bag-of-words' frequencies at each iteration before adding the document frequencies of new documents. The `decay` parameter is a value between 0 and 1 and indicates the percentage of frequencies the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration before adding the new bag-of-words matrix. This will make sure that recent data has more weight than previous iterations. #### delete_min_df -In BERTopic, we might want to remove words from the topic representation that ppear infrequently. The `min_df` in the `CountVectorizer` works quite well for that. However, when have a streaming setting, the `min_df` does not work as well since a word's frequency might start below `min_df` but will end up higher than that over time. Setting that value high might not always be advised. +In BERTopic, we might want to remove words from the topic representation that appear infrequently. The `min_df` in the `CountVectorizer` works quite well for that. However, when we have a streaming setting, the `min_df` does not work as well since a word's frequency might start below `min_df` but will end up higher than that over time. Setting that value high might not always be advised. -As a result, the vocabulary of the resulting bag-of-words matrix can become quite large. Similarly, if we implement the `decay` parameter, then some values will actually decrease over time until they are below `min_df`. For these reasons, the `delete_min_df` parameter was implemented. The parameter takes positive integers and indicates, at each iteration, which words will be removed. If the value is set to 5, it will check after each iteration if the total frequency of a word is exceed by that value. If so, the word will be removed in its entirety from the bag-of-words matrix. This helps to keep the bag-of-words matrix of a manageble size. +As a result, the vocabulary of the resulting bag-of-words matrix can become quite large. Similarly, if we implement the `decay` parameter, then some values will decrease over time until they are below `min_df`. For these reasons, the `delete_min_df` parameter was implemented. The parameter takes positive integers and indicates, at each iteration, which words will be removed. If the value is set to 5, it will check after each iteration if the total frequency of a word is exceeded by that value. If so, the word will be removed in its entirety from the bag-of-words matrix. This helps to keep the bag-of-words matrix of a manageable size. !!! note - Although the `delete_min_df` parameter removes words from the bag-of-words matrix, it is not permament. If new documents come in where those previously deleted words are used frequently, they get added back to the matrix. \ No newline at end of file + Although the `delete_min_df` parameter removes words from the bag-of-words matrix, it is not permanent. If new documents come in where those previously deleted words are used frequently, they get added back to the matrix. \ No newline at end of file diff --git a/docs/getting_started/vectorizers/vectorizers.svg b/docs/getting_started/vectorizers/vectorizers.svg new file mode 100644 index 00000000..be3ee4ab --- /dev/null +++ b/docs/getting_started/vectorizers/vectorizers.svg @@ -0,0 +1,47 @@ + + + + + + +SBERT + + + + + +UMAP + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +Jieba + + + + + +c-TF-IDF + + + + + +POS + + + + diff --git a/docs/getting_started/visualization/visualization.md b/docs/getting_started/visualization/visualization.md index c3efebe0..d1add600 100644 --- a/docs/getting_started/visualization/visualization.md +++ b/docs/getting_started/visualization/visualization.md @@ -1,10 +1,10 @@ -Visualizing BERTopic and its derivatives is important in understanding the model, how it works, but more importantly, where it works. +Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue. ## **Visualize Topics** After having trained our `BERTopic` model, we can iteratively go through hundreds of topics to get a good -understanding of the topics that were extract. However, that takes quite some time and lacks a global representation. +understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to [LDAvis](https://github.com/cpsievert/LDAvis). @@ -22,7 +22,7 @@ topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` -Then, we can use call `.visualize_topics` to create a 2D representation of your topics. The resulting graph is a +Then, we can call `.visualize_topics` to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML: ```python @@ -72,10 +72,18 @@ topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set `hide_document_hover=False` in order to hover over the points and see the content of the documents. +### **Custom Hover** + +When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (`titles`) containing the title for each document: + +```python +topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings) +``` + ## **Visualize Topic Hierarchy** The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use `scipy.cluster.hierarchy` to create clusters and visualize how -they relate to one another. This might help selecting an appropriate `nr_topics` when reducing the number +they relate to one another. This might help to select an appropriate `nr_topics` when reducing the number of topics that you have created. To visualize this hierarchy, run the following: ```python @@ -117,7 +125,7 @@ topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations -help you understand the effect of merging certain topics together. Some might be logical to merge whilst others might not. Moreover, +help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes. ### **Text-based topic tree** @@ -397,7 +405,7 @@ to view, we can see better which topics could be logically merged: ## **Visualize Hierarchical Documents** We can extend the previous method by calculating the topic representation at different levels of the hierarchy and -plotting them on a 2D-plane. To do so, we first need to calculate the hierarchical topics: +plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics: ```python from sklearn.datasets import fetch_20newsgroups @@ -431,7 +439,7 @@ topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_ !!! note The visualization above was generated with the additional parameter `hide_document_hover=True` which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization - smaller and fit into your RAM. However, it might be interesting to set `hide_document_hover=False` in order to hover + smaller and fit into your RAM. However, it might be interesting to set `hide_document_hover=False` to hover over the points and see the content of the documents. ## **Visualize Terms** @@ -567,29 +575,57 @@ topic_model.visualize_topics_per_class(topics_per_class) -## **Visualize Probablities** -We can also calculate the probabilities of topics found in a document. In order to do so, we have to -set `calculate_probabilities` to True as calculating them can be quite computationally expensive. -Then, we use the variable `probabilities` that is returned from `transform()` or `fit_transform()` -to understand how confident BERTopic is that certain topics can be found in a document: +## **Visualize Probablities or Distribution** + +We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used: ```python from bertopic import BERTopic -from sklearn.datasets import fetch_20newsgroups - -docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic(calculate_probabilities=True) -topics, probabilities = topic_model.fit_transform(docs) +topics, probs = topic_model.fit_transform(docs) +``` + +The resulting `probs` variable contains the soft-clustering as done through HDBSCAN. + +If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model: + +```python +from bertopic import BERTopic + +topic_model = BERTopic() +topics, _ = topic_model.fit_transform(docs) +topic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0) ``` -To visualize the distributions, run the following: +Then, we either pass the `probs` or `topic_distr` variable to `.visualize_distribution` to visualize either the probability distributions or the topic distributions: ```python -topic_model.visualize_distribution(probabilities[0]) +# To visualize the probabilities of topic assignment +topic_model.visualize_distribution(probs[0]) + +# To visualize the topic distributions in a document +topic_model.visualize_distribution(topic_distr[0]) ``` +Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results: + +```python +# Calculate the topic distributions on a token-level +topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) + +# Visualize the token-level distributions +df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1]) +df +``` + +

+ +

+ +!!! note + To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2` !!! note The distribution of the probabilities does not give an indication to diff --git a/docs/index.md b/docs/index.md index 739cddcf..b96ff873 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,11 +12,16 @@ allowing for easily interpretable topics whilst keeping important words in the t BERTopic supports [**guided**](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html), -(semi-) [**supervised**](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html), -and [**dynamic**](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) topic modeling. It even supports visualizations similar to LDAvis! +[**supervised**](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html), +[**semi-supervised**](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html), +[**manual**](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html), +[**long-document**](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html), +[**hierarchical**](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html), +[**class-based**](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html), +[**dynamic**](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html), and +[**online**](https://maartengr.github.io/BERTopic/getting_started/online/online.html) topic modeling. It even supports visualizations similar to LDAvis! -Corresponding medium posts can be found [here](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6?source=friends_link&sk=0b5a470c006d1842ad4c8a3057063a99) -and [here](https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4). +Corresponding medium posts can be found [here](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6?source=friends_link&sk=0b5a470c006d1842ad4c8a3057063a99), [here](https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4) and [here](https://towardsdatascience.com/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf?sk=b1e0fd46f70cb15e8422b4794a81161d). For a more detailed overview, you can read the [paper](https://arxiv.org/abs/2203.05794) or see a [brief overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). ## **Installation** @@ -78,13 +83,35 @@ frequent topic that was generated, topic 0: ('software', 0.0034415334250699077), ('email', 0.0034239554442333257), ('pc', 0.003047105930670237)] -``` +``` + +Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: + +```python +>>> topic_model.get_document_info(docs) + +Document Topic Name Top_n_words Probability ... +I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... +My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... +Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... +Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... +1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... +``` **NOTE**: Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. +## **Modularity** + +By default, the main steps for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model: + + + +You can swap out any of these models or even remove them entirely. Starting with the embedding step, you can find out how to do this [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) and more about the underlying algorithm and assumptions [here](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). + ## **Overview** -BERTopic has quite a number of functions that quickly can become overwhelming. To alleviate this issue, you will find an overview +BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose. ### Common @@ -99,12 +126,14 @@ Below, you will find an overview of common functions in BERTopic. | Access all topics | `.get_topics()` | | Get topic freq | `.get_topic_freq()` | | Get all topic information| `.get_topic_info()` | +| Get all document information| `.get_document_info(docs)` | | Get representative docs per topic | `.get_representative_docs()` | | Update topic representation | `.update_topics(docs, n_gram_range=(1, 3))` | | Generate topic labels | `.generate_topic_labels()` | | Set topic labels | `.set_topic_labels(my_custom_labels)` | | Merge topics | `.merge_topics(docs, topics_to_merge)` | | Reduce nr of topics | `.reduce_topics(docs, nr_topics=30)` | +| Reduce outliers | `.reduce_outliers(docs, topics)` | | Find topics | `.find_topics("vehicle")` | | Save model | `.save("my_model")` | | Load model | `BERTopic.load("my_model")` | @@ -112,7 +141,7 @@ Below, you will find an overview of common functions in BERTopic. ### Attributes -After having trained your BERTopic model, a number of attributes are saved within your model. These attributes, in part, +After having trained your BERTopic model, several are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in `_` and are public attributes that can be used to access model information. @@ -131,16 +160,19 @@ public attributes that can be used to access model information. ### Variations -There are many different use cases in which topic modeling can be used. As such, a number of -variations of BERTopic have been developed such that one package can be used across across many use cases. +There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases. | Method | Code | |-----------------------|---| -| (semi-) Supervised Topic Modeling | `.fit(docs, y=y)` | -| Topic Modeling per Class | `.topics_per_class(docs, classes)` | -| Dynamic Topic Modeling | `.topics_over_time(docs, timestamps)` | -| Hierarchical Topic Modeling | `.hierarchical_topics(docs)` | -| Guided Topic Modeling | `BERTopic(seed_topic_list=seed_topic_list)` | +| [Topic Distribution Approximation](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) | `.approximate_distribution(docs)` | +| [Online Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/online/online.html) | `.partial_fit(doc)` | +| [Semi-supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html) | `.fit(docs, y=y)` | +| [Supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) | `.fit(docs, y=y)` | +| [Manual Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) | `.fit(docs, y=y)` | +| [Topic Modeling per Class](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html) | `.topics_per_class(docs, classes)` | +| [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` | +| [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` | +| [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` | ### Visualizations Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index e0eabae5..fcfddce2 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -6,4 +6,21 @@ :root>* { --md-typeset-a-color: #016198; --md-text-link-color: #000000; -} \ No newline at end of file +} + +body[data-md-color-primary="black"] .svg_image svg { + filter: invert(100%) hue-rotate(180deg); +} + +body[data-md-color-primary="black"] .svg_image svg rect { + fill: transparent; +} + +.svg_image { + text-align: center; +} + +.center { + display: block; + margin: 0 auto; +} diff --git a/images/clusters.png b/images/clusters.png deleted file mode 100644 index 80d2d32a..00000000 Binary files a/images/clusters.png and /dev/null differ diff --git a/images/modularity.svg b/images/modularity.svg new file mode 100644 index 00000000..fb137685 --- /dev/null +++ b/images/modularity.svg @@ -0,0 +1,240 @@ + + + + + + + + + + + + + + + + + + +SBERT + + + + + +SpaCy + + + + + +Transformers + + + + + + + + + + + + + + + + + + + + + + + + +UMAP + + + + + +PCA + + + + + +TruncatedSVD + + + + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +Jieba + + + + + +POS + + + + + +k-Means + + + + + +BIRCH + + + + + + + + + + + + +Embeddings +Dimensionality Reduction +Clustering +Tokenizer +Weighting scheme + + + + + +SBERT + + + + + +UMAP + + + + + +HDBSCAN + + + + + +CountVectorizer + + + + + +c-TF-IDF + + + + + +c-TF-IDF + + + + + +c-TF-IDF + MMR + + + + + +c-TF-IDF + BM25 + + + + + +TF-IDF + + + + + +TruncatedSVD + + + + + +BIRCH + + + + + +CountVectorizer + + + + + +c-TF-IDF + MMR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/images/probabilities.png b/images/probabilities.png deleted file mode 100644 index 124ce60c..00000000 Binary files a/images/probabilities.png and /dev/null differ diff --git a/images/topics.png b/images/topics.png deleted file mode 100644 index e1b109e0..00000000 Binary files a/images/topics.png and /dev/null differ diff --git a/mkdocs.yml b/mkdocs.yml index 9dfb4d7c..680edc9f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,35 +12,44 @@ nav: - The Algorithm: algorithm/algorithm.md - Getting Started: - getting_started/quickstart/quickstart.md - - Embedding Models: getting_started/embeddings/embeddings.md - Topic Visualization: getting_started/visualization/visualization.md - Topic Reduction: getting_started/topicreduction/topicreduction.md - Topic Representation: getting_started/topicrepresentation/topicrepresentation.md - Search Topics: getting_started/search/search.md - Parameter tuning: getting_started/parameter tuning/parametertuning.md + - Outlier reduction: getting_started/outlier_reduction/outlier_reduction.md - Tips & Tricks: getting_started/tips_and_tricks/tips_and_tricks.md - Sub-models: + - Embeddings: getting_started/embeddings/embeddings.md - Dimensionality Reduction: getting_started/dim_reduction/dim_reduction.md - Clustering: getting_started/clustering/clustering.md - Vectorizers: getting_started/vectorizers/vectorizers.md - c-TF-IDF: getting_started/ctfidf/ctfidf.md - Variations: + - Topic Distributions: getting_started/distribution/distribution.md - Topics per Class: getting_started/topicsperclass/topicsperclass.md - - (semi)-Supervised Topic Modeling: getting_started/supervised/supervised.md + - Supervised Topic Modeling: getting_started/supervised/supervised.md + - Semi-supervised Topic Modeling: getting_started/semisupervised/semisupervised.md - Dynamic Topic Modeling: getting_started/topicsovertime/topicsovertime.md - Guided Topic Modeling: getting_started/guided/guided.md - Hierarchical Topic Modeling: getting_started/hierarchicaltopics/hierarchicaltopics.md - Online Topic Modeling: getting_started/online/online.md + - Manual Topic Modeling: getting_started/manual/manual.md - FAQ: faq.md - API: - BERTopic: api/bertopic.md - MMR: api/mmr.md - - Vectorizers: - - cTFIDF: api/ctfidf.md - - OnlineCountVectorizer: api/onlinecv.md - - Backends: - - Base: api/backends/base.md - - Word Doc: api/backends/word_doc.md + - Sub-models: + - Backends: + - Base: api/backends/base.md + - Word Doc: api/backends/word_doc.md + - Dimensionality Reduction: + - Base: api/dimensionality/base.md + - Clustering: + - Base: api/cluster/base.md + - Vectorizers: + - cTFIDF: api/ctfidf.md + - OnlineCountVectorizer: api/onlinecv.md - Plotting: - Barchart: api/plotting/barchart.md - Documents: api/plotting/documents.md @@ -81,19 +90,24 @@ theme: - navigation.tracking - toc.follow palette: - - scheme: default - toggle: - icon: material/weather-sunny - name: Switch to dark mode - - scheme: slate - toggle: - icon: material/weather-night - name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: black + toggle: + icon: material/weather-sunny + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/weather-night + name: Switch to light mode markdown_extensions: - admonition + - md_in_html - pymdownx.details - pymdownx.highlight - pymdownx.superfences + - pymdownx.snippets - toc: permalink: true diff --git a/setup.py b/setup.py index 320b01ba..dac87073 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ base_packages = [ "numpy>=1.20.0", - "hdbscan>=0.8.28", + "hdbscan>=0.8.29", "umap-learn>=0.5.0", "pandas>=1.1.5", "scikit-learn>=0.22.2.post1", @@ -53,7 +53,7 @@ setup( name="bertopic", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.12.0", + version="0.13.0", author="Maarten P. Grootendorst", author_email="maartengrootendorst@gmail.com", description="BERTopic performs topic Modeling with state-of-the-art transformer models.", diff --git a/tests/conftest.py b/tests/conftest.py index b4d45cdb..a775a6bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,9 @@ from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.decomposition import PCA, IncrementalPCA from bertopic.vectorizers import OnlineCountVectorizer +from bertopic.cluster import BaseCluster +from bertopic.dimensionality import BaseDimensionalityReduction +from sklearn.linear_model import LogisticRegression @pytest.fixture(scope="session") @@ -34,6 +37,13 @@ def documents(): return newsgroup_docs +@pytest.fixture(scope="session") +def targets(): + data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) + y = data['target'][:500] + return y + + @pytest.fixture(scope="session") def base_topic_model(documents, document_embeddings, embedding_model): model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True) @@ -81,6 +91,19 @@ def kmeans_pca_topic_model(documents, document_embeddings): return model +@pytest.fixture(scope="session") +def supervised_topic_model(documents, document_embeddings, embedding_model, targets): + empty_dimensionality_model = BaseDimensionalityReduction() + clf = LogisticRegression() + + model = BERTopic( + embedding_model=embedding_model, + umap_model=empty_dimensionality_model, + hdbscan_model=clf, + ).fit(documents, embeddings=document_embeddings, y=targets) + return model + + @pytest.fixture(scope="session") def online_topic_model(documents, document_embeddings, embedding_model): umap_model = IncrementalPCA(n_components=5) diff --git a/tests/test_bertopic.py b/tests/test_bertopic.py index ba24d0e4..3b6d4f47 100644 --- a/tests/test_bertopic.py +++ b/tests/test_bertopic.py @@ -2,7 +2,7 @@ import pytest -@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), ('custom_topic_model'), ('merged_topic_model'), ('reduced_topic_model'), ('online_topic_model')]) +@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), ('custom_topic_model'), ('merged_topic_model'), ('reduced_topic_model'), ('online_topic_model'), ('supervised_topic_model')]) def test_full_model(model, documents, request): """ Tests the entire pipeline in one go. This serves as a sanity check to see if the default settings result in a good separation of topics. @@ -23,6 +23,10 @@ def test_full_model(model, documents, request): assert len(topic_model.get_topic_freq()) > 2 assert len(topic_model.get_topics()) == len(topic_model.get_topic_freq()) + # Test extraction of document info + document_info = topic_model.get_document_info(documents) + assert len(document_info) == len(documents) + # Test transform doc = "This is a new document to predict." topics_test, probs_test = topic_model.transform([doc]) @@ -87,3 +91,14 @@ def test_full_model(model, documents, request): topics_to_merge = [0, 1] topic_model.merge_topics(documents, topics_to_merge) assert freq < topic_model.get_topic_freq(0) + + # Test reduction of outliers + if -1 in topics: + new_topics = topic_model.reduce_outliers(documents, topics, threshold=0.0) + nr_outliers_topic_model = sum([1 for topic in topic_model.topics_ if topic == -1]) + nr_outliers_new_topics = sum([1 for topic in new_topics if topic == -1]) + + if topic_model._outliers == 1: + assert nr_outliers_topic_model > nr_outliers_new_topics + + \ No newline at end of file diff --git a/tests/test_plotting/test_approximate.py b/tests/test_plotting/test_approximate.py new file mode 100644 index 00000000..c9a1e6b3 --- /dev/null +++ b/tests/test_plotting/test_approximate.py @@ -0,0 +1,28 @@ +import copy +import pytest + +@pytest.mark.parametrize("batch_size", [50, None]) +@pytest.mark.parametrize("padding", [True, False]) +@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), + ('base_topic_model'), + ('custom_topic_model'), + ('merged_topic_model'), + ('reduced_topic_model')]) +def test_approximate_distribution(batch_size, padding, model, documents, request): + topic_model = copy.deepcopy(request.getfixturevalue(model)) + + # Calculate only on a document-level based on tokensets + topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size) + assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers + + # Use the distribution visualization + for i in range(3): + topic_model.visualize_distribution(topic_distr[i]) + + # Calculate distribution on a token-level + topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True) + assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers + assert len(topic_token_distr) == len(documents[:100]) + + for token_distr in topic_token_distr: + assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers diff --git a/tests/test_vectorizers/test_ctfidf.py b/tests/test_vectorizers/test_ctfidf.py index f3ba7cef..703067ff 100644 --- a/tests/test_vectorizers/test_ctfidf.py +++ b/tests/test_vectorizers/test_ctfidf.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from packaging import version -from scipy.sparse.csr import csr_matrix +from scipy.sparse import csr_matrix from sklearn import __version__ as sklearn_version from sklearn.feature_extraction.text import CountVectorizer from bertopic.vectorizers import ClassTfidfTransformer