diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 01f47754..00000000
--- a/.flake8
+++ /dev/null
@@ -1,2 +0,0 @@
-[flake8]
-max-line-length = 160
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 5682f40e..7ef1efbb 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -221,8 +221,7 @@ def __init__(
# Topic-based parameters
if top_n_words > 100:
logger.warning(
- "Note that extracting more than 100 words from a sparse "
- "can slow down computation quite a bit."
+ "Note that extracting more than 100 words from a sparse can slow down computation quite a bit."
)
self.top_n_words = top_n_words
@@ -241,9 +240,7 @@ def __init__(
# Vectorizer
self.n_gram_range = n_gram_range
- self.vectorizer_model = vectorizer_model or CountVectorizer(
- ngram_range=self.n_gram_range
- )
+ self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range)
self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()
# Representation model
@@ -364,9 +361,7 @@ def fit(
topic_model = BERTopic().fit(docs, embeddings)
```
"""
- self.fit_transform(
- documents=documents, embeddings=embeddings, y=y, images=images
- )
+ self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images)
return self
def fit_transform(
@@ -427,16 +422,12 @@ def fit_transform(
check_embeddings_shape(embeddings, documents)
doc_ids = range(len(documents)) if documents is not None else range(len(images))
- documents = pd.DataFrame(
- {"Document": documents, "ID": doc_ids, "Topic": None, "Image": images}
- )
+ documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images})
# Extract embeddings
if embeddings is None:
logger.info("Embedding - Transforming documents to embeddings.")
- self.embedding_model = select_backend(
- self.embedding_model, language=self.language, verbose=self.verbose
- )
+ self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
embeddings = self._extract_embeddings(
documents.Document.values.tolist(),
images=images,
@@ -446,9 +437,7 @@ def fit_transform(
logger.info("Embedding - Completed \u2713")
else:
if self.embedding_model is not None:
- self.embedding_model = select_backend(
- self.embedding_model, language=self.language
- )
+ self.embedding_model = select_backend(self.embedding_model, language=self.language)
# Guided Topic Modeling
if self.seed_topic_list is not None and self.embedding_model is not None:
@@ -459,17 +448,15 @@ def fit_transform(
# Zero-shot Topic Modeling
if self._is_zeroshot():
- documents, embeddings, assigned_documents, assigned_embeddings = (
- self._zeroshot_topic_modeling(documents, embeddings)
+ documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(
+ documents, embeddings
)
# Filter UMAP embeddings to only non-assigned embeddings to be used for clustering
umap_embeddings = self.umap_model.transform(embeddings)
if len(documents) > 0: # No zero-shot topics matched
# Cluster reduced embeddings
- documents, probabilities = self._cluster_embeddings(
- umap_embeddings, documents, y=y
- )
+ documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)
if self._is_zeroshot() and len(assigned_documents) > 0:
documents, embeddings = self._combine_zeroshot_topics(
documents, embeddings, assigned_documents, assigned_embeddings
@@ -526,9 +513,7 @@ def fit_transform(
]
# Resulting output
- self.probabilities_ = self._map_probabilities(
- probabilities, original_topics=True
- )
+ self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
predictions = documents.Topic.to_list()
return predictions, self.probabilities_
@@ -588,9 +573,7 @@ def transform(
documents = [documents]
if embeddings is None:
- embeddings = self._extract_embeddings(
- documents, images=images, method="document", verbose=self.verbose
- )
+ embeddings = self._extract_embeddings(documents, images=images, method="document", verbose=self.verbose)
# Check if an embedding model was found
if embeddings is None:
@@ -602,9 +585,7 @@ def transform(
# Transform without hdbscan_model and umap_model using only cosine similarity
elif type(self.hdbscan_model) == BaseCluster:
- logger.info(
- "Predicting topic assignments through cosine similarity of topic and document embeddings."
- )
+ logger.info("Predicting topic assignments through cosine similarity of topic and document embeddings.")
sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))
predictions = np.argmax(sim_matrix, axis=1) - self._outliers
@@ -628,12 +609,8 @@ def transform(
# Calculate probabilities
if self.calculate_probabilities:
- logger.info(
- "Probabilities - Start calculation of probabilities with HDBSCAN"
- )
- probabilities = hdbscan_delegator(
- self.hdbscan_model, "membership_vector", umap_embeddings
- )
+ logger.info("Probabilities - Start calculation of probabilities with HDBSCAN")
+ probabilities = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
logger.info("Probabilities - Completed \u2713")
else:
predictions = self.hdbscan_model.predict(umap_embeddings)
@@ -712,16 +689,13 @@ def partial_fit(
check_embeddings_shape(embeddings, documents)
if not hasattr(self.hdbscan_model, "partial_fit"):
raise ValueError(
- "In order to use `.partial_fit`, the cluster model should have "
- "a `.partial_fit` function."
+ "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function."
)
# Prepare documents
if isinstance(documents, str):
documents = [documents]
- documents = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": None}
- )
+ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
# Extract embeddings
if embeddings is None:
@@ -746,9 +720,7 @@ def partial_fit(
umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True)
# Cluster reduced embeddings
- documents, self.probabilities_ = self._cluster_embeddings(
- umap_embeddings, documents, partial_fit=True
- )
+ documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True)
topics = documents.Topic.to_list()
# Map and find new topics
@@ -756,10 +728,7 @@ def partial_fit(
self.topic_mapper_ = TopicMapper(topics)
mappings = self.topic_mapper_.get_mappings()
new_topics = set(topics).difference(set(mappings.keys()))
- new_topic_ids = {
- topic: max(mappings.values()) + index + 1
- for index, topic in enumerate(new_topics)
- }
+ new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)}
self.topic_mapper_.add_new_topics(new_topic_ids)
updated_mappings = self.topic_mapper_.get_mappings()
updated_topics = [updated_mappings[topic] for topic in topics]
@@ -767,25 +736,19 @@ def partial_fit(
# Add missing topics (topics that were originally created but are now missing)
if self.topic_representations_:
- missing_topics = set(self.topic_representations_.keys()).difference(
- set(updated_topics)
- )
+ missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics))
for missing_topic in missing_topics:
documents.loc[len(documents), :] = [" ", len(documents), missing_topic]
else:
missing_topics = {}
# Prepare documents
- documents_per_topic = documents.sort_values("Topic").groupby(
- ["Topic"], as_index=False
- )
+ documents_per_topic = documents.sort_values("Topic").groupby(["Topic"], as_index=False)
updated_topics = documents_per_topic.first().Topic.astype(int)
documents_per_topic = documents_per_topic.agg({"Document": " ".join})
# Update topic representations
- self.c_tf_idf_, updated_words = self._c_tf_idf(
- documents_per_topic, partial_fit=True
- )
+ self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True)
self.topic_representations_ = self._extract_words_per_topic(
updated_words, documents, self.c_tf_idf_, calculate_aspects=False
)
@@ -801,10 +764,7 @@ def partial_fit(
sizes = documents.groupby(["Topic"], as_index=False).count()
for _, row in sizes.iterrows():
topic = int(row.Topic)
- if (
- self.topic_sizes_.get(topic) is not None
- and topic not in missing_topics
- ):
+ if self.topic_sizes_.get(topic) is not None and topic not in missing_topics:
self.topic_sizes_[topic] += int(row.Document)
elif self.topic_sizes_.get(topic) is None:
self.topic_sizes_[topic] = int(row.Document)
@@ -879,9 +839,7 @@ def topics_over_time(
check_is_fitted(self)
check_documents_type(docs)
selected_topics = topics if topics else self.topics_
- documents = pd.DataFrame(
- {"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}
- )
+ documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps})
global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False)
all_topics = sorted(list(documents.Topic.unique()))
@@ -930,9 +888,7 @@ def topics_over_time(
list(set(previous_topics).intersection(set(current_topics))) # noqa: F821
)
- current_overlap_idx = [
- current_topics.index(topic) for topic in overlapping_topics
- ]
+ current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]
previous_overlap_idx = [
previous_topics.index(topic) # noqa: F821
for topic in overlapping_topics
@@ -940,8 +896,7 @@ def topics_over_time(
c_tf_idf.tolil()[current_overlap_idx] = (
(
- c_tf_idf[current_overlap_idx]
- + previous_c_tf_idf[previous_overlap_idx] # noqa: F821
+ c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx] # noqa: F821
)
/ 2.0
).tolil()
@@ -949,16 +904,11 @@ def topics_over_time(
# Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
# by simply taking the average of the two
if global_tuning:
- selected_topics = [
- all_topics_indices[topic]
- for topic in documents_per_topic.Topic.values
- ]
+ selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0
# Extract the words per topic
- words_per_topic = self._extract_words_per_topic(
- words, selection, c_tf_idf, calculate_aspects=False
- )
+ words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
topic_frequency = pd.Series(
documents_per_topic.Timestamps.values, index=documents_per_topic.Topic
).to_dict()
@@ -979,9 +929,7 @@ def topics_over_time(
previous_topics = sorted(list(documents_per_topic.Topic.values)) # noqa: F841
previous_c_tf_idf = c_tf_idf.copy() # noqa: F841
- return pd.DataFrame(
- topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"]
- )
+ return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])
def topics_per_class(
self,
@@ -1023,9 +971,7 @@ def topics_per_class(
```
"""
check_documents_type(docs)
- documents = pd.DataFrame(
- {"Document": docs, "Topic": self.topics_, "Class": classes}
- )
+ documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes})
global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False)
# For each unique timestamp, create topic representations
@@ -1042,18 +988,11 @@ def topics_per_class(
# by simply taking the average of the two
if global_tuning:
c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False)
- c_tf_idf = (
- global_c_tf_idf[documents_per_topic.Topic.values + self._outliers]
- + c_tf_idf
- ) / 2.0
+ c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0
# Extract the words per topic
- words_per_topic = self._extract_words_per_topic(
- words, selection, c_tf_idf, calculate_aspects=False
- )
- topic_frequency = pd.Series(
- documents_per_topic.Class.values, index=documents_per_topic.Topic
- ).to_dict()
+ words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
+ topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()
# Fill dataframe with results
topics_at_class = [
@@ -1067,9 +1006,7 @@ def topics_per_class(
]
topics_per_class.extend(topics_at_class)
- topics_per_class = pd.DataFrame(
- topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]
- )
+ topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"])
return topics_per_class
@@ -1138,9 +1075,9 @@ def hierarchical_topics(
linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True)
# Calculate distance
- embeddings = select_topic_representation(
- self.c_tf_idf_, self.topic_embeddings_, use_ctfidf
- )[0][self._outliers :]
+ embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][
+ self._outliers :
+ ]
X = distance_function(embeddings)
X = validate_distance_matrix(X, embeddings.shape[0])
@@ -1153,15 +1090,9 @@ def hierarchical_topics(
Z[:, 2] = get_unique_distances(Z[:, 2])
# Calculate basic bag-of-words to be iteratively merged later
- documents = pd.DataFrame(
- {"Document": docs, "ID": range(len(docs)), "Topic": self.topics_}
- )
- documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
- {"Document": " ".join}
- )
- documents_per_topic = documents_per_topic.loc[
- documents_per_topic.Topic != -1, :
- ]
+ documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": self.topics_})
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
+ documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :]
clean_documents = self._preprocess_text(documents_per_topic.Document.values)
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
@@ -1187,9 +1118,7 @@ def hierarchical_topics(
)
for index in tqdm(range(len(Z))):
# Find clustered documents
- clusters = (
- sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers
- )
+ clusters = sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers
nr_clusters = len(clusters)
# Extract first topic we find to get the set of topics in a merged topic
@@ -1200,18 +1129,14 @@ def hierarchical_topics(
topic = int(val)
else:
val = Z[int(val - len(clusters))][0]
- clustered_topics = [
- i for i, x in enumerate(clusters) if x == clusters[topic]
- ]
+ clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]
# Group bow per cluster, calculate c-TF-IDF and extract words
grouped = csr_matrix(bow[clustered_topics].sum(axis=0))
c_tf_idf = self.ctfidf_model.transform(grouped)
selection = documents.loc[documents.Topic.isin(clustered_topics), :]
selection.Topic = 0
- words_per_topic = self._extract_words_per_topic(
- words, selection, c_tf_idf, calculate_aspects=False
- )
+ words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
# Extract parent's name and ID
parent_id = index + len(clusters)
@@ -1398,9 +1323,7 @@ def approximate_distribution(
t = math.ceil(window / stride) - 1
for i in range(math.ceil(window / stride) - 1):
padded.append(tokenset[: window - ((t - i) * stride)])
- padded_ids.append(
- list(range(0, window - ((t - i) * stride)))
- )
+ padded_ids.append(list(range(0, window - ((t - i) * stride))))
token_sets = padded + token_sets
token_sets_ids = padded_ids + token_sets_ids
@@ -1413,20 +1336,14 @@ def approximate_distribution(
# Calculate similarity between embeddings of token sets and the topics
if use_embedding_model:
- embeddings = self._extract_embeddings(
- all_sentences, method="document", verbose=True
- )
- similarity = cosine_similarity(
- embeddings, self.topic_embeddings_[self._outliers :]
- )
+ embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True)
+ similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :])
# Calculate similarity between c-TF-IDF of token sets and the topics
else:
bow_doc = self.vectorizer_model.transform(all_sentences)
c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)
- similarity = cosine_similarity(
- c_tf_idf_doc, self.c_tf_idf_[self._outliers :]
- )
+ similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])
# Only keep similarities that exceed the minimum
similarity[similarity < min_similarity] = 0
@@ -1445,9 +1362,7 @@ def approximate_distribution(
# Assign topics to individual tokens
token_id = [i for i in range(len(token))]
token_val = {index: [] for index in token_id}
- for sim, token_set in zip(
- similarity[start:end], all_token_sets_ids[start:end]
- ):
+ for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]):
for token in token_set:
if token in token_val:
token_val[token].append(sim)
@@ -1477,9 +1392,7 @@ def approximate_distribution(
end = end + 1
group = similarity[start:end].sum(axis=0)
topic_distribution.append(group)
- topic_distribution = normalize(
- np.array(topic_distribution), norm="l1", axis=1
- )
+ topic_distribution = normalize(np.array(topic_distribution), norm="l1", axis=1)
topic_token_distribution = None
# Combine results
@@ -1493,9 +1406,7 @@ def approximate_distribution(
return topic_distributions, topic_token_distributions
- def find_topics(
- self, search_term: str = None, image: str = None, top_n: int = 5
- ) -> Tuple[List[int], List[float]]:
+ def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]:
"""Find topics most similar to a search_term.
Creates an embedding for a search query and compares that with
@@ -1529,25 +1440,19 @@ def find_topics(
search_term consists of a phrase or multiple words.
"""
if self.embedding_model is None:
- raise Exception(
- "This method can only be used if you did not use custom embeddings."
- )
+ raise Exception("This method can only be used if you did not use custom embeddings.")
topic_list = list(self.topic_representations_.keys())
topic_list.sort()
# Extract search_term embeddings and compare with topic embeddings
if search_term is not None:
- search_embedding = self._extract_embeddings(
- [search_term], method="word", verbose=False
- ).flatten()
+ search_embedding = self._extract_embeddings([search_term], method="word", verbose=False).flatten()
elif image is not None:
search_embedding = self._extract_embeddings(
[None], images=[image], method="document", verbose=False
).flatten()
- sims = cosine_similarity(
- search_embedding.reshape(1, -1), self.topic_embeddings_
- ).flatten()
+ sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten()
# Extract topics most similar to search_term
ids = np.argsort(sims)[-top_n:]
@@ -1623,13 +1528,10 @@ def update_topics(
if top_n_words > 100:
logger.warning(
- "Note that extracting more than 100 words from a sparse "
- "can slow down computation quite a bit."
+ "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit."
)
self.top_n_words = top_n_words
- self.vectorizer_model = vectorizer_model or CountVectorizer(
- ngram_range=n_gram_range
- )
+ self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()
self.representation_model = representation_model
@@ -1644,12 +1546,8 @@ def update_topics(
"c-TF-IDF embeddings instead of centroid embeddings."
)
- documents = pd.DataFrame(
- {"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}
- )
- documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
- {"Document": " ".join}
- )
+ documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images})
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
# Update topic sizes and assignments
self._update_topic_size(documents)
@@ -1697,9 +1595,7 @@ def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:
else:
return self.topic_representations_
- def get_topic(
- self, topic: int, full: bool = False
- ) -> Union[Mapping[str, Tuple[str, float]], bool]:
+ def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:
"""Return top n words for a specific topic and their c-TF-IDF scores.
Arguments:
@@ -1719,10 +1615,7 @@ def get_topic(
if topic in self.topic_representations_:
if full:
representations = {"Main": self.topic_representations_[topic]}
- aspects = {
- aspect: representations[topic]
- for aspect, representations in self.topic_aspects_.items()
- }
+ aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()}
representations.update(aspects)
return representations
else:
@@ -1746,25 +1639,17 @@ def get_topic_info(self, topic: int = None) -> pd.DataFrame:
"""
check_is_fitted(self)
- info = pd.DataFrame(
- self.topic_sizes_.items(), columns=["Topic", "Count"]
- ).sort_values("Topic")
+ info = pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values("Topic")
info["Name"] = info.Topic.map(self.topic_labels_)
# Custom label
if self.custom_labels_ is not None:
if len(self.custom_labels_) == len(info):
- labels = {
- topic - self._outliers: label
- for topic, label in enumerate(self.custom_labels_)
- }
+ labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)}
info["CustomName"] = info["Topic"].map(labels)
# Main Keywords
- values = {
- topic: list(list(zip(*values))[0])
- for topic, values in self.topic_representations_.items()
- }
+ values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()}
info["Representation"] = info["Topic"].map(values)
# Extract all topic aspects
@@ -1774,24 +1659,16 @@ def get_topic_info(self, topic: int = None) -> pd.DataFrame:
if isinstance(list(values.values())[-1][0], tuple) or isinstance(
list(values.values())[-1][0], list
):
- values = {
- topic: list(list(zip(*value))[0])
- for topic, value in values.items()
- }
+ values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()}
elif isinstance(list(values.values())[-1][0], str):
- values = {
- topic: " ".join(value).strip()
- for topic, value in values.items()
- }
+ values = {topic: " ".join(value).strip() for topic, value in values.items()}
info[aspect] = info["Topic"].map(values)
# Representative Docs / Images
if self.representative_docs_ is not None:
info["Representative_Docs"] = info["Topic"].map(self.representative_docs_)
if self.representative_images_ is not None:
- info["Representative_Images"] = info["Topic"].map(
- self.representative_images_
- )
+ info["Representative_Images"] = info["Topic"].map(self.representative_images_)
# Select specific topic to return
if topic is not None:
@@ -1826,9 +1703,9 @@ def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
if isinstance(topic, int):
return self.topic_sizes_[topic]
else:
- return pd.DataFrame(
- self.topic_sizes_.items(), columns=["Topic", "Count"]
- ).sort_values("Count", ascending=False)
+ return pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values(
+ "Count", ascending=False
+ )
def get_document_info(
self,
@@ -1899,10 +1776,7 @@ def get_document_info(
document_info = pd.merge(document_info, topic_info, on="Topic", how="left")
# Add top n words
- top_n_words = {
- topic: " - ".join(list(zip(*self.get_topic(topic)))[0])
- for topic in set(self.topics_)
- }
+ top_n_words = {topic: " - ".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)}
document_info["Top_n_words"] = document_info.Topic.map(top_n_words)
# Add flat probabilities
@@ -1916,15 +1790,9 @@ def get_document_info(
]
# Add representative document labels
- repr_docs = [
- repr_doc
- for repr_docs in self.representative_docs_.values()
- for repr_doc in repr_docs
- ]
+ repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs]
document_info["Representative_document"] = False
- document_info.loc[
- document_info.Document.isin(repr_docs), "Representative_document"
- ] = True
+ document_info.loc[document_info.Document.isin(repr_docs), "Representative_document"] = True
# Add custom meta data provided by the user
if metadata is not None:
@@ -2028,12 +1896,8 @@ def get_topic_tree(
max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1
# Extract mapping from ID to name
- topic_to_name = dict(
- zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name)
- )
- topic_to_name.update(
- dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name))
- )
+ topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))
+ topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))
topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}
# Create tree
@@ -2051,8 +1915,7 @@ def get_tree(start, tree):
def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
# Get distance between merged topics
distance = hier_topics.loc[
- (hier_topics.Child_Left_ID == parent)
- | (hier_topics.Child_Right_ID == parent),
+ (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),
"Distance",
]
distance = distance.values[0] if len(distance) > 0 else 10
@@ -2064,12 +1927,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
if int(parent) <= max_original_topic:
# Do not append topic ID if they are not merged
if distance < max_distance:
- to_print += (
- "■──"
- + topic_to_name[parent]
- + f" ── Topic: {parent}"
- + "\n"
- )
+ to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n"
else:
to_print += "O \n"
else:
@@ -2080,15 +1938,11 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
for child in tree[parent][:-1]:
to_print += indent + "├" + "─"
- to_print = _tree(
- to_print, start, child, tree, parent, indent + "│" + " " * width
- )
+ to_print = _tree(to_print, start, child, tree, parent, indent + "│" + " " * width)
child = tree[parent][-1]
to_print += indent + "└" + "─"
- to_print = _tree(
- to_print, start, child, tree, parent, indent + " " * (width + 1)
- )
+ to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width + 1))
return to_print
@@ -2099,9 +1953,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
start = str(hier_topics.Parent_ID.astype(int).max())
return get_tree(start, tree)
- def set_topic_labels(
- self, topic_labels: Union[List[str], Mapping[int, str]]
- ) -> None:
+ def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:
"""Set custom topic labels in your fitted BERTopic model.
Arguments:
@@ -2145,17 +1997,12 @@ def set_topic_labels(
if isinstance(topic_labels, dict):
if self.custom_labels_ is not None:
- original_labels = {
- topic: label
- for topic, label in zip(unique_topics, self.custom_labels_)
- }
+ original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)}
else:
info = self.get_topic_info()
original_labels = dict(zip(info.Topic, info.Name))
custom_labels = [
- topic_labels.get(topic)
- if topic_labels.get(topic)
- else original_labels[topic]
+ topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic]
for topic in unique_topics
]
@@ -2164,8 +2011,7 @@ def set_topic_labels(
custom_labels = topic_labels
else:
raise ValueError(
- "Make sure that `topic_labels` contains the same number "
- "of labels as there are topics."
+ "Make sure that `topic_labels` contains the same number " "of labels as there are topics."
)
self.custom_labels_ = custom_labels
@@ -2283,8 +2129,7 @@ def merge_topics(
mapping[topic] = topic_group[0]
else:
raise ValueError(
- "Make sure that `topics_to_merge` is either"
- "a list of topics or a list of list of topics."
+ "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics."
)
# Track mappings and sizes of topics for merging topic embeddings
@@ -2472,9 +2317,7 @@ def reduce_outliers(
# Check correct use of parameters
if strategy.lower() == "probabilities" and probabilities is None:
- raise ValueError(
- "Make sure to pass in `probabilities` in order to use the probabilities strategy"
- )
+ raise ValueError("Make sure to pass in `probabilities` in order to use the probabilities strategy")
# Reduce outliers by extracting most likely topics through the topic-term probability matrix
if strategy.lower() == "probabilities":
@@ -2490,12 +2333,8 @@ def reduce_outliers(
topic_distr, _ = self.approximate_distribution(
outlier_docs, min_similarity=threshold, **distributions_params
)
- outlier_topics = iter(
- [np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]
- )
- new_topics = [
- topic if topic != -1 else next(outlier_topics) for topic in topics
- ]
+ outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr])
+ new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]
# Reduce outliers by finding the most similar c-TF-IDF representations
elif strategy.lower() == "c-tf-idf":
@@ -2505,18 +2344,12 @@ def reduce_outliers(
# Calculate c-TF-IDF of outlier documents with all topics
bow_doc = self.vectorizer_model.transform(outlier_docs)
c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)
- similarity = cosine_similarity(
- c_tf_idf_doc, self.c_tf_idf_[self._outliers :]
- )
+ similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])
# Update topics
similarity[similarity < threshold] = 0
- outlier_topics = iter(
- [np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]
- )
- new_topics = [
- topic if topic != -1 else next(outlier_topics) for topic in topics
- ]
+ outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])
+ new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]
# Reduce outliers by finding the most similar topic embeddings
elif strategy.lower() == "embeddings":
@@ -2533,28 +2366,18 @@ def reduce_outliers(
# Extract or calculate embeddings for outlier documents
if embeddings is not None:
- outlier_embeddings = np.array(
- [embeddings[index] for index in outlier_ids]
- )
+ outlier_embeddings = np.array([embeddings[index] for index in outlier_ids])
elif images is not None:
outlier_images = [images[index] for index in outlier_ids]
- outlier_embeddings = self.embedding_model.embed_images(
- outlier_images, verbose=self.verbose
- )
+ outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose)
else:
outlier_embeddings = self.embedding_model.embed_documents(outlier_docs)
- similarity = cosine_similarity(
- outlier_embeddings, self.topic_embeddings_[self._outliers :]
- )
+ similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :])
# Update topics
similarity[similarity < threshold] = 0
- outlier_topics = iter(
- [np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]
- )
- new_topics = [
- topic if topic != -1 else next(outlier_topics) for topic in topics
- ]
+ outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])
+ new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]
return new_topics
@@ -3507,9 +3330,7 @@ def save(
)
# Minimal
- save_utils.save_hf(
- model=self, save_directory=save_directory, serialization=serialization
- )
+ save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)
save_utils.save_topics(model=self, path=save_directory / "topics.json")
save_utils.save_images(model=self, path=save_directory / "images")
save_utils.save_config(
@@ -3525,9 +3346,7 @@ def save(
save_directory=save_directory,
serialization=serialization,
)
- save_utils.save_ctfidf_config(
- model=self, path=save_directory / "ctfidf_config.json"
- )
+ save_utils.save_ctfidf_config(model=self, path=save_directory / "ctfidf_config.json")
@classmethod
def load(cls, path: str, embedding_model=None):
@@ -3557,22 +3376,16 @@ def load(cls, path: str, embedding_model=None):
with open(file_or_dir, "rb") as file:
if embedding_model:
topic_model = joblib.load(file)
- topic_model.embedding_model = select_backend(
- embedding_model, verbose=topic_model.verbose
- )
+ topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)
else:
topic_model = joblib.load(file)
return topic_model
# Load from directory or HF
if file_or_dir.is_dir():
- topics, params, tensors, ctfidf_tensors, ctfidf_config, images = (
- save_utils.load_local_files(file_or_dir)
- )
+ topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir)
elif "/" in str(path):
- topics, params, tensors, ctfidf_tensors, ctfidf_config, images = (
- save_utils.load_files_from_hf(path)
- )
+ topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)
else:
raise ValueError("Make sure to either pass a valid directory or HF model.")
topic_model = _create_model_from_files(
@@ -3587,9 +3400,7 @@ def load(cls, path: str, embedding_model=None):
# Replace embedding model if one is specifically chosen
if embedding_model is not None:
- topic_model.embedding_model = select_backend(
- embedding_model, verbose=topic_model.verbose
- )
+ topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)
return topic_model
@@ -3645,9 +3456,7 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None)
all_topics, all_params, all_tensors = [], [], []
for index, model in enumerate(models):
model.save(tmpdir, serialization="pytorch")
- topics, params, tensors, _, _, _ = save_utils.load_local_files(
- Path(tmpdir)
- )
+ topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))
all_topics.append(topics)
all_params.append(params)
all_tensors.append(np.array(tensors["topic_embeddings"]))
@@ -3666,11 +3475,7 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None)
# Extract new topics
new_topics = sorted(
- [
- index - selected_topics["_outliers"]
- for index, sim in enumerate(sims)
- if sim < min_similarity
- ]
+ [index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]
)
max_topic = max(set(merged_topics["topics"]))
@@ -3680,12 +3485,10 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None)
if new_topic != -1:
max_topic += 1
new_topics_dict[new_topic] = max_topic
- merged_topics["topic_representations"][str(max_topic)] = (
- selected_topics["topic_representations"][str(new_topic)]
- )
- merged_topics["topic_labels"][str(max_topic)] = selected_topics[
- "topic_labels"
- ][str(new_topic)]
+ merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][
+ str(new_topic)
+ ]
+ merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)]
# Add new aspects
if selected_topics["topic_aspects"]:
@@ -3698,27 +3501,19 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None)
# If the original model does not have topic aspects but the to be added model does
if not merged_topics.get("topic_aspects"):
- merged_topics["topic_aspects"] = selected_topics[
- "topic_aspects"
- ]
+ merged_topics["topic_aspects"] = selected_topics["topic_aspects"]
# If they both contain topic aspects, add to the existing set of aspects
else:
- for aspect, values in selected_topics[
- "topic_aspects"
- ].items():
- merged_topics["topic_aspects"][aspect][
- str(max_topic)
- ] = values[str(new_topic)]
+ for aspect, values in selected_topics["topic_aspects"].items():
+ merged_topics["topic_aspects"][aspect][str(max_topic)] = values[str(new_topic)]
# Add new embeddings
new_tensors = tensors[new_topic + selected_topics["_outliers"]]
merged_tensors = np.vstack([merged_tensors, new_tensors])
# Topic Mapper
- merged_topics["topic_mapper"] = TopicMapper(
- list(range(-1, max_topic + 1, 1))
- ).mappings_
+ merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_
# Find similar topics and re-assign those from the new models
sims_idx = np.argmax(sim_matrix, axis=1)
@@ -3749,13 +3544,8 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None)
# Replace embedding model if one is specifically chosen
verbose = any([model.verbose for model in models])
- if (
- embedding_model is not None
- and type(merged_model.embedding_model) == BaseEmbedder
- ):
- merged_model.embedding_model = select_backend(
- embedding_model, verbose=verbose
- )
+ if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder:
+ merged_model.embedding_model = select_backend(embedding_model, verbose=verbose)
return merged_model
def push_to_hf_hub(
@@ -3874,17 +3664,11 @@ def _extract_embeddings(
documents = [documents]
if images is not None and hasattr(self.embedding_model, "embed_images"):
- embeddings = self.embedding_model.embed(
- documents=documents, images=images, verbose=verbose
- )
+ embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose)
elif method == "word":
- embeddings = self.embedding_model.embed_words(
- words=documents, verbose=verbose
- )
+ embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose)
elif method == "document":
- embeddings = self.embedding_model.embed_documents(
- documents, verbose=verbose
- )
+ embeddings = self.embedding_model.embed_documents(documents, verbose=verbose)
elif documents[0] is None and images is None:
raise ValueError(
"Make sure to use an embedding model that can either embed documents"
@@ -3897,9 +3681,7 @@ def _extract_embeddings(
)
return embeddings
- def _images_to_text(
- self, documents: pd.DataFrame, embeddings: np.ndarray
- ) -> pd.DataFrame:
+ def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
"""Convert images to text."""
logger.info("Images - Converting images to text. This might take a while.")
if isinstance(self.representation_model, dict):
@@ -3912,19 +3694,14 @@ def _images_to_text(
documents = tuner.image_to_text(documents, embeddings)
elif isinstance(self.representation_model, BaseRepresentation):
if getattr(self.representation_model, "image_to_text_model", False):
- documents = self.representation_model.image_to_text(
- documents, embeddings
- )
+ documents = self.representation_model.image_to_text(documents, embeddings)
logger.info("Images - Completed \u2713")
return documents
def _map_predictions(self, predictions: List[int]) -> List[int]:
"""Map predictions to the correct topics if topics were reduced."""
mappings = self.topic_mapper_.get_mappings(original_topics=True)
- mapped_predictions = [
- mappings[prediction] if prediction in mappings else -1
- for prediction in predictions
- ]
+ mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions]
return mapped_predictions
def _reduce_dimensionality(
@@ -4008,12 +3785,8 @@ def _cluster_embeddings(
if hasattr(self.hdbscan_model, "probabilities_"):
probabilities = self.hdbscan_model.probabilities_
- if self.calculate_probabilities and is_supported_hdbscan(
- self.hdbscan_model
- ):
- probabilities = hdbscan_delegator(
- self.hdbscan_model, "all_points_membership_vectors"
- )
+ if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model):
+ probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors")
if not partial_fit:
self.topic_mapper_ = TopicMapper(self.topics_)
@@ -4037,23 +3810,15 @@ def _zeroshot_topic_modeling(
documents: The leftover documents that were not assigned to any topic
embeddings: The leftover embeddings that were not assigned to any topic
"""
- logger.info(
- "Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics"
- )
+ logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics")
# Similarity between document and zero-shot topic embeddings
zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list)
cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings)
assignment = np.argmax(cosine_similarities, 1)
assignment_vals = np.max(cosine_similarities, 1)
- assigned_ids = [
- index
- for index, value in enumerate(assignment_vals)
- if value >= self.zeroshot_min_similarity
- ]
+ assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity]
non_assigned_ids = [
- index
- for index, value in enumerate(assignment_vals)
- if value < self.zeroshot_min_similarity
+ index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity
]
# Assign topics
@@ -4117,32 +3882,22 @@ def _combine_zeroshot_topics(
documents: DataFrame with all the original documents with their topic assignments
embeddings: np.ndarray of embeddings aligned with the documents
"""
- logger.info(
- "Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering..."
- )
+ logger.info("Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...")
# Combine Zero-shot topics with topics from clustering
zeroshot_topic_idx_to_topic_id = {
zeroshot_topic_id: new_topic_id
- for new_topic_id, zeroshot_topic_id in enumerate(
- set(assigned_documents.Topic)
- )
+ for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic))
}
self._topic_id_to_zeroshot_topic_idx = {
new_topic_id: zeroshot_topic_id
- for new_topic_id, zeroshot_topic_id in enumerate(
- set(assigned_documents.Topic)
- )
+ for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic))
}
- assigned_documents.Topic = assigned_documents.Topic.map(
- zeroshot_topic_idx_to_topic_id
- )
+ assigned_documents.Topic = assigned_documents.Topic.map(zeroshot_topic_idx_to_topic_id)
num_zeroshot_topics = len(zeroshot_topic_idx_to_topic_id)
# Insert zeroshot topics between outlier cluster and other clusters
documents.Topic = documents.Topic.apply(
- lambda topic_id: topic_id + num_zeroshot_topics
- if topic_id != -1
- else topic_id
+ lambda topic_id: topic_id + num_zeroshot_topics if topic_id != -1 else topic_id
)
# Combine the clustered documents/embeddings with assigned documents/embeddings in the original order
@@ -4159,9 +3914,7 @@ def _combine_zeroshot_topics(
logger.info("Zeroshot Step 2 - Completed \u2713")
return documents, embeddings
- def _guided_topic_modeling(
- self, embeddings: np.ndarray
- ) -> Tuple[List[int], np.array]:
+ def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]:
"""Apply Guided Topic Modeling.
We transform the seeded topics to embeddings using the
@@ -4185,12 +3938,8 @@ def _guided_topic_modeling(
logger.info("Guided - Find embeddings highly related to seeded topics.")
# Create embeddings from the seeded topics
seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list]
- seed_topic_embeddings = self._extract_embeddings(
- seed_topic_list, verbose=self.verbose
- )
- seed_topic_embeddings = np.vstack(
- [seed_topic_embeddings, embeddings.mean(axis=0)]
- )
+ seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose)
+ seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)])
# Label documents that are most similar to one of the seeded topics
sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings)
@@ -4201,9 +3950,7 @@ def _guided_topic_modeling(
# embedding of the seeded topic to force the documents in a cluster
for seed_topic in range(len(seed_topic_list)):
indices = [index for index, topic in enumerate(y) if topic == seed_topic]
- embeddings[indices] = np.average(
- [embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1]
- )
+ embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1])
logger.info("Guided - Completed \u2713")
return y, embeddings
@@ -4226,17 +3973,11 @@ def _extract_topics(
c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic
"""
if verbose:
- logger.info(
- "Representation - Extracting topics from clusters using representation models."
- )
- documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
- {"Document": " ".join}
- )
+ logger.info("Representation - Extracting topics from clusters using representation models.")
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
self.topic_representations_ = self._extract_words_per_topic(words, documents)
- self._create_topic_vectors(
- documents=documents, embeddings=embeddings, mappings=mappings
- )
+ self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)
if verbose:
logger.info("Representation - Completed \u2713")
@@ -4310,11 +4051,7 @@ def _extract_representative_docs(
selected_docs_ids = selection.index.tolist()
# Calculate similarity
- nr_docs = (
- nr_repr_docs
- if len(selected_docs) > nr_repr_docs
- else len(selected_docs)
- )
+ nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs)
bow = self.vectorizer_model.transform(selected_docs)
ctfidf = self.ctfidf_model.transform(bow)
sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index])
@@ -4331,28 +4068,14 @@ def _extract_representative_docs(
# Extract top n most representative documents
else:
- indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[
- -nr_docs:
- ]
+ indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:]
docs = [selected_docs[index] for index in indices]
- doc_ids = [
- selected_docs_ids[index]
- for index, doc in enumerate(selected_docs)
- if doc in docs
- ]
+ doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs]
repr_docs_ids.append(doc_ids)
repr_docs.extend(docs)
- repr_docs_indices.append(
- [
- repr_docs_indices[-1][-1] + i + 1 if index != 0 else i
- for i in range(nr_docs)
- ]
- )
- repr_docs_mappings = {
- topic: repr_docs[i[0] : i[-1] + 1]
- for topic, i in zip(topics.keys(), repr_docs_indices)
- }
+ repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)])
+ repr_docs_mappings = {topic: repr_docs[i[0] : i[-1] + 1] for topic, i in zip(topics.keys(), repr_docs_indices)}
return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids
@@ -4393,30 +4116,22 @@ def _create_topic_vectors(
topic_ids = topics_from["topics_from"]
topic_sizes = topics_from["topic_sizes"]
if topic_ids:
- embds = np.array(self.topic_embeddings_)[
- np.array(topic_ids) + self._outliers
- ]
+ embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers]
topic_embedding = np.average(embds, axis=0, weights=topic_sizes)
topic_embeddings_dict[topic_to] = topic_embedding
# Re-order topic embeddings
topics_to_map = {
- topic_mapping[0]: topic_mapping[1]
- for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
+ topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
}
topic_embeddings = {}
for topic, embds in topic_embeddings_dict.items():
topic_embeddings[topics_to_map[topic]] = embds
unique_topics = sorted(list(topic_embeddings.keys()))
- self.topic_embeddings_ = np.array(
- [topic_embeddings[topic] for topic in unique_topics]
- )
+ self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics])
# Topic embeddings based on keyword representations
- elif (
- self.embedding_model is not None
- and type(self.embedding_model) is not BaseEmbedder
- ):
+ elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder:
topic_list = list(self.topic_representations_.keys())
topic_list.sort()
@@ -4428,9 +4143,7 @@ def _create_topic_vectors(
# Extract embeddings for all words in all topics
topic_words = [self.get_topic(topic) for topic in topic_list]
topic_words = [word[0] for topic in topic_words for word in topic]
- word_embeddings = self._extract_embeddings(
- topic_words, method="word", verbose=False
- )
+ word_embeddings = self._extract_embeddings(topic_words, method="word", verbose=False)
# Take the weighted average of word embeddings in a topic based on their c-TF-IDF value
# The embeddings var is a single numpy matrix and therefore slicing is necessary to
@@ -4488,33 +4201,16 @@ def _c_tf_idf(
if self.ctfidf_model.seed_words and self.seed_topic_list:
seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
multiplier = np.array(
- [
- self.ctfidf_model.seed_multiplier
- if word in self.ctfidf_model.seed_words
- else 1
- for word in words
- ]
- )
- multiplier = np.array(
- [
- 1.2 if word in seed_topic_list else value
- for value, word in zip(multiplier, words)
- ]
+ [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
)
+ multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])
elif self.ctfidf_model.seed_words:
multiplier = np.array(
- [
- self.ctfidf_model.seed_multiplier
- if word in self.ctfidf_model.seed_words
- else 1
- for word in words
- ]
+ [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
)
elif self.seed_topic_list:
seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
- multiplier = np.array(
- [1.2 if word in seed_topic_list else 1 for word in words]
- )
+ multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])
if fit:
self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)
@@ -4572,9 +4268,7 @@ def _extract_words_per_topic(
# Get top 30 words per topic based on c-TF-IDF score
base_topics = {
label: [
- (words[word_index], score)
- if word_index is not None and score > 0
- else ("", 0.00001)
+ (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001)
for word_index, score in zip(indices[index][::-1], scores[index][::-1])
]
for index, label in enumerate(labels)
@@ -4584,40 +4278,27 @@ def _extract_words_per_topic(
topics = base_topics.copy()
if not self.representation_model:
# Default representation: c_tf_idf + top_n_words
- topics = {
- label: values[: self.top_n_words] for label, values in topics.items()
- }
+ topics = {label: values[: self.top_n_words] for label, values in topics.items()}
elif isinstance(self.representation_model, list):
for tuner in self.representation_model:
topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
elif isinstance(self.representation_model, BaseRepresentation):
- topics = self.representation_model.extract_topics(
- self, documents, c_tf_idf, topics
- )
+ topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
elif isinstance(self.representation_model, dict):
if self.representation_model.get("Main"):
main_model = self.representation_model["Main"]
if isinstance(main_model, BaseRepresentation):
- topics = main_model.extract_topics(
- self, documents, c_tf_idf, topics
- )
+ topics = main_model.extract_topics(self, documents, c_tf_idf, topics)
elif isinstance(main_model, list):
for tuner in main_model:
topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
else:
- raise TypeError(
- f"unsupported type {type(main_model).__name__} for representation_model['Main']"
- )
+ raise TypeError(f"unsupported type {type(main_model).__name__} for representation_model['Main']")
else:
# Default representation: c_tf_idf + top_n_words
- topics = {
- label: values[: self.top_n_words]
- for label, values in topics.items()
- }
+ topics = {label: values[: self.top_n_words] for label, values in topics.items()}
else:
- raise TypeError(
- f"unsupported type {type(self.representation_model).__name__} for representation_model"
- )
+ raise TypeError(f"unsupported type {type(self.representation_model).__name__} for representation_model")
# Extract additional topic aspects
if calculate_aspects and isinstance(self.representation_model, dict):
@@ -4626,19 +4307,12 @@ def _extract_words_per_topic(
aspects = base_topics.copy()
if not aspect_model:
# Default representation: c_tf_idf + top_n_words
- aspects = {
- label: values[: self.top_n_words]
- for label, values in aspects.items()
- }
+ aspects = {label: values[: self.top_n_words] for label, values in aspects.items()}
if isinstance(aspect_model, list):
for tuner in aspect_model:
- aspects = tuner.extract_topics(
- self, documents, c_tf_idf, aspects
- )
+ aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects)
elif isinstance(aspect_model, BaseRepresentation):
- aspects = aspect_model.extract_topics(
- self, documents, c_tf_idf, aspects
- )
+ aspects = aspect_model.extract_topics(self, documents, c_tf_idf, aspects)
else:
raise TypeError(
f"unsupported type {type(aspect_model).__name__} for representation_model[{repr(aspect)}]"
@@ -4647,9 +4321,7 @@ def _extract_words_per_topic(
return topics
- def _reduce_topics(
- self, documents: pd.DataFrame, use_ctfidf: bool = False
- ) -> pd.DataFrame:
+ def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
"""Reduce topics to self.nr_topics.
Arguments:
@@ -4676,9 +4348,7 @@ def _reduce_topics(
)
return documents
- def _reduce_to_n_topics(
- self, documents: pd.DataFrame, use_ctfidf: bool = False
- ) -> pd.DataFrame:
+ def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
"""Reduce topics to self.nr_topics.
Arguments:
@@ -4700,9 +4370,7 @@ def _reduce_to_n_topics(
# Cluster the topic embeddings using AgglomerativeClustering
if version.parse(sklearn_version) >= version.parse("1.4.0"):
- cluster = AgglomerativeClustering(
- self.nr_topics - self._outliers, metric="precomputed", linkage="average"
- )
+ cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric="precomputed", linkage="average")
else:
cluster = AgglomerativeClustering(
self.nr_topics - self._outliers,
@@ -4713,9 +4381,7 @@ def _reduce_to_n_topics(
new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics]
# Track mappings and sizes of topics for merging topic embeddings
- mapped_topics = {
- from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)
- }
+ mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)}
basic_mappings = defaultdict(list)
for key, val in sorted(mapped_topics.items()):
basic_mappings[val].append(key)
@@ -4742,8 +4408,7 @@ def _reduce_to_n_topics(
if self._is_zeroshot():
new_topic_id_to_zeroshot_topic_idx = {}
topics_to_map = {
- topic_mapping[0]: topic_mapping[1]
- for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
+ topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
}
for topic_to, topics_from in basic_mappings.items():
@@ -4753,9 +4418,7 @@ def _reduce_to_n_topics(
# which of the original topics are zero-shot
zeroshot_topic_ids = [
- topic_id
- for topic_id in topics_from
- if topic_id in self._topic_id_to_zeroshot_topic_idx
+ topic_id for topic_id in topics_from if topic_id in self._topic_id_to_zeroshot_topic_idx
]
if len(zeroshot_topic_ids) == 0:
continue
@@ -4763,9 +4426,7 @@ def _reduce_to_n_topics(
# If any of the original topics are zero-shot, take the best fitting zero-shot label
# if the cosine similarity with the new topic exceeds the zero-shot threshold
zeroshot_labels = [
- self.zeroshot_topic_list[
- self._topic_id_to_zeroshot_topic_idx[topic_id]
- ]
+ self.zeroshot_topic_list[self._topic_id_to_zeroshot_topic_idx[topic_id]]
for topic_id in zeroshot_topic_ids
]
zeroshot_embeddings = self._extract_embeddings(zeroshot_labels)
@@ -4775,18 +4436,14 @@ def _reduce_to_n_topics(
best_zeroshot_topic_idx = np.argmax(cosine_similarities)
best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx]
if best_cosine_similarity >= self.zeroshot_min_similarity:
- new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[
- best_zeroshot_topic_idx
- ]
+ new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[best_zeroshot_topic_idx]
self._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx
self._update_topic_size(documents)
return documents
- def _auto_reduce_topics(
- self, documents: pd.DataFrame, use_ctfidf: bool = False
- ) -> pd.DataFrame:
+ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
"""Reduce the number of topics automatically using HDBSCAN.
Arguments:
@@ -4819,13 +4476,8 @@ def _auto_reduce_topics(
for index, prediction in enumerate(predictions)
if prediction != -1
}
- documents.Topic = (
- documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
- )
- mapped_topics = {
- from_topic: to_topic
- for from_topic, to_topic in zip(topics, documents.Topic.tolist())
- }
+ documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
+ mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}
# Track mappings and sizes of topics for merging topic embeddings
mappings = defaultdict(list)
@@ -4873,17 +4525,13 @@ def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame:
self._update_topic_size(documents)
# Map topics based on frequency
- df = pd.DataFrame(
- self.topic_sizes_.items(), columns=["Old_Topic", "Size"]
- ).sort_values("Size", ascending=False)
+ df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False)
df = df[df.Old_Topic != -1]
sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))}
self.topic_mapper_.add_mappings(sorted_topics)
# Map documents
- documents.Topic = (
- documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int)
- )
+ documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int)
self._update_topic_size(documents)
return documents
@@ -4918,9 +4566,7 @@ def _map_probabilities(
)
for from_topic, to_topic in mappings.items():
if to_topic != -1 and from_topic != -1:
- mapped_probabilities[:, to_topic] += probabilities[
- :, from_topic
- ]
+ mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
return mapped_probabilities
@@ -4936,12 +4582,8 @@ def _preprocess_text(self, documents: np.ndarray) -> List[str]:
cleaned_documents = [doc.replace("\n", " ") for doc in documents]
cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]
if self.language == "english":
- cleaned_documents = [
- re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents
- ]
- cleaned_documents = [
- doc if doc != "" else "emptydoc" for doc in cleaned_documents
- ]
+ cleaned_documents = [re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents]
+ cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents]
return cleaned_documents
@staticmethod
@@ -4961,13 +4603,8 @@ def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray:
indices = []
for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
n_row_pick = min(n, ri - le)
- values = matrix.indices[
- le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]
- ]
- values = [
- values[index] if len(values) >= index + 1 else None
- for index in range(n)
- ]
+ values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]
+ values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
indices.append(values)
return np.array(indices)
@@ -4984,9 +4621,7 @@ def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray:
"""
top_values = []
for row, values in enumerate(indices):
- scores = np.array(
- [matrix[row, value] if value is not None else 0 for value in values]
- )
+ scores = np.array([matrix[row, value] if value is not None else 0 for value in values])
top_values.append(scores)
return np.array(top_values)
@@ -4999,11 +4634,7 @@ def _get_param_names(cls):
"""
init_signature = inspect.signature(cls.__init__)
parameters = sorted(
- [
- p.name
- for p in init_signature.parameters.values()
- if p.name != "self" and p.kind != p.VAR_KEYWORD
- ]
+ [p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD]
)
return parameters
@@ -5173,22 +4804,16 @@ def _create_model_from_files(
**params,
)
topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy()
- topic_model.topic_representations_ = {
- int(key): val for key, val in topics["topic_representations"].items()
- }
+ topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()}
topic_model.topics_ = topics["topics"]
- topic_model.topic_sizes_ = {
- int(key): val for key, val in topics["topic_sizes"].items()
- }
+ topic_model.topic_sizes_ = {int(key): val for key, val in topics["topic_sizes"].items()}
topic_model.custom_labels_ = topics["custom_labels"]
if topics.get("topic_aspects"):
topic_aspects = {}
for aspect, values in topics["topic_aspects"].items():
if aspect != "Visual_Aspect":
- topic_aspects[aspect] = {
- int(topic): value for topic, value in values.items()
- }
+ topic_aspects[aspect] = {int(topic): value for topic, value in values.items()}
topic_model.topic_aspects_ = topic_aspects
if images is not None:
@@ -5209,20 +4834,12 @@ def _create_model_from_files(
)
# CountVectorizer
- topic_model.vectorizer_model = CountVectorizer(
- **ctfidf_config["vectorizer_model"]["params"]
- )
- topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"][
- "vocab"
- ]
+ topic_model.vectorizer_model = CountVectorizer(**ctfidf_config["vectorizer_model"]["params"])
+ topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"]["vocab"]
# ClassTfidfTransformer
- topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"][
- "reduce_frequent_words"
- ]
- topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"][
- "bm25_weighting"
- ]
+ topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"]["reduce_frequent_words"]
+ topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"]["bm25_weighting"]
idf = ctfidf_tensors["diag"].numpy()
topic_model.ctfidf_model._idf_diag = sp.diags(
idf, offsets=0, shape=(len(idf), len(idf)), format="csr", dtype=np.float64
diff --git a/bertopic/_save_utils.py b/bertopic/_save_utils.py
index a01ba691..845e0f75 100644
--- a/bertopic/_save_utils.py
+++ b/bertopic/_save_utils.py
@@ -135,9 +135,7 @@ def push_to_hf_hub(
save_ctfidf: Whether to save c-TF-IDF information
"""
if not _has_hf_hub:
- raise ValueError(
- "Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`"
- )
+ raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`")
# Create repo if it doesn't exist yet and infer complete repo_id
repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True)
@@ -156,9 +154,7 @@ def push_to_hf_hub(
# Add README if it does not exist
try:
- get_hf_file_metadata(
- hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
- )
+ get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
except: # noqa: E722
if model_card:
readme_text = generate_readme(model, repo_id)
@@ -241,13 +237,9 @@ def load_files_from_hf(path):
# c-TF-IDF
try:
- ctfidf_config = load_cfg_from_json(
- hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)
- )
+ ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None))
try:
- ctfidf_tensors = hf_hub_download(
- path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None
- )
+ ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None)
ctfidf_tensors = load_safetensors(ctfidf_tensors)
except: # noqa: E722
ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None)
@@ -268,9 +260,7 @@ def load_files_from_hf(path):
topic_list = list(topics["topic_representations"].keys())
images = {}
for topic in topic_list:
- image = Image.open(
- hf_hub_download(path, f"images/{topic}.jpg", revision=None)
- )
+ image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None))
images[int(topic)] = image
return topics, params, tensors, ctfidf_tensors, ctfidf_config, images
@@ -283,11 +273,7 @@ def generate_readme(model, repo_id: str):
# Get Statistics
model_name = repo_id.split("/")[-1]
- params = {
- param: value
- for param, value in model.get_params().items()
- if "model" not in param
- }
+ params = {param: value for param, value in model.get_params().items() if "model" not in param}
params = "\n".join([f"* {param}: {value}" for param, value in params.items()])
topics = sorted(list(set(model.topics_)))
nr_topics = str(len(set(model.topics_)))
@@ -298,23 +284,15 @@ def generate_readme(model, repo_id: str):
nr_documents = ""
# Topic information
- topic_keywords = [
- " - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics
- ]
+ topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics]
topic_freq = [model.get_topic_freq(topic) for topic in topics]
- topic_labels = (
- model.custom_labels_
- if model.custom_labels_
- else [model.topic_labels_[topic] for topic in topics]
- )
+ topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics]
topics = [
f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n"
for index, topic in enumerate(topics)
]
topics = topic_table_head + "".join(topics)
- frameworks = "\n".join(
- [f"* {param}: {value}" for param, value in get_package_versions().items()]
- )
+ frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()])
# Fill Statistics into model card
model_card = model_card.replace("{MODEL_NAME}", model_name)
@@ -330,9 +308,7 @@ def generate_readme(model, repo_id: str):
if not has_visual_aspect:
model_card = model_card.replace("{PIPELINE_TAG}", "text-classification")
else:
- model_card = model_card.replace(
- "pipeline_tag: {PIPELINE_TAG}\n", ""
- ) # TODO add proper tag for this instance
+ model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "") # TODO add proper tag for this instance
return model_card
diff --git a/bertopic/_utils.py b/bertopic/_utils.py
index 0695b7cf..6c859041 100644
--- a/bertopic/_utils.py
+++ b/bertopic/_utils.py
@@ -45,20 +45,14 @@ def check_documents_type(documents):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError("Make sure that the iterable only contains strings.")
else:
- raise TypeError(
- "Make sure that the documents variable is an iterable containing strings only."
- )
+ raise TypeError("Make sure that the documents variable is an iterable containing strings only.")
def check_embeddings_shape(embeddings, docs):
"""Check if the embeddings have the correct shape."""
if embeddings is not None:
- if not any(
- [isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]
- ):
- raise ValueError(
- "Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. "
- )
+ if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]):
+ raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ")
else:
if embeddings.shape[0] != len(docs):
raise ValueError(
@@ -137,16 +131,11 @@ def validate_distance_matrix(X, n_samples):
# check it has correct size
n = s[0]
if n != (n_samples * (n_samples - 1) / 2):
- raise ValueError(
- "The condensed distance matrix must have " "shape (n*(n-1)/2,)."
- )
+ raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
- raise ValueError(
- "The distance matrix must be of shape "
- "(n, n) where n is the number of samples."
- )
+ raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
@@ -182,15 +171,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
for i in range(dists.shape[0] - 1):
if dists[i] == dists[i + 1]:
# returns the next unique distance or the current distance with the added noise
- next_unique_dist = next(
- (d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max
- )
+ next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max)
# the noise can never be large then the difference between the next unique distance and the current one
curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
- dists_cp[i + 1] = np.random.uniform(
- low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise
- )
+ dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
return dists_cp
diff --git a/bertopic/backend/_flair.py b/bertopic/backend/_flair.py
index 2abeec49..f6e27fea 100644
--- a/bertopic/backend/_flair.py
+++ b/bertopic/backend/_flair.py
@@ -67,9 +67,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
embeddings = []
for document in tqdm(documents, disable=not verbose):
try:
- sentence = (
- Sentence(document) if document else Sentence("an empty document")
- )
+ sentence = Sentence(document) if document else Sentence("an empty document")
self.embedding_model.embed(sentence)
except RuntimeError:
sentence = Sentence("an empty document")
diff --git a/bertopic/backend/_gensim.py b/bertopic/backend/_gensim.py
index 3727e04d..d76fff17 100644
--- a/bertopic/backend/_gensim.py
+++ b/bertopic/backend/_gensim.py
@@ -48,9 +48,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
- vector_shape = self.embedding_model.get_vector(
- list(self.embedding_model.index_to_key)[0]
- ).shape[0]
+ vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
empty_vector = np.zeros(vector_shape)
# Extract word embeddings and pool to document-level
diff --git a/bertopic/backend/_hftransformers.py b/bertopic/backend/_hftransformers.py
index 8de9cc2a..344412e9 100644
--- a/bertopic/backend/_hftransformers.py
+++ b/bertopic/backend/_hftransformers.py
@@ -58,9 +58,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
embeddings = []
for document, features in tqdm(
- zip(
- documents, self.embedding_model(dataset, truncation=True, padding=True)
- ),
+ zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
total=len(dataset),
disable=not verbose,
):
@@ -79,12 +77,10 @@ def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
"""
token_embeddings = np.array(features)
- attention_mask = self.embedding_model.tokenizer(
- document, truncation=True, padding=True, return_tensors="np"
- )["attention_mask"]
- input_mask_expanded = np.broadcast_to(
- np.expand_dims(attention_mask, -1), token_embeddings.shape
- )
+ attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
+ "attention_mask"
+ ]
+ input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = np.clip(
input_mask_expanded.sum(1),
diff --git a/bertopic/backend/_multimodal.py b/bertopic/backend/_multimodal.py
index 846efc41..e1aac8d3 100644
--- a/bertopic/backend/_multimodal.py
+++ b/bertopic/backend/_multimodal.py
@@ -84,9 +84,7 @@ def __init__(
except: # noqa: E722
self.tokenizer = None
- def embed(
- self, documents: List[str], images: List[str] = None, verbose: bool = False
- ) -> np.ndarray:
+ def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words or images into an n-dimensional
matrix of embeddings.
@@ -124,9 +122,7 @@ def embed(
elif image_embeddings is not None:
return image_embeddings
- def embed_documents(
- self, documents: List[str], verbose: bool = False
- ) -> np.ndarray:
+ def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
@@ -139,9 +135,7 @@ def embed_documents(
that each have an embeddings size of `m`
"""
truncated_docs = [self._truncate_document(doc) for doc in documents]
- embeddings = self.embedding_model.encode(
- truncated_docs, show_progress_bar=verbose
- )
+ embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
return embeddings
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
@@ -170,15 +164,12 @@ def embed_images(self, images, verbose):
end_index = (i * self.batch_size) + self.batch_size
images_to_embed = [
- Image.open(image) if isinstance(image, str) else image
- for image in images[start_index:end_index]
+ Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
]
if self.image_model is not None:
img_emb = self.image_model.encode(images_to_embed)
else:
- img_emb = self.embedding_model.encode(
- images_to_embed, show_progress_bar=False
- )
+ img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
embeddings.extend(img_emb.tolist())
# Close images
@@ -191,9 +182,7 @@ def embed_images(self, images, verbose):
if self.image_model is not None:
embeddings = self.image_model.encode(images_to_embed)
else:
- embeddings = self.embedding_model.encode(
- images_to_embed, show_progress_bar=False
- )
+ embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
return embeddings
def _truncate_document(self, document):
diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py
index 19d18268..7a4cc6b3 100644
--- a/bertopic/backend/_openai.py
+++ b/bertopic/backend/_openai.py
@@ -70,9 +70,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
- response = self.client.embeddings.create(
- input=batch, **self.generator_kwargs
- )
+ response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
embeddings.extend([r.embedding for r in response.data])
# Delay subsequent calls
@@ -81,9 +79,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
# Extract embeddings all at once
else:
- response = self.client.embeddings.create(
- input=prepared_documents, **self.generator_kwargs
- )
+ response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
embeddings = [r.embedding for r in response.data]
return np.array(embeddings)
diff --git a/bertopic/backend/_use.py b/bertopic/backend/_use.py
index c33c76fc..a17a87d1 100644
--- a/bertopic/backend/_use.py
+++ b/bertopic/backend/_use.py
@@ -50,9 +50,6 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
that each have an embeddings size of `m`
"""
embeddings = np.array(
- [
- self.embedding_model([doc]).cpu().numpy()[0]
- for doc in tqdm(documents, disable=not verbose)
- ]
+ [self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
)
return embeddings
diff --git a/bertopic/backend/_utils.py b/bertopic/backend/_utils.py
index 7c78d32e..4190bd4e 100644
--- a/bertopic/backend/_utils.py
+++ b/bertopic/backend/_utils.py
@@ -68,9 +68,7 @@
]
-def select_backend(
- embedding_model, language: str = None, verbose: bool = False
-) -> BaseEmbedder:
+def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
"""Select an embedding model based on language or a specific provided model.
When selecting a language, we choose all-MiniLM-L6-v2 for English and
paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
@@ -115,9 +113,7 @@ def select_backend(
return USEBackend(embedding_model)
# Sentence Transformer embeddings
- if "sentence_transformers" in str(type(embedding_model)) or isinstance(
- embedding_model, str
- ):
+ if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
from ._sentencetransformers import SentenceTransformerBackend
return SentenceTransformerBackend(embedding_model)
@@ -134,13 +130,9 @@ def select_backend(
from ._sentencetransformers import SentenceTransformerBackend
if language.lower() in ["English", "english", "en"]:
- return SentenceTransformerBackend(
- "sentence-transformers/all-MiniLM-L6-v2"
- )
+ return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
elif language.lower() in languages or language == "multilingual":
- return SentenceTransformerBackend(
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
- )
+ return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
else:
raise ValueError(
f"{language} is currently not supported. However, you can "
diff --git a/bertopic/cluster/_utils.py b/bertopic/cluster/_utils.py
index 82f243c6..375a15b3 100644
--- a/bertopic/cluster/_utils.py
+++ b/bertopic/cluster/_utils.py
@@ -25,9 +25,7 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
if "cuml" in str_type_model and "hdbscan" in str_type_model:
from cuml.cluster import hdbscan as cuml_hdbscan
- predictions, probabilities = cuml_hdbscan.approximate_predict(
- model, embeddings
- )
+ predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings)
return predictions, probabilities
predictions = model.predict(embeddings)
diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py
index a6380273..d5c0bd60 100644
--- a/bertopic/plotting/_approximate_distribution.py
+++ b/bertopic/plotting/_approximate_distribution.py
@@ -86,9 +86,7 @@ def text_color(val):
def highligh_color(data, color="white"):
attr = "background-color: {}".format(color)
- return pd.DataFrame(
- np.where(data == 0, attr, ""), index=data.index, columns=data.columns
- )
+ return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns)
if len(df) == 0:
return df
diff --git a/bertopic/plotting/_barchart.py b/bertopic/plotting/_barchart.py
index 417e2c0f..a6e614cb 100644
--- a/bertopic/plotting/_barchart.py
+++ b/bertopic/plotting/_barchart.py
@@ -52,9 +52,7 @@ def visualize_barchart(
"""
- colors = itertools.cycle(
- ["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"]
- )
+ colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
@@ -68,21 +66,11 @@ def visualize_barchart(
# Initialize figure
if isinstance(custom_labels, str):
- subplot_titles = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in topics
- ]
- subplot_titles = [
- "_".join([label[0] for label in labels[:4]]) for labels in subplot_titles
- ]
- subplot_titles = [
- label if len(label) < 30 else label[:27] + "..." for label in subplot_titles
- ]
+ subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
+ subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles]
+ subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles]
elif topic_model.custom_labels_ is not None and custom_labels:
- subplot_titles = [
- topic_model.custom_labels_[topic + topic_model._outliers]
- for topic in topics
- ]
+ subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]
else:
subplot_titles = [f"Topic {topic}" for topic in topics]
columns = 4
@@ -100,9 +88,7 @@ def visualize_barchart(
row = 1
column = 1
for topic in topics:
- words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][
- ::-1
- ]
+ words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]
fig.add_trace(
diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py
index a793e4fc..a0e02c18 100644
--- a/bertopic/plotting/_datamap.py
+++ b/bertopic/plotting/_datamap.py
@@ -106,17 +106,13 @@ def visualize_document_datamap(
# Extract embeddings if not already done
if embeddings is None and reduced_embeddings is None:
- embeddings_to_reduce = topic_model._extract_embeddings(
- df.doc.to_list(), method="document"
- )
+ embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
else:
embeddings_to_reduce = embeddings
# Reduce input embeddings
if reduced_embeddings is None:
- umap_model = UMAP(
- n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine"
- ).fit(embeddings_to_reduce)
+ umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
else:
embeddings_2d = reduced_embeddings
@@ -125,27 +121,18 @@ def visualize_document_datamap(
# Prepare text and names
if isinstance(custom_labels, str):
- names = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in unique_topics
- ]
+ names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]
names = [" ".join([label[0] for label in labels[:4]]) for labels in names]
names = [label if len(label) < 30 else label[:27] + "..." for label in names]
elif topic_model.custom_labels_ is not None and custom_labels:
- names = [
- topic_model.custom_labels_[topic + topic_model._outliers]
- for topic in unique_topics
- ]
+ names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
else:
names = [
- f"Topic-{topic}: "
- + " ".join([word for word, value in topic_model.get_topic(topic)][:3])
+ f"Topic-{topic}: " + " ".join([word for word, value in topic_model.get_topic(topic)][:3])
for topic in unique_topics
]
- topic_name_mapping = {
- topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)
- }
+ topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)}
topic_name_mapping[-1] = "Unlabelled"
# If a set of topics is chosen, set everything else to "Unlabelled"
diff --git a/bertopic/plotting/_distribution.py b/bertopic/plotting/_distribution.py
index d04d140b..c04a851b 100644
--- a/bertopic/plotting/_distribution.py
+++ b/bertopic/plotting/_distribution.py
@@ -60,17 +60,11 @@ def visualize_distribution(
# Create labels
if isinstance(custom_labels, str):
- labels = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in labels_idx
- ]
+ labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx]
labels = ["_".join([label[0] for label in l[:4]]) for l in labels] # noqa: E741
labels = [label if len(label) < 30 else label[:27] + "..." for label in labels]
elif topic_model.custom_labels_ is not None and custom_labels:
- labels = [
- topic_model.custom_labels_[idx + topic_model._outliers]
- for idx in labels_idx
- ]
+ labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx]
else:
labels = []
for idx in labels_idx:
diff --git a/bertopic/plotting/_documents.py b/bertopic/plotting/_documents.py
index 0c5287b4..e1a3f1d3 100644
--- a/bertopic/plotting/_documents.py
+++ b/bertopic/plotting/_documents.py
@@ -109,24 +109,18 @@ def visualize_documents(
# Extract embeddings if not already done
if sample is None:
if embeddings is None and reduced_embeddings is None:
- embeddings_to_reduce = topic_model._extract_embeddings(
- df.doc.to_list(), method="document"
- )
+ embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
else:
embeddings_to_reduce = embeddings
else:
if embeddings is not None:
embeddings_to_reduce = embeddings[indices]
elif embeddings is None and reduced_embeddings is None:
- embeddings_to_reduce = topic_model._extract_embeddings(
- df.doc.to_list(), method="document"
- )
+ embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
# Reduce input embeddings
if reduced_embeddings is None:
- umap_model = UMAP(
- n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
- ).fit(embeddings_to_reduce)
+ umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
elif sample is not None and reduced_embeddings is not None:
embeddings_2d = reduced_embeddings[indices]
@@ -143,21 +137,14 @@ def visualize_documents(
# Prepare text and names
if isinstance(custom_labels, str):
- names = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in unique_topics
- ]
+ names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]
names = ["_".join([label[0] for label in labels[:4]]) for labels in names]
names = [label if len(label) < 30 else label[:27] + "..." for label in names]
elif topic_model.custom_labels_ is not None and custom_labels:
- names = [
- topic_model.custom_labels_[topic + topic_model._outliers]
- for topic in unique_topics
- ]
+ names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
else:
names = [
- f"{topic}_"
- + "_".join([word for word, value in topic_model.get_topic(topic)][:3])
+ f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3])
for topic in unique_topics
]
@@ -248,12 +235,8 @@ def visualize_documents(
y1=sum(y_range) / 2,
line=dict(color="#9E9E9E", width=2),
)
- fig.add_annotation(
- x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10
- )
- fig.add_annotation(
- y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10
- )
+ fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
+ fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
# Stylize layout
fig.update_layout(
diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py
index ad9f0664..9e51f13e 100644
--- a/bertopic/plotting/_heatmap.py
+++ b/bertopic/plotting/_heatmap.py
@@ -59,9 +59,9 @@ def visualize_heatmap(
"""
- embeddings = select_topic_representation(
- topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf
- )[0][topic_model._outliers :]
+ embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][
+ topic_model._outliers :
+ ]
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
@@ -77,10 +77,7 @@ def visualize_heatmap(
sorted_topics = topics
if n_clusters:
if n_clusters >= len(set(topics)):
- raise ValueError(
- "Make sure to set `n_clusters` lower than "
- "the total number of unique topics."
- )
+ raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.")
distance_matrix = cosine_similarity(embeddings[topics])
Z = linkage(distance_matrix, "ward")
@@ -101,31 +98,16 @@ def visualize_heatmap(
# Create labels
if isinstance(custom_labels, str):
new_labels = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in sorted_topics
- ]
- new_labels = [
- "_".join([label[0] for label in labels[:4]]) for labels in new_labels
- ]
- new_labels = [
- label if len(label) < 30 else label[:27] + "..." for label in new_labels
+ [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics
]
+ new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
+ new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]
elif topic_model.custom_labels_ is not None and custom_labels:
- new_labels = [
- topic_model.custom_labels_[topic + topic_model._outliers]
- for topic in sorted_topics
- ]
+ new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics]
else:
- new_labels = [
- [[str(topic), None]] + topic_model.get_topic(topic)
- for topic in sorted_topics
- ]
- new_labels = [
- "_".join([label[0] for label in labels[:4]]) for labels in new_labels
- ]
- new_labels = [
- label if len(label) < 30 else label[:27] + "..." for label in new_labels
- ]
+ new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]
+ new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
+ new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]
fig = px.imshow(
distance_matrix,
diff --git a/bertopic/plotting/_hierarchical_documents.py b/bertopic/plotting/_hierarchical_documents.py
index 5501c8b7..2da9c83b 100644
--- a/bertopic/plotting/_hierarchical_documents.py
+++ b/bertopic/plotting/_hierarchical_documents.py
@@ -133,24 +133,18 @@ def visualize_hierarchical_documents(
# Extract embeddings if not already done
if sample is None:
if embeddings is None and reduced_embeddings is None:
- embeddings_to_reduce = topic_model._extract_embeddings(
- df.doc.to_list(), method="document"
- )
+ embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
else:
embeddings_to_reduce = embeddings
else:
if embeddings is not None:
embeddings_to_reduce = embeddings[indices]
elif embeddings is None and reduced_embeddings is None:
- embeddings_to_reduce = topic_model._extract_embeddings(
- df.doc.to_list(), method="document"
- )
+ embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
# Reduce input embeddings
if reduced_embeddings is None:
- umap_model = UMAP(
- n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
- ).fit(embeddings_to_reduce)
+ umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
embeddings_2d = umap_model.embedding_
elif sample is not None and reduced_embeddings is not None:
embeddings_2d = reduced_embeddings[indices]
@@ -179,8 +173,7 @@ def visualize_hierarchical_documents(
max_distances = [distances[i] for i in log_indices]
elif level_scale == "lin" or level_scale == "linear":
max_distances = [
- distances[indices[-1]]
- for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)
+ distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)
][::-1]
else:
raise ValueError("level_scale needs to be one of 'log' or 'linear'")
@@ -188,9 +181,7 @@ def visualize_hierarchical_documents(
for index, max_distance in enumerate(max_distances):
# Get topics below `max_distance`
mapping = {topic: topic for topic in df.topic.unique()}
- selection = hierarchical_topics.loc[
- hierarchical_topics.Distance <= max_distance, :
- ]
+ selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :]
selection.Parent_ID = selection.Parent_ID.astype(int)
selection = selection.sort_values("Parent_ID")
@@ -219,18 +210,12 @@ def visualize_hierarchical_documents(
if topic_model.get_topic(topic):
if isinstance(custom_labels, str):
trace_name = f"{topic}_" + "_".join(
- list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][
- :3
- ]
+ list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]
)
elif topic_model.custom_labels_ is not None and custom_labels:
- trace_name = topic_model.custom_labels_[
- topic + topic_model._outliers
- ]
+ trace_name = topic_model.custom_labels_[topic + topic_model._outliers]
else:
- trace_name = f"{topic}_" + "_".join(
- [word[:20] for word, _ in topic_model.get_topic(topic)][:3]
- )
+ trace_name = f"{topic}_" + "_".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3])
topic_names[topic] = {
"trace_name": trace_name[:40],
"plot_text": trace_name[:40],
@@ -239,9 +224,7 @@ def visualize_hierarchical_documents(
else:
trace_name = (
f"{topic}_"
- + hierarchical_topics.loc[
- hierarchical_topics.Parent_ID == str(topic), "Parent_Name"
- ].values[0]
+ + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]
)
plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])
topic_names[topic] = {
@@ -264,9 +247,7 @@ def visualize_hierarchical_documents(
mode="markers+text",
name="other",
hoverinfo="text",
- hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"]
- if not hide_document_hover
- else None,
+ hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"] if not hide_document_hover else None,
showlegend=False,
marker=dict(color="#CFD8DC", size=5, opacity=0.5),
)
@@ -275,20 +256,14 @@ def visualize_hierarchical_documents(
# Selected topics
if topics:
selection = df.loc[(df.topic.isin(topics)), :]
- unique_topics = sorted(
- [int(topic) for topic in selection[f"level_{level+1}"].unique()]
- )
+ unique_topics = sorted([int(topic) for topic in selection[f"level_{level+1}"].unique()])
else:
- unique_topics = sorted(
- [int(topic) for topic in df[f"level_{level+1}"].unique()]
- )
+ unique_topics = sorted([int(topic) for topic in df[f"level_{level+1}"].unique()])
for topic in unique_topics:
if topic != -1:
if topics:
- selection = df.loc[
- (df[f"level_{level+1}"] == topic) & (df.topic.isin(topics)), :
- ]
+ selection = df.loc[(df[f"level_{level+1}"] == topic) & (df.topic.isin(topics)), :]
else:
selection = df.loc[df[f"level_{level+1}"] == topic, :]
@@ -297,9 +272,7 @@ def visualize_hierarchical_documents(
selection["text"] = ""
selection.loc[len(selection) - 1, "x"] = selection.x.mean()
selection.loc[len(selection) - 1, "y"] = selection.y.mean()
- selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)][
- "plot_text"
- ]
+ selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)]["plot_text"]
traces.append(
go.Scattergl(
@@ -373,12 +346,8 @@ def visualize_hierarchical_documents(
y1=sum(y_range) / 2,
line=dict(color="#9E9E9E", width=2),
)
- fig.add_annotation(
- x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10
- )
- fig.add_annotation(
- y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10
- )
+ fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
+ fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
# Stylize layout
fig.update_layout(
diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py
index 6faa1bc4..2e6e6b23 100644
--- a/bertopic/plotting/_hierarchy.py
+++ b/bertopic/plotting/_hierarchy.py
@@ -123,9 +123,9 @@ def visualize_hierarchy(
indices = np.array([all_topics.index(topic) for topic in topics])
# Select topic embeddings
- embeddings = select_topic_representation(
- topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf
- )[0][indices]
+ embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][
+ indices
+ ]
# Annotations
if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()):
@@ -142,9 +142,7 @@ def visualize_hierarchy(
annotations = None
# wrap distance function to validate input and return a condensed distance matrix
- distance_function_viz = lambda x: validate_distance_matrix(
- distance_function(x), embeddings.shape[0]
- )
+ distance_function_viz = lambda x: validate_distance_matrix(distance_function(x), embeddings.shape[0])
# Create dendogram
fig = ff.create_dendrogram(
embeddings,
@@ -159,31 +157,20 @@ def visualize_hierarchy(
axis = "yaxis" if orientation == "left" else "xaxis"
if isinstance(custom_labels, str):
new_labels = [
- [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x]
- for x in fig.layout[axis]["ticktext"]
- ]
- new_labels = [
- "_".join([label[0] for label in labels[:4]]) for labels in new_labels
- ]
- new_labels = [
- label if len(label) < 30 else label[:27] + "..." for label in new_labels
+ [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis]["ticktext"]
]
+ new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
+ new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]
elif topic_model.custom_labels_ is not None and custom_labels:
new_labels = [
- topic_model.custom_labels_[topics[int(x)] + topic_model._outliers]
- for x in fig.layout[axis]["ticktext"]
+ topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis]["ticktext"]
]
else:
new_labels = [
- [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)])
- for x in fig.layout[axis]["ticktext"]
- ]
- new_labels = [
- "_".join([label[0] for label in labels[:4]]) for labels in new_labels
- ]
- new_labels = [
- label if len(label) < 30 else label[:27] + "..." for label in new_labels
+ [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) for x in fig.layout[axis]["ticktext"]
]
+ new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
+ new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]
# Stylize layout
fig.update_layout(
@@ -222,21 +209,9 @@ def visualize_hierarchy(
if hierarchical_topics is not None:
for index in [0, 3]:
axis = "x" if orientation == "left" else "y"
- xs = [
- data["x"][index]
- for data in fig.data
- if (data["text"] and data[axis][index] > 0)
- ]
- ys = [
- data["y"][index]
- for data in fig.data
- if (data["text"] and data[axis][index] > 0)
- ]
- hovertext = [
- data["text"][index]
- for data in fig.data
- if (data["text"] and data[axis][index] > 0)
- ]
+ xs = [data["x"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)]
+ ys = [data["y"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)]
+ hovertext = [data["text"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)]
fig.add_trace(
go.Scatter(
@@ -322,18 +297,12 @@ def _get_annotations(
if len(fst_topic) == 1:
if isinstance(custom_labels, str):
fst_name = f"{fst_topic[0]}_" + "_".join(
- list(zip(*topic_model.topic_aspects_[custom_labels][fst_topic[0]]))[
- 0
- ][:3]
+ list(zip(*topic_model.topic_aspects_[custom_labels][fst_topic[0]]))[0][:3]
)
elif topic_model.custom_labels_ is not None and custom_labels:
- fst_name = topic_model.custom_labels_[
- fst_topic[0] + topic_model._outliers
- ]
+ fst_name = topic_model.custom_labels_[fst_topic[0] + topic_model._outliers]
else:
- fst_name = "_".join(
- [word for word, _ in topic_model.get_topic(fst_topic[0])][:5]
- )
+ fst_name = "_".join([word for word, _ in topic_model.get_topic(fst_topic[0])][:5])
else:
for key, value in parent_topic.items():
if set(value) == set(fst_topic):
@@ -342,18 +311,12 @@ def _get_annotations(
if len(scnd_topic) == 1:
if isinstance(custom_labels, str):
scnd_name = f"{scnd_topic[0]}_" + "_".join(
- list(
- zip(*topic_model.topic_aspects_[custom_labels][scnd_topic[0]])
- )[0][:3]
+ list(zip(*topic_model.topic_aspects_[custom_labels][scnd_topic[0]]))[0][:3]
)
elif topic_model.custom_labels_ is not None and custom_labels:
- scnd_name = topic_model.custom_labels_[
- scnd_topic[0] + topic_model._outliers
- ]
+ scnd_name = topic_model.custom_labels_[scnd_topic[0] + topic_model._outliers]
else:
- scnd_name = "_".join(
- [word for word, _ in topic_model.get_topic(scnd_topic[0])][:5]
- )
+ scnd_name = "_".join([word for word, _ in topic_model.get_topic(scnd_topic[0])][:5])
else:
for key, value in parent_topic.items():
if set(value) == set(scnd_topic):
diff --git a/bertopic/plotting/_term_rank.py b/bertopic/plotting/_term_rank.py
index 5dc98a23..4043692b 100644
--- a/bertopic/plotting/_term_rank.py
+++ b/bertopic/plotting/_term_rank.py
@@ -69,9 +69,7 @@ def visualize_term_rank(
topic_words = [topic_model.get_topic(topic) for topic in topic_ids]
values = np.array([[value[1] for value in values] for values in topic_words])
- indices = np.array(
- [[value + 1 for value in range(len(values))] for values in topic_words]
- )
+ indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words])
# Create figure
lines = []
@@ -79,15 +77,11 @@ def visualize_term_rank(
if not any(y > 1.5):
# labels
if isinstance(custom_labels, str):
- label = f"{topic}_" + "_".join(
- list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]
- )
+ label = f"{topic}_" + "_".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3])
elif topic_model.custom_labels_ is not None and custom_labels:
label = topic_model.custom_labels_[topic + topic_model._outliers]
else:
- label = f"Topic {topic}:" + "_".join(
- [word[0] for word in topic_model.get_topic(topic)]
- )
+ label = f"Topic {topic}:" + "_".join([word[0] for word in topic_model.get_topic(topic)])
label = label[:50]
# line parameters
diff --git a/bertopic/plotting/_topics.py b/bertopic/plotting/_topics.py
index 8a14a34d..2e477d05 100644
--- a/bertopic/plotting/_topics.py
+++ b/bertopic/plotting/_topics.py
@@ -65,22 +65,13 @@ def visualize_topics(
topic_list = sorted(topics)
frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]
if isinstance(custom_labels, str):
- words = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in topic_list
- ]
+ words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list]
words = ["_".join([label[0] for label in labels[:4]]) for labels in words]
words = [label if len(label) < 30 else label[:27] + "..." for label in words]
elif custom_labels and topic_model.custom_labels_ is not None:
- words = [
- topic_model.custom_labels_[topic + topic_model._outliers]
- for topic in topic_list
- ]
+ words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]
else:
- words = [
- " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]])
- for topic in topic_list
- ]
+ words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]
# Embed c-TF-IDF into 2D
all_topics = sorted(list(topic_model.get_topics().keys()))
@@ -96,13 +87,9 @@ def visualize_topics(
if c_tfidf_used:
embeddings = MinMaxScaler().fit_transform(embeddings)
- embeddings = UMAP(
- n_neighbors=2, n_components=2, metric="hellinger", random_state=42
- ).fit_transform(embeddings)
+ embeddings = UMAP(n_neighbors=2, n_components=2, metric="hellinger", random_state=42).fit_transform(embeddings)
else:
- embeddings = UMAP(
- n_neighbors=2, n_components=2, metric="cosine", random_state=42
- ).fit_transform(embeddings)
+ embeddings = UMAP(n_neighbors=2, n_components=2, metric="cosine", random_state=42).fit_transform(embeddings)
# Visualize with plotly
df = pd.DataFrame(
@@ -117,18 +104,14 @@ def visualize_topics(
return _plotly_topic_visualization(df, topic_list, title, width, height)
-def _plotly_topic_visualization(
- df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int
-):
+def _plotly_topic_visualization(df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int):
"""Create plotly-based visualization of topics with a slider for topic selection."""
def get_color(topic_selected):
if topic_selected == -1:
marker_color = ["#B0BEC5" for _ in topic_list]
else:
- marker_color = [
- "red" if topic == topic_selected else "#B0BEC5" for topic in topic_list
- ]
+ marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list]
return [{"marker.color": [marker_color]}]
# Prepare figure range
@@ -152,9 +135,7 @@ def get_color(topic_selected):
labels={"x": "", "y": ""},
hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False},
)
- fig.update_traces(
- marker=dict(color="#B0BEC5", line=dict(width=2, color="DarkSlateGrey"))
- )
+ fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color="DarkSlateGrey")))
# Update hover order
fig.update_traces(
@@ -168,10 +149,7 @@ def get_color(topic_selected):
)
# Create a slider for topic selection
- steps = [
- dict(label=f"Topic {topic}", method="update", args=get_color(topic))
- for topic in topic_list
- ]
+ steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list]
sliders = [dict(active=0, pad={"t": 50}, steps=steps)]
# Stylize layout
@@ -213,12 +191,8 @@ def get_color(topic_selected):
y1=sum(y_range) / 2,
line=dict(color="#9E9E9E", width=2),
)
- fig.add_annotation(
- x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10
- )
- fig.add_annotation(
- y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10
- )
+ fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
+ fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
fig.data = fig.data[::-1]
return fig
diff --git a/bertopic/plotting/_topics_over_time.py b/bertopic/plotting/_topics_over_time.py
index 625a8cce..b8254421 100644
--- a/bertopic/plotting/_topics_over_time.py
+++ b/bertopic/plotting/_topics_over_time.py
@@ -73,34 +73,20 @@ def visualize_topics_over_time(
# Prepare data
if isinstance(custom_labels, str):
- topic_names = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in topics
- ]
- topic_names = [
- "_".join([label[0] for label in labels[:4]]) for labels in topic_names
- ]
- topic_names = [
- label if len(label) < 30 else label[:27] + "..." for label in topic_names
- ]
- topic_names = {
- key: topic_names[index]
- for index, key in enumerate(topic_model.topic_labels_.keys())
- }
+ topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
+ topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names]
+ topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names]
+ topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}
elif topic_model.custom_labels_ is not None and custom_labels:
topic_names = {
- key: topic_model.custom_labels_[key + topic_model._outliers]
- for key, _ in topic_model.topic_labels_.items()
+ key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()
}
else:
topic_names = {
- key: value[:40] + "..." if len(value) > 40 else value
- for key, value in topic_model.topic_labels_.items()
+ key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items()
}
topics_over_time["Name"] = topics_over_time.Topic.map(topic_names)
- data = topics_over_time.loc[
- topics_over_time.Topic.isin(selected_topics), :
- ].sort_values(["Topic", "Timestamp"])
+ data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"])
# Add traces
fig = go.Figure()
diff --git a/bertopic/plotting/_topics_per_class.py b/bertopic/plotting/_topics_per_class.py
index 5bb8cef4..cdf02ebb 100644
--- a/bertopic/plotting/_topics_per_class.py
+++ b/bertopic/plotting/_topics_per_class.py
@@ -73,29 +73,17 @@ def visualize_topics_per_class(
# Prepare data
if isinstance(custom_labels, str):
- topic_names = [
- [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic]
- for topic in topics
- ]
- topic_names = [
- "_".join([label[0] for label in labels[:4]]) for labels in topic_names
- ]
- topic_names = [
- label if len(label) < 30 else label[:27] + "..." for label in topic_names
- ]
- topic_names = {
- key: topic_names[index]
- for index, key in enumerate(topic_model.topic_labels_.keys())
- }
+ topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
+ topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names]
+ topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names]
+ topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}
elif topic_model.custom_labels_ is not None and custom_labels:
topic_names = {
- key: topic_model.custom_labels_[key + topic_model._outliers]
- for key, _ in topic_model.topic_labels_.items()
+ key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()
}
else:
topic_names = {
- key: value[:40] + "..." if len(value) > 40 else value
- for key, value in topic_model.topic_labels_.items()
+ key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items()
}
topics_per_class["Name"] = topics_per_class.Topic.map(topic_names)
data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :]
diff --git a/bertopic/representation/__init__.py b/bertopic/representation/__init__.py
index 3c18305f..da0c6365 100644
--- a/bertopic/representation/__init__.py
+++ b/bertopic/representation/__init__.py
@@ -24,9 +24,7 @@
from bertopic.representation._zeroshot import ZeroShotClassification
except ModuleNotFoundError:
msg = "`pip install bertopic` without `--no-deps` \n\n"
- ZeroShotClassification = NotInstalled(
- "ZeroShotClassification", "transformers", custom_msg=msg
- )
+ ZeroShotClassification = NotInstalled("ZeroShotClassification", "transformers", custom_msg=msg)
# OpenAI Generator
try:
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index 64511daf..8ca31c8f 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -151,13 +151,8 @@ def extract_topics(
# Generate using Cohere's Language Model
updated_topics = {}
- for topic, docs in tqdm(
- repr_docs_mappings.items(), disable=not topic_model.verbose
- ):
- truncated_docs = [
- truncate_document(topic_model, self.doc_length, self.tokenizer, doc)
- for doc in docs
- ]
+ for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):
+ truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]
prompt = self._create_prompt(truncated_docs, topic, topics)
self.prompts_.append(prompt)
diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index 7d9d19e2..f91c01cc 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -84,10 +84,8 @@ def extract_topics(
updated_topics: Updated topic representations
"""
# We extract the top n representative documents per class
- _, representative_docs, repr_doc_indices, _ = (
- topic_model._extract_representative_docs(
- c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
- )
+ _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(
+ c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
)
# We extract the top n words per class
@@ -95,9 +93,7 @@ def extract_topics(
# We calculate the similarity between word and document embeddings and create
# topic embeddings from the representative document embeddings
- sim_matrix, words = self._extract_embeddings(
- topic_model, topics, representative_docs, repr_doc_indices
- )
+ sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)
# Find the best matching words based on the similarity matrix for each topic
updated_topics = self._extract_top_words(words, topics, sim_matrix)
@@ -139,17 +135,12 @@ def _extract_candidate_words(
# Get top 30 words per topic based on c-TF-IDF score
topics = {
label: [
- (words[word_index], score)
- if word_index is not None and score > 0
- else ("", 0.00001)
+ (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001)
for word_index, score in zip(indices[index][::-1], scores[index][::-1])
]
for index, label in enumerate(labels)
}
- topics = {
- label: list(zip(*values[: self.nr_candidate_words]))[0]
- for label, values in topics.items()
- }
+ topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()}
return topics
@@ -177,18 +168,12 @@ def _extract_embeddings(
vocab: The complete vocabulary of input documents
"""
# Calculate representative docs embeddings and create topic embeddings
- repr_embeddings = topic_model._extract_embeddings(
- representative_docs, method="document", verbose=False
- )
- topic_embeddings = [
- np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices
- ]
+ repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
+ topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
# Calculate word embeddings and extract best matching with updated topic_embeddings
vocab = list(set([word for words in topics.values() for word in words]))
- word_embeddings = topic_model._extract_embeddings(
- vocab, method="document", verbose=False
- )
+ word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)
sim = cosine_similarity(topic_embeddings, word_embeddings)
return sim, vocab
@@ -216,14 +201,9 @@ def _extract_top_words(
for i, topic in enumerate(labels):
indices = [vocab.index(word) for word in topics[topic]]
values = sim[:, indices][i]
- word_indices = [
- indices[index] for index in np.argsort(values)[-self.top_n_words :]
- ]
+ word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]]
updated_topics[topic] = [
- (vocab[index], val)
- for val, index in zip(
- np.sort(values)[-self.top_n_words :], word_indices
- )
+ (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices)
][::-1]
return updated_topics
diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
index ad92aef1..df5c4839 100644
--- a/bertopic/representation/_langchain.py
+++ b/bertopic/representation/_langchain.py
@@ -180,11 +180,7 @@ def extract_topics(
# Generate label using langchain's batch functionality
chain_docs: List[List[Document]] = [
[
- Document(
- page_content=truncate_document(
- topic_model, self.doc_length, self.tokenizer, doc
- )
- )
+ Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc))
for doc in docs
]
for docs in repr_docs_mappings.values()
@@ -199,16 +195,10 @@ def extract_topics(
prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords))
prompts.append(prompt)
- inputs = [
- {"input_documents": docs, "question": prompt}
- for docs, prompt in zip(chain_docs, prompts)
- ]
+ inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)]
else:
- inputs = [
- {"input_documents": docs, "question": self.prompt}
- for docs in chain_docs
- ]
+ inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs]
# `self.chain` must return a dict with an `output_text` key
# same output key as the `StuffDocumentsChain` returned by `load_qa_chain`
@@ -216,8 +206,7 @@ def extract_topics(
labels = [output["output_text"].strip() for output in outputs]
updated_topics = {
- topic: [(label, 1)] + [("", 0) for _ in range(9)]
- for topic, label in zip(repr_docs_mappings.keys(), labels)
+ topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels)
}
return updated_topics
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index fa573463..83b18952 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -143,28 +143,18 @@ def extract_topics(
)
updated_topics = {}
- for topic, docs in tqdm(
- repr_docs_mappings.items(), disable=not topic_model.verbose
- ):
+ for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):
# Prepare prompt
- truncated_docs = [
- truncate_document(topic_model, self.doc_length, self.tokenizer, doc)
- for doc in docs
- ]
+ truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]
prompt = self._create_prompt(truncated_docs, topic, topics)
self.prompts_.append(prompt)
# Extract result from generator and use that as label
topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
- topic_description = [
- (description["text"].replace(prompt, ""), 1)
- for description in topic_description
- ]
+ topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]
if len(topic_description) < 10:
- topic_description += [
- ("", 0) for _ in range(10 - len(topic_description))
- ]
+ topic_description += [("", 0) for _ in range(10 - len(topic_description))]
updated_topics[topic] = topic_description
diff --git a/bertopic/representation/_mmr.py b/bertopic/representation/_mmr.py
index 07a8dd13..b3b1b232 100644
--- a/bertopic/representation/_mmr.py
+++ b/bertopic/representation/_mmr.py
@@ -68,12 +68,10 @@ def extract_topics(
updated_topics = {}
for topic, topic_words in topics.items():
words = [word[0] for word in topic_words]
- word_embeddings = topic_model._extract_embeddings(
- words, method="word", verbose=False
+ word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
+ topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
+ 1, -1
)
- topic_embedding = topic_model._extract_embeddings(
- " ".join(words), method="word", verbose=False
- ).reshape(1, -1)
topic_words = mmr(
topic_embedding,
word_embeddings,
@@ -81,9 +79,7 @@ def extract_topics(
self.diversity,
self.top_n_words,
)
- updated_topics[topic] = [
- (word, value) for word, value in topics[topic] if word in topic_words
- ]
+ updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
return updated_topics
@@ -119,14 +115,10 @@ def mmr(
# Extract similarities within candidates and
# between candidates and selected keywords/phrases
candidate_similarities = word_doc_similarity[candidates_idx, :]
- target_similarities = np.max(
- word_similarity[candidates_idx][:, keywords_idx], axis=1
- )
+ target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
# Calculate MMR
- mmr = (
- 1 - diversity
- ) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
+ mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
mmr_idx = candidates_idx[np.argmax(mmr)]
# Update keywords & candidates
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index 35bdf1da..8fd25a1b 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -205,13 +205,8 @@ def extract_topics(
# Generate using OpenAI's Language Model
updated_topics = {}
- for topic, docs in tqdm(
- repr_docs_mappings.items(), disable=not topic_model.verbose
- ):
- truncated_docs = [
- truncate_document(topic_model, self.doc_length, self.tokenizer, doc)
- for doc in docs
- ]
+ for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):
+ truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]
prompt = self._create_prompt(truncated_docs, topic, topics)
self.prompts_.append(prompt)
@@ -237,11 +232,7 @@ def extract_topics(
# Check whether content was actually generated
# Addresses #1570 for potential issues with OpenAI's content filter
if hasattr(response.choices[0].message, "content"):
- label = (
- response.choices[0]
- .message.content.strip()
- .replace("topic: ", "")
- )
+ label = response.choices[0].message.content.strip().replace("topic: ", "")
else:
label = "No label returned"
else:
@@ -253,9 +244,7 @@ def extract_topics(
**self.generator_kwargs,
)
else:
- response = self.client.completions.create(
- model=self.model, prompt=prompt, **self.generator_kwargs
- )
+ response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)
label = response.choices[0].text.strip()
updated_topics[topic] = [(label, 1)]
diff --git a/bertopic/representation/_pos.py b/bertopic/representation/_pos.py
index 08139b53..3ac2815f 100644
--- a/bertopic/representation/_pos.py
+++ b/bertopic/representation/_pos.py
@@ -120,9 +120,7 @@ def extract_topics(
candidate_documents = []
for keyword in keywords:
selection = documents.loc[documents.Topic == topic, :]
- selection = selection.loc[
- selection.Document.str.contains(keyword), "Document"
- ]
+ selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
if len(selection) > 0:
for document in selection[:2]:
candidate_documents.append(document)
@@ -150,27 +148,14 @@ def extract_topics(
for topic, candidate_keywords in candidate_topics.items():
word_indices = np.sort(
- [
- words_lookup.get(keyword)
- for keyword in candidate_keywords
- if keyword in words_lookup
- ]
+ [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup]
)
vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]
- indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[
- -self.top_n_words :
- ][::-1]
- vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[
- -self.top_n_words :
- ][::-1]
- topic_words = [
- (words[word_indices[index]], val) for index, val in zip(indices, vals)
- ]
+ indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
+ vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
+ topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
updated_topics[topic] = topic_words
if len(updated_topics[topic]) < self.top_n_words:
- updated_topics[topic] += [
- ("", 0)
- for _ in range(self.top_n_words - len(updated_topics[topic]))
- ]
+ updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))]
return updated_topics
diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
index 3bc3853a..b028e575 100644
--- a/bertopic/representation/_textgeneration.py
+++ b/bertopic/representation/_textgeneration.py
@@ -142,15 +142,10 @@ def extract_topics(
repr_docs_mappings = {topic: None for topic in topics.keys()}
updated_topics = {}
- for topic, docs in tqdm(
- repr_docs_mappings.items(), disable=not topic_model.verbose
- ):
+ for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):
# Prepare prompt
truncated_docs = (
- [
- truncate_document(topic_model, self.doc_length, self.tokenizer, doc)
- for doc in docs
- ]
+ [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]
if docs is not None
else docs
)
@@ -160,14 +155,11 @@ def extract_topics(
# Extract result from generator and use that as label
topic_description = self.model(prompt, **self.pipeline_kwargs)
topic_description = [
- (description["generated_text"].replace(prompt, ""), 1)
- for description in topic_description
+ (description["generated_text"].replace(prompt, ""), 1) for description in topic_description
]
if len(topic_description) < 10:
- topic_description += [
- ("", 0) for _ in range(10 - len(topic_description))
- ]
+ topic_description += [("", 0) for _ in range(10 - len(topic_description))]
updated_topics[topic] = topic_description
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index 00f157a5..2a99fd1f 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -85,9 +85,7 @@ def wrapper(*args, **kwargs):
# Check if max retries has been reached
if num_retries > max_retries:
- raise Exception(
- f"Maximum number of retries ({max_retries}) exceeded."
- )
+ raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
# Increment the delay
delay *= exponential_base * (1 + jitter * random.random())
diff --git a/bertopic/representation/_visual.py b/bertopic/representation/_visual.py
index 897d7c9d..07968596 100644
--- a/bertopic/representation/_visual.py
+++ b/bertopic/representation/_visual.py
@@ -63,9 +63,7 @@ def __init__(
if isinstance(image_to_text_model, Pipeline):
self.image_to_text_model = image_to_text_model
elif isinstance(image_to_text_model, str):
- self.image_to_text_model = pipeline(
- "image-to-text", model=image_to_text_model
- )
+ self.image_to_text_model = pipeline("image-to-text", model=image_to_text_model)
elif image_to_text_model is None:
self.image_to_text_model = None
else:
@@ -109,23 +107,17 @@ def extract_topics(
for topic in tqdm(unique_topics):
# Get and order represetnative images
sliced_examplars = repr_docs_ids[topic + topic_model._outliers]
- sliced_examplars = [
- sliced_examplars[i : i + 3] for i in range(0, len(sliced_examplars), 3)
- ]
+ sliced_examplars = [sliced_examplars[i : i + 3] for i in range(0, len(sliced_examplars), 3)]
images_to_combine = [
[
- Image.open(images[index])
- if isinstance(images[index], str)
- else images[index]
+ Image.open(images[index]) if isinstance(images[index], str) else images[index]
for index in sub_indices
]
for sub_indices in sliced_examplars
]
# Concatenate representative images
- representative_image = get_concat_tile_resize(
- images_to_combine, self.image_height, self.image_squares
- )
+ representative_image = get_concat_tile_resize(images_to_combine, self.image_height, self.image_squares)
representative_images[topic] = representative_image
# Make sure to properly close images
@@ -136,9 +128,7 @@ def extract_topics(
return representative_images
- def _convert_image_to_text(
- self, images: List[str], verbose: bool = False
- ) -> List[str]:
+ def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> List[str]:
"""Convert a list of images to captions.
Arguments:
@@ -163,9 +153,7 @@ def _convert_image_to_text(
return documents
- def image_to_text(
- self, documents: pd.DataFrame, embeddings: np.ndarray
- ) -> pd.DataFrame:
+ def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
"""Convert images to text."""
# Create image topic embeddings
topics = documents.Topic.values.tolist()
@@ -193,10 +181,7 @@ def image_to_text(
current_id = 0
for topic, image_ids in tqdm(image_centroids.items()):
selected_images = [
- Image.open(images[index])
- if isinstance(images[index], str)
- else images[index]
- for index in image_ids
+ Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in image_ids
]
text = self._convert_image_to_text(selected_images)
@@ -243,10 +228,7 @@ def get_concat_v_multi_resize(im_list):
"""Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/."""
min_width = min(im.width for im in im_list)
min_width = max(im.width for im in im_list)
- im_list_resize = [
- im.resize((min_width, int(im.height * min_width / im.width)), resample=0)
- for im in im_list
- ]
+ im_list_resize = [im.resize((min_width, int(im.height * min_width / im.width)), resample=0) for im in im_list]
total_height = sum(im.height for im in im_list_resize)
dst = Image.new("RGB", (min_width, total_height), (255, 255, 255))
pos_y = 0
@@ -264,9 +246,7 @@ def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False):
if image_squares:
width = int(image_height / 3)
height = int(image_height / 3)
- images = [
- [image.resize((width, height)) for image in images] for images in im_list_2d
- ]
+ images = [[image.resize((width, height)) for image in images] for images in im_list_2d]
# Resize images based on minimum size
else:
@@ -280,9 +260,7 @@ def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False):
resample=0,
)
elif img.width > img.height:
- images[i][j] = img.resize(
- (min_width, int(img.height * min_width / img.width)), resample=0
- )
+ images[i][j] = img.resize((min_width, int(img.height * min_width / img.width)), resample=0)
else:
images[i][j] = img.resize((min_width, min_width))
diff --git a/bertopic/representation/_zeroshot.py b/bertopic/representation/_zeroshot.py
index 7dff499b..5f67de9a 100644
--- a/bertopic/representation/_zeroshot.py
+++ b/bertopic/representation/_zeroshot.py
@@ -75,12 +75,8 @@ def extract_topics(
updated_topics: Updated topic representations
"""
# Classify topics
- topic_descriptions = [
- " ".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()
- ]
- classifications = self.model(
- topic_descriptions, self.candidate_topics, **self.pipeline_kwargs
- )
+ topic_descriptions = [" ".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()]
+ classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs)
# Extract labels
updated_topics = {}
@@ -90,25 +86,19 @@ def extract_topics(
# Multi-label assignment
if self.pipeline_kwargs.get("multi_label"):
topic_description = []
- for label, score in zip(
- classification["labels"], classification["scores"]
- ):
+ for label, score in zip(classification["labels"], classification["scores"]):
if score > self.min_prob:
topic_description.append((label, score))
# Single label assignment
elif classification["scores"][0] > self.min_prob:
- topic_description = [
- (classification["labels"][0], classification["scores"][0])
- ]
+ topic_description = [(classification["labels"][0], classification["scores"][0])]
# Make sure that 10 items are returned
if len(topic_description) == 0:
topic_description = topics[topic]
elif len(topic_description) < 10:
- topic_description += [
- ("", 0) for _ in range(10 - len(topic_description))
- ]
+ topic_description += [("", 0) for _ in range(10 - len(topic_description))]
updated_topics[topic] = topic_description
return updated_topics
diff --git a/bertopic/vectorizers/_online_cv.py b/bertopic/vectorizers/_online_cv.py
index fedb363c..27387fa2 100644
--- a/bertopic/vectorizers/_online_cv.py
+++ b/bertopic/vectorizers/_online_cv.py
@@ -121,15 +121,11 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix:
X = self.transform(raw_documents)
# Add empty columns if new words are found
- columns = csr_matrix(
- (self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int
- )
+ columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
self.X_ = sparse.hstack([self.X_, columns])
# Add empty rows if new topics are found
- rows = csr_matrix(
- (X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int
- )
+ rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
self.X_ = sparse.vstack([self.X_, rows])
# Decay of BoW matrix
diff --git a/pyproject.toml b/pyproject.toml
index d0c1abfe..2dce9bc3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,7 +98,7 @@ include = ["bertopic*"]
exclude = ["tests"]
[tool.ruff]
-target-version = "py38"
+line-length = 120
[tool.ruff.lint]
select = [
diff --git a/tests/conftest.py b/tests/conftest.py
index 95bcf738..3d8d49db 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -27,17 +27,15 @@ def document_embeddings(documents, embedding_model):
@pytest.fixture(scope="session")
def reduced_embeddings(document_embeddings):
- reduced_embeddings = UMAP(
- n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
- ).fit_transform(document_embeddings)
+ reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit_transform(
+ document_embeddings
+ )
return reduced_embeddings
@pytest.fixture(scope="session")
def documents():
- newsgroup_docs = fetch_20newsgroups(
- subset="all", remove=("headers", "footers", "quotes")
- )["data"][:1000]
+ newsgroup_docs = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))["data"][:1000]
return newsgroup_docs
@@ -74,9 +72,7 @@ def zeroshot_topic_model(documents, document_embeddings, embedding_model):
@pytest.fixture(scope="session")
def custom_topic_model(documents, document_embeddings, embedding_model):
- umap_model = UMAP(
- n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42
- )
+ umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42)
hdbscan_model = HDBSCAN(
min_cluster_size=3,
metric="euclidean",
@@ -94,9 +90,7 @@ def custom_topic_model(documents, document_embeddings, embedding_model):
@pytest.fixture(scope="session")
def representation_topic_model(documents, document_embeddings, embedding_model):
- umap_model = UMAP(
- n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42
- )
+ umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42)
hdbscan_model = HDBSCAN(
min_cluster_size=3,
metric="euclidean",
@@ -177,9 +171,7 @@ def online_topic_model(documents, document_embeddings, embedding_model):
topics = []
for index in range(0, len(documents), 50):
- model.partial_fit(
- documents[index : index + 50], document_embeddings[index : index + 50]
- )
+ model.partial_fit(documents[index : index + 50], document_embeddings[index : index + 50])
topics.extend(model.topics_)
model.topics_ = topics
return model
diff --git a/tests/test_bertopic.py b/tests/test_bertopic.py
index 73614e1b..3bcc6cbb 100644
--- a/tests/test_bertopic.py
+++ b/tests/test_bertopic.py
@@ -75,13 +75,9 @@ def test_full_model(model, documents, request):
# Test zero-shot topic modeling
if topic_model._is_zeroshot():
if topic_model._outliers:
- assert set(topic_model.topic_labels_.keys()) == set(
- range(-1, len(topic_model.topic_labels_) - 1)
- )
+ assert set(topic_model.topic_labels_.keys()) == set(range(-1, len(topic_model.topic_labels_) - 1))
else:
- assert set(topic_model.topic_labels_.keys()) == set(
- range(len(topic_model.topic_labels_))
- )
+ assert set(topic_model.topic_labels_.keys()) == set(range(len(topic_model.topic_labels_)))
# Test topics over time
timestamps = [i % 10 for i in range(len(documents))]
@@ -130,9 +126,7 @@ def test_full_model(model, documents, request):
assert topic != original_topic
# Test updating topic labels
- topic_labels = topic_model.generate_topic_labels(
- nr_words=3, topic_prefix=False, word_length=10, separator=", "
- )
+ topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ")
assert len(topic_labels) == len(set(topic_model.topics_))
# Test setting topic labels
@@ -148,9 +142,7 @@ def test_full_model(model, documents, request):
# Test reduction of outliers
if -1 in topics:
new_topics = topic_model.reduce_outliers(documents, topics, threshold=0.0)
- nr_outliers_topic_model = sum(
- [1 for topic in topic_model.topics_ if topic == -1]
- )
+ nr_outliers_topic_model = sum([1 for topic in topic_model.topics_ if topic == -1])
nr_outliers_new_topics = sum([1 for topic in new_topics if topic == -1])
if topic_model._outliers == 1:
diff --git a/tests/test_plotting/test_approximate.py b/tests/test_plotting/test_approximate.py
index 2de86848..1b0a78eb 100644
--- a/tests/test_plotting/test_approximate.py
+++ b/tests/test_plotting/test_approximate.py
@@ -18,28 +18,17 @@ def test_approximate_distribution(batch_size, padding, model, documents, request
topic_model = copy.deepcopy(request.getfixturevalue(model))
# Calculate only on a document-level based on tokensets
- topic_distr, _ = topic_model.approximate_distribution(
- documents, padding=padding, batch_size=batch_size
- )
- assert (
- topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
- )
+ topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size)
+ assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
# Use the distribution visualization
for i in range(3):
topic_model.visualize_distribution(topic_distr[i])
# Calculate distribution on a token-level
- topic_distr, topic_token_distr = topic_model.approximate_distribution(
- documents[:100], calculate_tokens=True
- )
- assert (
- topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
- )
+ topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True)
+ assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
assert len(topic_token_distr) == len(documents[:100])
for token_distr in topic_token_distr:
- assert (
- token_distr.shape[1]
- == len(topic_model.topic_labels_) - topic_model._outliers
- )
+ assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
diff --git a/tests/test_plotting/test_documents.py b/tests/test_plotting/test_documents.py
index 81acbe4c..8d94767b 100644
--- a/tests/test_plotting/test_documents.py
+++ b/tests/test_plotting/test_documents.py
@@ -17,8 +17,6 @@ def test_documents(model, reduced_embeddings, documents, request):
topics = set(topic_model.topics_)
if -1 in topics:
topics.remove(-1)
- fig = topic_model.visualize_documents(
- documents, embeddings=reduced_embeddings, hide_document_hover=True
- )
+ fig = topic_model.visualize_documents(documents, embeddings=reduced_embeddings, hide_document_hover=True)
fig_topics = [int(data["name"].split("_")[0]) for data in fig.to_dict()["data"][1:]]
assert set(fig_topics) == topics
diff --git a/tests/test_plotting/test_dynamic.py b/tests/test_plotting/test_dynamic.py
index 361702b1..6551da52 100644
--- a/tests/test_plotting/test_dynamic.py
+++ b/tests/test_plotting/test_dynamic.py
@@ -19,7 +19,4 @@ def test_dynamic(model, documents, request):
topics_over_time = topic_model.topics_over_time(documents, timestamps)
fig = topic_model.visualize_topics_over_time(topics_over_time)
- assert (
- len(fig.to_dict()["data"])
- == len(set(topic_model.topics_)) - topic_model._outliers
- )
+ assert len(fig.to_dict()["data"]) == len(set(topic_model.topics_)) - topic_model._outliers
diff --git a/tests/test_plotting/test_term_rank.py b/tests/test_plotting/test_term_rank.py
index 318d7d3c..67015d05 100644
--- a/tests/test_plotting/test_term_rank.py
+++ b/tests/test_plotting/test_term_rank.py
@@ -2,9 +2,7 @@
import pytest
-@pytest.mark.parametrize(
- "model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")]
-)
+@pytest.mark.parametrize("model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")])
def test_term_rank(model, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
topic_model.visualize_term_rank()
diff --git a/tests/test_reduction/test_merge.py b/tests/test_reduction/test_merge.py
index b69ee3cd..67bf9934 100644
--- a/tests/test_reduction/test_merge.py
+++ b/tests/test_reduction/test_merge.py
@@ -19,9 +19,7 @@ def test_merge(model, documents, request):
topics_to_merge = [1, 2]
topic_model.merge_topics(documents, topics_to_merge)
- mappings = topic_model.topic_mapper_.get_mappings(
- list(topic_model.hdbscan_model.labels_)
- )
+ mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
assert nr_topics == len(set(topic_model.topics_)) + 1
@@ -33,9 +31,7 @@ def test_merge(model, documents, request):
topics_to_merge = [1, 2]
topic_model.merge_topics(documents, topics_to_merge)
- mappings = topic_model.topic_mapper_.get_mappings(
- list(topic_model.hdbscan_model.labels_)
- )
+ mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
assert nr_topics == len(set(topic_model.topics_)) + 2
diff --git a/tests/test_representation/test_representations.py b/tests/test_representation/test_representations.py
index 98b8f4dd..7c819964 100644
--- a/tests/test_representation/test_representations.py
+++ b/tests/test_representation/test_representations.py
@@ -151,9 +151,7 @@ def test_topic_reduction_edge_cases(model, documents, request):
topic_model.nr_topics = 100
nr_topics = 5
topics = np.random.randint(-1, nr_topics - 1, len(documents))
- old_documents = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": topics}
- )
+ old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
topic_model._update_topic_size(old_documents)
topic_model._extract_topics(old_documents)
old_freq = topic_model.get_topic_freq()
diff --git a/tests/test_sub_models/test_cluster.py b/tests/test_sub_models/test_cluster.py
index 6115d08e..265f6f78 100644
--- a/tests/test_sub_models/test_cluster.py
+++ b/tests/test_sub_models/test_cluster.py
@@ -21,13 +21,9 @@
],
)
def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
- embeddings, _ = make_blobs(
- n_samples=samples, centers=centers, n_features=features, random_state=42
- )
+ embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
documents = [str(i + 1) for i in range(embeddings.shape[0])]
- old_df = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": None}
- )
+ old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
if cluster_model == "kmeans":
cluster_model = KMeans(n_clusters=centers)
@@ -44,9 +40,7 @@ def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
assert len(new_df.Topic.unique()) == centers
assert "Topic" in new_df.columns
- pd.testing.assert_frame_equal(
- old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)
- )
+ pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
@pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"])
@@ -62,13 +56,9 @@ def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
],
)
def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
- embeddings, _ = make_blobs(
- n_samples=samples, centers=centers, n_features=features, random_state=42
- )
+ embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
documents = [str(i + 1) for i in range(embeddings.shape[0])]
- old_df = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": None}
- )
+ old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
if cluster_model == "kmeans":
cluster_model = KMeans(n_clusters=centers)
else:
@@ -84,6 +74,4 @@ def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, cen
assert len(new_df.Topic.unique()) == centers
assert "Topic" in new_df.columns
- pd.testing.assert_frame_equal(
- old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)
- )
+ pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
diff --git a/tests/test_sub_models/test_embeddings.py b/tests/test_sub_models/test_embeddings.py
index 22f53539..75735607 100644
--- a/tests/test_sub_models/test_embeddings.py
+++ b/tests/test_sub_models/test_embeddings.py
@@ -19,9 +19,7 @@
def test_extract_embeddings(model, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
single_embedding = topic_model._extract_embeddings("a document")
- multiple_embeddings = topic_model._extract_embeddings(
- ["something different", "another document"]
- )
+ multiple_embeddings = topic_model._extract_embeddings(["something different", "another document"])
sim_matrix = cosine_similarity(single_embedding, multiple_embeddings)[0]
assert single_embedding.shape[0] == 1
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2974b1b6..90876e76 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -41,15 +41,9 @@ def test_check_embeddings_shape():
def test_make_unique_distances():
def check_dists(dists: List[float], noise_max: float):
- unique_dists = get_unique_distances(
- np.array(dists, dtype=float), noise_max=noise_max
- )
- assert len(unique_dists) == len(
- dists
- ), "The number of elements must be the same"
- assert len(dists) == len(
- np.unique(unique_dists)
- ), "The distances must be unique"
+ unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
+ assert len(unique_dists) == len(dists), "The number of elements must be the same"
+ assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"
check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)
@@ -69,44 +63,32 @@ def test_select_topic_representation():
topic_embeddings = np.array([[2, 2, 2]])
# Use topic embeddings
- repr_, ctfidf_used = select_topic_representation(
- ctfidf_embeddings, topic_embeddings, use_ctfidf=False
- )
+ repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=False)
np.testing.assert_array_equal(topic_embeddings, repr_)
assert not ctfidf_used
# Fallback to c-TF-IDF
- repr_, ctfidf_used = select_topic_representation(
- ctfidf_embeddings, None, use_ctfidf=False
- )
+ repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, None, use_ctfidf=False)
np.testing.assert_array_equal(ctfidf_embeddings, repr_)
assert ctfidf_used
# Use c-TF-IDF
- repr_, ctfidf_used = select_topic_representation(
- ctfidf_embeddings, topic_embeddings, use_ctfidf=True
- )
+ repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=True)
np.testing.assert_array_equal(ctfidf_embeddings, repr_)
assert ctfidf_used
# Fallback to topic embeddings
- repr_, ctfidf_used = select_topic_representation(
- None, topic_embeddings, use_ctfidf=True
- )
+ repr_, ctfidf_used = select_topic_representation(None, topic_embeddings, use_ctfidf=True)
np.testing.assert_array_equal(topic_embeddings, repr_)
assert not ctfidf_used
# `scipy.sparse.csr_matrix` can be used as c-TF-IDF embeddings
np.testing.assert_array_equal(
ctfidf_embeddings,
- select_topic_representation(
- ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True
- )[0],
+ select_topic_representation(ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True)[0],
)
# check that `csr_matrix` is not casted to `np.ndarray` when `ctfidf_as_ndarray` is False
- repr_ = select_topic_representation(
- ctfidf_embeddings_sparse, None, output_ndarray=False
- )[0]
+ repr_ = select_topic_representation(ctfidf_embeddings_sparse, None, output_ndarray=False)[0]
assert isinstance(repr_, csr_matrix)
diff --git a/tests/test_variations/test_class.py b/tests/test_variations/test_class.py
index a94c108d..5c969b51 100644
--- a/tests/test_variations/test_class.py
+++ b/tests/test_variations/test_class.py
@@ -18,12 +18,8 @@
)
def test_class(model, documents, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
- topics_per_class_global = topic_model.topics_per_class(
- documents, classes=classes, global_tuning=True
- )
- topics_per_class_local = topic_model.topics_per_class(
- documents, classes=classes, global_tuning=False
- )
+ topics_per_class_global = topic_model.topics_per_class(documents, classes=classes, global_tuning=True)
+ topics_per_class_local = topic_model.topics_per_class(documents, classes=classes, global_tuning=False)
assert topics_per_class_global.Frequency.sum() == len(documents)
assert topics_per_class_local.Frequency.sum() == len(documents)
diff --git a/tests/test_variations/test_hierarchy.py b/tests/test_variations/test_hierarchy.py
index cdfdaf8d..1ac7091d 100644
--- a/tests/test_variations/test_hierarchy.py
+++ b/tests/test_variations/test_hierarchy.py
@@ -36,9 +36,7 @@ def test_hierarchy(model, documents, request):
def test_linkage(model, documents, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
- hierarchical_topics = topic_model.hierarchical_topics(
- documents, linkage_function=linkage_function
- )
+ hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
tree = topic_model.get_topic_tree(hierarchical_topics)
@@ -61,9 +59,7 @@ def test_linkage(model, documents, request):
def test_tree(model, documents, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
- hierarchical_topics = topic_model.hierarchical_topics(
- documents, linkage_function=linkage_function
- )
+ hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
tree = topic_model.get_topic_tree(hierarchical_topics)
diff --git a/tests/test_vectorizers/test_ctfidf.py b/tests/test_vectorizers/test_ctfidf.py
index a6cedccd..5d2626b6 100644
--- a/tests/test_vectorizers/test_ctfidf.py
+++ b/tests/test_vectorizers/test_ctfidf.py
@@ -23,12 +23,8 @@
def test_ctfidf(model, documents, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
topics = topic_model.topics_
- documents = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": topics}
- )
- documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
- {"Document": " ".join}
- )
+ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
count = topic_model.vectorizer_model.fit(documents)
@@ -74,12 +70,8 @@ def test_ctfidf_custom_cv(model, documents, request):
topic_model = copy.deepcopy(request.getfixturevalue(model))
topic_model.vectorizer_model = cv
topics = topic_model.topics_
- documents = pd.DataFrame(
- {"Document": documents, "ID": range(len(documents)), "Topic": topics}
- )
- documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
- {"Document": " ".join}
- )
+ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
+ documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
count = topic_model.vectorizer_model.fit(documents)