diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 01f47754..00000000 --- a/.flake8 +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -max-line-length = 160 diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 5682f40e..7ef1efbb 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -221,8 +221,7 @@ def __init__( # Topic-based parameters if top_n_words > 100: logger.warning( - "Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit." + "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." ) self.top_n_words = top_n_words @@ -241,9 +240,7 @@ def __init__( # Vectorizer self.n_gram_range = n_gram_range - self.vectorizer_model = vectorizer_model or CountVectorizer( - ngram_range=self.n_gram_range - ) + self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() # Representation model @@ -364,9 +361,7 @@ def fit( topic_model = BERTopic().fit(docs, embeddings) ``` """ - self.fit_transform( - documents=documents, embeddings=embeddings, y=y, images=images - ) + self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images) return self def fit_transform( @@ -427,16 +422,12 @@ def fit_transform( check_embeddings_shape(embeddings, documents) doc_ids = range(len(documents)) if documents is not None else range(len(images)) - documents = pd.DataFrame( - {"Document": documents, "ID": doc_ids, "Topic": None, "Image": images} - ) + documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images}) # Extract embeddings if embeddings is None: logger.info("Embedding - Transforming documents to embeddings.") - self.embedding_model = select_backend( - self.embedding_model, language=self.language, verbose=self.verbose - ) + self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose) embeddings = self._extract_embeddings( documents.Document.values.tolist(), images=images, @@ -446,9 +437,7 @@ def fit_transform( logger.info("Embedding - Completed \u2713") else: if self.embedding_model is not None: - self.embedding_model = select_backend( - self.embedding_model, language=self.language - ) + self.embedding_model = select_backend(self.embedding_model, language=self.language) # Guided Topic Modeling if self.seed_topic_list is not None and self.embedding_model is not None: @@ -459,17 +448,15 @@ def fit_transform( # Zero-shot Topic Modeling if self._is_zeroshot(): - documents, embeddings, assigned_documents, assigned_embeddings = ( - self._zeroshot_topic_modeling(documents, embeddings) + documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling( + documents, embeddings ) # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering umap_embeddings = self.umap_model.transform(embeddings) if len(documents) > 0: # No zero-shot topics matched # Cluster reduced embeddings - documents, probabilities = self._cluster_embeddings( - umap_embeddings, documents, y=y - ) + documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) if self._is_zeroshot() and len(assigned_documents) > 0: documents, embeddings = self._combine_zeroshot_topics( documents, embeddings, assigned_documents, assigned_embeddings @@ -526,9 +513,7 @@ def fit_transform( ] # Resulting output - self.probabilities_ = self._map_probabilities( - probabilities, original_topics=True - ) + self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() return predictions, self.probabilities_ @@ -588,9 +573,7 @@ def transform( documents = [documents] if embeddings is None: - embeddings = self._extract_embeddings( - documents, images=images, method="document", verbose=self.verbose - ) + embeddings = self._extract_embeddings(documents, images=images, method="document", verbose=self.verbose) # Check if an embedding model was found if embeddings is None: @@ -602,9 +585,7 @@ def transform( # Transform without hdbscan_model and umap_model using only cosine similarity elif type(self.hdbscan_model) == BaseCluster: - logger.info( - "Predicting topic assignments through cosine similarity of topic and document embeddings." - ) + logger.info("Predicting topic assignments through cosine similarity of topic and document embeddings.") sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) predictions = np.argmax(sim_matrix, axis=1) - self._outliers @@ -628,12 +609,8 @@ def transform( # Calculate probabilities if self.calculate_probabilities: - logger.info( - "Probabilities - Start calculation of probabilities with HDBSCAN" - ) - probabilities = hdbscan_delegator( - self.hdbscan_model, "membership_vector", umap_embeddings - ) + logger.info("Probabilities - Start calculation of probabilities with HDBSCAN") + probabilities = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings) logger.info("Probabilities - Completed \u2713") else: predictions = self.hdbscan_model.predict(umap_embeddings) @@ -712,16 +689,13 @@ def partial_fit( check_embeddings_shape(embeddings, documents) if not hasattr(self.hdbscan_model, "partial_fit"): raise ValueError( - "In order to use `.partial_fit`, the cluster model should have " - "a `.partial_fit` function." + "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function." ) # Prepare documents if isinstance(documents, str): documents = [documents] - documents = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": None} - ) + documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) # Extract embeddings if embeddings is None: @@ -746,9 +720,7 @@ def partial_fit( umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True) # Cluster reduced embeddings - documents, self.probabilities_ = self._cluster_embeddings( - umap_embeddings, documents, partial_fit=True - ) + documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True) topics = documents.Topic.to_list() # Map and find new topics @@ -756,10 +728,7 @@ def partial_fit( self.topic_mapper_ = TopicMapper(topics) mappings = self.topic_mapper_.get_mappings() new_topics = set(topics).difference(set(mappings.keys())) - new_topic_ids = { - topic: max(mappings.values()) + index + 1 - for index, topic in enumerate(new_topics) - } + new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)} self.topic_mapper_.add_new_topics(new_topic_ids) updated_mappings = self.topic_mapper_.get_mappings() updated_topics = [updated_mappings[topic] for topic in topics] @@ -767,25 +736,19 @@ def partial_fit( # Add missing topics (topics that were originally created but are now missing) if self.topic_representations_: - missing_topics = set(self.topic_representations_.keys()).difference( - set(updated_topics) - ) + missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics)) for missing_topic in missing_topics: documents.loc[len(documents), :] = [" ", len(documents), missing_topic] else: missing_topics = {} # Prepare documents - documents_per_topic = documents.sort_values("Topic").groupby( - ["Topic"], as_index=False - ) + documents_per_topic = documents.sort_values("Topic").groupby(["Topic"], as_index=False) updated_topics = documents_per_topic.first().Topic.astype(int) documents_per_topic = documents_per_topic.agg({"Document": " ".join}) # Update topic representations - self.c_tf_idf_, updated_words = self._c_tf_idf( - documents_per_topic, partial_fit=True - ) + self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True) self.topic_representations_ = self._extract_words_per_topic( updated_words, documents, self.c_tf_idf_, calculate_aspects=False ) @@ -801,10 +764,7 @@ def partial_fit( sizes = documents.groupby(["Topic"], as_index=False).count() for _, row in sizes.iterrows(): topic = int(row.Topic) - if ( - self.topic_sizes_.get(topic) is not None - and topic not in missing_topics - ): + if self.topic_sizes_.get(topic) is not None and topic not in missing_topics: self.topic_sizes_[topic] += int(row.Document) elif self.topic_sizes_.get(topic) is None: self.topic_sizes_[topic] = int(row.Document) @@ -879,9 +839,7 @@ def topics_over_time( check_is_fitted(self) check_documents_type(docs) selected_topics = topics if topics else self.topics_ - documents = pd.DataFrame( - {"Document": docs, "Topic": selected_topics, "Timestamps": timestamps} - ) + documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}) global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) all_topics = sorted(list(documents.Topic.unique())) @@ -930,9 +888,7 @@ def topics_over_time( list(set(previous_topics).intersection(set(current_topics))) # noqa: F821 ) - current_overlap_idx = [ - current_topics.index(topic) for topic in overlapping_topics - ] + current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics] previous_overlap_idx = [ previous_topics.index(topic) # noqa: F821 for topic in overlapping_topics @@ -940,8 +896,7 @@ def topics_over_time( c_tf_idf.tolil()[current_overlap_idx] = ( ( - c_tf_idf[current_overlap_idx] - + previous_c_tf_idf[previous_overlap_idx] # noqa: F821 + c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx] # noqa: F821 ) / 2.0 ).tolil() @@ -949,16 +904,11 @@ def topics_over_time( # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: - selected_topics = [ - all_topics_indices[topic] - for topic in documents_per_topic.Topic.values - ] + selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values] c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0 # Extract the words per topic - words_per_topic = self._extract_words_per_topic( - words, selection, c_tf_idf, calculate_aspects=False - ) + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) topic_frequency = pd.Series( documents_per_topic.Timestamps.values, index=documents_per_topic.Topic ).to_dict() @@ -979,9 +929,7 @@ def topics_over_time( previous_topics = sorted(list(documents_per_topic.Topic.values)) # noqa: F841 previous_c_tf_idf = c_tf_idf.copy() # noqa: F841 - return pd.DataFrame( - topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"] - ) + return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"]) def topics_per_class( self, @@ -1023,9 +971,7 @@ def topics_per_class( ``` """ check_documents_type(docs) - documents = pd.DataFrame( - {"Document": docs, "Topic": self.topics_, "Class": classes} - ) + documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes}) global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) # For each unique timestamp, create topic representations @@ -1042,18 +988,11 @@ def topics_per_class( # by simply taking the average of the two if global_tuning: c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) - c_tf_idf = ( - global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] - + c_tf_idf - ) / 2.0 + c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0 # Extract the words per topic - words_per_topic = self._extract_words_per_topic( - words, selection, c_tf_idf, calculate_aspects=False - ) - topic_frequency = pd.Series( - documents_per_topic.Class.values, index=documents_per_topic.Topic - ).to_dict() + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) + topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict() # Fill dataframe with results topics_at_class = [ @@ -1067,9 +1006,7 @@ def topics_per_class( ] topics_per_class.extend(topics_at_class) - topics_per_class = pd.DataFrame( - topics_per_class, columns=["Topic", "Words", "Frequency", "Class"] - ) + topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]) return topics_per_class @@ -1138,9 +1075,9 @@ def hierarchical_topics( linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Calculate distance - embeddings = select_topic_representation( - self.c_tf_idf_, self.topic_embeddings_, use_ctfidf - )[0][self._outliers :] + embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][ + self._outliers : + ] X = distance_function(embeddings) X = validate_distance_matrix(X, embeddings.shape[0]) @@ -1153,15 +1090,9 @@ def hierarchical_topics( Z[:, 2] = get_unique_distances(Z[:, 2]) # Calculate basic bag-of-words to be iteratively merged later - documents = pd.DataFrame( - {"Document": docs, "ID": range(len(docs)), "Topic": self.topics_} - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) - documents_per_topic = documents_per_topic.loc[ - documents_per_topic.Topic != -1, : - ] + documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": self.topics_}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) + documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :] clean_documents = self._preprocess_text(documents_per_topic.Document.values) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 @@ -1187,9 +1118,7 @@ def hierarchical_topics( ) for index in tqdm(range(len(Z))): # Find clustered documents - clusters = ( - sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers - ) + clusters = sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers nr_clusters = len(clusters) # Extract first topic we find to get the set of topics in a merged topic @@ -1200,18 +1129,14 @@ def hierarchical_topics( topic = int(val) else: val = Z[int(val - len(clusters))][0] - clustered_topics = [ - i for i, x in enumerate(clusters) if x == clusters[topic] - ] + clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]] # Group bow per cluster, calculate c-TF-IDF and extract words grouped = csr_matrix(bow[clustered_topics].sum(axis=0)) c_tf_idf = self.ctfidf_model.transform(grouped) selection = documents.loc[documents.Topic.isin(clustered_topics), :] selection.Topic = 0 - words_per_topic = self._extract_words_per_topic( - words, selection, c_tf_idf, calculate_aspects=False - ) + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) # Extract parent's name and ID parent_id = index + len(clusters) @@ -1398,9 +1323,7 @@ def approximate_distribution( t = math.ceil(window / stride) - 1 for i in range(math.ceil(window / stride) - 1): padded.append(tokenset[: window - ((t - i) * stride)]) - padded_ids.append( - list(range(0, window - ((t - i) * stride))) - ) + padded_ids.append(list(range(0, window - ((t - i) * stride)))) token_sets = padded + token_sets token_sets_ids = padded_ids + token_sets_ids @@ -1413,20 +1336,14 @@ def approximate_distribution( # Calculate similarity between embeddings of token sets and the topics if use_embedding_model: - embeddings = self._extract_embeddings( - all_sentences, method="document", verbose=True - ) - similarity = cosine_similarity( - embeddings, self.topic_embeddings_[self._outliers :] - ) + embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) + similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :]) # Calculate similarity between c-TF-IDF of token sets and the topics else: bow_doc = self.vectorizer_model.transform(all_sentences) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity( - c_tf_idf_doc, self.c_tf_idf_[self._outliers :] - ) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Only keep similarities that exceed the minimum similarity[similarity < min_similarity] = 0 @@ -1445,9 +1362,7 @@ def approximate_distribution( # Assign topics to individual tokens token_id = [i for i in range(len(token))] token_val = {index: [] for index in token_id} - for sim, token_set in zip( - similarity[start:end], all_token_sets_ids[start:end] - ): + for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]): for token in token_set: if token in token_val: token_val[token].append(sim) @@ -1477,9 +1392,7 @@ def approximate_distribution( end = end + 1 group = similarity[start:end].sum(axis=0) topic_distribution.append(group) - topic_distribution = normalize( - np.array(topic_distribution), norm="l1", axis=1 - ) + topic_distribution = normalize(np.array(topic_distribution), norm="l1", axis=1) topic_token_distribution = None # Combine results @@ -1493,9 +1406,7 @@ def approximate_distribution( return topic_distributions, topic_token_distributions - def find_topics( - self, search_term: str = None, image: str = None, top_n: int = 5 - ) -> Tuple[List[int], List[float]]: + def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]: """Find topics most similar to a search_term. Creates an embedding for a search query and compares that with @@ -1529,25 +1440,19 @@ def find_topics( search_term consists of a phrase or multiple words. """ if self.embedding_model is None: - raise Exception( - "This method can only be used if you did not use custom embeddings." - ) + raise Exception("This method can only be used if you did not use custom embeddings.") topic_list = list(self.topic_representations_.keys()) topic_list.sort() # Extract search_term embeddings and compare with topic embeddings if search_term is not None: - search_embedding = self._extract_embeddings( - [search_term], method="word", verbose=False - ).flatten() + search_embedding = self._extract_embeddings([search_term], method="word", verbose=False).flatten() elif image is not None: search_embedding = self._extract_embeddings( [None], images=[image], method="document", verbose=False ).flatten() - sims = cosine_similarity( - search_embedding.reshape(1, -1), self.topic_embeddings_ - ).flatten() + sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten() # Extract topics most similar to search_term ids = np.argsort(sims)[-top_n:] @@ -1623,13 +1528,10 @@ def update_topics( if top_n_words > 100: logger.warning( - "Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit." + "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit." ) self.top_n_words = top_n_words - self.vectorizer_model = vectorizer_model or CountVectorizer( - ngram_range=n_gram_range - ) + self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() self.representation_model = representation_model @@ -1644,12 +1546,8 @@ def update_topics( "c-TF-IDF embeddings instead of centroid embeddings." ) - documents = pd.DataFrame( - {"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images} - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) + documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) # Update topic sizes and assignments self._update_topic_size(documents) @@ -1697,9 +1595,7 @@ def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: else: return self.topic_representations_ - def get_topic( - self, topic: int, full: bool = False - ) -> Union[Mapping[str, Tuple[str, float]], bool]: + def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]: """Return top n words for a specific topic and their c-TF-IDF scores. Arguments: @@ -1719,10 +1615,7 @@ def get_topic( if topic in self.topic_representations_: if full: representations = {"Main": self.topic_representations_[topic]} - aspects = { - aspect: representations[topic] - for aspect, representations in self.topic_aspects_.items() - } + aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()} representations.update(aspects) return representations else: @@ -1746,25 +1639,17 @@ def get_topic_info(self, topic: int = None) -> pd.DataFrame: """ check_is_fitted(self) - info = pd.DataFrame( - self.topic_sizes_.items(), columns=["Topic", "Count"] - ).sort_values("Topic") + info = pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values("Topic") info["Name"] = info.Topic.map(self.topic_labels_) # Custom label if self.custom_labels_ is not None: if len(self.custom_labels_) == len(info): - labels = { - topic - self._outliers: label - for topic, label in enumerate(self.custom_labels_) - } + labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)} info["CustomName"] = info["Topic"].map(labels) # Main Keywords - values = { - topic: list(list(zip(*values))[0]) - for topic, values in self.topic_representations_.items() - } + values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()} info["Representation"] = info["Topic"].map(values) # Extract all topic aspects @@ -1774,24 +1659,16 @@ def get_topic_info(self, topic: int = None) -> pd.DataFrame: if isinstance(list(values.values())[-1][0], tuple) or isinstance( list(values.values())[-1][0], list ): - values = { - topic: list(list(zip(*value))[0]) - for topic, value in values.items() - } + values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()} elif isinstance(list(values.values())[-1][0], str): - values = { - topic: " ".join(value).strip() - for topic, value in values.items() - } + values = {topic: " ".join(value).strip() for topic, value in values.items()} info[aspect] = info["Topic"].map(values) # Representative Docs / Images if self.representative_docs_ is not None: info["Representative_Docs"] = info["Topic"].map(self.representative_docs_) if self.representative_images_ is not None: - info["Representative_Images"] = info["Topic"].map( - self.representative_images_ - ) + info["Representative_Images"] = info["Topic"].map(self.representative_images_) # Select specific topic to return if topic is not None: @@ -1826,9 +1703,9 @@ def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]: if isinstance(topic, int): return self.topic_sizes_[topic] else: - return pd.DataFrame( - self.topic_sizes_.items(), columns=["Topic", "Count"] - ).sort_values("Count", ascending=False) + return pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values( + "Count", ascending=False + ) def get_document_info( self, @@ -1899,10 +1776,7 @@ def get_document_info( document_info = pd.merge(document_info, topic_info, on="Topic", how="left") # Add top n words - top_n_words = { - topic: " - ".join(list(zip(*self.get_topic(topic)))[0]) - for topic in set(self.topics_) - } + top_n_words = {topic: " - ".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)} document_info["Top_n_words"] = document_info.Topic.map(top_n_words) # Add flat probabilities @@ -1916,15 +1790,9 @@ def get_document_info( ] # Add representative document labels - repr_docs = [ - repr_doc - for repr_docs in self.representative_docs_.values() - for repr_doc in repr_docs - ] + repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] document_info["Representative_document"] = False - document_info.loc[ - document_info.Document.isin(repr_docs), "Representative_document" - ] = True + document_info.loc[document_info.Document.isin(repr_docs), "Representative_document"] = True # Add custom meta data provided by the user if metadata is not None: @@ -2028,12 +1896,8 @@ def get_topic_tree( max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1 # Extract mapping from ID to name - topic_to_name = dict( - zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name) - ) - topic_to_name.update( - dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)) - ) + topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name)) + topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name))) topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()} # Create tree @@ -2051,8 +1915,7 @@ def get_tree(start, tree): def _tree(to_print, start, parent, tree, grandpa=None, indent=""): # Get distance between merged topics distance = hier_topics.loc[ - (hier_topics.Child_Left_ID == parent) - | (hier_topics.Child_Right_ID == parent), + (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent), "Distance", ] distance = distance.values[0] if len(distance) > 0 else 10 @@ -2064,12 +1927,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""): if int(parent) <= max_original_topic: # Do not append topic ID if they are not merged if distance < max_distance: - to_print += ( - "■──" - + topic_to_name[parent] - + f" ── Topic: {parent}" - + "\n" - ) + to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n" else: to_print += "O \n" else: @@ -2080,15 +1938,11 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""): for child in tree[parent][:-1]: to_print += indent + "├" + "─" - to_print = _tree( - to_print, start, child, tree, parent, indent + "│" + " " * width - ) + to_print = _tree(to_print, start, child, tree, parent, indent + "│" + " " * width) child = tree[parent][-1] to_print += indent + "└" + "─" - to_print = _tree( - to_print, start, child, tree, parent, indent + " " * (width + 1) - ) + to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width + 1)) return to_print @@ -2099,9 +1953,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""): start = str(hier_topics.Parent_ID.astype(int).max()) return get_tree(start, tree) - def set_topic_labels( - self, topic_labels: Union[List[str], Mapping[int, str]] - ) -> None: + def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None: """Set custom topic labels in your fitted BERTopic model. Arguments: @@ -2145,17 +1997,12 @@ def set_topic_labels( if isinstance(topic_labels, dict): if self.custom_labels_ is not None: - original_labels = { - topic: label - for topic, label in zip(unique_topics, self.custom_labels_) - } + original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)} else: info = self.get_topic_info() original_labels = dict(zip(info.Topic, info.Name)) custom_labels = [ - topic_labels.get(topic) - if topic_labels.get(topic) - else original_labels[topic] + topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics ] @@ -2164,8 +2011,7 @@ def set_topic_labels( custom_labels = topic_labels else: raise ValueError( - "Make sure that `topic_labels` contains the same number " - "of labels as there are topics." + "Make sure that `topic_labels` contains the same number " "of labels as there are topics." ) self.custom_labels_ = custom_labels @@ -2283,8 +2129,7 @@ def merge_topics( mapping[topic] = topic_group[0] else: raise ValueError( - "Make sure that `topics_to_merge` is either" - "a list of topics or a list of list of topics." + "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics." ) # Track mappings and sizes of topics for merging topic embeddings @@ -2472,9 +2317,7 @@ def reduce_outliers( # Check correct use of parameters if strategy.lower() == "probabilities" and probabilities is None: - raise ValueError( - "Make sure to pass in `probabilities` in order to use the probabilities strategy" - ) + raise ValueError("Make sure to pass in `probabilities` in order to use the probabilities strategy") # Reduce outliers by extracting most likely topics through the topic-term probability matrix if strategy.lower() == "probabilities": @@ -2490,12 +2333,8 @@ def reduce_outliers( topic_distr, _ = self.approximate_distribution( outlier_docs, min_similarity=threshold, **distributions_params ) - outlier_topics = iter( - [np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr] - ) - new_topics = [ - topic if topic != -1 else next(outlier_topics) for topic in topics - ] + outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] # Reduce outliers by finding the most similar c-TF-IDF representations elif strategy.lower() == "c-tf-idf": @@ -2505,18 +2344,12 @@ def reduce_outliers( # Calculate c-TF-IDF of outlier documents with all topics bow_doc = self.vectorizer_model.transform(outlier_docs) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity( - c_tf_idf_doc, self.c_tf_idf_[self._outliers :] - ) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 - outlier_topics = iter( - [np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity] - ) - new_topics = [ - topic if topic != -1 else next(outlier_topics) for topic in topics - ] + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] # Reduce outliers by finding the most similar topic embeddings elif strategy.lower() == "embeddings": @@ -2533,28 +2366,18 @@ def reduce_outliers( # Extract or calculate embeddings for outlier documents if embeddings is not None: - outlier_embeddings = np.array( - [embeddings[index] for index in outlier_ids] - ) + outlier_embeddings = np.array([embeddings[index] for index in outlier_ids]) elif images is not None: outlier_images = [images[index] for index in outlier_ids] - outlier_embeddings = self.embedding_model.embed_images( - outlier_images, verbose=self.verbose - ) + outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose) else: outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) - similarity = cosine_similarity( - outlier_embeddings, self.topic_embeddings_[self._outliers :] - ) + similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 - outlier_topics = iter( - [np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity] - ) - new_topics = [ - topic if topic != -1 else next(outlier_topics) for topic in topics - ] + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] return new_topics @@ -3507,9 +3330,7 @@ def save( ) # Minimal - save_utils.save_hf( - model=self, save_directory=save_directory, serialization=serialization - ) + save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization) save_utils.save_topics(model=self, path=save_directory / "topics.json") save_utils.save_images(model=self, path=save_directory / "images") save_utils.save_config( @@ -3525,9 +3346,7 @@ def save( save_directory=save_directory, serialization=serialization, ) - save_utils.save_ctfidf_config( - model=self, path=save_directory / "ctfidf_config.json" - ) + save_utils.save_ctfidf_config(model=self, path=save_directory / "ctfidf_config.json") @classmethod def load(cls, path: str, embedding_model=None): @@ -3557,22 +3376,16 @@ def load(cls, path: str, embedding_model=None): with open(file_or_dir, "rb") as file: if embedding_model: topic_model = joblib.load(file) - topic_model.embedding_model = select_backend( - embedding_model, verbose=topic_model.verbose - ) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) else: topic_model = joblib.load(file) return topic_model # Load from directory or HF if file_or_dir.is_dir(): - topics, params, tensors, ctfidf_tensors, ctfidf_config, images = ( - save_utils.load_local_files(file_or_dir) - ) + topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir) elif "/" in str(path): - topics, params, tensors, ctfidf_tensors, ctfidf_config, images = ( - save_utils.load_files_from_hf(path) - ) + topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path) else: raise ValueError("Make sure to either pass a valid directory or HF model.") topic_model = _create_model_from_files( @@ -3587,9 +3400,7 @@ def load(cls, path: str, embedding_model=None): # Replace embedding model if one is specifically chosen if embedding_model is not None: - topic_model.embedding_model = select_backend( - embedding_model, verbose=topic_model.verbose - ) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) return topic_model @@ -3645,9 +3456,7 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None) all_topics, all_params, all_tensors = [], [], [] for index, model in enumerate(models): model.save(tmpdir, serialization="pytorch") - topics, params, tensors, _, _, _ = save_utils.load_local_files( - Path(tmpdir) - ) + topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir)) all_topics.append(topics) all_params.append(params) all_tensors.append(np.array(tensors["topic_embeddings"])) @@ -3666,11 +3475,7 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None) # Extract new topics new_topics = sorted( - [ - index - selected_topics["_outliers"] - for index, sim in enumerate(sims) - if sim < min_similarity - ] + [index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity] ) max_topic = max(set(merged_topics["topics"])) @@ -3680,12 +3485,10 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None) if new_topic != -1: max_topic += 1 new_topics_dict[new_topic] = max_topic - merged_topics["topic_representations"][str(max_topic)] = ( - selected_topics["topic_representations"][str(new_topic)] - ) - merged_topics["topic_labels"][str(max_topic)] = selected_topics[ - "topic_labels" - ][str(new_topic)] + merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][ + str(new_topic) + ] + merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)] # Add new aspects if selected_topics["topic_aspects"]: @@ -3698,27 +3501,19 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None) # If the original model does not have topic aspects but the to be added model does if not merged_topics.get("topic_aspects"): - merged_topics["topic_aspects"] = selected_topics[ - "topic_aspects" - ] + merged_topics["topic_aspects"] = selected_topics["topic_aspects"] # If they both contain topic aspects, add to the existing set of aspects else: - for aspect, values in selected_topics[ - "topic_aspects" - ].items(): - merged_topics["topic_aspects"][aspect][ - str(max_topic) - ] = values[str(new_topic)] + for aspect, values in selected_topics["topic_aspects"].items(): + merged_topics["topic_aspects"][aspect][str(max_topic)] = values[str(new_topic)] # Add new embeddings new_tensors = tensors[new_topic + selected_topics["_outliers"]] merged_tensors = np.vstack([merged_tensors, new_tensors]) # Topic Mapper - merged_topics["topic_mapper"] = TopicMapper( - list(range(-1, max_topic + 1, 1)) - ).mappings_ + merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_ # Find similar topics and re-assign those from the new models sims_idx = np.argmax(sim_matrix, axis=1) @@ -3749,13 +3544,8 @@ def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None) # Replace embedding model if one is specifically chosen verbose = any([model.verbose for model in models]) - if ( - embedding_model is not None - and type(merged_model.embedding_model) == BaseEmbedder - ): - merged_model.embedding_model = select_backend( - embedding_model, verbose=verbose - ) + if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder: + merged_model.embedding_model = select_backend(embedding_model, verbose=verbose) return merged_model def push_to_hf_hub( @@ -3874,17 +3664,11 @@ def _extract_embeddings( documents = [documents] if images is not None and hasattr(self.embedding_model, "embed_images"): - embeddings = self.embedding_model.embed( - documents=documents, images=images, verbose=verbose - ) + embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose) elif method == "word": - embeddings = self.embedding_model.embed_words( - words=documents, verbose=verbose - ) + embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose) elif method == "document": - embeddings = self.embedding_model.embed_documents( - documents, verbose=verbose - ) + embeddings = self.embedding_model.embed_documents(documents, verbose=verbose) elif documents[0] is None and images is None: raise ValueError( "Make sure to use an embedding model that can either embed documents" @@ -3897,9 +3681,7 @@ def _extract_embeddings( ) return embeddings - def _images_to_text( - self, documents: pd.DataFrame, embeddings: np.ndarray - ) -> pd.DataFrame: + def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: """Convert images to text.""" logger.info("Images - Converting images to text. This might take a while.") if isinstance(self.representation_model, dict): @@ -3912,19 +3694,14 @@ def _images_to_text( documents = tuner.image_to_text(documents, embeddings) elif isinstance(self.representation_model, BaseRepresentation): if getattr(self.representation_model, "image_to_text_model", False): - documents = self.representation_model.image_to_text( - documents, embeddings - ) + documents = self.representation_model.image_to_text(documents, embeddings) logger.info("Images - Completed \u2713") return documents def _map_predictions(self, predictions: List[int]) -> List[int]: """Map predictions to the correct topics if topics were reduced.""" mappings = self.topic_mapper_.get_mappings(original_topics=True) - mapped_predictions = [ - mappings[prediction] if prediction in mappings else -1 - for prediction in predictions - ] + mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions] return mapped_predictions def _reduce_dimensionality( @@ -4008,12 +3785,8 @@ def _cluster_embeddings( if hasattr(self.hdbscan_model, "probabilities_"): probabilities = self.hdbscan_model.probabilities_ - if self.calculate_probabilities and is_supported_hdbscan( - self.hdbscan_model - ): - probabilities = hdbscan_delegator( - self.hdbscan_model, "all_points_membership_vectors" - ) + if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model): + probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors") if not partial_fit: self.topic_mapper_ = TopicMapper(self.topics_) @@ -4037,23 +3810,15 @@ def _zeroshot_topic_modeling( documents: The leftover documents that were not assigned to any topic embeddings: The leftover embeddings that were not assigned to any topic """ - logger.info( - "Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics" - ) + logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics") # Similarity between document and zero-shot topic embeddings zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list) cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings) assignment = np.argmax(cosine_similarities, 1) assignment_vals = np.max(cosine_similarities, 1) - assigned_ids = [ - index - for index, value in enumerate(assignment_vals) - if value >= self.zeroshot_min_similarity - ] + assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity] non_assigned_ids = [ - index - for index, value in enumerate(assignment_vals) - if value < self.zeroshot_min_similarity + index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity ] # Assign topics @@ -4117,32 +3882,22 @@ def _combine_zeroshot_topics( documents: DataFrame with all the original documents with their topic assignments embeddings: np.ndarray of embeddings aligned with the documents """ - logger.info( - "Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering..." - ) + logger.info("Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...") # Combine Zero-shot topics with topics from clustering zeroshot_topic_idx_to_topic_id = { zeroshot_topic_id: new_topic_id - for new_topic_id, zeroshot_topic_id in enumerate( - set(assigned_documents.Topic) - ) + for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) } self._topic_id_to_zeroshot_topic_idx = { new_topic_id: zeroshot_topic_id - for new_topic_id, zeroshot_topic_id in enumerate( - set(assigned_documents.Topic) - ) + for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) } - assigned_documents.Topic = assigned_documents.Topic.map( - zeroshot_topic_idx_to_topic_id - ) + assigned_documents.Topic = assigned_documents.Topic.map(zeroshot_topic_idx_to_topic_id) num_zeroshot_topics = len(zeroshot_topic_idx_to_topic_id) # Insert zeroshot topics between outlier cluster and other clusters documents.Topic = documents.Topic.apply( - lambda topic_id: topic_id + num_zeroshot_topics - if topic_id != -1 - else topic_id + lambda topic_id: topic_id + num_zeroshot_topics if topic_id != -1 else topic_id ) # Combine the clustered documents/embeddings with assigned documents/embeddings in the original order @@ -4159,9 +3914,7 @@ def _combine_zeroshot_topics( logger.info("Zeroshot Step 2 - Completed \u2713") return documents, embeddings - def _guided_topic_modeling( - self, embeddings: np.ndarray - ) -> Tuple[List[int], np.array]: + def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: """Apply Guided Topic Modeling. We transform the seeded topics to embeddings using the @@ -4185,12 +3938,8 @@ def _guided_topic_modeling( logger.info("Guided - Find embeddings highly related to seeded topics.") # Create embeddings from the seeded topics seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list] - seed_topic_embeddings = self._extract_embeddings( - seed_topic_list, verbose=self.verbose - ) - seed_topic_embeddings = np.vstack( - [seed_topic_embeddings, embeddings.mean(axis=0)] - ) + seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose) + seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)]) # Label documents that are most similar to one of the seeded topics sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings) @@ -4201,9 +3950,7 @@ def _guided_topic_modeling( # embedding of the seeded topic to force the documents in a cluster for seed_topic in range(len(seed_topic_list)): indices = [index for index, topic in enumerate(y) if topic == seed_topic] - embeddings[indices] = np.average( - [embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1] - ) + embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1]) logger.info("Guided - Completed \u2713") return y, embeddings @@ -4226,17 +3973,11 @@ def _extract_topics( c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic """ if verbose: - logger.info( - "Representation - Extracting topics from clusters using representation models." - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) + logger.info("Representation - Extracting topics from clusters using representation models.") + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic(words, documents) - self._create_topic_vectors( - documents=documents, embeddings=embeddings, mappings=mappings - ) + self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) if verbose: logger.info("Representation - Completed \u2713") @@ -4310,11 +4051,7 @@ def _extract_representative_docs( selected_docs_ids = selection.index.tolist() # Calculate similarity - nr_docs = ( - nr_repr_docs - if len(selected_docs) > nr_repr_docs - else len(selected_docs) - ) + nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs) bow = self.vectorizer_model.transform(selected_docs) ctfidf = self.ctfidf_model.transform(bow) sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index]) @@ -4331,28 +4068,14 @@ def _extract_representative_docs( # Extract top n most representative documents else: - indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[ - -nr_docs: - ] + indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:] docs = [selected_docs[index] for index in indices] - doc_ids = [ - selected_docs_ids[index] - for index, doc in enumerate(selected_docs) - if doc in docs - ] + doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs] repr_docs_ids.append(doc_ids) repr_docs.extend(docs) - repr_docs_indices.append( - [ - repr_docs_indices[-1][-1] + i + 1 if index != 0 else i - for i in range(nr_docs) - ] - ) - repr_docs_mappings = { - topic: repr_docs[i[0] : i[-1] + 1] - for topic, i in zip(topics.keys(), repr_docs_indices) - } + repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)]) + repr_docs_mappings = {topic: repr_docs[i[0] : i[-1] + 1] for topic, i in zip(topics.keys(), repr_docs_indices)} return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids @@ -4393,30 +4116,22 @@ def _create_topic_vectors( topic_ids = topics_from["topics_from"] topic_sizes = topics_from["topic_sizes"] if topic_ids: - embds = np.array(self.topic_embeddings_)[ - np.array(topic_ids) + self._outliers - ] + embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers] topic_embedding = np.average(embds, axis=0, weights=topic_sizes) topic_embeddings_dict[topic_to] = topic_embedding # Re-order topic embeddings topics_to_map = { - topic_mapping[0]: topic_mapping[1] - for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] + topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] } topic_embeddings = {} for topic, embds in topic_embeddings_dict.items(): topic_embeddings[topics_to_map[topic]] = embds unique_topics = sorted(list(topic_embeddings.keys())) - self.topic_embeddings_ = np.array( - [topic_embeddings[topic] for topic in unique_topics] - ) + self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics]) # Topic embeddings based on keyword representations - elif ( - self.embedding_model is not None - and type(self.embedding_model) is not BaseEmbedder - ): + elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder: topic_list = list(self.topic_representations_.keys()) topic_list.sort() @@ -4428,9 +4143,7 @@ def _create_topic_vectors( # Extract embeddings for all words in all topics topic_words = [self.get_topic(topic) for topic in topic_list] topic_words = [word[0] for topic in topic_words for word in topic] - word_embeddings = self._extract_embeddings( - topic_words, method="word", verbose=False - ) + word_embeddings = self._extract_embeddings(topic_words, method="word", verbose=False) # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value # The embeddings var is a single numpy matrix and therefore slicing is necessary to @@ -4488,33 +4201,16 @@ def _c_tf_idf( if self.ctfidf_model.seed_words and self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array( - [ - self.ctfidf_model.seed_multiplier - if word in self.ctfidf_model.seed_words - else 1 - for word in words - ] - ) - multiplier = np.array( - [ - 1.2 if word in seed_topic_list else value - for value, word in zip(multiplier, words) - ] + [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] ) + multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) elif self.ctfidf_model.seed_words: multiplier = np.array( - [ - self.ctfidf_model.seed_multiplier - if word in self.ctfidf_model.seed_words - else 1 - for word in words - ] + [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] ) elif self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] - multiplier = np.array( - [1.2 if word in seed_topic_list else 1 for word in words] - ) + multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) if fit: self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier) @@ -4572,9 +4268,7 @@ def _extract_words_per_topic( # Get top 30 words per topic based on c-TF-IDF score base_topics = { label: [ - (words[word_index], score) - if word_index is not None and score > 0 - else ("", 0.00001) + (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) for word_index, score in zip(indices[index][::-1], scores[index][::-1]) ] for index, label in enumerate(labels) @@ -4584,40 +4278,27 @@ def _extract_words_per_topic( topics = base_topics.copy() if not self.representation_model: # Default representation: c_tf_idf + top_n_words - topics = { - label: values[: self.top_n_words] for label, values in topics.items() - } + topics = {label: values[: self.top_n_words] for label, values in topics.items()} elif isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(self.representation_model, BaseRepresentation): - topics = self.representation_model.extract_topics( - self, documents, c_tf_idf, topics - ) + topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(self.representation_model, dict): if self.representation_model.get("Main"): main_model = self.representation_model["Main"] if isinstance(main_model, BaseRepresentation): - topics = main_model.extract_topics( - self, documents, c_tf_idf, topics - ) + topics = main_model.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(main_model, list): for tuner in main_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) else: - raise TypeError( - f"unsupported type {type(main_model).__name__} for representation_model['Main']" - ) + raise TypeError(f"unsupported type {type(main_model).__name__} for representation_model['Main']") else: # Default representation: c_tf_idf + top_n_words - topics = { - label: values[: self.top_n_words] - for label, values in topics.items() - } + topics = {label: values[: self.top_n_words] for label, values in topics.items()} else: - raise TypeError( - f"unsupported type {type(self.representation_model).__name__} for representation_model" - ) + raise TypeError(f"unsupported type {type(self.representation_model).__name__} for representation_model") # Extract additional topic aspects if calculate_aspects and isinstance(self.representation_model, dict): @@ -4626,19 +4307,12 @@ def _extract_words_per_topic( aspects = base_topics.copy() if not aspect_model: # Default representation: c_tf_idf + top_n_words - aspects = { - label: values[: self.top_n_words] - for label, values in aspects.items() - } + aspects = {label: values[: self.top_n_words] for label, values in aspects.items()} if isinstance(aspect_model, list): for tuner in aspect_model: - aspects = tuner.extract_topics( - self, documents, c_tf_idf, aspects - ) + aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects) elif isinstance(aspect_model, BaseRepresentation): - aspects = aspect_model.extract_topics( - self, documents, c_tf_idf, aspects - ) + aspects = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) else: raise TypeError( f"unsupported type {type(aspect_model).__name__} for representation_model[{repr(aspect)}]" @@ -4647,9 +4321,7 @@ def _extract_words_per_topic( return topics - def _reduce_topics( - self, documents: pd.DataFrame, use_ctfidf: bool = False - ) -> pd.DataFrame: + def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce topics to self.nr_topics. Arguments: @@ -4676,9 +4348,7 @@ def _reduce_topics( ) return documents - def _reduce_to_n_topics( - self, documents: pd.DataFrame, use_ctfidf: bool = False - ) -> pd.DataFrame: + def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce topics to self.nr_topics. Arguments: @@ -4700,9 +4370,7 @@ def _reduce_to_n_topics( # Cluster the topic embeddings using AgglomerativeClustering if version.parse(sklearn_version) >= version.parse("1.4.0"): - cluster = AgglomerativeClustering( - self.nr_topics - self._outliers, metric="precomputed", linkage="average" - ) + cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric="precomputed", linkage="average") else: cluster = AgglomerativeClustering( self.nr_topics - self._outliers, @@ -4713,9 +4381,7 @@ def _reduce_to_n_topics( new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] # Track mappings and sizes of topics for merging topic embeddings - mapped_topics = { - from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics) - } + mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)} basic_mappings = defaultdict(list) for key, val in sorted(mapped_topics.items()): basic_mappings[val].append(key) @@ -4742,8 +4408,7 @@ def _reduce_to_n_topics( if self._is_zeroshot(): new_topic_id_to_zeroshot_topic_idx = {} topics_to_map = { - topic_mapping[0]: topic_mapping[1] - for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] + topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] } for topic_to, topics_from in basic_mappings.items(): @@ -4753,9 +4418,7 @@ def _reduce_to_n_topics( # which of the original topics are zero-shot zeroshot_topic_ids = [ - topic_id - for topic_id in topics_from - if topic_id in self._topic_id_to_zeroshot_topic_idx + topic_id for topic_id in topics_from if topic_id in self._topic_id_to_zeroshot_topic_idx ] if len(zeroshot_topic_ids) == 0: continue @@ -4763,9 +4426,7 @@ def _reduce_to_n_topics( # If any of the original topics are zero-shot, take the best fitting zero-shot label # if the cosine similarity with the new topic exceeds the zero-shot threshold zeroshot_labels = [ - self.zeroshot_topic_list[ - self._topic_id_to_zeroshot_topic_idx[topic_id] - ] + self.zeroshot_topic_list[self._topic_id_to_zeroshot_topic_idx[topic_id]] for topic_id in zeroshot_topic_ids ] zeroshot_embeddings = self._extract_embeddings(zeroshot_labels) @@ -4775,18 +4436,14 @@ def _reduce_to_n_topics( best_zeroshot_topic_idx = np.argmax(cosine_similarities) best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] if best_cosine_similarity >= self.zeroshot_min_similarity: - new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[ - best_zeroshot_topic_idx - ] + new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[best_zeroshot_topic_idx] self._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx self._update_topic_size(documents) return documents - def _auto_reduce_topics( - self, documents: pd.DataFrame, use_ctfidf: bool = False - ) -> pd.DataFrame: + def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce the number of topics automatically using HDBSCAN. Arguments: @@ -4819,13 +4476,8 @@ def _auto_reduce_topics( for index, prediction in enumerate(predictions) if prediction != -1 } - documents.Topic = ( - documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) - ) - mapped_topics = { - from_topic: to_topic - for from_topic, to_topic in zip(topics, documents.Topic.tolist()) - } + documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) + mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())} # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) @@ -4873,17 +4525,13 @@ def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: self._update_topic_size(documents) # Map topics based on frequency - df = pd.DataFrame( - self.topic_sizes_.items(), columns=["Old_Topic", "Size"] - ).sort_values("Size", ascending=False) + df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) df = df[df.Old_Topic != -1] sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} self.topic_mapper_.add_mappings(sorted_topics) # Map documents - documents.Topic = ( - documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int) - ) + documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int) self._update_topic_size(documents) return documents @@ -4918,9 +4566,7 @@ def _map_probabilities( ) for from_topic, to_topic in mappings.items(): if to_topic != -1 and from_topic != -1: - mapped_probabilities[:, to_topic] += probabilities[ - :, from_topic - ] + mapped_probabilities[:, to_topic] += probabilities[:, from_topic] return mapped_probabilities @@ -4936,12 +4582,8 @@ def _preprocess_text(self, documents: np.ndarray) -> List[str]: cleaned_documents = [doc.replace("\n", " ") for doc in documents] cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents] if self.language == "english": - cleaned_documents = [ - re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents - ] - cleaned_documents = [ - doc if doc != "" else "emptydoc" for doc in cleaned_documents - ] + cleaned_documents = [re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents] + cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents] return cleaned_documents @staticmethod @@ -4961,13 +4603,8 @@ def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray: indices = [] for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]): n_row_pick = min(n, ri - le) - values = matrix.indices[ - le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:] - ] - values = [ - values[index] if len(values) >= index + 1 else None - for index in range(n) - ] + values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]] + values = [values[index] if len(values) >= index + 1 else None for index in range(n)] indices.append(values) return np.array(indices) @@ -4984,9 +4621,7 @@ def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray: """ top_values = [] for row, values in enumerate(indices): - scores = np.array( - [matrix[row, value] if value is not None else 0 for value in values] - ) + scores = np.array([matrix[row, value] if value is not None else 0 for value in values]) top_values.append(scores) return np.array(top_values) @@ -4999,11 +4634,7 @@ def _get_param_names(cls): """ init_signature = inspect.signature(cls.__init__) parameters = sorted( - [ - p.name - for p in init_signature.parameters.values() - if p.name != "self" and p.kind != p.VAR_KEYWORD - ] + [p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD] ) return parameters @@ -5173,22 +4804,16 @@ def _create_model_from_files( **params, ) topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy() - topic_model.topic_representations_ = { - int(key): val for key, val in topics["topic_representations"].items() - } + topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()} topic_model.topics_ = topics["topics"] - topic_model.topic_sizes_ = { - int(key): val for key, val in topics["topic_sizes"].items() - } + topic_model.topic_sizes_ = {int(key): val for key, val in topics["topic_sizes"].items()} topic_model.custom_labels_ = topics["custom_labels"] if topics.get("topic_aspects"): topic_aspects = {} for aspect, values in topics["topic_aspects"].items(): if aspect != "Visual_Aspect": - topic_aspects[aspect] = { - int(topic): value for topic, value in values.items() - } + topic_aspects[aspect] = {int(topic): value for topic, value in values.items()} topic_model.topic_aspects_ = topic_aspects if images is not None: @@ -5209,20 +4834,12 @@ def _create_model_from_files( ) # CountVectorizer - topic_model.vectorizer_model = CountVectorizer( - **ctfidf_config["vectorizer_model"]["params"] - ) - topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"][ - "vocab" - ] + topic_model.vectorizer_model = CountVectorizer(**ctfidf_config["vectorizer_model"]["params"]) + topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"]["vocab"] # ClassTfidfTransformer - topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"][ - "reduce_frequent_words" - ] - topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"][ - "bm25_weighting" - ] + topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"]["reduce_frequent_words"] + topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"]["bm25_weighting"] idf = ctfidf_tensors["diag"].numpy() topic_model.ctfidf_model._idf_diag = sp.diags( idf, offsets=0, shape=(len(idf), len(idf)), format="csr", dtype=np.float64 diff --git a/bertopic/_save_utils.py b/bertopic/_save_utils.py index a01ba691..845e0f75 100644 --- a/bertopic/_save_utils.py +++ b/bertopic/_save_utils.py @@ -135,9 +135,7 @@ def push_to_hf_hub( save_ctfidf: Whether to save c-TF-IDF information """ if not _has_hf_hub: - raise ValueError( - "Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`" - ) + raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`") # Create repo if it doesn't exist yet and infer complete repo_id repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) @@ -156,9 +154,7 @@ def push_to_hf_hub( # Add README if it does not exist try: - get_hf_file_metadata( - hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision) - ) + get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) except: # noqa: E722 if model_card: readme_text = generate_readme(model, repo_id) @@ -241,13 +237,9 @@ def load_files_from_hf(path): # c-TF-IDF try: - ctfidf_config = load_cfg_from_json( - hf_hub_download(path, CTFIDF_CFG_NAME, revision=None) - ) + ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)) try: - ctfidf_tensors = hf_hub_download( - path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None - ) + ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None) ctfidf_tensors = load_safetensors(ctfidf_tensors) except: # noqa: E722 ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None) @@ -268,9 +260,7 @@ def load_files_from_hf(path): topic_list = list(topics["topic_representations"].keys()) images = {} for topic in topic_list: - image = Image.open( - hf_hub_download(path, f"images/{topic}.jpg", revision=None) - ) + image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None)) images[int(topic)] = image return topics, params, tensors, ctfidf_tensors, ctfidf_config, images @@ -283,11 +273,7 @@ def generate_readme(model, repo_id: str): # Get Statistics model_name = repo_id.split("/")[-1] - params = { - param: value - for param, value in model.get_params().items() - if "model" not in param - } + params = {param: value for param, value in model.get_params().items() if "model" not in param} params = "\n".join([f"* {param}: {value}" for param, value in params.items()]) topics = sorted(list(set(model.topics_))) nr_topics = str(len(set(model.topics_))) @@ -298,23 +284,15 @@ def generate_readme(model, repo_id: str): nr_documents = "" # Topic information - topic_keywords = [ - " - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics - ] + topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics] topic_freq = [model.get_topic_freq(topic) for topic in topics] - topic_labels = ( - model.custom_labels_ - if model.custom_labels_ - else [model.topic_labels_[topic] for topic in topics] - ) + topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics] topics = [ f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n" for index, topic in enumerate(topics) ] topics = topic_table_head + "".join(topics) - frameworks = "\n".join( - [f"* {param}: {value}" for param, value in get_package_versions().items()] - ) + frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()]) # Fill Statistics into model card model_card = model_card.replace("{MODEL_NAME}", model_name) @@ -330,9 +308,7 @@ def generate_readme(model, repo_id: str): if not has_visual_aspect: model_card = model_card.replace("{PIPELINE_TAG}", "text-classification") else: - model_card = model_card.replace( - "pipeline_tag: {PIPELINE_TAG}\n", "" - ) # TODO add proper tag for this instance + model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "") # TODO add proper tag for this instance return model_card diff --git a/bertopic/_utils.py b/bertopic/_utils.py index 0695b7cf..6c859041 100644 --- a/bertopic/_utils.py +++ b/bertopic/_utils.py @@ -45,20 +45,14 @@ def check_documents_type(documents): if not any([isinstance(doc, str) for doc in documents]): raise TypeError("Make sure that the iterable only contains strings.") else: - raise TypeError( - "Make sure that the documents variable is an iterable containing strings only." - ) + raise TypeError("Make sure that the documents variable is an iterable containing strings only.") def check_embeddings_shape(embeddings, docs): """Check if the embeddings have the correct shape.""" if embeddings is not None: - if not any( - [isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)] - ): - raise ValueError( - "Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. " - ) + if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]): + raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ") else: if embeddings.shape[0] != len(docs): raise ValueError( @@ -137,16 +131,11 @@ def validate_distance_matrix(X, n_samples): # check it has correct size n = s[0] if n != (n_samples * (n_samples - 1) / 2): - raise ValueError( - "The condensed distance matrix must have " "shape (n*(n-1)/2,)." - ) + raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).") elif len(s) == 2: # check it has correct size if (s[0] != n_samples) or (s[1] != n_samples): - raise ValueError( - "The distance matrix must be of shape " - "(n, n) where n is the number of samples." - ) + raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.") # force zero diagonal and convert to condensed np.fill_diagonal(X, 0) X = squareform(X) @@ -182,15 +171,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array: for i in range(dists.shape[0] - 1): if dists[i] == dists[i + 1]: # returns the next unique distance or the current distance with the added noise - next_unique_dist = next( - (d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max - ) + next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max) # the noise can never be large then the difference between the next unique distance and the current one curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i]) - dists_cp[i + 1] = np.random.uniform( - low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise - ) + dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise) return dists_cp diff --git a/bertopic/backend/_flair.py b/bertopic/backend/_flair.py index 2abeec49..f6e27fea 100644 --- a/bertopic/backend/_flair.py +++ b/bertopic/backend/_flair.py @@ -67,9 +67,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: embeddings = [] for document in tqdm(documents, disable=not verbose): try: - sentence = ( - Sentence(document) if document else Sentence("an empty document") - ) + sentence = Sentence(document) if document else Sentence("an empty document") self.embedding_model.embed(sentence) except RuntimeError: sentence = Sentence("an empty document") diff --git a/bertopic/backend/_gensim.py b/bertopic/backend/_gensim.py index 3727e04d..d76fff17 100644 --- a/bertopic/backend/_gensim.py +++ b/bertopic/backend/_gensim.py @@ -48,9 +48,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ - vector_shape = self.embedding_model.get_vector( - list(self.embedding_model.index_to_key)[0] - ).shape[0] + vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0] empty_vector = np.zeros(vector_shape) # Extract word embeddings and pool to document-level diff --git a/bertopic/backend/_hftransformers.py b/bertopic/backend/_hftransformers.py index 8de9cc2a..344412e9 100644 --- a/bertopic/backend/_hftransformers.py +++ b/bertopic/backend/_hftransformers.py @@ -58,9 +58,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: embeddings = [] for document, features in tqdm( - zip( - documents, self.embedding_model(dataset, truncation=True, padding=True) - ), + zip(documents, self.embedding_model(dataset, truncation=True, padding=True)), total=len(dataset), disable=not verbose, ): @@ -79,12 +77,10 @@ def _embed(self, document: str, features: np.ndarray) -> np.ndarray: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers """ token_embeddings = np.array(features) - attention_mask = self.embedding_model.tokenizer( - document, truncation=True, padding=True, return_tensors="np" - )["attention_mask"] - input_mask_expanded = np.broadcast_to( - np.expand_dims(attention_mask, -1), token_embeddings.shape - ) + attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[ + "attention_mask" + ] + input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape) sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1) sum_mask = np.clip( input_mask_expanded.sum(1), diff --git a/bertopic/backend/_multimodal.py b/bertopic/backend/_multimodal.py index 846efc41..e1aac8d3 100644 --- a/bertopic/backend/_multimodal.py +++ b/bertopic/backend/_multimodal.py @@ -84,9 +84,7 @@ def __init__( except: # noqa: E722 self.tokenizer = None - def embed( - self, documents: List[str], images: List[str] = None, verbose: bool = False - ) -> np.ndarray: + def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words or images into an n-dimensional matrix of embeddings. @@ -124,9 +122,7 @@ def embed( elif image_embeddings is not None: return image_embeddings - def embed_documents( - self, documents: List[str], verbose: bool = False - ) -> np.ndarray: + def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. @@ -139,9 +135,7 @@ def embed_documents( that each have an embeddings size of `m` """ truncated_docs = [self._truncate_document(doc) for doc in documents] - embeddings = self.embedding_model.encode( - truncated_docs, show_progress_bar=verbose - ) + embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose) return embeddings def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: @@ -170,15 +164,12 @@ def embed_images(self, images, verbose): end_index = (i * self.batch_size) + self.batch_size images_to_embed = [ - Image.open(image) if isinstance(image, str) else image - for image in images[start_index:end_index] + Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index] ] if self.image_model is not None: img_emb = self.image_model.encode(images_to_embed) else: - img_emb = self.embedding_model.encode( - images_to_embed, show_progress_bar=False - ) + img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False) embeddings.extend(img_emb.tolist()) # Close images @@ -191,9 +182,7 @@ def embed_images(self, images, verbose): if self.image_model is not None: embeddings = self.image_model.encode(images_to_embed) else: - embeddings = self.embedding_model.encode( - images_to_embed, show_progress_bar=False - ) + embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False) return embeddings def _truncate_document(self, document): diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py index 19d18268..7a4cc6b3 100644 --- a/bertopic/backend/_openai.py +++ b/bertopic/backend/_openai.py @@ -70,9 +70,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: if self.batch_size is not None: embeddings = [] for batch in tqdm(self._chunks(prepared_documents), disable=not verbose): - response = self.client.embeddings.create( - input=batch, **self.generator_kwargs - ) + response = self.client.embeddings.create(input=batch, **self.generator_kwargs) embeddings.extend([r.embedding for r in response.data]) # Delay subsequent calls @@ -81,9 +79,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: # Extract embeddings all at once else: - response = self.client.embeddings.create( - input=prepared_documents, **self.generator_kwargs - ) + response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs) embeddings = [r.embedding for r in response.data] return np.array(embeddings) diff --git a/bertopic/backend/_use.py b/bertopic/backend/_use.py index c33c76fc..a17a87d1 100644 --- a/bertopic/backend/_use.py +++ b/bertopic/backend/_use.py @@ -50,9 +50,6 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: that each have an embeddings size of `m` """ embeddings = np.array( - [ - self.embedding_model([doc]).cpu().numpy()[0] - for doc in tqdm(documents, disable=not verbose) - ] + [self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)] ) return embeddings diff --git a/bertopic/backend/_utils.py b/bertopic/backend/_utils.py index 7c78d32e..4190bd4e 100644 --- a/bertopic/backend/_utils.py +++ b/bertopic/backend/_utils.py @@ -68,9 +68,7 @@ ] -def select_backend( - embedding_model, language: str = None, verbose: bool = False -) -> BaseEmbedder: +def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder: """Select an embedding model based on language or a specific provided model. When selecting a language, we choose all-MiniLM-L6-v2 for English and paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages. @@ -115,9 +113,7 @@ def select_backend( return USEBackend(embedding_model) # Sentence Transformer embeddings - if "sentence_transformers" in str(type(embedding_model)) or isinstance( - embedding_model, str - ): + if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str): from ._sentencetransformers import SentenceTransformerBackend return SentenceTransformerBackend(embedding_model) @@ -134,13 +130,9 @@ def select_backend( from ._sentencetransformers import SentenceTransformerBackend if language.lower() in ["English", "english", "en"]: - return SentenceTransformerBackend( - "sentence-transformers/all-MiniLM-L6-v2" - ) + return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2") elif language.lower() in languages or language == "multilingual": - return SentenceTransformerBackend( - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" - ) + return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") else: raise ValueError( f"{language} is currently not supported. However, you can " diff --git a/bertopic/cluster/_utils.py b/bertopic/cluster/_utils.py index 82f243c6..375a15b3 100644 --- a/bertopic/cluster/_utils.py +++ b/bertopic/cluster/_utils.py @@ -25,9 +25,7 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None): if "cuml" in str_type_model and "hdbscan" in str_type_model: from cuml.cluster import hdbscan as cuml_hdbscan - predictions, probabilities = cuml_hdbscan.approximate_predict( - model, embeddings - ) + predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings) return predictions, probabilities predictions = model.predict(embeddings) diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py index a6380273..d5c0bd60 100644 --- a/bertopic/plotting/_approximate_distribution.py +++ b/bertopic/plotting/_approximate_distribution.py @@ -86,9 +86,7 @@ def text_color(val): def highligh_color(data, color="white"): attr = "background-color: {}".format(color) - return pd.DataFrame( - np.where(data == 0, attr, ""), index=data.index, columns=data.columns - ) + return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns) if len(df) == 0: return df diff --git a/bertopic/plotting/_barchart.py b/bertopic/plotting/_barchart.py index 417e2c0f..a6e614cb 100644 --- a/bertopic/plotting/_barchart.py +++ b/bertopic/plotting/_barchart.py @@ -52,9 +52,7 @@ def visualize_barchart( """ - colors = itertools.cycle( - ["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"] - ) + colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"]) # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -68,21 +66,11 @@ def visualize_barchart( # Initialize figure if isinstance(custom_labels, str): - subplot_titles = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in topics - ] - subplot_titles = [ - "_".join([label[0] for label in labels[:4]]) for labels in subplot_titles - ] - subplot_titles = [ - label if len(label) < 30 else label[:27] + "..." for label in subplot_titles - ] + subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] + subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles] + subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles] elif topic_model.custom_labels_ is not None and custom_labels: - subplot_titles = [ - topic_model.custom_labels_[topic + topic_model._outliers] - for topic in topics - ] + subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics] else: subplot_titles = [f"Topic {topic}" for topic in topics] columns = 4 @@ -100,9 +88,7 @@ def visualize_barchart( row = 1 column = 1 for topic in topics: - words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][ - ::-1 - ] + words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][::-1] scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1] fig.add_trace( diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py index a793e4fc..a0e02c18 100644 --- a/bertopic/plotting/_datamap.py +++ b/bertopic/plotting/_datamap.py @@ -106,17 +106,13 @@ def visualize_document_datamap( # Extract embeddings if not already done if embeddings is None and reduced_embeddings is None: - embeddings_to_reduce = topic_model._extract_embeddings( - df.doc.to_list(), method="document" - ) + embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings # Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP( - n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine" - ).fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ else: embeddings_2d = reduced_embeddings @@ -125,27 +121,18 @@ def visualize_document_datamap( # Prepare text and names if isinstance(custom_labels, str): - names = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in unique_topics - ] + names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics] names = [" ".join([label[0] for label in labels[:4]]) for labels in names] names = [label if len(label) < 30 else label[:27] + "..." for label in names] elif topic_model.custom_labels_ is not None and custom_labels: - names = [ - topic_model.custom_labels_[topic + topic_model._outliers] - for topic in unique_topics - ] + names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: names = [ - f"Topic-{topic}: " - + " ".join([word for word, value in topic_model.get_topic(topic)][:3]) + f"Topic-{topic}: " + " ".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics ] - topic_name_mapping = { - topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names) - } + topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)} topic_name_mapping[-1] = "Unlabelled" # If a set of topics is chosen, set everything else to "Unlabelled" diff --git a/bertopic/plotting/_distribution.py b/bertopic/plotting/_distribution.py index d04d140b..c04a851b 100644 --- a/bertopic/plotting/_distribution.py +++ b/bertopic/plotting/_distribution.py @@ -60,17 +60,11 @@ def visualize_distribution( # Create labels if isinstance(custom_labels, str): - labels = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in labels_idx - ] + labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx] labels = ["_".join([label[0] for label in l[:4]]) for l in labels] # noqa: E741 labels = [label if len(label) < 30 else label[:27] + "..." for label in labels] elif topic_model.custom_labels_ is not None and custom_labels: - labels = [ - topic_model.custom_labels_[idx + topic_model._outliers] - for idx in labels_idx - ] + labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx] else: labels = [] for idx in labels_idx: diff --git a/bertopic/plotting/_documents.py b/bertopic/plotting/_documents.py index 0c5287b4..e1a3f1d3 100644 --- a/bertopic/plotting/_documents.py +++ b/bertopic/plotting/_documents.py @@ -109,24 +109,18 @@ def visualize_documents( # Extract embeddings if not already done if sample is None: if embeddings is None and reduced_embeddings is None: - embeddings_to_reduce = topic_model._extract_embeddings( - df.doc.to_list(), method="document" - ) + embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings else: if embeddings is not None: embeddings_to_reduce = embeddings[indices] elif embeddings is None and reduced_embeddings is None: - embeddings_to_reduce = topic_model._extract_embeddings( - df.doc.to_list(), method="document" - ) + embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") # Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP( - n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine" - ).fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] @@ -143,21 +137,14 @@ def visualize_documents( # Prepare text and names if isinstance(custom_labels, str): - names = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in unique_topics - ] + names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics] names = ["_".join([label[0] for label in labels[:4]]) for labels in names] names = [label if len(label) < 30 else label[:27] + "..." for label in names] elif topic_model.custom_labels_ is not None and custom_labels: - names = [ - topic_model.custom_labels_[topic + topic_model._outliers] - for topic in unique_topics - ] + names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: names = [ - f"{topic}_" - + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) + f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics ] @@ -248,12 +235,8 @@ def visualize_documents( y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) - fig.add_annotation( - x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10 - ) - fig.add_annotation( - y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10 - ) + fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) + fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) # Stylize layout fig.update_layout( diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py index ad9f0664..9e51f13e 100644 --- a/bertopic/plotting/_heatmap.py +++ b/bertopic/plotting/_heatmap.py @@ -59,9 +59,9 @@ def visualize_heatmap( """ - embeddings = select_topic_representation( - topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf - )[0][topic_model._outliers :] + embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ + topic_model._outliers : + ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -77,10 +77,7 @@ def visualize_heatmap( sorted_topics = topics if n_clusters: if n_clusters >= len(set(topics)): - raise ValueError( - "Make sure to set `n_clusters` lower than " - "the total number of unique topics." - ) + raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.") distance_matrix = cosine_similarity(embeddings[topics]) Z = linkage(distance_matrix, "ward") @@ -101,31 +98,16 @@ def visualize_heatmap( # Create labels if isinstance(custom_labels, str): new_labels = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in sorted_topics - ] - new_labels = [ - "_".join([label[0] for label in labels[:4]]) for labels in new_labels - ] - new_labels = [ - label if len(label) < 30 else label[:27] + "..." for label in new_labels + [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics ] + new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] + new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: - new_labels = [ - topic_model.custom_labels_[topic + topic_model._outliers] - for topic in sorted_topics - ] + new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics] else: - new_labels = [ - [[str(topic), None]] + topic_model.get_topic(topic) - for topic in sorted_topics - ] - new_labels = [ - "_".join([label[0] for label in labels[:4]]) for labels in new_labels - ] - new_labels = [ - label if len(label) < 30 else label[:27] + "..." for label in new_labels - ] + new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics] + new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] + new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] fig = px.imshow( distance_matrix, diff --git a/bertopic/plotting/_hierarchical_documents.py b/bertopic/plotting/_hierarchical_documents.py index 5501c8b7..2da9c83b 100644 --- a/bertopic/plotting/_hierarchical_documents.py +++ b/bertopic/plotting/_hierarchical_documents.py @@ -133,24 +133,18 @@ def visualize_hierarchical_documents( # Extract embeddings if not already done if sample is None: if embeddings is None and reduced_embeddings is None: - embeddings_to_reduce = topic_model._extract_embeddings( - df.doc.to_list(), method="document" - ) + embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings else: if embeddings is not None: embeddings_to_reduce = embeddings[indices] elif embeddings is None and reduced_embeddings is None: - embeddings_to_reduce = topic_model._extract_embeddings( - df.doc.to_list(), method="document" - ) + embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") # Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP( - n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine" - ).fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] @@ -179,8 +173,7 @@ def visualize_hierarchical_documents( max_distances = [distances[i] for i in log_indices] elif level_scale == "lin" or level_scale == "linear": max_distances = [ - distances[indices[-1]] - for indices in np.array_split(range(len(hierarchical_topics)), nr_levels) + distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels) ][::-1] else: raise ValueError("level_scale needs to be one of 'log' or 'linear'") @@ -188,9 +181,7 @@ def visualize_hierarchical_documents( for index, max_distance in enumerate(max_distances): # Get topics below `max_distance` mapping = {topic: topic for topic in df.topic.unique()} - selection = hierarchical_topics.loc[ - hierarchical_topics.Distance <= max_distance, : - ] + selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :] selection.Parent_ID = selection.Parent_ID.astype(int) selection = selection.sort_values("Parent_ID") @@ -219,18 +210,12 @@ def visualize_hierarchical_documents( if topic_model.get_topic(topic): if isinstance(custom_labels, str): trace_name = f"{topic}_" + "_".join( - list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][ - :3 - ] + list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3] ) elif topic_model.custom_labels_ is not None and custom_labels: - trace_name = topic_model.custom_labels_[ - topic + topic_model._outliers - ] + trace_name = topic_model.custom_labels_[topic + topic_model._outliers] else: - trace_name = f"{topic}_" + "_".join( - [word[:20] for word, _ in topic_model.get_topic(topic)][:3] - ) + trace_name = f"{topic}_" + "_".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3]) topic_names[topic] = { "trace_name": trace_name[:40], "plot_text": trace_name[:40], @@ -239,9 +224,7 @@ def visualize_hierarchical_documents( else: trace_name = ( f"{topic}_" - + hierarchical_topics.loc[ - hierarchical_topics.Parent_ID == str(topic), "Parent_Name" - ].values[0] + + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0] ) plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]]) topic_names[topic] = { @@ -264,9 +247,7 @@ def visualize_hierarchical_documents( mode="markers+text", name="other", hoverinfo="text", - hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"] - if not hide_document_hover - else None, + hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"] if not hide_document_hover else None, showlegend=False, marker=dict(color="#CFD8DC", size=5, opacity=0.5), ) @@ -275,20 +256,14 @@ def visualize_hierarchical_documents( # Selected topics if topics: selection = df.loc[(df.topic.isin(topics)), :] - unique_topics = sorted( - [int(topic) for topic in selection[f"level_{level+1}"].unique()] - ) + unique_topics = sorted([int(topic) for topic in selection[f"level_{level+1}"].unique()]) else: - unique_topics = sorted( - [int(topic) for topic in df[f"level_{level+1}"].unique()] - ) + unique_topics = sorted([int(topic) for topic in df[f"level_{level+1}"].unique()]) for topic in unique_topics: if topic != -1: if topics: - selection = df.loc[ - (df[f"level_{level+1}"] == topic) & (df.topic.isin(topics)), : - ] + selection = df.loc[(df[f"level_{level+1}"] == topic) & (df.topic.isin(topics)), :] else: selection = df.loc[df[f"level_{level+1}"] == topic, :] @@ -297,9 +272,7 @@ def visualize_hierarchical_documents( selection["text"] = "" selection.loc[len(selection) - 1, "x"] = selection.x.mean() selection.loc[len(selection) - 1, "y"] = selection.y.mean() - selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)][ - "plot_text" - ] + selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)]["plot_text"] traces.append( go.Scattergl( @@ -373,12 +346,8 @@ def visualize_hierarchical_documents( y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) - fig.add_annotation( - x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10 - ) - fig.add_annotation( - y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10 - ) + fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) + fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) # Stylize layout fig.update_layout( diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py index 6faa1bc4..2e6e6b23 100644 --- a/bertopic/plotting/_hierarchy.py +++ b/bertopic/plotting/_hierarchy.py @@ -123,9 +123,9 @@ def visualize_hierarchy( indices = np.array([all_topics.index(topic) for topic in topics]) # Select topic embeddings - embeddings = select_topic_representation( - topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf - )[0][indices] + embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ + indices + ] # Annotations if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()): @@ -142,9 +142,7 @@ def visualize_hierarchy( annotations = None # wrap distance function to validate input and return a condensed distance matrix - distance_function_viz = lambda x: validate_distance_matrix( - distance_function(x), embeddings.shape[0] - ) + distance_function_viz = lambda x: validate_distance_matrix(distance_function(x), embeddings.shape[0]) # Create dendogram fig = ff.create_dendrogram( embeddings, @@ -159,31 +157,20 @@ def visualize_hierarchy( axis = "yaxis" if orientation == "left" else "xaxis" if isinstance(custom_labels, str): new_labels = [ - [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] - for x in fig.layout[axis]["ticktext"] - ] - new_labels = [ - "_".join([label[0] for label in labels[:4]]) for labels in new_labels - ] - new_labels = [ - label if len(label) < 30 else label[:27] + "..." for label in new_labels + [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis]["ticktext"] ] + new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] + new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: new_labels = [ - topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] - for x in fig.layout[axis]["ticktext"] + topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis]["ticktext"] ] else: new_labels = [ - [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) - for x in fig.layout[axis]["ticktext"] - ] - new_labels = [ - "_".join([label[0] for label in labels[:4]]) for labels in new_labels - ] - new_labels = [ - label if len(label) < 30 else label[:27] + "..." for label in new_labels + [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) for x in fig.layout[axis]["ticktext"] ] + new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] + new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] # Stylize layout fig.update_layout( @@ -222,21 +209,9 @@ def visualize_hierarchy( if hierarchical_topics is not None: for index in [0, 3]: axis = "x" if orientation == "left" else "y" - xs = [ - data["x"][index] - for data in fig.data - if (data["text"] and data[axis][index] > 0) - ] - ys = [ - data["y"][index] - for data in fig.data - if (data["text"] and data[axis][index] > 0) - ] - hovertext = [ - data["text"][index] - for data in fig.data - if (data["text"] and data[axis][index] > 0) - ] + xs = [data["x"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] + ys = [data["y"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] + hovertext = [data["text"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] fig.add_trace( go.Scatter( @@ -322,18 +297,12 @@ def _get_annotations( if len(fst_topic) == 1: if isinstance(custom_labels, str): fst_name = f"{fst_topic[0]}_" + "_".join( - list(zip(*topic_model.topic_aspects_[custom_labels][fst_topic[0]]))[ - 0 - ][:3] + list(zip(*topic_model.topic_aspects_[custom_labels][fst_topic[0]]))[0][:3] ) elif topic_model.custom_labels_ is not None and custom_labels: - fst_name = topic_model.custom_labels_[ - fst_topic[0] + topic_model._outliers - ] + fst_name = topic_model.custom_labels_[fst_topic[0] + topic_model._outliers] else: - fst_name = "_".join( - [word for word, _ in topic_model.get_topic(fst_topic[0])][:5] - ) + fst_name = "_".join([word for word, _ in topic_model.get_topic(fst_topic[0])][:5]) else: for key, value in parent_topic.items(): if set(value) == set(fst_topic): @@ -342,18 +311,12 @@ def _get_annotations( if len(scnd_topic) == 1: if isinstance(custom_labels, str): scnd_name = f"{scnd_topic[0]}_" + "_".join( - list( - zip(*topic_model.topic_aspects_[custom_labels][scnd_topic[0]]) - )[0][:3] + list(zip(*topic_model.topic_aspects_[custom_labels][scnd_topic[0]]))[0][:3] ) elif topic_model.custom_labels_ is not None and custom_labels: - scnd_name = topic_model.custom_labels_[ - scnd_topic[0] + topic_model._outliers - ] + scnd_name = topic_model.custom_labels_[scnd_topic[0] + topic_model._outliers] else: - scnd_name = "_".join( - [word for word, _ in topic_model.get_topic(scnd_topic[0])][:5] - ) + scnd_name = "_".join([word for word, _ in topic_model.get_topic(scnd_topic[0])][:5]) else: for key, value in parent_topic.items(): if set(value) == set(scnd_topic): diff --git a/bertopic/plotting/_term_rank.py b/bertopic/plotting/_term_rank.py index 5dc98a23..4043692b 100644 --- a/bertopic/plotting/_term_rank.py +++ b/bertopic/plotting/_term_rank.py @@ -69,9 +69,7 @@ def visualize_term_rank( topic_words = [topic_model.get_topic(topic) for topic in topic_ids] values = np.array([[value[1] for value in values] for values in topic_words]) - indices = np.array( - [[value + 1 for value in range(len(values))] for values in topic_words] - ) + indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words]) # Create figure lines = [] @@ -79,15 +77,11 @@ def visualize_term_rank( if not any(y > 1.5): # labels if isinstance(custom_labels, str): - label = f"{topic}_" + "_".join( - list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3] - ) + label = f"{topic}_" + "_".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]) elif topic_model.custom_labels_ is not None and custom_labels: label = topic_model.custom_labels_[topic + topic_model._outliers] else: - label = f"Topic {topic}:" + "_".join( - [word[0] for word in topic_model.get_topic(topic)] - ) + label = f"Topic {topic}:" + "_".join([word[0] for word in topic_model.get_topic(topic)]) label = label[:50] # line parameters diff --git a/bertopic/plotting/_topics.py b/bertopic/plotting/_topics.py index 8a14a34d..2e477d05 100644 --- a/bertopic/plotting/_topics.py +++ b/bertopic/plotting/_topics.py @@ -65,22 +65,13 @@ def visualize_topics( topic_list = sorted(topics) frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list] if isinstance(custom_labels, str): - words = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in topic_list - ] + words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list] words = ["_".join([label[0] for label in labels[:4]]) for labels in words] words = [label if len(label) < 30 else label[:27] + "..." for label in words] elif custom_labels and topic_model.custom_labels_ is not None: - words = [ - topic_model.custom_labels_[topic + topic_model._outliers] - for topic in topic_list - ] + words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list] else: - words = [ - " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) - for topic in topic_list - ] + words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list] # Embed c-TF-IDF into 2D all_topics = sorted(list(topic_model.get_topics().keys())) @@ -96,13 +87,9 @@ def visualize_topics( if c_tfidf_used: embeddings = MinMaxScaler().fit_transform(embeddings) - embeddings = UMAP( - n_neighbors=2, n_components=2, metric="hellinger", random_state=42 - ).fit_transform(embeddings) + embeddings = UMAP(n_neighbors=2, n_components=2, metric="hellinger", random_state=42).fit_transform(embeddings) else: - embeddings = UMAP( - n_neighbors=2, n_components=2, metric="cosine", random_state=42 - ).fit_transform(embeddings) + embeddings = UMAP(n_neighbors=2, n_components=2, metric="cosine", random_state=42).fit_transform(embeddings) # Visualize with plotly df = pd.DataFrame( @@ -117,18 +104,14 @@ def visualize_topics( return _plotly_topic_visualization(df, topic_list, title, width, height) -def _plotly_topic_visualization( - df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int -): +def _plotly_topic_visualization(df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int): """Create plotly-based visualization of topics with a slider for topic selection.""" def get_color(topic_selected): if topic_selected == -1: marker_color = ["#B0BEC5" for _ in topic_list] else: - marker_color = [ - "red" if topic == topic_selected else "#B0BEC5" for topic in topic_list - ] + marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list] return [{"marker.color": [marker_color]}] # Prepare figure range @@ -152,9 +135,7 @@ def get_color(topic_selected): labels={"x": "", "y": ""}, hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False}, ) - fig.update_traces( - marker=dict(color="#B0BEC5", line=dict(width=2, color="DarkSlateGrey")) - ) + fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color="DarkSlateGrey"))) # Update hover order fig.update_traces( @@ -168,10 +149,7 @@ def get_color(topic_selected): ) # Create a slider for topic selection - steps = [ - dict(label=f"Topic {topic}", method="update", args=get_color(topic)) - for topic in topic_list - ] + steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list] sliders = [dict(active=0, pad={"t": 50}, steps=steps)] # Stylize layout @@ -213,12 +191,8 @@ def get_color(topic_selected): y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) - fig.add_annotation( - x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10 - ) - fig.add_annotation( - y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10 - ) + fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) + fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) fig.data = fig.data[::-1] return fig diff --git a/bertopic/plotting/_topics_over_time.py b/bertopic/plotting/_topics_over_time.py index 625a8cce..b8254421 100644 --- a/bertopic/plotting/_topics_over_time.py +++ b/bertopic/plotting/_topics_over_time.py @@ -73,34 +73,20 @@ def visualize_topics_over_time( # Prepare data if isinstance(custom_labels, str): - topic_names = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in topics - ] - topic_names = [ - "_".join([label[0] for label in labels[:4]]) for labels in topic_names - ] - topic_names = [ - label if len(label) < 30 else label[:27] + "..." for label in topic_names - ] - topic_names = { - key: topic_names[index] - for index, key in enumerate(topic_model.topic_labels_.keys()) - } + topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] + topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names] + topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] + topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: topic_names = { - key: topic_model.custom_labels_[key + topic_model._outliers] - for key, _ in topic_model.topic_labels_.items() + key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() } else: topic_names = { - key: value[:40] + "..." if len(value) > 40 else value - for key, value in topic_model.topic_labels_.items() + key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() } topics_over_time["Name"] = topics_over_time.Topic.map(topic_names) - data = topics_over_time.loc[ - topics_over_time.Topic.isin(selected_topics), : - ].sort_values(["Topic", "Timestamp"]) + data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"]) # Add traces fig = go.Figure() diff --git a/bertopic/plotting/_topics_per_class.py b/bertopic/plotting/_topics_per_class.py index 5bb8cef4..cdf02ebb 100644 --- a/bertopic/plotting/_topics_per_class.py +++ b/bertopic/plotting/_topics_per_class.py @@ -73,29 +73,17 @@ def visualize_topics_per_class( # Prepare data if isinstance(custom_labels, str): - topic_names = [ - [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] - for topic in topics - ] - topic_names = [ - "_".join([label[0] for label in labels[:4]]) for labels in topic_names - ] - topic_names = [ - label if len(label) < 30 else label[:27] + "..." for label in topic_names - ] - topic_names = { - key: topic_names[index] - for index, key in enumerate(topic_model.topic_labels_.keys()) - } + topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] + topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names] + topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] + topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: topic_names = { - key: topic_model.custom_labels_[key + topic_model._outliers] - for key, _ in topic_model.topic_labels_.items() + key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() } else: topic_names = { - key: value[:40] + "..." if len(value) > 40 else value - for key, value in topic_model.topic_labels_.items() + key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() } topics_per_class["Name"] = topics_per_class.Topic.map(topic_names) data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :] diff --git a/bertopic/representation/__init__.py b/bertopic/representation/__init__.py index 3c18305f..da0c6365 100644 --- a/bertopic/representation/__init__.py +++ b/bertopic/representation/__init__.py @@ -24,9 +24,7 @@ from bertopic.representation._zeroshot import ZeroShotClassification except ModuleNotFoundError: msg = "`pip install bertopic` without `--no-deps` \n\n" - ZeroShotClassification = NotInstalled( - "ZeroShotClassification", "transformers", custom_msg=msg - ) + ZeroShotClassification = NotInstalled("ZeroShotClassification", "transformers", custom_msg=msg) # OpenAI Generator try: diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index 64511daf..8ca31c8f 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -151,13 +151,8 @@ def extract_topics( # Generate using Cohere's Language Model updated_topics = {} - for topic, docs in tqdm( - repr_docs_mappings.items(), disable=not topic_model.verbose - ): - truncated_docs = [ - truncate_document(topic_model, self.doc_length, self.tokenizer, doc) - for doc in docs - ] + for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): + truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index 7d9d19e2..f91c01cc 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -84,10 +84,8 @@ def extract_topics( updated_topics: Updated topic representations """ # We extract the top n representative documents per class - _, representative_docs, repr_doc_indices, _ = ( - topic_model._extract_representative_docs( - c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs - ) + _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs ) # We extract the top n words per class @@ -95,9 +93,7 @@ def extract_topics( # We calculate the similarity between word and document embeddings and create # topic embeddings from the representative document embeddings - sim_matrix, words = self._extract_embeddings( - topic_model, topics, representative_docs, repr_doc_indices - ) + sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices) # Find the best matching words based on the similarity matrix for each topic updated_topics = self._extract_top_words(words, topics, sim_matrix) @@ -139,17 +135,12 @@ def _extract_candidate_words( # Get top 30 words per topic based on c-TF-IDF score topics = { label: [ - (words[word_index], score) - if word_index is not None and score > 0 - else ("", 0.00001) + (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) for word_index, score in zip(indices[index][::-1], scores[index][::-1]) ] for index, label in enumerate(labels) } - topics = { - label: list(zip(*values[: self.nr_candidate_words]))[0] - for label, values in topics.items() - } + topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()} return topics @@ -177,18 +168,12 @@ def _extract_embeddings( vocab: The complete vocabulary of input documents """ # Calculate representative docs embeddings and create topic embeddings - repr_embeddings = topic_model._extract_embeddings( - representative_docs, method="document", verbose=False - ) - topic_embeddings = [ - np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices - ] + repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) + topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) - word_embeddings = topic_model._extract_embeddings( - vocab, method="document", verbose=False - ) + word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) sim = cosine_similarity(topic_embeddings, word_embeddings) return sim, vocab @@ -216,14 +201,9 @@ def _extract_top_words( for i, topic in enumerate(labels): indices = [vocab.index(word) for word in topics[topic]] values = sim[:, indices][i] - word_indices = [ - indices[index] for index in np.argsort(values)[-self.top_n_words :] - ] + word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]] updated_topics[topic] = [ - (vocab[index], val) - for val, index in zip( - np.sort(values)[-self.top_n_words :], word_indices - ) + (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices) ][::-1] return updated_topics diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index ad92aef1..df5c4839 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -180,11 +180,7 @@ def extract_topics( # Generate label using langchain's batch functionality chain_docs: List[List[Document]] = [ [ - Document( - page_content=truncate_document( - topic_model, self.doc_length, self.tokenizer, doc - ) - ) + Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc)) for doc in docs ] for docs in repr_docs_mappings.values() @@ -199,16 +195,10 @@ def extract_topics( prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompts.append(prompt) - inputs = [ - {"input_documents": docs, "question": prompt} - for docs, prompt in zip(chain_docs, prompts) - ] + inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] else: - inputs = [ - {"input_documents": docs, "question": self.prompt} - for docs in chain_docs - ] + inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] # `self.chain` must return a dict with an `output_text` key # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` @@ -216,8 +206,7 @@ def extract_topics( labels = [output["output_text"].strip() for output in outputs] updated_topics = { - topic: [(label, 1)] + [("", 0) for _ in range(9)] - for topic, label in zip(repr_docs_mappings.keys(), labels) + topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) } return updated_topics diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index fa573463..83b18952 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -143,28 +143,18 @@ def extract_topics( ) updated_topics = {} - for topic, docs in tqdm( - repr_docs_mappings.items(), disable=not topic_model.verbose - ): + for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): # Prepare prompt - truncated_docs = [ - truncate_document(topic_model, self.doc_length, self.tokenizer, doc) - for doc in docs - ] + truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Extract result from generator and use that as label topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"] - topic_description = [ - (description["text"].replace(prompt, ""), 1) - for description in topic_description - ] + topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description] if len(topic_description) < 10: - topic_description += [ - ("", 0) for _ in range(10 - len(topic_description)) - ] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description diff --git a/bertopic/representation/_mmr.py b/bertopic/representation/_mmr.py index 07a8dd13..b3b1b232 100644 --- a/bertopic/representation/_mmr.py +++ b/bertopic/representation/_mmr.py @@ -68,12 +68,10 @@ def extract_topics( updated_topics = {} for topic, topic_words in topics.items(): words = [word[0] for word in topic_words] - word_embeddings = topic_model._extract_embeddings( - words, method="word", verbose=False + word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False) + topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape( + 1, -1 ) - topic_embedding = topic_model._extract_embeddings( - " ".join(words), method="word", verbose=False - ).reshape(1, -1) topic_words = mmr( topic_embedding, word_embeddings, @@ -81,9 +79,7 @@ def extract_topics( self.diversity, self.top_n_words, ) - updated_topics[topic] = [ - (word, value) for word, value in topics[topic] if word in topic_words - ] + updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words] return updated_topics @@ -119,14 +115,10 @@ def mmr( # Extract similarities within candidates and # between candidates and selected keywords/phrases candidate_similarities = word_doc_similarity[candidates_idx, :] - target_similarities = np.max( - word_similarity[candidates_idx][:, keywords_idx], axis=1 - ) + target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1) # Calculate MMR - mmr = ( - 1 - diversity - ) * candidate_similarities - diversity * target_similarities.reshape(-1, 1) + mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1) mmr_idx = candidates_idx[np.argmax(mmr)] # Update keywords & candidates diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 35bdf1da..8fd25a1b 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -205,13 +205,8 @@ def extract_topics( # Generate using OpenAI's Language Model updated_topics = {} - for topic, docs in tqdm( - repr_docs_mappings.items(), disable=not topic_model.verbose - ): - truncated_docs = [ - truncate_document(topic_model, self.doc_length, self.tokenizer, doc) - for doc in docs - ] + for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): + truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) @@ -237,11 +232,7 @@ def extract_topics( # Check whether content was actually generated # Addresses #1570 for potential issues with OpenAI's content filter if hasattr(response.choices[0].message, "content"): - label = ( - response.choices[0] - .message.content.strip() - .replace("topic: ", "") - ) + label = response.choices[0].message.content.strip().replace("topic: ", "") else: label = "No label returned" else: @@ -253,9 +244,7 @@ def extract_topics( **self.generator_kwargs, ) else: - response = self.client.completions.create( - model=self.model, prompt=prompt, **self.generator_kwargs - ) + response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs) label = response.choices[0].text.strip() updated_topics[topic] = [(label, 1)] diff --git a/bertopic/representation/_pos.py b/bertopic/representation/_pos.py index 08139b53..3ac2815f 100644 --- a/bertopic/representation/_pos.py +++ b/bertopic/representation/_pos.py @@ -120,9 +120,7 @@ def extract_topics( candidate_documents = [] for keyword in keywords: selection = documents.loc[documents.Topic == topic, :] - selection = selection.loc[ - selection.Document.str.contains(keyword), "Document" - ] + selection = selection.loc[selection.Document.str.contains(keyword), "Document"] if len(selection) > 0: for document in selection[:2]: candidate_documents.append(document) @@ -150,27 +148,14 @@ def extract_topics( for topic, candidate_keywords in candidate_topics.items(): word_indices = np.sort( - [ - words_lookup.get(keyword) - for keyword in candidate_keywords - if keyword in words_lookup - ] + [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup] ) vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers] - indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[ - -self.top_n_words : - ][::-1] - vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[ - -self.top_n_words : - ][::-1] - topic_words = [ - (words[word_indices[index]], val) for index, val in zip(indices, vals) - ] + indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] + vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] + topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)] updated_topics[topic] = topic_words if len(updated_topics[topic]) < self.top_n_words: - updated_topics[topic] += [ - ("", 0) - for _ in range(self.top_n_words - len(updated_topics[topic])) - ] + updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))] return updated_topics diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index 3bc3853a..b028e575 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -142,15 +142,10 @@ def extract_topics( repr_docs_mappings = {topic: None for topic in topics.keys()} updated_topics = {} - for topic, docs in tqdm( - repr_docs_mappings.items(), disable=not topic_model.verbose - ): + for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): # Prepare prompt truncated_docs = ( - [ - truncate_document(topic_model, self.doc_length, self.tokenizer, doc) - for doc in docs - ] + [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs ) @@ -160,14 +155,11 @@ def extract_topics( # Extract result from generator and use that as label topic_description = self.model(prompt, **self.pipeline_kwargs) topic_description = [ - (description["generated_text"].replace(prompt, ""), 1) - for description in topic_description + (description["generated_text"].replace(prompt, ""), 1) for description in topic_description ] if len(topic_description) < 10: - topic_description += [ - ("", 0) for _ in range(10 - len(topic_description)) - ] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index 00f157a5..2a99fd1f 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -85,9 +85,7 @@ def wrapper(*args, **kwargs): # Check if max retries has been reached if num_retries > max_retries: - raise Exception( - f"Maximum number of retries ({max_retries}) exceeded." - ) + raise Exception(f"Maximum number of retries ({max_retries}) exceeded.") # Increment the delay delay *= exponential_base * (1 + jitter * random.random()) diff --git a/bertopic/representation/_visual.py b/bertopic/representation/_visual.py index 897d7c9d..07968596 100644 --- a/bertopic/representation/_visual.py +++ b/bertopic/representation/_visual.py @@ -63,9 +63,7 @@ def __init__( if isinstance(image_to_text_model, Pipeline): self.image_to_text_model = image_to_text_model elif isinstance(image_to_text_model, str): - self.image_to_text_model = pipeline( - "image-to-text", model=image_to_text_model - ) + self.image_to_text_model = pipeline("image-to-text", model=image_to_text_model) elif image_to_text_model is None: self.image_to_text_model = None else: @@ -109,23 +107,17 @@ def extract_topics( for topic in tqdm(unique_topics): # Get and order represetnative images sliced_examplars = repr_docs_ids[topic + topic_model._outliers] - sliced_examplars = [ - sliced_examplars[i : i + 3] for i in range(0, len(sliced_examplars), 3) - ] + sliced_examplars = [sliced_examplars[i : i + 3] for i in range(0, len(sliced_examplars), 3)] images_to_combine = [ [ - Image.open(images[index]) - if isinstance(images[index], str) - else images[index] + Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in sub_indices ] for sub_indices in sliced_examplars ] # Concatenate representative images - representative_image = get_concat_tile_resize( - images_to_combine, self.image_height, self.image_squares - ) + representative_image = get_concat_tile_resize(images_to_combine, self.image_height, self.image_squares) representative_images[topic] = representative_image # Make sure to properly close images @@ -136,9 +128,7 @@ def extract_topics( return representative_images - def _convert_image_to_text( - self, images: List[str], verbose: bool = False - ) -> List[str]: + def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> List[str]: """Convert a list of images to captions. Arguments: @@ -163,9 +153,7 @@ def _convert_image_to_text( return documents - def image_to_text( - self, documents: pd.DataFrame, embeddings: np.ndarray - ) -> pd.DataFrame: + def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: """Convert images to text.""" # Create image topic embeddings topics = documents.Topic.values.tolist() @@ -193,10 +181,7 @@ def image_to_text( current_id = 0 for topic, image_ids in tqdm(image_centroids.items()): selected_images = [ - Image.open(images[index]) - if isinstance(images[index], str) - else images[index] - for index in image_ids + Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in image_ids ] text = self._convert_image_to_text(selected_images) @@ -243,10 +228,7 @@ def get_concat_v_multi_resize(im_list): """Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/.""" min_width = min(im.width for im in im_list) min_width = max(im.width for im in im_list) - im_list_resize = [ - im.resize((min_width, int(im.height * min_width / im.width)), resample=0) - for im in im_list - ] + im_list_resize = [im.resize((min_width, int(im.height * min_width / im.width)), resample=0) for im in im_list] total_height = sum(im.height for im in im_list_resize) dst = Image.new("RGB", (min_width, total_height), (255, 255, 255)) pos_y = 0 @@ -264,9 +246,7 @@ def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False): if image_squares: width = int(image_height / 3) height = int(image_height / 3) - images = [ - [image.resize((width, height)) for image in images] for images in im_list_2d - ] + images = [[image.resize((width, height)) for image in images] for images in im_list_2d] # Resize images based on minimum size else: @@ -280,9 +260,7 @@ def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False): resample=0, ) elif img.width > img.height: - images[i][j] = img.resize( - (min_width, int(img.height * min_width / img.width)), resample=0 - ) + images[i][j] = img.resize((min_width, int(img.height * min_width / img.width)), resample=0) else: images[i][j] = img.resize((min_width, min_width)) diff --git a/bertopic/representation/_zeroshot.py b/bertopic/representation/_zeroshot.py index 7dff499b..5f67de9a 100644 --- a/bertopic/representation/_zeroshot.py +++ b/bertopic/representation/_zeroshot.py @@ -75,12 +75,8 @@ def extract_topics( updated_topics: Updated topic representations """ # Classify topics - topic_descriptions = [ - " ".join(list(zip(*topics[topic]))[0]) for topic in topics.keys() - ] - classifications = self.model( - topic_descriptions, self.candidate_topics, **self.pipeline_kwargs - ) + topic_descriptions = [" ".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()] + classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs) # Extract labels updated_topics = {} @@ -90,25 +86,19 @@ def extract_topics( # Multi-label assignment if self.pipeline_kwargs.get("multi_label"): topic_description = [] - for label, score in zip( - classification["labels"], classification["scores"] - ): + for label, score in zip(classification["labels"], classification["scores"]): if score > self.min_prob: topic_description.append((label, score)) # Single label assignment elif classification["scores"][0] > self.min_prob: - topic_description = [ - (classification["labels"][0], classification["scores"][0]) - ] + topic_description = [(classification["labels"][0], classification["scores"][0])] # Make sure that 10 items are returned if len(topic_description) == 0: topic_description = topics[topic] elif len(topic_description) < 10: - topic_description += [ - ("", 0) for _ in range(10 - len(topic_description)) - ] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description return updated_topics diff --git a/bertopic/vectorizers/_online_cv.py b/bertopic/vectorizers/_online_cv.py index fedb363c..27387fa2 100644 --- a/bertopic/vectorizers/_online_cv.py +++ b/bertopic/vectorizers/_online_cv.py @@ -121,15 +121,11 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix: X = self.transform(raw_documents) # Add empty columns if new words are found - columns = csr_matrix( - (self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int - ) + columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int) self.X_ = sparse.hstack([self.X_, columns]) # Add empty rows if new topics are found - rows = csr_matrix( - (X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int - ) + rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int) self.X_ = sparse.vstack([self.X_, rows]) # Decay of BoW matrix diff --git a/pyproject.toml b/pyproject.toml index d0c1abfe..2dce9bc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ include = ["bertopic*"] exclude = ["tests"] [tool.ruff] -target-version = "py38" +line-length = 120 [tool.ruff.lint] select = [ diff --git a/tests/conftest.py b/tests/conftest.py index 95bcf738..3d8d49db 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,17 +27,15 @@ def document_embeddings(documents, embedding_model): @pytest.fixture(scope="session") def reduced_embeddings(document_embeddings): - reduced_embeddings = UMAP( - n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine" - ).fit_transform(document_embeddings) + reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit_transform( + document_embeddings + ) return reduced_embeddings @pytest.fixture(scope="session") def documents(): - newsgroup_docs = fetch_20newsgroups( - subset="all", remove=("headers", "footers", "quotes") - )["data"][:1000] + newsgroup_docs = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))["data"][:1000] return newsgroup_docs @@ -74,9 +72,7 @@ def zeroshot_topic_model(documents, document_embeddings, embedding_model): @pytest.fixture(scope="session") def custom_topic_model(documents, document_embeddings, embedding_model): - umap_model = UMAP( - n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42 - ) + umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42) hdbscan_model = HDBSCAN( min_cluster_size=3, metric="euclidean", @@ -94,9 +90,7 @@ def custom_topic_model(documents, document_embeddings, embedding_model): @pytest.fixture(scope="session") def representation_topic_model(documents, document_embeddings, embedding_model): - umap_model = UMAP( - n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42 - ) + umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42) hdbscan_model = HDBSCAN( min_cluster_size=3, metric="euclidean", @@ -177,9 +171,7 @@ def online_topic_model(documents, document_embeddings, embedding_model): topics = [] for index in range(0, len(documents), 50): - model.partial_fit( - documents[index : index + 50], document_embeddings[index : index + 50] - ) + model.partial_fit(documents[index : index + 50], document_embeddings[index : index + 50]) topics.extend(model.topics_) model.topics_ = topics return model diff --git a/tests/test_bertopic.py b/tests/test_bertopic.py index 73614e1b..3bcc6cbb 100644 --- a/tests/test_bertopic.py +++ b/tests/test_bertopic.py @@ -75,13 +75,9 @@ def test_full_model(model, documents, request): # Test zero-shot topic modeling if topic_model._is_zeroshot(): if topic_model._outliers: - assert set(topic_model.topic_labels_.keys()) == set( - range(-1, len(topic_model.topic_labels_) - 1) - ) + assert set(topic_model.topic_labels_.keys()) == set(range(-1, len(topic_model.topic_labels_) - 1)) else: - assert set(topic_model.topic_labels_.keys()) == set( - range(len(topic_model.topic_labels_)) - ) + assert set(topic_model.topic_labels_.keys()) == set(range(len(topic_model.topic_labels_))) # Test topics over time timestamps = [i % 10 for i in range(len(documents))] @@ -130,9 +126,7 @@ def test_full_model(model, documents, request): assert topic != original_topic # Test updating topic labels - topic_labels = topic_model.generate_topic_labels( - nr_words=3, topic_prefix=False, word_length=10, separator=", " - ) + topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ") assert len(topic_labels) == len(set(topic_model.topics_)) # Test setting topic labels @@ -148,9 +142,7 @@ def test_full_model(model, documents, request): # Test reduction of outliers if -1 in topics: new_topics = topic_model.reduce_outliers(documents, topics, threshold=0.0) - nr_outliers_topic_model = sum( - [1 for topic in topic_model.topics_ if topic == -1] - ) + nr_outliers_topic_model = sum([1 for topic in topic_model.topics_ if topic == -1]) nr_outliers_new_topics = sum([1 for topic in new_topics if topic == -1]) if topic_model._outliers == 1: diff --git a/tests/test_plotting/test_approximate.py b/tests/test_plotting/test_approximate.py index 2de86848..1b0a78eb 100644 --- a/tests/test_plotting/test_approximate.py +++ b/tests/test_plotting/test_approximate.py @@ -18,28 +18,17 @@ def test_approximate_distribution(batch_size, padding, model, documents, request topic_model = copy.deepcopy(request.getfixturevalue(model)) # Calculate only on a document-level based on tokensets - topic_distr, _ = topic_model.approximate_distribution( - documents, padding=padding, batch_size=batch_size - ) - assert ( - topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers - ) + topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size) + assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers # Use the distribution visualization for i in range(3): topic_model.visualize_distribution(topic_distr[i]) # Calculate distribution on a token-level - topic_distr, topic_token_distr = topic_model.approximate_distribution( - documents[:100], calculate_tokens=True - ) - assert ( - topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers - ) + topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True) + assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers assert len(topic_token_distr) == len(documents[:100]) for token_distr in topic_token_distr: - assert ( - token_distr.shape[1] - == len(topic_model.topic_labels_) - topic_model._outliers - ) + assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers diff --git a/tests/test_plotting/test_documents.py b/tests/test_plotting/test_documents.py index 81acbe4c..8d94767b 100644 --- a/tests/test_plotting/test_documents.py +++ b/tests/test_plotting/test_documents.py @@ -17,8 +17,6 @@ def test_documents(model, reduced_embeddings, documents, request): topics = set(topic_model.topics_) if -1 in topics: topics.remove(-1) - fig = topic_model.visualize_documents( - documents, embeddings=reduced_embeddings, hide_document_hover=True - ) + fig = topic_model.visualize_documents(documents, embeddings=reduced_embeddings, hide_document_hover=True) fig_topics = [int(data["name"].split("_")[0]) for data in fig.to_dict()["data"][1:]] assert set(fig_topics) == topics diff --git a/tests/test_plotting/test_dynamic.py b/tests/test_plotting/test_dynamic.py index 361702b1..6551da52 100644 --- a/tests/test_plotting/test_dynamic.py +++ b/tests/test_plotting/test_dynamic.py @@ -19,7 +19,4 @@ def test_dynamic(model, documents, request): topics_over_time = topic_model.topics_over_time(documents, timestamps) fig = topic_model.visualize_topics_over_time(topics_over_time) - assert ( - len(fig.to_dict()["data"]) - == len(set(topic_model.topics_)) - topic_model._outliers - ) + assert len(fig.to_dict()["data"]) == len(set(topic_model.topics_)) - topic_model._outliers diff --git a/tests/test_plotting/test_term_rank.py b/tests/test_plotting/test_term_rank.py index 318d7d3c..67015d05 100644 --- a/tests/test_plotting/test_term_rank.py +++ b/tests/test_plotting/test_term_rank.py @@ -2,9 +2,7 @@ import pytest -@pytest.mark.parametrize( - "model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")] -) +@pytest.mark.parametrize("model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")]) def test_term_rank(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.visualize_term_rank() diff --git a/tests/test_reduction/test_merge.py b/tests/test_reduction/test_merge.py index b69ee3cd..67bf9934 100644 --- a/tests/test_reduction/test_merge.py +++ b/tests/test_reduction/test_merge.py @@ -19,9 +19,7 @@ def test_merge(model, documents, request): topics_to_merge = [1, 2] topic_model.merge_topics(documents, topics_to_merge) - mappings = topic_model.topic_mapper_.get_mappings( - list(topic_model.hdbscan_model.labels_) - ) + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] assert nr_topics == len(set(topic_model.topics_)) + 1 @@ -33,9 +31,7 @@ def test_merge(model, documents, request): topics_to_merge = [1, 2] topic_model.merge_topics(documents, topics_to_merge) - mappings = topic_model.topic_mapper_.get_mappings( - list(topic_model.hdbscan_model.labels_) - ) + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] assert nr_topics == len(set(topic_model.topics_)) + 2 diff --git a/tests/test_representation/test_representations.py b/tests/test_representation/test_representations.py index 98b8f4dd..7c819964 100644 --- a/tests/test_representation/test_representations.py +++ b/tests/test_representation/test_representations.py @@ -151,9 +151,7 @@ def test_topic_reduction_edge_cases(model, documents, request): topic_model.nr_topics = 100 nr_topics = 5 topics = np.random.randint(-1, nr_topics - 1, len(documents)) - old_documents = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": topics} - ) + old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) topic_model._update_topic_size(old_documents) topic_model._extract_topics(old_documents) old_freq = topic_model.get_topic_freq() diff --git a/tests/test_sub_models/test_cluster.py b/tests/test_sub_models/test_cluster.py index 6115d08e..265f6f78 100644 --- a/tests/test_sub_models/test_cluster.py +++ b/tests/test_sub_models/test_cluster.py @@ -21,13 +21,9 @@ ], ) def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): - embeddings, _ = make_blobs( - n_samples=samples, centers=centers, n_features=features, random_state=42 - ) + embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42) documents = [str(i + 1) for i in range(embeddings.shape[0])] - old_df = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": None} - ) + old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) if cluster_model == "kmeans": cluster_model = KMeans(n_clusters=centers) @@ -44,9 +40,7 @@ def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): assert len(new_df.Topic.unique()) == centers assert "Topic" in new_df.columns - pd.testing.assert_frame_equal( - old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1) - ) + pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)) @pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"]) @@ -62,13 +56,9 @@ def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): ], ) def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): - embeddings, _ = make_blobs( - n_samples=samples, centers=centers, n_features=features, random_state=42 - ) + embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42) documents = [str(i + 1) for i in range(embeddings.shape[0])] - old_df = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": None} - ) + old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) if cluster_model == "kmeans": cluster_model = KMeans(n_clusters=centers) else: @@ -84,6 +74,4 @@ def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, cen assert len(new_df.Topic.unique()) == centers assert "Topic" in new_df.columns - pd.testing.assert_frame_equal( - old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1) - ) + pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)) diff --git a/tests/test_sub_models/test_embeddings.py b/tests/test_sub_models/test_embeddings.py index 22f53539..75735607 100644 --- a/tests/test_sub_models/test_embeddings.py +++ b/tests/test_sub_models/test_embeddings.py @@ -19,9 +19,7 @@ def test_extract_embeddings(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) single_embedding = topic_model._extract_embeddings("a document") - multiple_embeddings = topic_model._extract_embeddings( - ["something different", "another document"] - ) + multiple_embeddings = topic_model._extract_embeddings(["something different", "another document"]) sim_matrix = cosine_similarity(single_embedding, multiple_embeddings)[0] assert single_embedding.shape[0] == 1 diff --git a/tests/test_utils.py b/tests/test_utils.py index 2974b1b6..90876e76 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -41,15 +41,9 @@ def test_check_embeddings_shape(): def test_make_unique_distances(): def check_dists(dists: List[float], noise_max: float): - unique_dists = get_unique_distances( - np.array(dists, dtype=float), noise_max=noise_max - ) - assert len(unique_dists) == len( - dists - ), "The number of elements must be the same" - assert len(dists) == len( - np.unique(unique_dists) - ), "The distances must be unique" + unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max) + assert len(unique_dists) == len(dists), "The number of elements must be the same" + assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique" check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7) @@ -69,44 +63,32 @@ def test_select_topic_representation(): topic_embeddings = np.array([[2, 2, 2]]) # Use topic embeddings - repr_, ctfidf_used = select_topic_representation( - ctfidf_embeddings, topic_embeddings, use_ctfidf=False - ) + repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=False) np.testing.assert_array_equal(topic_embeddings, repr_) assert not ctfidf_used # Fallback to c-TF-IDF - repr_, ctfidf_used = select_topic_representation( - ctfidf_embeddings, None, use_ctfidf=False - ) + repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, None, use_ctfidf=False) np.testing.assert_array_equal(ctfidf_embeddings, repr_) assert ctfidf_used # Use c-TF-IDF - repr_, ctfidf_used = select_topic_representation( - ctfidf_embeddings, topic_embeddings, use_ctfidf=True - ) + repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=True) np.testing.assert_array_equal(ctfidf_embeddings, repr_) assert ctfidf_used # Fallback to topic embeddings - repr_, ctfidf_used = select_topic_representation( - None, topic_embeddings, use_ctfidf=True - ) + repr_, ctfidf_used = select_topic_representation(None, topic_embeddings, use_ctfidf=True) np.testing.assert_array_equal(topic_embeddings, repr_) assert not ctfidf_used # `scipy.sparse.csr_matrix` can be used as c-TF-IDF embeddings np.testing.assert_array_equal( ctfidf_embeddings, - select_topic_representation( - ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True - )[0], + select_topic_representation(ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True)[0], ) # check that `csr_matrix` is not casted to `np.ndarray` when `ctfidf_as_ndarray` is False - repr_ = select_topic_representation( - ctfidf_embeddings_sparse, None, output_ndarray=False - )[0] + repr_ = select_topic_representation(ctfidf_embeddings_sparse, None, output_ndarray=False)[0] assert isinstance(repr_, csr_matrix) diff --git a/tests/test_variations/test_class.py b/tests/test_variations/test_class.py index a94c108d..5c969b51 100644 --- a/tests/test_variations/test_class.py +++ b/tests/test_variations/test_class.py @@ -18,12 +18,8 @@ ) def test_class(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) - topics_per_class_global = topic_model.topics_per_class( - documents, classes=classes, global_tuning=True - ) - topics_per_class_local = topic_model.topics_per_class( - documents, classes=classes, global_tuning=False - ) + topics_per_class_global = topic_model.topics_per_class(documents, classes=classes, global_tuning=True) + topics_per_class_local = topic_model.topics_per_class(documents, classes=classes, global_tuning=False) assert topics_per_class_global.Frequency.sum() == len(documents) assert topics_per_class_local.Frequency.sum() == len(documents) diff --git a/tests/test_variations/test_hierarchy.py b/tests/test_variations/test_hierarchy.py index cdfdaf8d..1ac7091d 100644 --- a/tests/test_variations/test_hierarchy.py +++ b/tests/test_variations/test_hierarchy.py @@ -36,9 +36,7 @@ def test_hierarchy(model, documents, request): def test_linkage(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True) - hierarchical_topics = topic_model.hierarchical_topics( - documents, linkage_function=linkage_function - ) + hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function) merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals]) tree = topic_model.get_topic_tree(hierarchical_topics) @@ -61,9 +59,7 @@ def test_linkage(model, documents, request): def test_tree(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True) - hierarchical_topics = topic_model.hierarchical_topics( - documents, linkage_function=linkage_function - ) + hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function) merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals]) tree = topic_model.get_topic_tree(hierarchical_topics) diff --git a/tests/test_vectorizers/test_ctfidf.py b/tests/test_vectorizers/test_ctfidf.py index a6cedccd..5d2626b6 100644 --- a/tests/test_vectorizers/test_ctfidf.py +++ b/tests/test_vectorizers/test_ctfidf.py @@ -23,12 +23,8 @@ def test_ctfidf(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = topic_model.topics_ - documents = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": topics} - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) + documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents) @@ -74,12 +70,8 @@ def test_ctfidf_custom_cv(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.vectorizer_model = cv topics = topic_model.topics_ - documents = pd.DataFrame( - {"Document": documents, "ID": range(len(documents)), "Topic": topics} - ) - documents_per_topic = documents.groupby(["Topic"], as_index=False).agg( - {"Document": " ".join} - ) + documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents)