MaartenGr · steven-solomon · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 22, 2024
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
@@ -5,8 +5,9 @@
 from scipy.sparse import csr_matrix
 from typing import Mapping, List, Tuple, Any, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document
+from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document, MyLogger
 
+logger = MyLogger("WARNING")
 
 DEFAULT_PROMPT = """
 This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
@@ -37,7 +38,7 @@
 Topic name:"""
 
 DEFAULT_CHAT_PROMPT = """
-I have a topic that contains the following documents: 
+I have a topic that contains the following documents:
 [DOCUMENTS]
 The topic is described by the following keywords: [KEYWORDS]
 
@@ -193,7 +194,7 @@ def extract_topics(self,
             updated_topics: Updated topic representations
         """
         # Extract the top n representative documents per topic
-        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)
+        repr_docs_mappings, _, _, repr_doc_ids = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)
 
         # Generate using OpenAI's Language Model
         updated_topics = {}
@@ -217,11 +218,21 @@ def extract_topics(self,
                 else:
                     response = self.client.chat.completions.create(**kwargs)
 
-                # Check whether content was actually generated
-                # Adresses #1570 for potential issues with OpenAI's content filter
-                if hasattr(response.choices[0].message, "content"):
-                    label = response.choices[0].message.content.strip().replace("topic: ", "")
+                choice = response.choices[0]
+
+                if choice.finish_reason == "stop":
+                    label = choice.message.content.strip().replace("topic: ", "")
+                elif choice.finish_reason == "length":
+                    logger.warn(f"Extracing Topics - Length limit reached for doc_ids ({repr_doc_ids})")
+                    if hasattr(response.choices[0].message, "content"):
+                        label = choice.message.content.strip().replace("topic: ", "")
+                    else:
+                        label = "Incomple output due to token limit being reached"
+                elif choice.finish_reason == "content_filter":
+                    logger.warn(f"Extracing Topics - Content filtered for doc_ids ({repr_doc_ids})")
+                    label = "Output content filtered by OpenAI"
                 else:
+                    logger.warn(f"Extracing Topics - No label due to finish_reason {choice.finish_reason} for doc_ids ({repr_doc_ids})")
                     label = "No label returned"
             else:
                 if self.exponential_backoff: