cleanlab
diff --git a/‎.stats.yml‎
Lines changed: 1 addition & 1 deletion b/‎.stats.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/codex/resources/projects/projects.py‎
Lines changed: 92 additions & 80 deletions b/‎src/codex/resources/projects/projects.py‎
Lines changed: 92 additions & 80 deletions
diff --git a/‎src/codex/resources/tlm.py‎
Lines changed: 184 additions & 160 deletions b/‎src/codex/resources/tlm.py‎
Lines changed: 184 additions & 160 deletions
diff --git a/‎src/codex/types/project_validate_params.py‎
Lines changed: 48 additions & 40 deletions b/‎src/codex/types/project_validate_params.py‎
Lines changed: 48 additions & 40 deletions
diff --git a/‎src/codex/types/tlm_prompt_params.py‎
Lines changed: 48 additions & 40 deletions b/‎src/codex/types/tlm_prompt_params.py‎
Lines changed: 48 additions & 40 deletions
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: d01153406f196329a5d5d1efd8f3e50e
+openapi_spec_hash: 924a89b5f031d9215a5a701f834b132f
 config_hash: 930284cfa37f835d949c8a1b124f4807
@@ -123,60 +123,66 @@ class ProjectValidateParams(TypedDict, total=False):
     `model`, and `max_tokens` is set to 512. You can set custom values for these
     arguments regardless of the quality preset specified.
 
-    Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
-    "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
-    "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
-    "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
-    "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
-    default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
-    better results, faster models yield faster results). - Models still in beta:
-    "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
-    "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
-    "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
-    "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
-    latency/costs: "gpt-4.1-nano", "nova-micro".
-
-        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
-        Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
-        If you experience token/rate limit errors while using TLM, try lowering this number.
+    Args: model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini",
+    "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o",
+    "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0",
+    "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+    "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro",
+    "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use
+    (better models yield better results, faster models yield faster results). -
+    Models still in beta: "gpt-5", "gpt-5-mini", "gpt-5-nano", "o3", "o1",
+    "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0",
+    "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku". - Recommended
+    models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0",
+    "claude-sonnet-4-0". - Recommended models for low latency/costs: "gpt-4.1-nano",
+    "nova-micro".
+
+        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+        The expected input format is a list of dictionaries, where each dictionary has the following keys:
+        - name: Name of the evaluation criteria.
+        - criteria: Instructions specifying the evaluation criteria.
+
+        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
+        If you experience token/rate-limit errors, try lowering this number.
         For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
 
-        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
-        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
-        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
-        Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
-        When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+        Reduce this value to reduce runtimes. Higher values may improve trust scoring.
+
+        num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
+        The maximum number currently supported is 3. Lower values can reduce runtimes.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
-        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
-        Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
+        Must be between 0 and 20. Lower values can reduce runtimes.
         Measuring consistency helps quantify the epistemic uncertainty associated with
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
-
-        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
         similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
         "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
-        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
-
-        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
-        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
-        Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
-
-        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
-        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
+        This parameter has no effect when `num_consistency_samples = 0`.
 
-        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
-        The expected input format is a list of dictionaries, where each dictionary has the following keys:
-        - name: Name of the evaluation criteria.
-        - criteria: Instructions specifying the evaluation criteria.
+        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+        You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
+        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+        When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
-        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+        disable_trustworthiness (bool, default = False): if True, trustworthiness scoring is disabled and TLM will not compute trust scores for responses.
+        This is useful when you only want to use custom evaluation criteria or when you want to minimize computational overhead and only need the base LLM response.
+        The following parameters will be ignored when `disable_trustworthiness` is True: `num_consistency_samples`, `num_self_reflections`, `num_candidate_responses`, `reasoning_effort`, `similarity_measure`.
     """
 
     prompt: Optional[str]
@@ -647,6 +653,8 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
 class Options(TypedDict, total=False):
     custom_eval_criteria: Iterable[object]
 
+    disable_trustworthiness: bool
+
     log: List[str]
 
     max_tokens: int
 
@@ -45,60 +45,66 @@ class TlmPromptParams(TypedDict, total=False):
     `model`, and `max_tokens` is set to 512. You can set custom values for these
     arguments regardless of the quality preset specified.
 
-    Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
-    "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
-    "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
-    "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
-    "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
-    default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
-    better results, faster models yield faster results). - Models still in beta:
-    "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
-    "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
-    "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
-    "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
-    latency/costs: "gpt-4.1-nano", "nova-micro".
-
-        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
-        Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
-        If you experience token/rate limit errors while using TLM, try lowering this number.
+    Args: model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini",
+    "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o",
+    "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0",
+    "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+    "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro",
+    "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use
+    (better models yield better results, faster models yield faster results). -
+    Models still in beta: "gpt-5", "gpt-5-mini", "gpt-5-nano", "o3", "o1",
+    "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0",
+    "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku". - Recommended
+    models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0",
+    "claude-sonnet-4-0". - Recommended models for low latency/costs: "gpt-4.1-nano",
+    "nova-micro".
+
+        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+        The expected input format is a list of dictionaries, where each dictionary has the following keys:
+        - name: Name of the evaluation criteria.
+        - criteria: Instructions specifying the evaluation criteria.
+
+        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
+        If you experience token/rate-limit errors, try lowering this number.
         For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
 
-        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
-        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
-        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
-        Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
-        When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+        Reduce this value to reduce runtimes. Higher values may improve trust scoring.
+
+        num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
+        The maximum number currently supported is 3. Lower values can reduce runtimes.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
-        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
-        Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
+        Must be between 0 and 20. Lower values can reduce runtimes.
         Measuring consistency helps quantify the epistemic uncertainty associated with
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
-
-        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
         similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
         "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
-        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
-
-        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
-        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
-        Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
-
-        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
-        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
+        This parameter has no effect when `num_consistency_samples = 0`.
 
-        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
-        The expected input format is a list of dictionaries, where each dictionary has the following keys:
-        - name: Name of the evaluation criteria.
-        - criteria: Instructions specifying the evaluation criteria.
+        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+        You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
+        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+        When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        This parameter has no effect when `disable_trustworthiness` is True.
 
-        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+        disable_trustworthiness (bool, default = False): if True, trustworthiness scoring is disabled and TLM will not compute trust scores for responses.
+        This is useful when you only want to use custom evaluation criteria or when you want to minimize computational overhead and only need the base LLM response.
+        The following parameters will be ignored when `disable_trustworthiness` is True: `num_consistency_samples`, `num_self_reflections`, `num_candidate_responses`, `reasoning_effort`, `similarity_measure`.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -110,6 +116,8 @@ class TlmPromptParams(TypedDict, total=False):
 class Options(TypedDict, total=False):
     custom_eval_criteria: Iterable[object]
 
+    disable_trustworthiness: bool
+
     log: List[str]
 
     max_tokens: int