33from __future__ import annotations
44
55import typing_extensions
6- from typing import Dict , List , Optional
6+ from typing import Dict , List , Iterable , Optional
77from typing_extensions import Literal
88
99import httpx
@@ -436,6 +436,7 @@ def validate(
436436 custom_eval_thresholds : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
437437 custom_metadata : Optional [object ] | NotGiven = NOT_GIVEN ,
438438 eval_scores : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
439+ messages : Optional [Iterable [project_validate_params .Message ]] | NotGiven = NOT_GIVEN ,
439440 options : Optional [project_validate_params .Options ] | NotGiven = NOT_GIVEN ,
440441 quality_preset : Literal ["best" , "high" , "medium" , "low" , "base" ] | NotGiven = NOT_GIVEN ,
441442 task : Optional [str ] | NotGiven = NOT_GIVEN ,
@@ -465,6 +466,10 @@ def validate(
465466 eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
466467 be used to generate scores.
467468
469+ messages: Optional message history to provide conversation context for the query. Used to
470+ rewrite query into a self-contained version of itself. If not provided, the
471+ query will be treated as self-contained.
472+
468473 options: Typed dict of advanced configuration options for the Trustworthy Language Model.
469474 Many of these configurations are determined by the quality preset selected
470475 (learn about quality presets in the TLM [initialization method](./#class-tlm)).
@@ -490,27 +495,24 @@ def validate(
490495 `use_self_reflection` = True.
491496 - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
492497 `use_self_reflection` = False. When using `get_trustworthiness_score()` on
493- "base" preset, a cheaper self-reflection will be used to compute the
494- trustworthiness score.
495-
496- By default, the TLM uses the "medium" quality preset. The default base LLM
497- `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
498- You can set custom values for these arguments regardless of the quality preset
499- specified.
500-
501- Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
502- "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
503- "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
504- "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
505- "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
506- "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
507- faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
508- "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
509- "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
510- "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
511- for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
512- "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
513- "gpt-4.1-nano", "nova-micro".
498+ "base" preset, a faster self-reflection is employed.
499+
500+ By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
501+ `model`, and `max_tokens` is set to 512. You can set custom values for these
502+ arguments regardless of the quality preset specified.
503+
504+ Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
505+ "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
506+ "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
507+ "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
508+ "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
509+ default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
510+ better results, faster models yield faster results). - Models still in beta:
511+ "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
512+ "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
513+ "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
514+ "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
515+ latency/costs: "gpt-4.1-nano", "nova-micro".
514516
515517 max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
516518 Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
@@ -536,7 +538,7 @@ def validate(
536538
537539 similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
538540 trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
539- Supported similarity measures include: "semantic" (based on natural language inference),
541+ Supported similarity measures include - "semantic" (based on natural language inference),
540542 "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
541543 "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
542544 and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
@@ -588,6 +590,7 @@ def validate(
588590 "custom_eval_thresholds" : custom_eval_thresholds ,
589591 "custom_metadata" : custom_metadata ,
590592 "eval_scores" : eval_scores ,
593+ "messages" : messages ,
591594 "options" : options ,
592595 "quality_preset" : quality_preset ,
593596 "task" : task ,
@@ -985,6 +988,7 @@ async def validate(
985988 custom_eval_thresholds : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
986989 custom_metadata : Optional [object ] | NotGiven = NOT_GIVEN ,
987990 eval_scores : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
991+ messages : Optional [Iterable [project_validate_params .Message ]] | NotGiven = NOT_GIVEN ,
988992 options : Optional [project_validate_params .Options ] | NotGiven = NOT_GIVEN ,
989993 quality_preset : Literal ["best" , "high" , "medium" , "low" , "base" ] | NotGiven = NOT_GIVEN ,
990994 task : Optional [str ] | NotGiven = NOT_GIVEN ,
@@ -1014,6 +1018,10 @@ async def validate(
10141018 eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
10151019 be used to generate scores.
10161020
1021+ messages: Optional message history to provide conversation context for the query. Used to
1022+ rewrite query into a self-contained version of itself. If not provided, the
1023+ query will be treated as self-contained.
1024+
10171025 options: Typed dict of advanced configuration options for the Trustworthy Language Model.
10181026 Many of these configurations are determined by the quality preset selected
10191027 (learn about quality presets in the TLM [initialization method](./#class-tlm)).
@@ -1039,27 +1047,24 @@ async def validate(
10391047 `use_self_reflection` = True.
10401048 - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
10411049 `use_self_reflection` = False. When using `get_trustworthiness_score()` on
1042- "base" preset, a cheaper self-reflection will be used to compute the
1043- trustworthiness score.
1044-
1045- By default, the TLM uses the "medium" quality preset. The default base LLM
1046- `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
1047- You can set custom values for these arguments regardless of the quality preset
1048- specified.
1049-
1050- Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
1051- "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
1052- "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
1053- "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
1054- "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
1055- "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
1056- faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
1057- "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
1058- "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
1059- "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
1060- for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
1061- "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
1062- "gpt-4.1-nano", "nova-micro".
1050+ "base" preset, a faster self-reflection is employed.
1051+
1052+ By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
1053+ `model`, and `max_tokens` is set to 512. You can set custom values for these
1054+ arguments regardless of the quality preset specified.
1055+
1056+ Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
1057+ "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
1058+ "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
1059+ "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
1060+ "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
1061+ default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
1062+ better results, faster models yield faster results). - Models still in beta:
1063+ "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
1064+ "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
1065+ "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
1066+ "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
1067+ latency/costs: "gpt-4.1-nano", "nova-micro".
10631068
10641069 max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
10651070 Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
@@ -1085,7 +1090,7 @@ async def validate(
10851090
10861091 similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
10871092 trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
1088- Supported similarity measures include: "semantic" (based on natural language inference),
1093+ Supported similarity measures include - "semantic" (based on natural language inference),
10891094 "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
10901095 "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
10911096 and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
@@ -1137,6 +1142,7 @@ async def validate(
11371142 "custom_eval_thresholds" : custom_eval_thresholds ,
11381143 "custom_metadata" : custom_metadata ,
11391144 "eval_scores" : eval_scores ,
1145+ "messages" : messages ,
11401146 "options" : options ,
11411147 "quality_preset" : quality_preset ,
11421148 "task" : task ,
0 commit comments