diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py index 35df3a1e..760eaeee 100644 --- a/src/llama_stack_client/_client.py +++ b/src/llama_stack_client/_client.py @@ -135,7 +135,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LLAMA_STACK_BASE_URL") if base_url is None: - base_url = "http://any-hosted-llama-stack.com" + base_url = f"http://any-hosted-llama-stack.com" custom_headers = default_headers or {} custom_headers["X-LlamaStack-Client-Version"] = __version__ @@ -351,7 +351,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LLAMA_STACK_BASE_URL") if base_url is None: - base_url = "http://any-hosted-llama-stack.com" + base_url = f"http://any-hosted-llama-stack.com" custom_headers = default_headers or {} custom_headers["X-LlamaStack-Client-Version"] = __version__ diff --git a/src/llama_stack_client/_files.py b/src/llama_stack_client/_files.py index 43d5ca1c..715cc207 100644 --- a/src/llama_stack_client/_files.py +++ b/src/llama_stack_client/_files.py @@ -71,7 +71,7 @@ def _transform_file(file: FileTypes) -> HttpxFileTypes: if is_tuple_t(file): return (file[0], _read_file_content(file[1]), *file[2:]) - raise TypeError("Expected file types input to be a FileContent type or to be a tuple") + raise TypeError(f"Expected file types input to be a FileContent type or to be a tuple") def _read_file_content(file: FileContent) -> HttpxFileContent: @@ -113,7 +113,7 @@ async def _async_transform_file(file: FileTypes) -> HttpxFileTypes: if is_tuple_t(file): return (file[0], await _async_read_file_content(file[1]), *file[2:]) - raise TypeError("Expected file types input to be a FileContent type or to be a tuple") + raise TypeError(f"Expected file types input to be a FileContent type or to be a tuple") async def _async_read_file_content(file: FileContent) -> HttpxFileContent: diff --git a/src/llama_stack_client/_response.py b/src/llama_stack_client/_response.py index 31f945b7..ea35182f 100644 --- a/src/llama_stack_client/_response.py +++ b/src/llama_stack_client/_response.py @@ -229,7 +229,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T: # the response class ourselves but that is something that should be supported directly in httpx # as it would be easy to incorrectly construct the Response object due to the multitude of arguments. if cast_to != httpx.Response: - raise ValueError("Subclasses of httpx.Response cannot be passed to `cast_to`") + raise ValueError(f"Subclasses of httpx.Response cannot be passed to `cast_to`") return cast(R, response) if ( @@ -245,9 +245,9 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T: if ( cast_to is not object - and origin is not list - and origin is not dict - and origin is not Union + and not origin is list + and not origin is dict + and not origin is Union and not issubclass(origin, BaseModel) ): raise RuntimeError( diff --git a/src/llama_stack_client/resources/agents/turn.py b/src/llama_stack_client/resources/agents/turn.py index 56e130f3..a17021ec 100644 --- a/src/llama_stack_client/resources/agents/turn.py +++ b/src/llama_stack_client/resources/agents/turn.py @@ -247,8 +247,18 @@ def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> Turn: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + tool_responses: The tool call responses to resume the turn with. + + stream: Whether to stream the response. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -275,8 +285,18 @@ def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> Stream[AgentTurnResponseStreamChunk]: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + stream: Whether to stream the response. + + tool_responses: The tool call responses to resume the turn with. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -303,8 +323,18 @@ def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> Turn | Stream[AgentTurnResponseStreamChunk]: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + stream: Whether to stream the response. + + tool_responses: The tool call responses to resume the turn with. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -571,8 +601,18 @@ async def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> Turn: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + tool_responses: The tool call responses to resume the turn with. + + stream: Whether to stream the response. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -599,8 +639,18 @@ async def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> AsyncStream[AgentTurnResponseStreamChunk]: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + stream: Whether to stream the response. + + tool_responses: The tool call responses to resume the turn with. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -627,8 +677,18 @@ async def resume( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> Turn | AsyncStream[AgentTurnResponseStreamChunk]: - """ + """Resume an agent turn with executed tool call responses. + + When a Turn has the + status `awaiting_input` due to pending input from client side tool calls, this + endpoint can be used to submit the outputs from the tool calls once they are + ready. + Args: + stream: Whether to stream the response. + + tool_responses: The tool call responses to resume the turn with. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index 0b37f1cf..c4d85852 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import List, Iterable +from typing import List, Union, Iterable from typing_extensions import Literal, overload import httpx @@ -36,6 +36,7 @@ from ..types.shared.chat_completion_response import ChatCompletionResponse from ..types.shared_params.interleaved_content import InterleavedContent from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk +from ..types.shared_params.interleaved_content_item import InterleavedContentItem __all__ = ["InferenceResource", "AsyncInferenceResource"] @@ -493,8 +494,11 @@ def completion( def embeddings( self, *, - contents: List[InterleavedContent], + contents: Union[List[str], Iterable[InterleavedContentItem]], model_id: str, + output_dimension: int | NotGiven = NOT_GIVEN, + task_type: Literal["query", "document"] | NotGiven = NOT_GIVEN, + text_truncation: Literal["none", "start", "end"] | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -506,13 +510,22 @@ def embeddings( Generate embeddings for content pieces using the specified model. Args: - contents: List of contents to generate embeddings for. Note that content can be - multimodal. The behavior depends on the model and provider. Some models may only - support text. + contents: List of contents to generate embeddings for. Each content can be a string or an + InterleavedContentItem (and hence can be multimodal). The behavior depends on + the model and provider. Some models may only support text. model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. + output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by + Matryoshka models. + + task_type: (Optional) How is the embedding being used? This is only supported by asymmetric + embedding models. + + text_truncation: (Optional) Config for how to truncate text for embedding when text is longer + than the model's max sequence length. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -527,6 +540,9 @@ def embeddings( { "contents": contents, "model_id": model_id, + "output_dimension": output_dimension, + "task_type": task_type, + "text_truncation": text_truncation, }, inference_embeddings_params.InferenceEmbeddingsParams, ), @@ -990,8 +1006,11 @@ async def completion( async def embeddings( self, *, - contents: List[InterleavedContent], + contents: Union[List[str], Iterable[InterleavedContentItem]], model_id: str, + output_dimension: int | NotGiven = NOT_GIVEN, + task_type: Literal["query", "document"] | NotGiven = NOT_GIVEN, + text_truncation: Literal["none", "start", "end"] | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -1003,13 +1022,22 @@ async def embeddings( Generate embeddings for content pieces using the specified model. Args: - contents: List of contents to generate embeddings for. Note that content can be - multimodal. The behavior depends on the model and provider. Some models may only - support text. + contents: List of contents to generate embeddings for. Each content can be a string or an + InterleavedContentItem (and hence can be multimodal). The behavior depends on + the model and provider. Some models may only support text. model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. + output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by + Matryoshka models. + + task_type: (Optional) How is the embedding being used? This is only supported by asymmetric + embedding models. + + text_truncation: (Optional) Config for how to truncate text for embedding when text is longer + than the model's max sequence length. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -1024,6 +1052,9 @@ async def embeddings( { "contents": contents, "model_id": model_id, + "output_dimension": output_dimension, + "task_type": task_type, + "text_truncation": text_truncation, }, inference_embeddings_params.InferenceEmbeddingsParams, ), diff --git a/src/llama_stack_client/types/agents/turn_resume_params.py b/src/llama_stack_client/types/agents/turn_resume_params.py index 0df97072..b79cdb27 100644 --- a/src/llama_stack_client/types/agents/turn_resume_params.py +++ b/src/llama_stack_client/types/agents/turn_resume_params.py @@ -16,14 +16,17 @@ class TurnResumeParamsBase(TypedDict, total=False): session_id: Required[str] tool_responses: Required[Iterable[ToolResponseMessage]] + """The tool call responses to resume the turn with.""" class TurnResumeParamsNonStreaming(TurnResumeParamsBase, total=False): stream: Literal[False] + """Whether to stream the response.""" class TurnResumeParamsStreaming(TurnResumeParamsBase): stream: Required[Literal[True]] + """Whether to stream the response.""" TurnResumeParams = Union[TurnResumeParamsNonStreaming, TurnResumeParamsStreaming] diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py index 926b511b..7bf5339f 100644 --- a/src/llama_stack_client/types/inference_embeddings_params.py +++ b/src/llama_stack_client/types/inference_embeddings_params.py @@ -2,20 +2,21 @@ from __future__ import annotations -from typing import List -from typing_extensions import Required, TypedDict +from typing import List, Union, Iterable +from typing_extensions import Literal, Required, TypedDict -from .shared_params.interleaved_content import InterleavedContent +from .shared_params.interleaved_content_item import InterleavedContentItem __all__ = ["InferenceEmbeddingsParams"] class InferenceEmbeddingsParams(TypedDict, total=False): - contents: Required[List[InterleavedContent]] + contents: Required[Union[List[str], Iterable[InterleavedContentItem]]] """List of contents to generate embeddings for. - Note that content can be multimodal. The behavior depends on the model and - provider. Some models may only support text. + Each content can be a string or an InterleavedContentItem (and hence can be + multimodal). The behavior depends on the model and provider. Some models may + only support text. """ model_id: Required[str] @@ -24,3 +25,21 @@ class InferenceEmbeddingsParams(TypedDict, total=False): The model must be an embedding model registered with Llama Stack and available via the /models endpoint. """ + + output_dimension: int + """(Optional) Output dimensionality for the embeddings. + + Only supported by Matryoshka models. + """ + + task_type: Literal["query", "document"] + """ + (Optional) How is the embedding being used? This is only supported by asymmetric + embedding models. + """ + + text_truncation: Literal["none", "start", "end"] + """ + (Optional) Config for how to truncate text for embedding when text is longer + than the model's max sequence length. + """ diff --git a/src/llama_stack_client/types/shared/query_result.py b/src/llama_stack_client/types/shared/query_result.py index dc20becf..c0a1d44c 100644 --- a/src/llama_stack_client/types/shared/query_result.py +++ b/src/llama_stack_client/types/shared/query_result.py @@ -1,6 +1,6 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional +from typing import Dict, List, Union, Optional from ..._models import BaseModel from .interleaved_content import InterleavedContent @@ -9,5 +9,7 @@ class QueryResult(BaseModel): + metadata: Dict[str, Union[bool, float, str, List[object], object, None]] + content: Optional[InterleavedContent] = None """A image content item""" diff --git a/src/llama_stack_client/types/tool_invocation_result.py b/src/llama_stack_client/types/tool_invocation_result.py index 4ecc3d03..a28160bb 100644 --- a/src/llama_stack_client/types/tool_invocation_result.py +++ b/src/llama_stack_client/types/tool_invocation_result.py @@ -1,6 +1,6 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional +from typing import Dict, List, Union, Optional from .._models import BaseModel from .shared.interleaved_content import InterleavedContent @@ -15,3 +15,5 @@ class ToolInvocationResult(BaseModel): error_code: Optional[int] = None error_message: Optional[str] = None + + metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None diff --git a/src/llama_stack_client/types/tool_response.py b/src/llama_stack_client/types/tool_response.py index 2617f6e3..f984f30a 100644 --- a/src/llama_stack_client/types/tool_response.py +++ b/src/llama_stack_client/types/tool_response.py @@ -1,6 +1,6 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Union +from typing import Dict, List, Union, Optional from typing_extensions import Literal from .._models import BaseModel @@ -16,3 +16,5 @@ class ToolResponse(BaseModel): """A image content item""" tool_name: Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str] + + metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py index 6cf7c8ba..4d078587 100644 --- a/tests/api_resources/test_inference.py +++ b/tests/api_resources/test_inference.py @@ -329,6 +329,17 @@ def test_method_embeddings(self, client: LlamaStackClient) -> None: ) assert_matches_type(EmbeddingsResponse, inference, path=["response"]) + @parametrize + def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None: + inference = client.inference.embeddings( + contents=["string"], + model_id="model_id", + output_dimension=0, + task_type="query", + text_truncation="none", + ) + assert_matches_type(EmbeddingsResponse, inference, path=["response"]) + @parametrize def test_raw_response_embeddings(self, client: LlamaStackClient) -> None: response = client.inference.with_raw_response.embeddings( @@ -667,6 +678,17 @@ async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> N ) assert_matches_type(EmbeddingsResponse, inference, path=["response"]) + @parametrize + async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: + inference = await async_client.inference.embeddings( + contents=["string"], + model_id="model_id", + output_dimension=0, + task_type="query", + text_truncation="none", + ) + assert_matches_type(EmbeddingsResponse, inference, path=["response"]) + @parametrize async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None: response = await async_client.inference.with_raw_response.embeddings(