From a918b4323118c18f77c2abe7e1a3054c1eebeaac Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Sat, 27 Sep 2025 20:42:37 +0000 Subject: [PATCH 1/8] feat(api): removing openai/v1 --- .stats.yml | 8 +- README.md | 42 ++ api.md | 64 +- .../resources/chat/completions.py | 12 +- .../resources/completions.py | 4 +- .../resources/embeddings.py | 4 +- src/llama_stack_client/resources/files.py | 36 +- src/llama_stack_client/resources/inference.py | 351 +---------- .../resources/models/models.py | 4 +- .../resources/models/openai.py | 4 +- .../resources/moderations.py | 4 +- .../resources/responses/input_items.py | 4 +- .../resources/responses/responses.py | 16 +- .../resources/vector_stores/files.py | 24 +- .../resources/vector_stores/vector_stores.py | 24 +- src/llama_stack_client/types/__init__.py | 11 - .../types/completion_response.py | 24 - .../types/file_create_params.py | 5 + .../inference_batch_chat_completion_params.py | 85 --- ...nference_batch_chat_completion_response.py | 13 - .../inference_batch_completion_params.py | 41 -- .../types/inference_completion_params.py | 65 -- .../types/list_models_response.py | 10 - .../types/model_list_response.py | 19 +- .../types/shared/__init__.py | 1 - .../types/shared/batch_completion.py | 13 - .../types/shared/tool_param_definition.py | 4 + .../shared_params/tool_param_definition.py | 4 + src/llama_stack_client/types/tool.py | 6 + src/llama_stack_client/types/tool_def.py | 6 + .../types/tool_def_param.py | 6 + tests/api_resources/test_agents.py | 4 + tests/api_resources/test_files.py | 12 + tests/api_resources/test_inference.py | 564 +----------------- 34 files changed, 239 insertions(+), 1255 deletions(-) delete mode 100644 src/llama_stack_client/types/completion_response.py delete mode 100644 src/llama_stack_client/types/inference_batch_chat_completion_params.py delete mode 100644 src/llama_stack_client/types/inference_batch_chat_completion_response.py delete mode 100644 src/llama_stack_client/types/inference_batch_completion_params.py delete mode 100644 src/llama_stack_client/types/inference_completion_params.py delete mode 100644 src/llama_stack_client/types/list_models_response.py delete mode 100644 src/llama_stack_client/types/shared/batch_completion.py diff --git a/.stats.yml b/.stats.yml index fa9edfc7..e5bf0be0 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ -configured_endpoints: 111 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-f252873ea1e1f38fd207331ef2621c511154d5be3f4076e59cc15754fc58eee4.yml -openapi_spec_hash: 10cbb4337a06a9fdd7d08612dd6044c3 -config_hash: 0358112cc0f3d880b4d55debdbe1cfa3 +configured_endpoints: 107 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-1eddf141208c131ee4a64ef996f8f419b444f60450de6807a9f6bc711ed8b661.yml +openapi_spec_hash: 94765c67ea99b1358169d41d810dd395 +config_hash: 7ec5a583f9c26b38993013bdfb0e7d46 diff --git a/README.md b/README.md index 928458d2..cbd6bf78 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,48 @@ Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typ Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`. +## Nested params + +Nested parameters are dictionaries, typed using `TypedDict`, for example: + +```python +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +chat_completion_response = client.inference.chat_completion( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model_id="model_id", + logprobs={}, +) +print(chat_completion_response.logprobs) +``` + +## File uploads + +Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`. + +```python +from pathlib import Path +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +client.files.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, + file=Path("/path/to/file"), + purpose="assistants", +) +``` + +The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically. + ## Handling errors When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `llama_stack_client.APIConnectionError` is raised. diff --git a/api.md b/api.md index 22c2120f..97d40b31 100644 --- a/api.md +++ b/api.md @@ -3,7 +3,6 @@ ```python from llama_stack_client.types import ( AgentConfig, - BatchCompletion, ChatCompletionResponse, CompletionMessage, ContentDelta, @@ -91,10 +90,10 @@ from llama_stack_client.types import ( Methods: -- client.responses.create(\*\*params) -> ResponseObject -- client.responses.retrieve(response_id) -> ResponseObject -- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse] -- client.responses.delete(response_id) -> ResponseDeleteResponse +- client.responses.create(\*\*params) -> ResponseObject +- client.responses.retrieve(response_id) -> ResponseObject +- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse] +- client.responses.delete(response_id) -> ResponseDeleteResponse ## InputItems @@ -106,7 +105,7 @@ from llama_stack_client.types.responses import InputItemListResponse Methods: -- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse +- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse # Agents @@ -244,20 +243,15 @@ Types: ```python from llama_stack_client.types import ( ChatCompletionResponseStreamChunk, - CompletionResponse, EmbeddingsResponse, TokenLogProbs, - InferenceBatchChatCompletionResponse, InferenceRerankResponse, ) ``` Methods: -- client.inference.batch_chat_completion(\*\*params) -> InferenceBatchChatCompletionResponse -- client.inference.batch_completion(\*\*params) -> BatchCompletion - client.inference.chat_completion(\*\*params) -> ChatCompletionResponse -- client.inference.completion(\*\*params) -> CompletionResponse - client.inference.embeddings(\*\*params) -> EmbeddingsResponse - client.inference.rerank(\*\*params) -> InferenceRerankResponse @@ -271,7 +265,7 @@ from llama_stack_client.types import CreateEmbeddingsResponse Methods: -- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse +- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse # Chat @@ -295,9 +289,9 @@ from llama_stack_client.types.chat import ( Methods: -- client.chat.completions.create(\*\*params) -> CompletionCreateResponse -- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse -- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse] +- client.chat.completions.create(\*\*params) -> CompletionCreateResponse +- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse +- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse] # Completions @@ -309,7 +303,7 @@ from llama_stack_client.types import CompletionCreateResponse Methods: -- client.completions.create(\*\*params) -> CompletionCreateResponse +- client.completions.create(\*\*params) -> CompletionCreateResponse # VectorIo @@ -359,12 +353,12 @@ from llama_stack_client.types import ( Methods: -- client.vector_stores.create(\*\*params) -> VectorStore -- client.vector_stores.retrieve(vector_store_id) -> VectorStore -- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore -- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore] -- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse -- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse +- client.vector_stores.create(\*\*params) -> VectorStore +- client.vector_stores.retrieve(vector_store_id) -> VectorStore +- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore +- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore] +- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse +- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse ## Files @@ -380,12 +374,12 @@ from llama_stack_client.types.vector_stores import ( Methods: -- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile -- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile -- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile -- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile] -- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse -- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse +- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile +- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile +- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile +- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile] +- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse +- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse # Models @@ -412,7 +406,7 @@ from llama_stack_client.types.models import OpenAIListResponse Methods: -- client.models.openai.list() -> OpenAIListResponse +- client.models.openai.list() -> OpenAIListResponse # PostTraining @@ -481,7 +475,7 @@ from llama_stack_client.types import CreateResponse Methods: -- client.moderations.create(\*\*params) -> CreateResponse +- client.moderations.create(\*\*params) -> CreateResponse # Safety @@ -608,8 +602,8 @@ from llama_stack_client.types import DeleteFileResponse, File, ListFilesResponse Methods: -- client.files.create(\*\*params) -> File -- client.files.retrieve(file_id) -> File -- client.files.list(\*\*params) -> SyncOpenAICursorPage[File] -- client.files.delete(file_id) -> DeleteFileResponse -- client.files.content(file_id) -> object +- client.files.create(\*\*params) -> File +- client.files.retrieve(file_id) -> File +- client.files.list(\*\*params) -> SyncOpenAICursorPage[File] +- client.files.delete(file_id) -> DeleteFileResponse +- client.files.content(file_id) -> object diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py index 5445a2d1..2fb19980 100644 --- a/src/llama_stack_client/resources/chat/completions.py +++ b/src/llama_stack_client/resources/chat/completions.py @@ -372,7 +372,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: return self._post( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", body=maybe_transform( { "messages": messages, @@ -439,7 +439,7 @@ def retrieve( if not completion_id: raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}") return self._get( - f"/v1/openai/v1/chat/completions/{completion_id}", + f"/v1/chat/completions/{completion_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -481,7 +481,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", page=SyncOpenAICursorPage[CompletionListResponse], options=make_request_options( extra_headers=extra_headers, @@ -845,7 +845,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: return await self._post( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", body=await async_maybe_transform( { "messages": messages, @@ -912,7 +912,7 @@ async def retrieve( if not completion_id: raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}") return await self._get( - f"/v1/openai/v1/chat/completions/{completion_id}", + f"/v1/chat/completions/{completion_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -954,7 +954,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", page=AsyncOpenAICursorPage[CompletionListResponse], options=make_request_options( extra_headers=extra_headers, diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py index 2c1475de..caeab7a1 100644 --- a/src/llama_stack_client/resources/completions.py +++ b/src/llama_stack_client/resources/completions.py @@ -326,7 +326,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]: return self._post( - "/v1/openai/v1/completions", + "/v1/completions", body=maybe_transform( { "model": model, @@ -664,7 +664,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]: return await self._post( - "/v1/openai/v1/completions", + "/v1/completions", body=await async_maybe_transform( { "model": model, diff --git a/src/llama_stack_client/resources/embeddings.py b/src/llama_stack_client/resources/embeddings.py index 60c38cb2..29cd69d8 100644 --- a/src/llama_stack_client/resources/embeddings.py +++ b/src/llama_stack_client/resources/embeddings.py @@ -87,7 +87,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/embeddings", + "/v1/embeddings", body=maybe_transform( { "input": input, @@ -169,7 +169,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/embeddings", + "/v1/embeddings", body=await async_maybe_transform( { "input": input, diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py index 6b395e52..e8f20d35 100644 --- a/src/llama_stack_client/resources/files.py +++ b/src/llama_stack_client/resources/files.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Mapping, cast +from typing import Mapping, Optional, cast from typing_extensions import Literal import httpx @@ -49,6 +49,8 @@ def with_streaming_response(self) -> FilesResourceWithStreamingResponse: def create( self, *, + expires_after_anchor: Optional[str], + expires_after_seconds: Optional[int], file: FileTypes, purpose: Literal["assistants", "batch"], # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. @@ -65,6 +67,9 @@ def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. + - expires_after: Optional form values describing expiration for the file. + Expected expires_after[anchor] = "created_at", expires_after[seconds] = + {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days). Args: purpose: Valid purpose values for OpenAI Files API. @@ -79,6 +84,8 @@ def create( """ body = deepcopy_minimal( { + "expires_after_anchor": expires_after_anchor, + "expires_after_seconds": expires_after_seconds, "file": file, "purpose": purpose, } @@ -89,7 +96,7 @@ def create( # multipart/form-data; boundary=---abc-- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return self._post( - "/v1/openai/v1/files", + "/v1/files", body=maybe_transform(body, file_create_params.FileCreateParams), files=files, options=make_request_options( @@ -124,7 +131,7 @@ def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -171,7 +178,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/files", + "/v1/files", page=SyncOpenAICursorPage[File], options=make_request_options( extra_headers=extra_headers, @@ -217,7 +224,7 @@ def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._delete( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -250,7 +257,7 @@ def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/files/{file_id}/content", + f"/v1/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -281,6 +288,8 @@ def with_streaming_response(self) -> AsyncFilesResourceWithStreamingResponse: async def create( self, *, + expires_after_anchor: Optional[str], + expires_after_seconds: Optional[int], file: FileTypes, purpose: Literal["assistants", "batch"], # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. @@ -297,6 +306,9 @@ async def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. + - expires_after: Optional form values describing expiration for the file. + Expected expires_after[anchor] = "created_at", expires_after[seconds] = + {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days). Args: purpose: Valid purpose values for OpenAI Files API. @@ -311,6 +323,8 @@ async def create( """ body = deepcopy_minimal( { + "expires_after_anchor": expires_after_anchor, + "expires_after_seconds": expires_after_seconds, "file": file, "purpose": purpose, } @@ -321,7 +335,7 @@ async def create( # multipart/form-data; boundary=---abc-- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return await self._post( - "/v1/openai/v1/files", + "/v1/files", body=await async_maybe_transform(body, file_create_params.FileCreateParams), files=files, options=make_request_options( @@ -356,7 +370,7 @@ async def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -403,7 +417,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/files", + "/v1/files", page=AsyncOpenAICursorPage[File], options=make_request_options( extra_headers=extra_headers, @@ -449,7 +463,7 @@ async def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._delete( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -482,7 +496,7 @@ async def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/files/{file_id}/content", + f"/v1/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index 732025cc..9a2b6c50 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -8,14 +8,7 @@ import httpx -from ..types import ( - inference_rerank_params, - inference_completion_params, - inference_embeddings_params, - inference_chat_completion_params, - inference_batch_completion_params, - inference_batch_chat_completion_params, -) +from ..types import inference_rerank_params, inference_embeddings_params, inference_chat_completion_params from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given from .._utils import required_args, maybe_transform, async_maybe_transform from .._compat import cached_property @@ -29,18 +22,14 @@ from .._wrappers import DataWrapper from .._streaming import Stream, AsyncStream from .._base_client import make_request_options -from ..types.completion_response import CompletionResponse from ..types.embeddings_response import EmbeddingsResponse from ..types.shared_params.message import Message -from ..types.shared.batch_completion import BatchCompletion from ..types.inference_rerank_response import InferenceRerankResponse from ..types.shared_params.response_format import ResponseFormat from ..types.shared_params.sampling_params import SamplingParams from ..types.shared.chat_completion_response import ChatCompletionResponse -from ..types.shared_params.interleaved_content import InterleavedContent from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk from ..types.shared_params.interleaved_content_item import InterleavedContentItem -from ..types.inference_batch_chat_completion_response import InferenceBatchChatCompletionResponse __all__ = ["InferenceResource", "AsyncInferenceResource"] @@ -65,131 +54,7 @@ def with_streaming_response(self) -> InferenceResourceWithStreamingResponse: """ return InferenceResourceWithStreamingResponse(self) - def batch_chat_completion( - self, - *, - messages_batch: Iterable[Iterable[Message]], - model_id: str, - logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit, - tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceBatchChatCompletionResponse: - """ - Generate chat completions for a batch of messages using the specified model. - - Args: - messages_batch: The messages to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - tool_config: (Optional) Configuration for tool use. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/batch-chat-completion", - body=maybe_transform( - { - "messages_batch": messages_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "tool_config": tool_config, - "tools": tools, - }, - inference_batch_chat_completion_params.InferenceBatchChatCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=InferenceBatchChatCompletionResponse, - ) - - def batch_completion( - self, - *, - content_batch: SequenceNotStr[InterleavedContent], - model_id: str, - logprobs: inference_batch_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> BatchCompletion: - """ - Generate completions for a batch of content using the specified model. - - Args: - content_batch: The content to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/batch-completion", - body=maybe_transform( - { - "content_batch": content_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - }, - inference_batch_completion_params.InferenceBatchCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=BatchCompletion, - ) - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload def chat_completion( self, @@ -258,9 +123,7 @@ def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload def chat_completion( self, @@ -329,9 +192,7 @@ def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload def chat_completion( self, @@ -400,9 +261,7 @@ def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) def chat_completion( self, @@ -453,7 +312,7 @@ def chat_completion( stream_cls=Stream[ChatCompletionResponseStreamChunk], ) - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @overload def completion( self, @@ -500,7 +359,7 @@ def completion( """ ... - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @overload def completion( self, @@ -547,7 +406,7 @@ def completion( """ ... - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @overload def completion( self, @@ -594,7 +453,7 @@ def completion( """ ... - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @required_args(["content", "model_id"], ["content", "model_id", "stream"]) def completion( self, @@ -637,7 +496,7 @@ def completion( stream_cls=Stream[CompletionResponse], ) - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.") + @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.") def embeddings( self, *, @@ -778,131 +637,7 @@ def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse """ return AsyncInferenceResourceWithStreamingResponse(self) - async def batch_chat_completion( - self, - *, - messages_batch: Iterable[Iterable[Message]], - model_id: str, - logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit, - tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceBatchChatCompletionResponse: - """ - Generate chat completions for a batch of messages using the specified model. - - Args: - messages_batch: The messages to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - tool_config: (Optional) Configuration for tool use. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/batch-chat-completion", - body=await async_maybe_transform( - { - "messages_batch": messages_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "tool_config": tool_config, - "tools": tools, - }, - inference_batch_chat_completion_params.InferenceBatchChatCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=InferenceBatchChatCompletionResponse, - ) - - async def batch_completion( - self, - *, - content_batch: SequenceNotStr[InterleavedContent], - model_id: str, - logprobs: inference_batch_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> BatchCompletion: - """ - Generate completions for a batch of content using the specified model. - - Args: - content_batch: The content to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/batch-completion", - body=await async_maybe_transform( - { - "content_batch": content_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - }, - inference_batch_completion_params.InferenceBatchCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=BatchCompletion, - ) - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload async def chat_completion( self, @@ -971,9 +706,7 @@ async def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload async def chat_completion( self, @@ -1042,9 +775,7 @@ async def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @overload async def chat_completion( self, @@ -1113,9 +844,7 @@ async def chat_completion( """ ... - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) + @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) async def chat_completion( self, @@ -1166,7 +895,7 @@ async def chat_completion( stream_cls=AsyncStream[ChatCompletionResponseStreamChunk], ) - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @overload async def completion( self, @@ -1260,7 +989,7 @@ async def completion( """ ... - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @overload async def completion( self, @@ -1307,7 +1036,7 @@ async def completion( """ ... - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") + @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @required_args(["content", "model_id"], ["content", "model_id", "stream"]) async def completion( self, @@ -1350,7 +1079,7 @@ async def completion( stream_cls=AsyncStream[CompletionResponse], ) - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.") + @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.") async def embeddings( self, *, @@ -1475,22 +1204,11 @@ class InferenceResourceWithRawResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.batch_chat_completion = to_raw_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = to_raw_response_wrapper( - inference.batch_completion, - ) self.chat_completion = ( # pyright: ignore[reportDeprecated] to_raw_response_wrapper( inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.completion = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) self.embeddings = ( # pyright: ignore[reportDeprecated] to_raw_response_wrapper( inference.embeddings, # pyright: ignore[reportDeprecated], @@ -1505,22 +1223,11 @@ class AsyncInferenceResourceWithRawResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.batch_chat_completion = async_to_raw_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = async_to_raw_response_wrapper( - inference.batch_completion, - ) self.chat_completion = ( # pyright: ignore[reportDeprecated] async_to_raw_response_wrapper( inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.completion = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) self.embeddings = ( # pyright: ignore[reportDeprecated] async_to_raw_response_wrapper( inference.embeddings, # pyright: ignore[reportDeprecated], @@ -1535,22 +1242,11 @@ class InferenceResourceWithStreamingResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.batch_chat_completion = to_streamed_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = to_streamed_response_wrapper( - inference.batch_completion, - ) self.chat_completion = ( # pyright: ignore[reportDeprecated] to_streamed_response_wrapper( inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.completion = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) self.embeddings = ( # pyright: ignore[reportDeprecated] to_streamed_response_wrapper( inference.embeddings, # pyright: ignore[reportDeprecated], @@ -1565,22 +1261,11 @@ class AsyncInferenceResourceWithStreamingResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.batch_chat_completion = async_to_streamed_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = async_to_streamed_response_wrapper( - inference.batch_completion, - ) self.chat_completion = ( # pyright: ignore[reportDeprecated] async_to_streamed_response_wrapper( inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.completion = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) self.embeddings = ( # pyright: ignore[reportDeprecated] async_to_streamed_response_wrapper( inference.embeddings, # pyright: ignore[reportDeprecated], diff --git a/src/llama_stack_client/resources/models/models.py b/src/llama_stack_client/resources/models/models.py index f044c50d..72d9f81e 100644 --- a/src/llama_stack_client/resources/models/models.py +++ b/src/llama_stack_client/resources/models/models.py @@ -101,7 +101,7 @@ def list( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ModelListResponse: - """List all models.""" + """List models using the OpenAI API.""" return self._get( "/v1/models", options=make_request_options( @@ -271,7 +271,7 @@ async def list( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ModelListResponse: - """List all models.""" + """List models using the OpenAI API.""" return await self._get( "/v1/models", options=make_request_options( diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py index e4b2fbd8..57179ed8 100644 --- a/src/llama_stack_client/resources/models/openai.py +++ b/src/llama_stack_client/resources/models/openai.py @@ -54,7 +54,7 @@ def list( ) -> OpenAIListResponse: """List models using the OpenAI API.""" return self._get( - "/v1/openai/v1/models", + "/v1/models", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, @@ -98,7 +98,7 @@ async def list( ) -> OpenAIListResponse: """List models using the OpenAI API.""" return await self._get( - "/v1/openai/v1/models", + "/v1/models", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, diff --git a/src/llama_stack_client/resources/moderations.py b/src/llama_stack_client/resources/moderations.py index a016b5b0..a73dc85a 100644 --- a/src/llama_stack_client/resources/moderations.py +++ b/src/llama_stack_client/resources/moderations.py @@ -73,7 +73,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/moderations", + "/v1/moderations", body=maybe_transform( { "input": input, @@ -138,7 +138,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/moderations", + "/v1/moderations", body=await async_maybe_transform( { "input": input, diff --git a/src/llama_stack_client/resources/responses/input_items.py b/src/llama_stack_client/resources/responses/input_items.py index da06debd..a5836ba7 100644 --- a/src/llama_stack_client/resources/responses/input_items.py +++ b/src/llama_stack_client/resources/responses/input_items.py @@ -85,7 +85,7 @@ def list( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._get( - f"/v1/openai/v1/responses/{response_id}/input_items", + f"/v1/responses/{response_id}/input_items", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, @@ -168,7 +168,7 @@ async def list( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._get( - f"/v1/openai/v1/responses/{response_id}/input_items", + f"/v1/responses/{response_id}/input_items", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, diff --git a/src/llama_stack_client/resources/responses/responses.py b/src/llama_stack_client/resources/responses/responses.py index 7f21f3ea..16e38fd0 100644 --- a/src/llama_stack_client/resources/responses/responses.py +++ b/src/llama_stack_client/resources/responses/responses.py @@ -228,7 +228,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ResponseObject | Stream[ResponseObjectStream]: return self._post( - "/v1/openai/v1/responses", + "/v1/responses", body=maybe_transform( { "input": input, @@ -281,7 +281,7 @@ def retrieve( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._get( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -323,7 +323,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/responses", + "/v1/responses", page=SyncOpenAICursorPage[ResponseListResponse], options=make_request_options( extra_headers=extra_headers, @@ -369,7 +369,7 @@ def delete( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._delete( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -568,7 +568,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ResponseObject | AsyncStream[ResponseObjectStream]: return await self._post( - "/v1/openai/v1/responses", + "/v1/responses", body=await async_maybe_transform( { "input": input, @@ -621,7 +621,7 @@ async def retrieve( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._get( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -663,7 +663,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/responses", + "/v1/responses", page=AsyncOpenAICursorPage[ResponseListResponse], options=make_request_options( extra_headers=extra_headers, @@ -709,7 +709,7 @@ async def delete( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._delete( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/vector_stores/files.py b/src/llama_stack_client/resources/vector_stores/files.py index 39f16a66..f9a1ef31 100644 --- a/src/llama_stack_client/resources/vector_stores/files.py +++ b/src/llama_stack_client/resources/vector_stores/files.py @@ -82,7 +82,7 @@ def create( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", body=maybe_transform( { "file_id": file_id, @@ -126,7 +126,7 @@ def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -165,7 +165,7 @@ def update( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", body=maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout @@ -218,7 +218,7 @@ def list( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get_api_list( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", page=SyncOpenAICursorPage[VectorStoreFile], options=make_request_options( extra_headers=extra_headers, @@ -268,7 +268,7 @@ def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -304,7 +304,7 @@ def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -367,7 +367,7 @@ async def create( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", body=await async_maybe_transform( { "file_id": file_id, @@ -411,7 +411,7 @@ async def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -450,7 +450,7 @@ async def update( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", body=await async_maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout @@ -503,7 +503,7 @@ def list( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get_api_list( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", page=AsyncOpenAICursorPage[VectorStoreFile], options=make_request_options( extra_headers=extra_headers, @@ -553,7 +553,7 @@ async def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -589,7 +589,7 @@ async def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/vector_stores/vector_stores.py b/src/llama_stack_client/resources/vector_stores/vector_stores.py index f3ab01f2..f858100b 100644 --- a/src/llama_stack_client/resources/vector_stores/vector_stores.py +++ b/src/llama_stack_client/resources/vector_stores/vector_stores.py @@ -112,7 +112,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", body=maybe_transform( { "chunking_strategy": chunking_strategy, @@ -158,7 +158,7 @@ def retrieve( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -200,7 +200,7 @@ def update( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", body=maybe_transform( { "expires_after": expires_after, @@ -255,7 +255,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", page=SyncOpenAICursorPage[VectorStore], options=make_request_options( extra_headers=extra_headers, @@ -301,7 +301,7 @@ def delete( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -354,7 +354,7 @@ def search( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/search", + f"/v1/vector_stores/{vector_store_id}/search", body=maybe_transform( { "query": query, @@ -446,7 +446,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", body=await async_maybe_transform( { "chunking_strategy": chunking_strategy, @@ -492,7 +492,7 @@ async def retrieve( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -534,7 +534,7 @@ async def update( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", body=await async_maybe_transform( { "expires_after": expires_after, @@ -589,7 +589,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", page=AsyncOpenAICursorPage[VectorStore], options=make_request_options( extra_headers=extra_headers, @@ -635,7 +635,7 @@ async def delete( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -688,7 +688,7 @@ async def search( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/search", + f"/v1/vector_stores/{vector_store_id}/search", body=await async_maybe_transform( { "query": query, diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py index 56b7f887..e63970ee 100644 --- a/src/llama_stack_client/types/__init__.py +++ b/src/llama_stack_client/types/__init__.py @@ -22,7 +22,6 @@ SystemMessage as SystemMessage, ResponseFormat as ResponseFormat, SamplingParams as SamplingParams, - BatchCompletion as BatchCompletion, SafetyViolation as SafetyViolation, CompletionMessage as CompletionMessage, InterleavedContent as InterleavedContent, @@ -61,7 +60,6 @@ from .tool_list_response import ToolListResponse as ToolListResponse from .agent_create_params import AgentCreateParams as AgentCreateParams from .agent_list_response import AgentListResponse as AgentListResponse -from .completion_response import CompletionResponse as CompletionResponse from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse from .list_files_response import ListFilesResponse as ListFilesResponse from .list_tools_response import ListToolsResponse as ListToolsResponse @@ -73,7 +71,6 @@ from .delete_file_response import DeleteFileResponse as DeleteFileResponse from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams -from .list_models_response import ListModelsResponse as ListModelsResponse from .list_routes_response import ListRoutesResponse as ListRoutesResponse from .query_spans_response import QuerySpansResponse as QuerySpansResponse from .response_list_params import ResponseListParams as ResponseListParams @@ -134,7 +131,6 @@ from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams -from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse @@ -160,20 +156,13 @@ from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse -from .inference_batch_completion_params import InferenceBatchCompletionParams as InferenceBatchCompletionParams from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse from .chat_completion_response_stream_chunk import ( ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk, ) -from .inference_batch_chat_completion_params import ( - InferenceBatchChatCompletionParams as InferenceBatchChatCompletionParams, -) from .telemetry_save_spans_to_dataset_params import ( TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams, ) -from .inference_batch_chat_completion_response import ( - InferenceBatchChatCompletionResponse as InferenceBatchChatCompletionResponse, -) from .post_training_preference_optimize_params import ( PostTrainingPreferenceOptimizeParams as PostTrainingPreferenceOptimizeParams, ) diff --git a/src/llama_stack_client/types/completion_response.py b/src/llama_stack_client/types/completion_response.py deleted file mode 100644 index 9718be8a..00000000 --- a/src/llama_stack_client/types/completion_response.py +++ /dev/null @@ -1,24 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Optional -from typing_extensions import Literal - -from .._models import BaseModel -from .shared.metric import Metric -from .token_log_probs import TokenLogProbs - -__all__ = ["CompletionResponse"] - - -class CompletionResponse(BaseModel): - content: str - """The generated completion text""" - - stop_reason: Literal["end_of_turn", "end_of_message", "out_of_tokens"] - """Reason why generation stopped""" - - logprobs: Optional[List[TokenLogProbs]] = None - """Optional log probabilities for generated tokens""" - - metrics: Optional[List[Metric]] = None - """(Optional) List of metrics associated with the API response""" diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py index 8322c0a9..a1197ff5 100644 --- a/src/llama_stack_client/types/file_create_params.py +++ b/src/llama_stack_client/types/file_create_params.py @@ -2,6 +2,7 @@ from __future__ import annotations +from typing import Optional from typing_extensions import Literal, Required, TypedDict from .._types import FileTypes @@ -10,6 +11,10 @@ class FileCreateParams(TypedDict, total=False): + expires_after_anchor: Required[Optional[str]] + + expires_after_seconds: Required[Optional[int]] + file: Required[FileTypes] purpose: Required[Literal["assistants", "batch"]] diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_params.py b/src/llama_stack_client/types/inference_batch_chat_completion_params.py deleted file mode 100644 index b5da0f0e..00000000 --- a/src/llama_stack_client/types/inference_batch_chat_completion_params.py +++ /dev/null @@ -1,85 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Dict, Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.message import Message -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.tool_param_definition import ToolParamDefinition - -__all__ = ["InferenceBatchChatCompletionParams", "Logprobs", "ToolConfig", "Tool"] - - -class InferenceBatchChatCompletionParams(TypedDict, total=False): - messages_batch: Required[Iterable[Iterable[Message]]] - """The messages to generate completions for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - tool_config: ToolConfig - """(Optional) Configuration for tool use.""" - - tools: Iterable[Tool] - """(Optional) List of tool definitions available to the model.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class ToolConfig(TypedDict, total=False): - system_message_behavior: Literal["append", "replace"] - """(Optional) Config for how to override the default system prompt. - - - `SystemMessageBehavior.append`: Appends the provided system message to the - default system prompt. - `SystemMessageBehavior.replace`: Replaces the default - system prompt with the provided system message. The system message can include - the string '{{function_definitions}}' to indicate where the function - definitions should be inserted. - """ - - tool_choice: Union[Literal["auto", "required", "none"], str] - """(Optional) Whether tool use is automatic, required, or none. - - Can also specify a tool name to use a specific tool. Defaults to - ToolChoice.auto. - """ - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. - """ - - -class Tool(TypedDict, total=False): - tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]] - - description: str - - parameters: Dict[str, ToolParamDefinition] diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_response.py b/src/llama_stack_client/types/inference_batch_chat_completion_response.py deleted file mode 100644 index ed24908d..00000000 --- a/src/llama_stack_client/types/inference_batch_chat_completion_response.py +++ /dev/null @@ -1,13 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from .._models import BaseModel -from .shared.chat_completion_response import ChatCompletionResponse - -__all__ = ["InferenceBatchChatCompletionResponse"] - - -class InferenceBatchChatCompletionResponse(BaseModel): - batch: List[ChatCompletionResponse] - """List of chat completion responses, one for each conversation in the batch""" diff --git a/src/llama_stack_client/types/inference_batch_completion_params.py b/src/llama_stack_client/types/inference_batch_completion_params.py deleted file mode 100644 index b225b883..00000000 --- a/src/llama_stack_client/types/inference_batch_completion_params.py +++ /dev/null @@ -1,41 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Required, TypedDict - -from .._types import SequenceNotStr -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.interleaved_content import InterleavedContent - -__all__ = ["InferenceBatchCompletionParams", "Logprobs"] - - -class InferenceBatchCompletionParams(TypedDict, total=False): - content_batch: Required[SequenceNotStr[InterleavedContent]] - """The content to generate completions for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" diff --git a/src/llama_stack_client/types/inference_completion_params.py b/src/llama_stack_client/types/inference_completion_params.py deleted file mode 100644 index c122f017..00000000 --- a/src/llama_stack_client/types/inference_completion_params.py +++ /dev/null @@ -1,65 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.interleaved_content import InterleavedContent - -__all__ = [ - "InferenceCompletionParamsBase", - "Logprobs", - "InferenceCompletionParamsNonStreaming", - "InferenceCompletionParamsStreaming", -] - - -class InferenceCompletionParamsBase(TypedDict, total=False): - content: Required[InterleavedContent] - """The content to generate a completion for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class InferenceCompletionParamsNonStreaming(InferenceCompletionParamsBase, total=False): - stream: Literal[False] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -class InferenceCompletionParamsStreaming(InferenceCompletionParamsBase): - stream: Required[Literal[True]] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -InferenceCompletionParams = Union[InferenceCompletionParamsNonStreaming, InferenceCompletionParamsStreaming] diff --git a/src/llama_stack_client/types/list_models_response.py b/src/llama_stack_client/types/list_models_response.py deleted file mode 100644 index a36896b8..00000000 --- a/src/llama_stack_client/types/list_models_response.py +++ /dev/null @@ -1,10 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from .._models import BaseModel -from .model_list_response import ModelListResponse - -__all__ = ["ListModelsResponse"] - - -class ListModelsResponse(BaseModel): - data: ModelListResponse diff --git a/src/llama_stack_client/types/model_list_response.py b/src/llama_stack_client/types/model_list_response.py index 905cdb0f..7631b69f 100644 --- a/src/llama_stack_client/types/model_list_response.py +++ b/src/llama_stack_client/types/model_list_response.py @@ -1,10 +1,21 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List -from typing_extensions import TypeAlias +from typing_extensions import Literal, TypeAlias -from .model import Model +from .._models import BaseModel -__all__ = ["ModelListResponse"] +__all__ = ["ModelListResponse", "ModelListResponseItem"] -ModelListResponse: TypeAlias = List[Model] + +class ModelListResponseItem(BaseModel): + id: str + + created: int + + object: Literal["model"] + + owned_by: str + + +ModelListResponse: TypeAlias = List[ModelListResponseItem] diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py index fb14d8a6..007d56ac 100644 --- a/src/llama_stack_client/types/shared/__init__.py +++ b/src/llama_stack_client/types/shared/__init__.py @@ -14,7 +14,6 @@ from .system_message import SystemMessage as SystemMessage from .response_format import ResponseFormat as ResponseFormat from .sampling_params import SamplingParams as SamplingParams -from .batch_completion import BatchCompletion as BatchCompletion from .safety_violation import SafetyViolation as SafetyViolation from .completion_message import CompletionMessage as CompletionMessage from .interleaved_content import InterleavedContent as InterleavedContent diff --git a/src/llama_stack_client/types/shared/batch_completion.py b/src/llama_stack_client/types/shared/batch_completion.py deleted file mode 100644 index 43a0a735..00000000 --- a/src/llama_stack_client/types/shared/batch_completion.py +++ /dev/null @@ -1,13 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from ..._models import BaseModel -from ..completion_response import CompletionResponse - -__all__ = ["BatchCompletion"] - - -class BatchCompletion(BaseModel): - batch: List[CompletionResponse] - """List of completion responses, one for each input in the batch""" diff --git a/src/llama_stack_client/types/shared/tool_param_definition.py b/src/llama_stack_client/types/shared/tool_param_definition.py index 1466c1f9..316f1e01 100644 --- a/src/llama_stack_client/types/shared/tool_param_definition.py +++ b/src/llama_stack_client/types/shared/tool_param_definition.py @@ -14,4 +14,8 @@ class ToolParamDefinition(BaseModel): description: Optional[str] = None + items: Union[bool, float, str, List[object], object, None] = None + required: Optional[bool] = None + + title: Optional[str] = None diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py index 2d7805fe..87563946 100644 --- a/src/llama_stack_client/types/shared_params/tool_param_definition.py +++ b/src/llama_stack_client/types/shared_params/tool_param_definition.py @@ -15,4 +15,8 @@ class ToolParamDefinition(TypedDict, total=False): description: str + items: Union[bool, float, str, Iterable[object], object, None] + required: bool + + title: str diff --git a/src/llama_stack_client/types/tool.py b/src/llama_stack_client/types/tool.py index c6994268..a7243b64 100644 --- a/src/llama_stack_client/types/tool.py +++ b/src/llama_stack_client/types/tool.py @@ -24,6 +24,12 @@ class Parameter(BaseModel): default: Union[bool, float, str, List[object], object, None] = None """(Optional) Default value for the parameter if not provided""" + items: Optional[object] = None + """Type of the elements when parameter_type is array""" + + title: Optional[str] = None + """(Optional) Title of the parameter""" + class Tool(BaseModel): description: str diff --git a/src/llama_stack_client/types/tool_def.py b/src/llama_stack_client/types/tool_def.py index c82a9b8a..21949b41 100644 --- a/src/llama_stack_client/types/tool_def.py +++ b/src/llama_stack_client/types/tool_def.py @@ -23,6 +23,12 @@ class Parameter(BaseModel): default: Union[bool, float, str, List[object], object, None] = None """(Optional) Default value for the parameter if not provided""" + items: Optional[object] = None + """Type of the elements when parameter_type is array""" + + title: Optional[str] = None + """(Optional) Title of the parameter""" + class ToolDef(BaseModel): name: str diff --git a/src/llama_stack_client/types/tool_def_param.py b/src/llama_stack_client/types/tool_def_param.py index 93ad8285..a50437b2 100644 --- a/src/llama_stack_client/types/tool_def_param.py +++ b/src/llama_stack_client/types/tool_def_param.py @@ -24,6 +24,12 @@ class Parameter(TypedDict, total=False): default: Union[bool, float, str, Iterable[object], object, None] """(Optional) Default value for the parameter if not provided""" + items: object + """Type of the elements when parameter_type is array""" + + title: str + """(Optional) Title of the parameter""" + class ToolDefParam(TypedDict, total=False): name: Required[str] diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py index 18b34012..c19bc9bf 100644 --- a/tests/api_resources/test_agents.py +++ b/tests/api_resources/test_agents.py @@ -49,6 +49,8 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None: "parameter_type": "parameter_type", "required": True, "default": True, + "items": {}, + "title": "title", } ], } @@ -253,6 +255,8 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack "parameter_type": "parameter_type", "required": True, "default": True, + "items": {}, + "title": "title", } ], } diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py index d9b29ffc..f2bc1e0a 100644 --- a/tests/api_resources/test_files.py +++ b/tests/api_resources/test_files.py @@ -21,6 +21,8 @@ class TestFiles: @parametrize def test_method_create(self, client: LlamaStackClient) -> None: file = client.files.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) @@ -29,6 +31,8 @@ def test_method_create(self, client: LlamaStackClient) -> None: @parametrize def test_raw_response_create(self, client: LlamaStackClient) -> None: response = client.files.with_raw_response.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) @@ -41,6 +45,8 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None: @parametrize def test_streaming_response_create(self, client: LlamaStackClient) -> None: with client.files.with_streaming_response.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) as response: @@ -210,6 +216,8 @@ class TestAsyncFiles: @parametrize async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: file = await async_client.files.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) @@ -218,6 +226,8 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: @parametrize async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None: response = await async_client.files.with_raw_response.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) @@ -230,6 +240,8 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> @parametrize async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None: async with async_client.files.with_streaming_response.create( + expires_after_anchor="expires_after_anchor", + expires_after_seconds=0, file=b"raw file contents", purpose="assistants", ) as response: diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py index 474ff7cf..6e952637 100644 --- a/tests/api_resources/test_inference.py +++ b/tests/api_resources/test_inference.py @@ -10,12 +10,10 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient from llama_stack_client.types import ( - CompletionResponse, EmbeddingsResponse, InferenceRerankResponse, - InferenceBatchChatCompletionResponse, ) -from llama_stack_client.types.shared import BatchCompletion, ChatCompletionResponse +from llama_stack_client.types.shared import ChatCompletionResponse # pyright: reportDeprecated=false @@ -25,160 +23,6 @@ class TestInference: parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) - @parametrize - def test_method_batch_chat_completion(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_method_batch_chat_completion_with_all_params(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - "context": "string", - } - ] - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_batch_chat_completion(self, client: LlamaStackClient) -> None: - response = client.inference.with_raw_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_batch_chat_completion(self, client: LlamaStackClient) -> None: - with client.inference.with_streaming_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_batch_completion(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_method_batch_completion_with_all_params(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_raw_response_batch_completion(self, client: LlamaStackClient) -> None: - response = client.inference.with_raw_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_streaming_response_batch_completion(self, client: LlamaStackClient) -> None: - with client.inference.with_streaming_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - @parametrize def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None: with pytest.warns(DeprecationWarning): @@ -234,7 +78,9 @@ def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaSt "param_type": "param_type", "default": True, "description": "description", + "items": True, "required": True, + "title": "title", } }, } @@ -337,7 +183,9 @@ def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaSt "param_type": "param_type", "default": True, "description": "description", + "items": True, "required": True, + "title": "title", } }, } @@ -385,128 +233,6 @@ def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackC assert cast(Any, response.is_closed) is True - @parametrize - def test_method_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.completion( - content="string", - model_id="model_id", - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_method_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.completion( - content="string", - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.completion( - content="string", - model_id="model_id", - stream=True, - ) - - inference_stream.response.close() - - @parametrize - def test_method_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.completion( - content="string", - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - - inference_stream.response.close() - - @parametrize - def test_raw_response_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = response.parse() - stream.close() - - @parametrize - def test_streaming_response_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = response.parse() - stream.close() - - assert cast(Any, response.is_closed) is True - @parametrize def test_method_embeddings(self, client: LlamaStackClient) -> None: with pytest.warns(DeprecationWarning): @@ -611,160 +337,6 @@ class TestAsyncInference: "async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"] ) - @parametrize - async def test_method_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_method_batch_chat_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - "context": "string", - } - ] - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - response = await async_client.inference.with_raw_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - async with async_client.inference.with_streaming_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_method_batch_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_raw_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - response = await async_client.inference.with_raw_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_streaming_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - async with async_client.inference.with_streaming_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - @parametrize async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: with pytest.warns(DeprecationWarning): @@ -820,7 +392,9 @@ async def test_method_chat_completion_with_all_params_overload_1(self, async_cli "param_type": "param_type", "default": True, "description": "description", + "items": True, "required": True, + "title": "title", } }, } @@ -923,7 +497,9 @@ async def test_method_chat_completion_with_all_params_overload_2(self, async_cli "param_type": "param_type", "default": True, "description": "description", + "items": True, "required": True, + "title": "title", } }, } @@ -971,128 +547,6 @@ async def test_streaming_response_chat_completion_overload_2(self, async_client: assert cast(Any, response.is_closed) is True - @parametrize - async def test_method_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.completion( - content="string", - model_id="model_id", - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_method_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.completion( - content="string", - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.completion( - content="string", - model_id="model_id", - stream=True, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_method_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.completion( - content="string", - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_raw_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = await response.parse() - await stream.close() - - @parametrize - async def test_streaming_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = await response.parse() - await stream.close() - - assert cast(Any, response.is_closed) is True - @parametrize async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None: with pytest.warns(DeprecationWarning): From 7f24c432dc1859312710a4a1ff4a80f6f861bee8 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 01:55:37 +0000 Subject: [PATCH 2/8] feat(api): expires_after changes for /files --- .stats.yml | 6 +- README.md | 3 - api.md | 11 +- src/llama_stack_client/resources/files.py | 34 +-- src/llama_stack_client/resources/inference.py | 284 +----------------- .../resources/models/models.py | 4 +- .../resources/models/openai.py | 18 +- src/llama_stack_client/types/__init__.py | 5 +- .../types/embeddings_response.py | 16 - .../types/file_create_params.py | 10 +- .../types/inference_embeddings_params.py | 46 --- .../types/inference_rerank_params.py | 106 ------- .../types/inference_rerank_response.py | 23 -- .../types/list_models_response.py | 10 + .../types/model_list_response.py | 19 +- .../types/models/openai_list_response.py | 19 +- .../types/response_list_response.py | 3 - .../types/response_object.py | 3 - tests/api_resources/models/test_openai.py | 14 +- tests/api_resources/test_files.py | 18 -- tests/api_resources/test_inference.py | 200 ------------ 21 files changed, 51 insertions(+), 801 deletions(-) delete mode 100644 src/llama_stack_client/types/embeddings_response.py delete mode 100644 src/llama_stack_client/types/inference_embeddings_params.py delete mode 100644 src/llama_stack_client/types/inference_rerank_params.py delete mode 100644 src/llama_stack_client/types/inference_rerank_response.py create mode 100644 src/llama_stack_client/types/list_models_response.py diff --git a/.stats.yml b/.stats.yml index e5bf0be0..016bf7b6 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ -configured_endpoints: 107 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-1eddf141208c131ee4a64ef996f8f419b444f60450de6807a9f6bc711ed8b661.yml -openapi_spec_hash: 94765c67ea99b1358169d41d810dd395 +configured_endpoints: 105 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml +openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952 config_hash: 7ec5a583f9c26b38993013bdfb0e7d46 diff --git a/README.md b/README.md index cbd6bf78..d448f59d 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,7 @@ from llama_stack_client import LlamaStackClient client = LlamaStackClient() client.files.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=Path("/path/to/file"), - purpose="assistants", ) ``` diff --git a/api.md b/api.md index 97d40b31..ad4e635c 100644 --- a/api.md +++ b/api.md @@ -241,19 +241,12 @@ Methods: Types: ```python -from llama_stack_client.types import ( - ChatCompletionResponseStreamChunk, - EmbeddingsResponse, - TokenLogProbs, - InferenceRerankResponse, -) +from llama_stack_client.types import ChatCompletionResponseStreamChunk, TokenLogProbs ``` Methods: - client.inference.chat_completion(\*\*params) -> ChatCompletionResponse -- client.inference.embeddings(\*\*params) -> EmbeddingsResponse -- client.inference.rerank(\*\*params) -> InferenceRerankResponse # Embeddings @@ -406,7 +399,7 @@ from llama_stack_client.types.models import OpenAIListResponse Methods: -- client.models.openai.list() -> OpenAIListResponse +- client.models.openai.list() -> ModelListResponse # PostTraining diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py index e8f20d35..04c37c56 100644 --- a/src/llama_stack_client/resources/files.py +++ b/src/llama_stack_client/resources/files.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Mapping, Optional, cast +from typing import Mapping, cast from typing_extensions import Literal import httpx @@ -49,10 +49,7 @@ def with_streaming_response(self) -> FilesResourceWithStreamingResponse: def create( self, *, - expires_after_anchor: Optional[str], - expires_after_seconds: Optional[int], file: FileTypes, - purpose: Literal["assistants", "batch"], # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -68,12 +65,8 @@ def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. - expires_after: Optional form values describing expiration for the file. - Expected expires_after[anchor] = "created_at", expires_after[seconds] = - {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days). Args: - purpose: Valid purpose values for OpenAI Files API. - extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -82,14 +75,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal( - { - "expires_after_anchor": expires_after_anchor, - "expires_after_seconds": expires_after_seconds, - "file": file, - "purpose": purpose, - } - ) + body = deepcopy_minimal({"file": file}) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) # It should be noted that the actual Content-Type header that will be # sent to the server will contain a `boundary` parameter, e.g. @@ -288,10 +274,7 @@ def with_streaming_response(self) -> AsyncFilesResourceWithStreamingResponse: async def create( self, *, - expires_after_anchor: Optional[str], - expires_after_seconds: Optional[int], file: FileTypes, - purpose: Literal["assistants", "batch"], # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -307,12 +290,8 @@ async def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. - expires_after: Optional form values describing expiration for the file. - Expected expires_after[anchor] = "created_at", expires_after[seconds] = - {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days). Args: - purpose: Valid purpose values for OpenAI Files API. - extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -321,14 +300,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal( - { - "expires_after_anchor": expires_after_anchor, - "expires_after_seconds": expires_after_seconds, - "file": file, - "purpose": purpose, - } - ) + body = deepcopy_minimal({"file": file}) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) # It should be noted that the actual Content-Type header that will be # sent to the server will contain a `boundary` parameter, e.g. diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index 9a2b6c50..bac5cb3e 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -3,13 +3,13 @@ from __future__ import annotations import typing_extensions -from typing import Type, Union, Iterable, cast +from typing import Iterable from typing_extensions import Literal, overload import httpx -from ..types import inference_rerank_params, inference_embeddings_params, inference_chat_completion_params -from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given +from ..types import inference_chat_completion_params +from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given from .._utils import required_args, maybe_transform, async_maybe_transform from .._compat import cached_property from .._resource import SyncAPIResource, AsyncAPIResource @@ -19,17 +19,13 @@ async_to_raw_response_wrapper, async_to_streamed_response_wrapper, ) -from .._wrappers import DataWrapper from .._streaming import Stream, AsyncStream from .._base_client import make_request_options -from ..types.embeddings_response import EmbeddingsResponse from ..types.shared_params.message import Message -from ..types.inference_rerank_response import InferenceRerankResponse from ..types.shared_params.response_format import ResponseFormat from ..types.shared_params.sampling_params import SamplingParams from ..types.shared.chat_completion_response import ChatCompletionResponse from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk -from ..types.shared_params.interleaved_content_item import InterleavedContentItem __all__ = ["InferenceResource", "AsyncInferenceResource"] @@ -495,126 +491,7 @@ def completion( stream=stream or False, stream_cls=Stream[CompletionResponse], ) - - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.") - def embeddings( - self, - *, - contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]], - model_id: str, - output_dimension: int | Omit = omit, - task_type: Literal["query", "document"] | Omit = omit, - text_truncation: Literal["none", "start", "end"] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EmbeddingsResponse: - """ - Generate embeddings for content pieces using the specified model. - - Args: - contents: List of contents to generate embeddings for. Each content can be a string or an - InterleavedContentItem (and hence can be multimodal). The behavior depends on - the model and provider. Some models may only support text. - - model_id: The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - - output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - - task_type: (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - - text_truncation: (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/embeddings", - body=maybe_transform( - { - "contents": contents, - "model_id": model_id, - "output_dimension": output_dimension, - "task_type": task_type, - "text_truncation": text_truncation, - }, - inference_embeddings_params.InferenceEmbeddingsParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=EmbeddingsResponse, - ) - - def rerank( - self, - *, - items: SequenceNotStr[inference_rerank_params.Item], - model: str, - query: inference_rerank_params.Query, - max_num_results: int | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceRerankResponse: - """ - Rerank a list of documents based on their relevance to a query. - - Args: - items: List of items to rerank. Each item can be a string, text content part, or image - content part. Each input must not exceed the model's max input token length. - - model: The identifier of the reranking model to use. - - query: The search query to rank items against. Can be a string, text content part, or - image content part. The input must not exceed the model's max input token - length. - - max_num_results: (Optional) Maximum number of results to return. Default: returns all. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/rerank", - body=maybe_transform( - { - "items": items, - "model": model, - "query": query, - "max_num_results": max_num_results, - }, - inference_rerank_params.InferenceRerankParams, - ), - options=make_request_options( - extra_headers=extra_headers, - extra_query=extra_query, - extra_body=extra_body, - timeout=timeout, - post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, - ), - cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), - ) + class AsyncInferenceResource(AsyncAPIResource): @@ -1078,126 +955,7 @@ async def completion( stream=stream or False, stream_cls=AsyncStream[CompletionResponse], ) - - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.") - async def embeddings( - self, - *, - contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]], - model_id: str, - output_dimension: int | Omit = omit, - task_type: Literal["query", "document"] | Omit = omit, - text_truncation: Literal["none", "start", "end"] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EmbeddingsResponse: - """ - Generate embeddings for content pieces using the specified model. - - Args: - contents: List of contents to generate embeddings for. Each content can be a string or an - InterleavedContentItem (and hence can be multimodal). The behavior depends on - the model and provider. Some models may only support text. - - model_id: The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - - output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - - task_type: (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - - text_truncation: (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/embeddings", - body=await async_maybe_transform( - { - "contents": contents, - "model_id": model_id, - "output_dimension": output_dimension, - "task_type": task_type, - "text_truncation": text_truncation, - }, - inference_embeddings_params.InferenceEmbeddingsParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=EmbeddingsResponse, - ) - - async def rerank( - self, - *, - items: SequenceNotStr[inference_rerank_params.Item], - model: str, - query: inference_rerank_params.Query, - max_num_results: int | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceRerankResponse: - """ - Rerank a list of documents based on their relevance to a query. - - Args: - items: List of items to rerank. Each item can be a string, text content part, or image - content part. Each input must not exceed the model's max input token length. - - model: The identifier of the reranking model to use. - - query: The search query to rank items against. Can be a string, text content part, or - image content part. The input must not exceed the model's max input token - length. - - max_num_results: (Optional) Maximum number of results to return. Default: returns all. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/rerank", - body=await async_maybe_transform( - { - "items": items, - "model": model, - "query": query, - "max_num_results": max_num_results, - }, - inference_rerank_params.InferenceRerankParams, - ), - options=make_request_options( - extra_headers=extra_headers, - extra_query=extra_query, - extra_body=extra_body, - timeout=timeout, - post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, - ), - cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), - ) + class InferenceResourceWithRawResponse: @@ -1209,14 +967,6 @@ def __init__(self, inference: InferenceResource) -> None: inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) - self.rerank = to_raw_response_wrapper( - inference.rerank, - ) class AsyncInferenceResourceWithRawResponse: @@ -1228,14 +978,6 @@ def __init__(self, inference: AsyncInferenceResource) -> None: inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) - self.rerank = async_to_raw_response_wrapper( - inference.rerank, - ) class InferenceResourceWithStreamingResponse: @@ -1247,14 +989,6 @@ def __init__(self, inference: InferenceResource) -> None: inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) - self.rerank = to_streamed_response_wrapper( - inference.rerank, - ) class AsyncInferenceResourceWithStreamingResponse: @@ -1266,11 +1000,3 @@ def __init__(self, inference: AsyncInferenceResource) -> None: inference.chat_completion, # pyright: ignore[reportDeprecated], ) ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) - self.rerank = async_to_streamed_response_wrapper( - inference.rerank, - ) diff --git a/src/llama_stack_client/resources/models/models.py b/src/llama_stack_client/resources/models/models.py index 72d9f81e..f044c50d 100644 --- a/src/llama_stack_client/resources/models/models.py +++ b/src/llama_stack_client/resources/models/models.py @@ -101,7 +101,7 @@ def list( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ModelListResponse: - """List models using the OpenAI API.""" + """List all models.""" return self._get( "/v1/models", options=make_request_options( @@ -271,7 +271,7 @@ async def list( extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ModelListResponse: - """List models using the OpenAI API.""" + """List all models.""" return await self._get( "/v1/models", options=make_request_options( diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py index 57179ed8..ab4b4038 100644 --- a/src/llama_stack_client/resources/models/openai.py +++ b/src/llama_stack_client/resources/models/openai.py @@ -17,7 +17,7 @@ ) from ..._wrappers import DataWrapper from ..._base_client import make_request_options -from ...types.models.openai_list_response import OpenAIListResponse +from ...types.model_list_response import ModelListResponse __all__ = ["OpenAIResource", "AsyncOpenAIResource"] @@ -51,8 +51,8 @@ def list( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> OpenAIListResponse: - """List models using the OpenAI API.""" + ) -> ModelListResponse: + """List all models.""" return self._get( "/v1/models", options=make_request_options( @@ -60,9 +60,9 @@ def list( extra_query=extra_query, extra_body=extra_body, timeout=timeout, - post_parser=DataWrapper[OpenAIListResponse]._unwrapper, + post_parser=DataWrapper[ModelListResponse]._unwrapper, ), - cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]), + cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]), ) @@ -95,8 +95,8 @@ async def list( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> OpenAIListResponse: - """List models using the OpenAI API.""" + ) -> ModelListResponse: + """List all models.""" return await self._get( "/v1/models", options=make_request_options( @@ -104,9 +104,9 @@ async def list( extra_query=extra_query, extra_body=extra_body, timeout=timeout, - post_parser=DataWrapper[OpenAIListResponse]._unwrapper, + post_parser=DataWrapper[ModelListResponse]._unwrapper, ), - cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]), + cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]), ) diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py index e63970ee..8a61ceec 100644 --- a/src/llama_stack_client/types/__init__.py +++ b/src/llama_stack_client/types/__init__.py @@ -60,7 +60,6 @@ from .tool_list_response import ToolListResponse as ToolListResponse from .agent_create_params import AgentCreateParams as AgentCreateParams from .agent_list_response import AgentListResponse as AgentListResponse -from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse from .list_files_response import ListFilesResponse as ListFilesResponse from .list_tools_response import ListToolsResponse as ListToolsResponse from .model_list_response import ModelListResponse as ModelListResponse @@ -71,6 +70,7 @@ from .delete_file_response import DeleteFileResponse as DeleteFileResponse from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams +from .list_models_response import ListModelsResponse as ListModelsResponse from .list_routes_response import ListRoutesResponse as ListRoutesResponse from .query_spans_response import QuerySpansResponse as QuerySpansResponse from .response_list_params import ResponseListParams as ResponseListParams @@ -100,7 +100,6 @@ from .dataset_iterrows_params import DatasetIterrowsParams as DatasetIterrowsParams from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams from .embedding_create_params import EmbeddingCreateParams as EmbeddingCreateParams -from .inference_rerank_params import InferenceRerankParams as InferenceRerankParams from .list_providers_response import ListProvidersResponse as ListProvidersResponse from .scoring_fn_params_param import ScoringFnParamsParam as ScoringFnParamsParam from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse @@ -119,7 +118,6 @@ from .dataset_register_response import DatasetRegisterResponse as DatasetRegisterResponse from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams -from .inference_rerank_response import InferenceRerankResponse as InferenceRerankResponse from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams @@ -131,7 +129,6 @@ from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams -from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse from .vector_db_register_response import VectorDBRegisterResponse as VectorDBRegisterResponse diff --git a/src/llama_stack_client/types/embeddings_response.py b/src/llama_stack_client/types/embeddings_response.py deleted file mode 100644 index f36c6b97..00000000 --- a/src/llama_stack_client/types/embeddings_response.py +++ /dev/null @@ -1,16 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from .._models import BaseModel - -__all__ = ["EmbeddingsResponse"] - - -class EmbeddingsResponse(BaseModel): - embeddings: List[List[float]] - """List of embedding vectors, one per input content. - - Each embedding is a list of floats. The dimensionality of the embedding is - model-specific; you can check model metadata using /models/{model_id} - """ diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py index a1197ff5..6278e1a0 100644 --- a/src/llama_stack_client/types/file_create_params.py +++ b/src/llama_stack_client/types/file_create_params.py @@ -2,8 +2,7 @@ from __future__ import annotations -from typing import Optional -from typing_extensions import Literal, Required, TypedDict +from typing_extensions import Required, TypedDict from .._types import FileTypes @@ -11,11 +10,4 @@ class FileCreateParams(TypedDict, total=False): - expires_after_anchor: Required[Optional[str]] - - expires_after_seconds: Required[Optional[int]] - file: Required[FileTypes] - - purpose: Required[Literal["assistants", "batch"]] - """Valid purpose values for OpenAI Files API.""" diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py deleted file mode 100644 index a1be545b..00000000 --- a/src/llama_stack_client/types/inference_embeddings_params.py +++ /dev/null @@ -1,46 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .._types import SequenceNotStr -from .shared_params.interleaved_content_item import InterleavedContentItem - -__all__ = ["InferenceEmbeddingsParams"] - - -class InferenceEmbeddingsParams(TypedDict, total=False): - contents: Required[Union[SequenceNotStr[str], Iterable[InterleavedContentItem]]] - """List of contents to generate embeddings for. - - Each content can be a string or an InterleavedContentItem (and hence can be - multimodal). The behavior depends on the model and provider. Some models may - only support text. - """ - - model_id: Required[str] - """The identifier of the model to use. - - The model must be an embedding model registered with Llama Stack and available - via the /models endpoint. - """ - - output_dimension: int - """(Optional) Output dimensionality for the embeddings. - - Only supported by Matryoshka models. - """ - - task_type: Literal["query", "document"] - """ - (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - """ - - text_truncation: Literal["none", "start", "end"] - """ - (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - """ diff --git a/src/llama_stack_client/types/inference_rerank_params.py b/src/llama_stack_client/types/inference_rerank_params.py deleted file mode 100644 index 8f8c4d64..00000000 --- a/src/llama_stack_client/types/inference_rerank_params.py +++ /dev/null @@ -1,106 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypeAlias, TypedDict - -from .._types import SequenceNotStr - -__all__ = [ - "InferenceRerankParams", - "Item", - "ItemOpenAIChatCompletionContentPartTextParam", - "ItemOpenAIChatCompletionContentPartImageParam", - "ItemOpenAIChatCompletionContentPartImageParamImageURL", - "Query", - "QueryOpenAIChatCompletionContentPartTextParam", - "QueryOpenAIChatCompletionContentPartImageParam", - "QueryOpenAIChatCompletionContentPartImageParamImageURL", -] - - -class InferenceRerankParams(TypedDict, total=False): - items: Required[SequenceNotStr[Item]] - """List of items to rerank. - - Each item can be a string, text content part, or image content part. Each input - must not exceed the model's max input token length. - """ - - model: Required[str] - """The identifier of the reranking model to use.""" - - query: Required[Query] - """The search query to rank items against. - - Can be a string, text content part, or image content part. The input must not - exceed the model's max input token length. - """ - - max_num_results: int - """(Optional) Maximum number of results to return. Default: returns all.""" - - -class ItemOpenAIChatCompletionContentPartTextParam(TypedDict, total=False): - text: Required[str] - """The text content of the message""" - - type: Required[Literal["text"]] - """Must be "text" to identify this as text content""" - - -class ItemOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False): - url: Required[str] - """URL of the image to include in the message""" - - detail: str - """(Optional) Level of detail for image processing. - - Can be "low", "high", or "auto" - """ - - -class ItemOpenAIChatCompletionContentPartImageParam(TypedDict, total=False): - image_url: Required[ItemOpenAIChatCompletionContentPartImageParamImageURL] - """Image URL specification and processing details""" - - type: Required[Literal["image_url"]] - """Must be "image_url" to identify this as image content""" - - -Item: TypeAlias = Union[ - str, ItemOpenAIChatCompletionContentPartTextParam, ItemOpenAIChatCompletionContentPartImageParam -] - - -class QueryOpenAIChatCompletionContentPartTextParam(TypedDict, total=False): - text: Required[str] - """The text content of the message""" - - type: Required[Literal["text"]] - """Must be "text" to identify this as text content""" - - -class QueryOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False): - url: Required[str] - """URL of the image to include in the message""" - - detail: str - """(Optional) Level of detail for image processing. - - Can be "low", "high", or "auto" - """ - - -class QueryOpenAIChatCompletionContentPartImageParam(TypedDict, total=False): - image_url: Required[QueryOpenAIChatCompletionContentPartImageParamImageURL] - """Image URL specification and processing details""" - - type: Required[Literal["image_url"]] - """Must be "image_url" to identify this as image content""" - - -Query: TypeAlias = Union[ - str, QueryOpenAIChatCompletionContentPartTextParam, QueryOpenAIChatCompletionContentPartImageParam -] diff --git a/src/llama_stack_client/types/inference_rerank_response.py b/src/llama_stack_client/types/inference_rerank_response.py deleted file mode 100644 index e74fc7e6..00000000 --- a/src/llama_stack_client/types/inference_rerank_response.py +++ /dev/null @@ -1,23 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List -from typing_extensions import TypeAlias - -from .._models import BaseModel - -__all__ = ["InferenceRerankResponse", "InferenceRerankResponseItem"] - - -class InferenceRerankResponseItem(BaseModel): - index: int - """The original index of the document in the input list""" - - relevance_score: float - """The relevance score from the model output. - - Values are inverted when applicable so that higher scores indicate greater - relevance. - """ - - -InferenceRerankResponse: TypeAlias = List[InferenceRerankResponseItem] diff --git a/src/llama_stack_client/types/list_models_response.py b/src/llama_stack_client/types/list_models_response.py new file mode 100644 index 00000000..a36896b8 --- /dev/null +++ b/src/llama_stack_client/types/list_models_response.py @@ -0,0 +1,10 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from .._models import BaseModel +from .model_list_response import ModelListResponse + +__all__ = ["ListModelsResponse"] + + +class ListModelsResponse(BaseModel): + data: ModelListResponse diff --git a/src/llama_stack_client/types/model_list_response.py b/src/llama_stack_client/types/model_list_response.py index 7631b69f..905cdb0f 100644 --- a/src/llama_stack_client/types/model_list_response.py +++ b/src/llama_stack_client/types/model_list_response.py @@ -1,21 +1,10 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List -from typing_extensions import Literal, TypeAlias +from typing_extensions import TypeAlias -from .._models import BaseModel +from .model import Model -__all__ = ["ModelListResponse", "ModelListResponseItem"] +__all__ = ["ModelListResponse"] - -class ModelListResponseItem(BaseModel): - id: str - - created: int - - object: Literal["model"] - - owned_by: str - - -ModelListResponse: TypeAlias = List[ModelListResponseItem] +ModelListResponse: TypeAlias = List[Model] diff --git a/src/llama_stack_client/types/models/openai_list_response.py b/src/llama_stack_client/types/models/openai_list_response.py index f14845d5..5b6c0358 100644 --- a/src/llama_stack_client/types/models/openai_list_response.py +++ b/src/llama_stack_client/types/models/openai_list_response.py @@ -1,21 +1,10 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List -from typing_extensions import Literal, TypeAlias +from typing_extensions import TypeAlias -from ..._models import BaseModel +from ..model import Model -__all__ = ["OpenAIListResponse", "OpenAIListResponseItem"] +__all__ = ["OpenAIListResponse"] - -class OpenAIListResponseItem(BaseModel): - id: str - - created: int - - object: Literal["model"] - - owned_by: str - - -OpenAIListResponse: TypeAlias = List[OpenAIListResponseItem] +OpenAIListResponse: TypeAlias = List[Model] diff --git a/src/llama_stack_client/types/response_list_response.py b/src/llama_stack_client/types/response_list_response.py index ae50d44a..ac7ec1b1 100644 --- a/src/llama_stack_client/types/response_list_response.py +++ b/src/llama_stack_client/types/response_list_response.py @@ -570,6 +570,3 @@ class ResponseListResponse(BaseModel): truncation: Optional[str] = None """(Optional) Truncation strategy applied to the response""" - - user: Optional[str] = None - """(Optional) User identifier associated with the request""" diff --git a/src/llama_stack_client/types/response_object.py b/src/llama_stack_client/types/response_object.py index c0f348a9..b618ddf5 100644 --- a/src/llama_stack_client/types/response_object.py +++ b/src/llama_stack_client/types/response_object.py @@ -361,6 +361,3 @@ def output_text(self) -> str: truncation: Optional[str] = None """(Optional) Truncation strategy applied to the response""" - - user: Optional[str] = None - """(Optional) User identifier associated with the request""" diff --git a/tests/api_resources/models/test_openai.py b/tests/api_resources/models/test_openai.py index ea64cce2..f94d2bf6 100644 --- a/tests/api_resources/models/test_openai.py +++ b/tests/api_resources/models/test_openai.py @@ -9,7 +9,7 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient -from llama_stack_client.types.models import OpenAIListResponse +from llama_stack_client.types import ModelListResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -20,7 +20,7 @@ class TestOpenAI: @parametrize def test_method_list(self, client: LlamaStackClient) -> None: openai = client.models.openai.list() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize def test_raw_response_list(self, client: LlamaStackClient) -> None: @@ -29,7 +29,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize def test_streaming_response_list(self, client: LlamaStackClient) -> None: @@ -38,7 +38,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None: assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) assert cast(Any, response.is_closed) is True @@ -51,7 +51,7 @@ class TestAsyncOpenAI: @parametrize async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None: openai = await async_client.models.openai.list() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None: @@ -60,7 +60,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = await response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None: @@ -69,6 +69,6 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = await response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py index f2bc1e0a..bdf81d4f 100644 --- a/tests/api_resources/test_files.py +++ b/tests/api_resources/test_files.py @@ -21,20 +21,14 @@ class TestFiles: @parametrize def test_method_create(self, client: LlamaStackClient) -> None: file = client.files.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) assert_matches_type(File, file, path=["response"]) @parametrize def test_raw_response_create(self, client: LlamaStackClient) -> None: response = client.files.with_raw_response.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) assert response.is_closed is True @@ -45,10 +39,7 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None: @parametrize def test_streaming_response_create(self, client: LlamaStackClient) -> None: with client.files.with_streaming_response.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -216,20 +207,14 @@ class TestAsyncFiles: @parametrize async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: file = await async_client.files.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) assert_matches_type(File, file, path=["response"]) @parametrize async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None: response = await async_client.files.with_raw_response.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) assert response.is_closed is True @@ -240,10 +225,7 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> @parametrize async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None: async with async_client.files.with_streaming_response.create( - expires_after_anchor="expires_after_anchor", - expires_after_seconds=0, file=b"raw file contents", - purpose="assistants", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py index 6e952637..6fc8040b 100644 --- a/tests/api_resources/test_inference.py +++ b/tests/api_resources/test_inference.py @@ -9,10 +9,6 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient -from llama_stack_client.types import ( - EmbeddingsResponse, - InferenceRerankResponse, -) from llama_stack_client.types.shared import ChatCompletionResponse # pyright: reportDeprecated=false @@ -233,104 +229,6 @@ def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackC assert cast(Any, response.is_closed) is True - @parametrize - def test_method_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.embeddings( - contents=["string"], - model_id="model_id", - output_dimension=0, - task_type="query", - text_truncation="none", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.embeddings( - contents=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_rerank(self, client: LlamaStackClient) -> None: - inference = client.inference.rerank( - items=["string"], - model="model", - query="string", - ) - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - def test_method_rerank_with_all_params(self, client: LlamaStackClient) -> None: - inference = client.inference.rerank( - items=["string"], - model="model", - query="string", - max_num_results=0, - ) - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_rerank(self, client: LlamaStackClient) -> None: - response = client.inference.with_raw_response.rerank( - items=["string"], - model="model", - query="string", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_rerank(self, client: LlamaStackClient) -> None: - with client.inference.with_streaming_response.rerank( - items=["string"], - model="model", - query="string", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - class TestAsyncInference: parametrize = pytest.mark.parametrize( @@ -546,101 +444,3 @@ async def test_streaming_response_chat_completion_overload_2(self, async_client: await stream.close() assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.embeddings( - contents=["string"], - model_id="model_id", - output_dimension=0, - task_type="query", - text_truncation="none", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.embeddings( - contents=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.rerank( - items=["string"], - model="model", - query="string", - ) - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - async def test_method_rerank_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.rerank( - items=["string"], - model="model", - query="string", - max_num_results=0, - ) - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_rerank(self, async_client: AsyncLlamaStackClient) -> None: - response = await async_client.inference.with_raw_response.rerank( - items=["string"], - model="model", - query="string", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_rerank(self, async_client: AsyncLlamaStackClient) -> None: - async with async_client.inference.with_streaming_response.rerank( - items=["string"], - model="model", - query="string", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(InferenceRerankResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True From 04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:00:14 +0000 Subject: [PATCH 3/8] feat(api)!: fixes to remove deprecated inference resources --- .stats.yml | 2 +- README.md | 43 +- api.md | 4 +- src/llama_stack_client/resources/inference.py | 546 +++--------------- src/llama_stack_client/types/__init__.py | 7 +- .../chat_completion_response_stream_chunk.py | 36 -- .../types/inference_chat_completion_params.py | 134 ----- .../types/inference_rerank_params.py | 106 ++++ .../types/inference_rerank_response.py | 23 + .../types/shared/chat_completion_response.py | 12 +- .../types/shared_params/__init__.py | 1 - .../shared_params/tool_param_definition.py | 22 - .../types/token_log_probs.py | 12 - tests/api_resources/test_inference.py | 458 +++------------ tests/test_client.py | 132 +---- 15 files changed, 308 insertions(+), 1230 deletions(-) delete mode 100644 src/llama_stack_client/types/chat_completion_response_stream_chunk.py delete mode 100644 src/llama_stack_client/types/inference_chat_completion_params.py create mode 100644 src/llama_stack_client/types/inference_rerank_params.py create mode 100644 src/llama_stack_client/types/inference_rerank_response.py delete mode 100644 src/llama_stack_client/types/shared_params/tool_param_definition.py delete mode 100644 src/llama_stack_client/types/token_log_probs.py diff --git a/.stats.yml b/.stats.yml index 016bf7b6..ed589610 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 105 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952 -config_hash: 7ec5a583f9c26b38993013bdfb0e7d46 +config_hash: 5b643c97c83a497d7d346253f1e175f3 diff --git a/README.md b/README.md index d448f59d..75857f1d 100644 --- a/README.md +++ b/README.md @@ -127,17 +127,11 @@ from llama_stack_client import LlamaStackClient client = LlamaStackClient() -chat_completion_response = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - logprobs={}, +client.toolgroups.register( + provider_id="provider_id", + toolgroup_id="toolgroup_id", + mcp_endpoint={"uri": "uri"}, ) -print(chat_completion_response.logprobs) ``` ## File uploads @@ -173,10 +167,7 @@ from llama_stack_client import LlamaStackClient client = LlamaStackClient() try: - client.agents.sessions.create( - agent_id="agent_id", - session_name="session_name", - ) + client.agents.toolgroups.list() except llama_stack_client.APIConnectionError as e: print("The server could not be reached") print(e.__cause__) # an underlying Exception, likely raised within httpx. @@ -219,10 +210,7 @@ client = LlamaStackClient( ) # Or, configure per-request: -client.with_options(max_retries=5).agents.sessions.create( - agent_id="agent_id", - session_name="session_name", -) +client.with_options(max_retries=5).toolgroups.list.create() ``` ### Timeouts @@ -245,10 +233,7 @@ client = LlamaStackClient( ) # Override per-request: -client.with_options(timeout=5.0).agents.sessions.create( - agent_id="agent_id", - session_name="session_name", -) +client.with_options(timeout=5.0).toolgroups.list.create() ``` On timeout, an `APITimeoutError` is thrown. @@ -287,14 +272,11 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to from llama_stack_client import LlamaStackClient client = LlamaStackClient() -response = client.agents.sessions.with_raw_response.create( - agent_id="agent_id", - session_name="session_name", -) +response = client.toolgroups.with_raw_response.list() print(response.headers.get('X-My-Header')) -session = response.parse() # get the object that `agents.sessions.create()` would have returned -print(session.session_id) +toolgroup = response.parse() # get the object that `toolgroups.list()` would have returned +print(toolgroup) ``` These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object. @@ -308,10 +290,7 @@ The above interface eagerly reads the full response body when you make the reque To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods. ```python -with client.agents.sessions.with_streaming_response.create( - agent_id="agent_id", - session_name="session_name", -) as response: +with client.agents.toolgroups.with_streaming_response.list() as response: print(response.headers.get("X-My-Header")) for line in response.iter_lines(): diff --git a/api.md b/api.md index ad4e635c..85e5a178 100644 --- a/api.md +++ b/api.md @@ -241,12 +241,12 @@ Methods: Types: ```python -from llama_stack_client.types import ChatCompletionResponseStreamChunk, TokenLogProbs +from llama_stack_client.types import InferenceRerankResponse ``` Methods: -- client.inference.chat_completion(\*\*params) -> ChatCompletionResponse +- client.inference.rerank(\*\*params) -> InferenceRerankResponse # Embeddings diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index bac5cb3e..5c022b0f 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -2,15 +2,13 @@ from __future__ import annotations -import typing_extensions -from typing import Iterable -from typing_extensions import Literal, overload +from typing import Type, cast import httpx -from ..types import inference_chat_completion_params -from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given -from .._utils import required_args, maybe_transform, async_maybe_transform +from ..types import inference_rerank_params +from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given +from .._utils import maybe_transform, async_maybe_transform from .._compat import cached_property from .._resource import SyncAPIResource, AsyncAPIResource from .._response import ( @@ -19,13 +17,9 @@ async_to_raw_response_wrapper, async_to_streamed_response_wrapper, ) -from .._streaming import Stream, AsyncStream +from .._wrappers import DataWrapper from .._base_client import make_request_options -from ..types.shared_params.message import Message -from ..types.shared_params.response_format import ResponseFormat -from ..types.shared_params.sampling_params import SamplingParams -from ..types.shared.chat_completion_response import ChatCompletionResponse -from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk +from ..types.inference_rerank_response import InferenceRerankResponse __all__ = ["InferenceResource", "AsyncInferenceResource"] @@ -50,64 +44,34 @@ def with_streaming_response(self) -> InferenceResourceWithStreamingResponse: """ return InferenceResourceWithStreamingResponse(self) - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - def chat_completion( + def rerank( self, *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, + items: SequenceNotStr[inference_rerank_params.Item], + model: str, + query: inference_rerank_params.Query, + max_num_results: int | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse: + ) -> InferenceRerankResponse: """ - Generate a chat completion for the given messages using the specified model. + Rerank a list of documents based on their relevance to a query. Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. + items: List of items to rerank. Each item can be a string, text content part, or image + content part. Each input must not exceed the model's max input token length. - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. + model: The identifier of the reranking model to use. - tool_config: (Optional) Configuration for tool use. + query: The search query to rank items against. Can be a string, text content part, or + image content part. The input must not exceed the model's max input token + length. - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. + max_num_results: (Optional) Maximum number of results to return. Default: returns all. extra_headers: Send extra headers @@ -117,195 +81,25 @@ def chat_completion( timeout: Override the client-level default timeout for this request, in seconds """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: Literal[True], - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> Stream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: bool, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} return self._post( - "/v1/inference/chat-completion", + "/v1alpha/inference/rerank", body=maybe_transform( { - "messages": messages, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - "tool_choice": tool_choice, - "tool_config": tool_config, - "tool_prompt_format": tool_prompt_format, - "tools": tools, + "items": items, + "model": model, + "query": query, + "max_num_results": max_num_results, }, - inference_chat_completion_params.InferenceChatCompletionParamsStreaming - if stream - else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming, + inference_rerank_params.InferenceRerankParams, ), options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, ), - cast_to=ChatCompletionResponse, - stream=stream or False, - stream_cls=Stream[ChatCompletionResponseStreamChunk], + cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), ) @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @@ -514,64 +308,34 @@ def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse """ return AsyncInferenceResourceWithStreamingResponse(self) - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - async def chat_completion( + async def rerank( self, *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, + items: SequenceNotStr[inference_rerank_params.Item], + model: str, + query: inference_rerank_params.Query, + max_num_results: int | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse: + ) -> InferenceRerankResponse: """ - Generate a chat completion for the given messages using the specified model. + Rerank a list of documents based on their relevance to a query. Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. + items: List of items to rerank. Each item can be a string, text content part, or image + content part. Each input must not exceed the model's max input token length. - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. + model: The identifier of the reranking model to use. - tool_config: (Optional) Configuration for tool use. + query: The search query to rank items against. Can be a string, text content part, or + image content part. The input must not exceed the model's max input token + length. - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. + max_num_results: (Optional) Maximum number of results to return. Default: returns all. extra_headers: Send extra headers @@ -581,195 +345,25 @@ async def chat_completion( timeout: Override the client-level default timeout for this request, in seconds """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: Literal[True], - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> AsyncStream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @overload - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: bool, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.") - @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} return await self._post( - "/v1/inference/chat-completion", + "/v1alpha/inference/rerank", body=await async_maybe_transform( { - "messages": messages, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - "tool_choice": tool_choice, - "tool_config": tool_config, - "tool_prompt_format": tool_prompt_format, - "tools": tools, + "items": items, + "model": model, + "query": query, + "max_num_results": max_num_results, }, - inference_chat_completion_params.InferenceChatCompletionParamsStreaming - if stream - else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming, + inference_rerank_params.InferenceRerankParams, ), options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, ), - cast_to=ChatCompletionResponse, - stream=stream or False, - stream_cls=AsyncStream[ChatCompletionResponseStreamChunk], + cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), ) @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") @@ -962,10 +556,8 @@ class InferenceResourceWithRawResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.chat_completion = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) + self.rerank = to_raw_response_wrapper( + inference.rerank, ) @@ -973,10 +565,8 @@ class AsyncInferenceResourceWithRawResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.chat_completion = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) + self.rerank = async_to_raw_response_wrapper( + inference.rerank, ) @@ -984,10 +574,8 @@ class InferenceResourceWithStreamingResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.chat_completion = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) + self.rerank = to_streamed_response_wrapper( + inference.rerank, ) @@ -995,8 +583,6 @@ class AsyncInferenceResourceWithStreamingResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.chat_completion = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) + self.rerank = async_to_streamed_response_wrapper( + inference.rerank, ) diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py index 8a61ceec..78cfbe2d 100644 --- a/src/llama_stack_client/types/__init__.py +++ b/src/llama_stack_client/types/__init__.py @@ -47,7 +47,6 @@ from .tool_def_param import ToolDefParam as ToolDefParam from .create_response import CreateResponse as CreateResponse from .response_object import ResponseObject as ResponseObject -from .token_log_probs import TokenLogProbs as TokenLogProbs from .file_list_params import FileListParams as FileListParams from .shield_call_step import ShieldCallStep as ShieldCallStep from .span_with_status import SpanWithStatus as SpanWithStatus @@ -100,6 +99,7 @@ from .dataset_iterrows_params import DatasetIterrowsParams as DatasetIterrowsParams from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams from .embedding_create_params import EmbeddingCreateParams as EmbeddingCreateParams +from .inference_rerank_params import InferenceRerankParams as InferenceRerankParams from .list_providers_response import ListProvidersResponse as ListProvidersResponse from .scoring_fn_params_param import ScoringFnParamsParam as ScoringFnParamsParam from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse @@ -118,6 +118,7 @@ from .dataset_register_response import DatasetRegisterResponse as DatasetRegisterResponse from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams +from .inference_rerank_response import InferenceRerankResponse as InferenceRerankResponse from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams @@ -147,16 +148,12 @@ from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams -from .inference_chat_completion_params import InferenceChatCompletionParams as InferenceChatCompletionParams from .list_post_training_jobs_response import ListPostTrainingJobsResponse as ListPostTrainingJobsResponse from .scoring_function_register_params import ScoringFunctionRegisterParams as ScoringFunctionRegisterParams from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse -from .chat_completion_response_stream_chunk import ( - ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk, -) from .telemetry_save_spans_to_dataset_params import ( TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams, ) diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py deleted file mode 100644 index 1a55f3d1..00000000 --- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py +++ /dev/null @@ -1,36 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Optional -from typing_extensions import Literal - -from .._models import BaseModel -from .shared.metric import Metric -from .token_log_probs import TokenLogProbs -from .shared.content_delta import ContentDelta - -__all__ = ["ChatCompletionResponseStreamChunk", "Event"] - - -class Event(BaseModel): - delta: ContentDelta - """Content generated since last event. - - This can be one or more tokens, or a tool call. - """ - - event_type: Literal["start", "complete", "progress"] - """Type of the event""" - - logprobs: Optional[List[TokenLogProbs]] = None - """Optional log probabilities for generated tokens""" - - stop_reason: Optional[Literal["end_of_turn", "end_of_message", "out_of_tokens"]] = None - """Optional reason why generation stopped, if complete""" - - -class ChatCompletionResponseStreamChunk(BaseModel): - event: Event - """The event containing the new content""" - - metrics: Optional[List[Metric]] = None - """(Optional) List of metrics associated with the API response""" diff --git a/src/llama_stack_client/types/inference_chat_completion_params.py b/src/llama_stack_client/types/inference_chat_completion_params.py deleted file mode 100644 index 746d3dee..00000000 --- a/src/llama_stack_client/types/inference_chat_completion_params.py +++ /dev/null @@ -1,134 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Dict, Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.message import Message -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.tool_param_definition import ToolParamDefinition - -__all__ = [ - "InferenceChatCompletionParamsBase", - "Logprobs", - "ToolConfig", - "Tool", - "InferenceChatCompletionParamsNonStreaming", - "InferenceChatCompletionParamsStreaming", -] - - -class InferenceChatCompletionParamsBase(TypedDict, total=False): - messages: Required[Iterable[Message]] - """List of messages in the conversation.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding. - - There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON - schema. Most providers support this format. - `ResponseFormat.grammar`: The - grammar is a BNF grammar. This format is more flexible, but not all providers - support it. - """ - - sampling_params: SamplingParams - """Parameters to control the sampling strategy.""" - - tool_choice: Literal["auto", "required", "none"] - """(Optional) Whether tool use is required or automatic. - - Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead. - """ - - tool_config: ToolConfig - """(Optional) Configuration for tool use.""" - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - """ - - tools: Iterable[Tool] - """(Optional) List of tool definitions available to the model.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class ToolConfig(TypedDict, total=False): - system_message_behavior: Literal["append", "replace"] - """(Optional) Config for how to override the default system prompt. - - - `SystemMessageBehavior.append`: Appends the provided system message to the - default system prompt. - `SystemMessageBehavior.replace`: Replaces the default - system prompt with the provided system message. The system message can include - the string '{{function_definitions}}' to indicate where the function - definitions should be inserted. - """ - - tool_choice: Union[Literal["auto", "required", "none"], str] - """(Optional) Whether tool use is automatic, required, or none. - - Can also specify a tool name to use a specific tool. Defaults to - ToolChoice.auto. - """ - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. - """ - - -class Tool(TypedDict, total=False): - tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]] - - description: str - - parameters: Dict[str, ToolParamDefinition] - - -class InferenceChatCompletionParamsNonStreaming(InferenceChatCompletionParamsBase, total=False): - stream: Literal[False] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -class InferenceChatCompletionParamsStreaming(InferenceChatCompletionParamsBase): - stream: Required[Literal[True]] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -InferenceChatCompletionParams = Union[InferenceChatCompletionParamsNonStreaming, InferenceChatCompletionParamsStreaming] diff --git a/src/llama_stack_client/types/inference_rerank_params.py b/src/llama_stack_client/types/inference_rerank_params.py new file mode 100644 index 00000000..8f8c4d64 --- /dev/null +++ b/src/llama_stack_client/types/inference_rerank_params.py @@ -0,0 +1,106 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Union +from typing_extensions import Literal, Required, TypeAlias, TypedDict + +from .._types import SequenceNotStr + +__all__ = [ + "InferenceRerankParams", + "Item", + "ItemOpenAIChatCompletionContentPartTextParam", + "ItemOpenAIChatCompletionContentPartImageParam", + "ItemOpenAIChatCompletionContentPartImageParamImageURL", + "Query", + "QueryOpenAIChatCompletionContentPartTextParam", + "QueryOpenAIChatCompletionContentPartImageParam", + "QueryOpenAIChatCompletionContentPartImageParamImageURL", +] + + +class InferenceRerankParams(TypedDict, total=False): + items: Required[SequenceNotStr[Item]] + """List of items to rerank. + + Each item can be a string, text content part, or image content part. Each input + must not exceed the model's max input token length. + """ + + model: Required[str] + """The identifier of the reranking model to use.""" + + query: Required[Query] + """The search query to rank items against. + + Can be a string, text content part, or image content part. The input must not + exceed the model's max input token length. + """ + + max_num_results: int + """(Optional) Maximum number of results to return. Default: returns all.""" + + +class ItemOpenAIChatCompletionContentPartTextParam(TypedDict, total=False): + text: Required[str] + """The text content of the message""" + + type: Required[Literal["text"]] + """Must be "text" to identify this as text content""" + + +class ItemOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False): + url: Required[str] + """URL of the image to include in the message""" + + detail: str + """(Optional) Level of detail for image processing. + + Can be "low", "high", or "auto" + """ + + +class ItemOpenAIChatCompletionContentPartImageParam(TypedDict, total=False): + image_url: Required[ItemOpenAIChatCompletionContentPartImageParamImageURL] + """Image URL specification and processing details""" + + type: Required[Literal["image_url"]] + """Must be "image_url" to identify this as image content""" + + +Item: TypeAlias = Union[ + str, ItemOpenAIChatCompletionContentPartTextParam, ItemOpenAIChatCompletionContentPartImageParam +] + + +class QueryOpenAIChatCompletionContentPartTextParam(TypedDict, total=False): + text: Required[str] + """The text content of the message""" + + type: Required[Literal["text"]] + """Must be "text" to identify this as text content""" + + +class QueryOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False): + url: Required[str] + """URL of the image to include in the message""" + + detail: str + """(Optional) Level of detail for image processing. + + Can be "low", "high", or "auto" + """ + + +class QueryOpenAIChatCompletionContentPartImageParam(TypedDict, total=False): + image_url: Required[QueryOpenAIChatCompletionContentPartImageParamImageURL] + """Image URL specification and processing details""" + + type: Required[Literal["image_url"]] + """Must be "image_url" to identify this as image content""" + + +Query: TypeAlias = Union[ + str, QueryOpenAIChatCompletionContentPartTextParam, QueryOpenAIChatCompletionContentPartImageParam +] diff --git a/src/llama_stack_client/types/inference_rerank_response.py b/src/llama_stack_client/types/inference_rerank_response.py new file mode 100644 index 00000000..e74fc7e6 --- /dev/null +++ b/src/llama_stack_client/types/inference_rerank_response.py @@ -0,0 +1,23 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List +from typing_extensions import TypeAlias + +from .._models import BaseModel + +__all__ = ["InferenceRerankResponse", "InferenceRerankResponseItem"] + + +class InferenceRerankResponseItem(BaseModel): + index: int + """The original index of the document in the input list""" + + relevance_score: float + """The relevance score from the model output. + + Values are inverted when applicable so that higher scores indicate greater + relevance. + """ + + +InferenceRerankResponse: TypeAlias = List[InferenceRerankResponseItem] diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py index 30191439..eb78a109 100644 --- a/src/llama_stack_client/types/shared/chat_completion_response.py +++ b/src/llama_stack_client/types/shared/chat_completion_response.py @@ -1,20 +1,24 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Optional +from typing import Dict, List, Optional from .metric import Metric from ..._models import BaseModel -from ..token_log_probs import TokenLogProbs from .completion_message import CompletionMessage -__all__ = ["ChatCompletionResponse"] +__all__ = ["ChatCompletionResponse", "Logprob"] + + +class Logprob(BaseModel): + logprobs_by_token: Dict[str, float] + """Dictionary mapping tokens to their log probabilities""" class ChatCompletionResponse(BaseModel): completion_message: CompletionMessage """The complete response message""" - logprobs: Optional[List[TokenLogProbs]] = None + logprobs: Optional[List[Logprob]] = None """Optional log probabilities for generated tokens""" metrics: Optional[List[Metric]] = None diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py index 3a0842e8..2ba8b592 100644 --- a/src/llama_stack_client/types/shared_params/__init__.py +++ b/src/llama_stack_client/types/shared_params/__init__.py @@ -11,7 +11,6 @@ from .sampling_params import SamplingParams as SamplingParams from .completion_message import CompletionMessage as CompletionMessage from .interleaved_content import InterleavedContent as InterleavedContent -from .tool_param_definition import ToolParamDefinition as ToolParamDefinition from .tool_response_message import ToolResponseMessage as ToolResponseMessage from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py deleted file mode 100644 index 87563946..00000000 --- a/src/llama_stack_client/types/shared_params/tool_param_definition.py +++ /dev/null @@ -1,22 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union, Iterable -from typing_extensions import Required, TypedDict - -__all__ = ["ToolParamDefinition"] - - -class ToolParamDefinition(TypedDict, total=False): - param_type: Required[str] - - default: Union[bool, float, str, Iterable[object], object, None] - - description: str - - items: Union[bool, float, str, Iterable[object], object, None] - - required: bool - - title: str diff --git a/src/llama_stack_client/types/token_log_probs.py b/src/llama_stack_client/types/token_log_probs.py deleted file mode 100644 index b1a0a2b4..00000000 --- a/src/llama_stack_client/types/token_log_probs.py +++ /dev/null @@ -1,12 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Dict - -from .._models import BaseModel - -__all__ = ["TokenLogProbs"] - - -class TokenLogProbs(BaseModel): - logprobs_by_token: Dict[str, float] - """Dictionary mapping tokens to their log probabilities""" diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py index 6fc8040b..f26802c2 100644 --- a/tests/api_resources/test_inference.py +++ b/tests/api_resources/test_inference.py @@ -9,9 +9,7 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient -from llama_stack_client.types.shared import ChatCompletionResponse - -# pyright: reportDeprecated=false +from llama_stack_client.types import InferenceRerankResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -20,212 +18,49 @@ class TestInference: parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) @parametrize - def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) + def test_method_rerank(self, client: LlamaStackClient) -> None: + inference = client.inference.rerank( + items=["string"], + model="model", + query="string", + ) + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "items": True, - "required": True, - "title": "title", - } - }, - } - ], - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) + def test_method_rerank_with_all_params(self, client: LlamaStackClient) -> None: + inference = client.inference.rerank( + items=["string"], + model="model", + query="string", + max_num_results=0, + ) + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - def test_raw_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) + def test_raw_response_rerank(self, client: LlamaStackClient) -> None: + response = client.inference.with_raw_response.rerank( + items=["string"], + model="model", + query="string", + ) assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" inference = response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - inference_stream.response.close() - - @parametrize - def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "items": True, - "required": True, - "title": "title", - } - }, - } - ], - ) - - inference_stream.response.close() - - @parametrize - def test_raw_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = response.parse() - stream.close() + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" + def test_streaming_response_rerank(self, client: LlamaStackClient) -> None: + with client.inference.with_streaming_response.rerank( + items=["string"], + model="model", + query="string", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = response.parse() - stream.close() + inference = response.parse() + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) assert cast(Any, response.is_closed) is True @@ -236,211 +71,48 @@ class TestAsyncInference: ) @parametrize - async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) + async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None: + inference = await async_client.inference.rerank( + items=["string"], + model="model", + query="string", + ) + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - async def test_method_chat_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "items": True, - "required": True, - "title": "title", - } - }, - } - ], - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) + async def test_method_rerank_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: + inference = await async_client.inference.rerank( + items=["string"], + model="model", + query="string", + max_num_results=0, + ) + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - async def test_raw_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) + async def test_raw_response_rerank(self, async_client: AsyncLlamaStackClient) -> None: + response = await async_client.inference.with_raw_response.rerank( + items=["string"], + model="model", + query="string", + ) assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" inference = await response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_method_chat_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "items": True, - "required": True, - "title": "title", - } - }, - } - ], - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_raw_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = await response.parse() - await stream.close() + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) @parametrize - async def test_streaming_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" + async def test_streaming_response_rerank(self, async_client: AsyncLlamaStackClient) -> None: + async with async_client.inference.with_streaming_response.rerank( + items=["string"], + model="model", + query="string", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = await response.parse() - await stream.close() + inference = await response.parse() + assert_matches_type(InferenceRerankResponse, inference, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/test_client.py b/tests/test_client.py index a5bce12c..c5606d5d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -678,36 +678,20 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ).__enter__() + client.toolgroups.with_streaming_response.list().__enter__() assert _get_open_connections(self.client) == 0 @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500)) + respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ).__enter__() + client.toolgroups.with_streaming_response.list().__enter__() assert _get_open_connections(self.client) == 0 @pytest.mark.parametrize("failures_before_success", [0, 2, 4]) @@ -734,17 +718,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) + response = client.toolgroups.with_raw_response.list() assert response.retries_taken == failures_before_success assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success @@ -766,18 +742,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) - - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - extra_headers={"x-stainless-retry-count": Omit()}, - ) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + + response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()}) assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0 @@ -798,18 +765,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) - - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - extra_headers={"x-stainless-retry-count": "42"}, - ) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + + response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"}) assert response.http_request.headers.get("x-stainless-retry-count") == "42" @@ -1498,18 +1456,10 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte async def test_retrying_timeout_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - await async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ).__aenter__() + await async_client.toolgroups.with_streaming_response.list().__aenter__() assert _get_open_connections(self.client) == 0 @@ -1518,18 +1468,10 @@ async def test_retrying_timeout_errors_doesnt_leak( async def test_retrying_status_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500)) + respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - await async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ).__aenter__() + await async_client.toolgroups.with_streaming_response.list().__aenter__() assert _get_open_connections(self.client) == 0 @pytest.mark.parametrize("failures_before_success", [0, 2, 4]) @@ -1557,17 +1499,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - response = await client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) + response = await client.toolgroups.with_raw_response.list() assert response.retries_taken == failures_before_success assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success @@ -1590,18 +1524,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) - - response = await client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - extra_headers={"x-stainless-retry-count": Omit()}, - ) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + + response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()}) assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0 @@ -1623,18 +1548,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) - - response = await client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - extra_headers={"x-stainless-retry-count": "42"}, - ) + respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + + response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"}) assert response.http_request.headers.get("x-stainless-retry-count") == "42" From f10ead00522b7ca803cd7dc3617da0d451efa7da Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 29 Sep 2025 19:40:32 -0700 Subject: [PATCH 4/8] fix: clean up deprecated code --- .../lib/inference/event_logger.py | 30 +- src/llama_stack_client/resources/inference.py | 370 ------------------ 2 files changed, 3 insertions(+), 397 deletions(-) diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py index 14b46372..cbf5f680 100644 --- a/src/llama_stack_client/lib/inference/event_logger.py +++ b/src/llama_stack_client/lib/inference/event_logger.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from typing import Generator from termcolor import cprint -from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk +from llama_stack_client.types import ChatCompletionChunk class InferenceStreamPrintableEvent: @@ -28,35 +28,11 @@ def __init__(self): self.is_thinking = False def yield_printable_events( - self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk + self, chunk: ChatCompletionChunk ) -> Generator[InferenceStreamPrintableEvent, None, None]: - # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk) - if hasattr(chunk, "event"): - yield from self._handle_inference_stream_chunk(chunk) - # Check if the chunk has choices attribute (ChatCompletionChunk) - elif hasattr(chunk, "choices") and len(chunk.choices) > 0: + if hasattr(chunk, "choices") and len(chunk.choices) > 0: yield from self._handle_chat_completion_chunk(chunk) - def _handle_inference_stream_chunk( - self, chunk: ChatCompletionResponseStreamChunk - ) -> Generator[InferenceStreamPrintableEvent, None, None]: - event = chunk.event - if event.event_type == "start": - yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="") - elif event.event_type == "progress": - if event.delta.type == "reasoning": - if not self.is_thinking: - yield InferenceStreamPrintableEvent(" ", color="magenta", end="") - self.is_thinking = True - yield InferenceStreamPrintableEvent(event.delta.reasoning, color="magenta", end="") - else: - if self.is_thinking: - yield InferenceStreamPrintableEvent("", color="magenta", end="") - self.is_thinking = False - yield InferenceStreamPrintableEvent(event.delta.text, color="yellow", end="") - elif event.event_type == "complete": - yield InferenceStreamPrintableEvent("") - def _handle_chat_completion_chunk( self, chunk: ChatCompletionChunk ) -> Generator[InferenceStreamPrintableEvent, None, None]: diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index 5c022b0f..e5cf7b6b 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -102,191 +102,6 @@ def rerank( cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), ) - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: Literal[True], - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> Stream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: bool, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | Stream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @required_args(["content", "model_id"], ["content", "model_id", "stream"]) - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | Stream[CompletionResponse]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return self._post( - "/v1/inference/completion", - body=maybe_transform( - { - "content": content, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - }, - inference_completion_params.InferenceCompletionParamsStreaming - if stream - else inference_completion_params.InferenceCompletionParamsNonStreaming, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=CompletionResponse, - stream=stream or False, - stream_cls=Stream[CompletionResponse], - ) - - class AsyncInferenceResource(AsyncAPIResource): @cached_property @@ -366,191 +181,6 @@ async def rerank( cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), ) - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: Literal[True], - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> AsyncStream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: bool, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | AsyncStream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.") - @required_args(["content", "model_id"], ["content", "model_id", "stream"]) - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | AsyncStream[CompletionResponse]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return await self._post( - "/v1/inference/completion", - body=await async_maybe_transform( - { - "content": content, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - }, - inference_completion_params.InferenceCompletionParamsStreaming - if stream - else inference_completion_params.InferenceCompletionParamsNonStreaming, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=CompletionResponse, - stream=stream or False, - stream_cls=AsyncStream[CompletionResponse], - ) - - class InferenceResourceWithRawResponse: def __init__(self, inference: InferenceResource) -> None: From 433a996527bcca131ada4730376d8993f34ad6f5 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 03:37:07 +0000 Subject: [PATCH 5/8] feat(api): updating post /v1/files to have correct multipart/form-data --- .stats.yml | 4 +- README.md | 1 + api.md | 11 +- src/llama_stack_client/resources/files.py | 36 +++- src/llama_stack_client/types/__init__.py | 3 - .../types/agents/__init__.py | 1 - .../types/agents/turn_response_event.py | 155 +++++++++++++++++- .../agents/turn_response_event_payload.py | 109 ------------ .../types/benchmark_config_param.py | 35 +++- .../types/eval_candidate_param.py | 35 ---- .../types/file_create_params.py | 20 ++- .../types/shared/__init__.py | 2 - .../types/shared/content_delta.py | 43 ----- .../types/shared/query_config.py | 36 +++- .../types/shared/query_generator_config.py | 33 ---- .../types/shared_params/__init__.py | 1 - .../types/shared_params/query_config.py | 34 +++- .../shared_params/query_generator_config.py | 30 ---- tests/api_resources/test_files.py | 30 ++++ 19 files changed, 335 insertions(+), 284 deletions(-) delete mode 100644 src/llama_stack_client/types/agents/turn_response_event_payload.py delete mode 100644 src/llama_stack_client/types/eval_candidate_param.py delete mode 100644 src/llama_stack_client/types/shared/content_delta.py delete mode 100644 src/llama_stack_client/types/shared/query_generator_config.py delete mode 100644 src/llama_stack_client/types/shared_params/query_generator_config.py diff --git a/.stats.yml b/.stats.yml index ed589610..20dba32e 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 105 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml -openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml +openapi_spec_hash: f73b3af77108625edae3f25972b9e665 config_hash: 5b643c97c83a497d7d346253f1e175f3 diff --git a/README.md b/README.md index 75857f1d..76c9b9ae 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ client = LlamaStackClient() client.files.create( file=Path("/path/to/file"), + purpose="assistants", ) ``` diff --git a/api.md b/api.md index 85e5a178..c246f4c1 100644 --- a/api.md +++ b/api.md @@ -5,7 +5,6 @@ from llama_stack_client.types import ( AgentConfig, ChatCompletionResponse, CompletionMessage, - ContentDelta, Document, InterleavedContent, InterleavedContentItem, @@ -13,7 +12,6 @@ from llama_stack_client.types import ( Metric, ParamType, QueryConfig, - QueryGeneratorConfig, QueryResult, ResponseFormat, SafetyViolation, @@ -163,12 +161,7 @@ Methods: Types: ```python -from llama_stack_client.types.agents import ( - AgentTurnResponseStreamChunk, - Turn, - TurnResponseEvent, - TurnResponseEventPayload, -) +from llama_stack_client.types.agents import AgentTurnResponseStreamChunk, Turn, TurnResponseEvent ``` Methods: @@ -205,7 +198,7 @@ Methods: Types: ```python -from llama_stack_client.types import BenchmarkConfig, EvalCandidate, EvaluateResponse, Job +from llama_stack_client.types import BenchmarkConfig, EvaluateResponse, Job ``` Methods: diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py index 04c37c56..39add811 100644 --- a/src/llama_stack_client/resources/files.py +++ b/src/llama_stack_client/resources/files.py @@ -50,6 +50,8 @@ def create( self, *, file: FileTypes, + purpose: Literal["assistants", "batch"], + expires_after: file_create_params.ExpiresAfter | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -67,6 +69,14 @@ def create( - expires_after: Optional form values describing expiration for the file. Args: + purpose: Valid purpose values for OpenAI Files API. + + expires_after: + Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -75,7 +85,13 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal({"file": file}) + body = deepcopy_minimal( + { + "file": file, + "purpose": purpose, + "expires_after": expires_after, + } + ) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) # It should be noted that the actual Content-Type header that will be # sent to the server will contain a `boundary` parameter, e.g. @@ -275,6 +291,8 @@ async def create( self, *, file: FileTypes, + purpose: Literal["assistants", "batch"], + expires_after: file_create_params.ExpiresAfter | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -292,6 +310,14 @@ async def create( - expires_after: Optional form values describing expiration for the file. Args: + purpose: Valid purpose values for OpenAI Files API. + + expires_after: + Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -300,7 +326,13 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal({"file": file}) + body = deepcopy_minimal( + { + "file": file, + "purpose": purpose, + "expires_after": expires_after, + } + ) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) # It should be noted that the actual Content-Type header that will be # sent to the server will contain a `boundary` parameter, e.g. diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py index 78cfbe2d..f81ada61 100644 --- a/src/llama_stack_client/types/__init__.py +++ b/src/llama_stack_client/types/__init__.py @@ -17,7 +17,6 @@ QueryConfig as QueryConfig, QueryResult as QueryResult, UserMessage as UserMessage, - ContentDelta as ContentDelta, ScoringResult as ScoringResult, SystemMessage as SystemMessage, ResponseFormat as ResponseFormat, @@ -27,7 +26,6 @@ InterleavedContent as InterleavedContent, ToolParamDefinition as ToolParamDefinition, ToolResponseMessage as ToolResponseMessage, - QueryGeneratorConfig as QueryGeneratorConfig, ChatCompletionResponse as ChatCompletionResponse, InterleavedContentItem as InterleavedContentItem, ) @@ -67,7 +65,6 @@ from .tool_execution_step import ToolExecutionStep as ToolExecutionStep from .tool_response_param import ToolResponseParam as ToolResponseParam from .delete_file_response import DeleteFileResponse as DeleteFileResponse -from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams from .list_models_response import ListModelsResponse as ListModelsResponse from .list_routes_response import ListRoutesResponse as ListRoutesResponse diff --git a/src/llama_stack_client/types/agents/__init__.py b/src/llama_stack_client/types/agents/__init__.py index f4f48353..3a144840 100644 --- a/src/llama_stack_client/types/agents/__init__.py +++ b/src/llama_stack_client/types/agents/__init__.py @@ -13,5 +13,4 @@ from .step_retrieve_response import StepRetrieveResponse as StepRetrieveResponse from .session_create_response import SessionCreateResponse as SessionCreateResponse from .session_retrieve_params import SessionRetrieveParams as SessionRetrieveParams -from .turn_response_event_payload import TurnResponseEventPayload as TurnResponseEventPayload from .agent_turn_response_stream_chunk import AgentTurnResponseStreamChunk as AgentTurnResponseStreamChunk diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py index df213246..c52121ab 100644 --- a/src/llama_stack_client/types/agents/turn_response_event.py +++ b/src/llama_stack_client/types/agents/turn_response_event.py @@ -1,11 +1,160 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. +from typing import Dict, List, Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias + +from .turn import Turn +from ..._utils import PropertyInfo from ..._models import BaseModel -from .turn_response_event_payload import TurnResponseEventPayload +from ..inference_step import InferenceStep +from ..shared.tool_call import ToolCall +from ..shield_call_step import ShieldCallStep +from ..tool_execution_step import ToolExecutionStep +from ..memory_retrieval_step import MemoryRetrievalStep + +__all__ = [ + "TurnResponseEvent", + "Payload", + "PayloadAgentTurnResponseStepStartPayload", + "PayloadAgentTurnResponseStepProgressPayload", + "PayloadAgentTurnResponseStepProgressPayloadDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall", + "PayloadAgentTurnResponseStepCompletePayload", + "PayloadAgentTurnResponseStepCompletePayloadStepDetails", + "PayloadAgentTurnResponseTurnStartPayload", + "PayloadAgentTurnResponseTurnCompletePayload", + "PayloadAgentTurnResponseTurnAwaitingInputPayload", +] + + +class PayloadAgentTurnResponseStepStartPayload(BaseModel): + event_type: Literal["step_start"] + """Type of event being reported""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None + """(Optional) Additional metadata for the step""" + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta(BaseModel): + text: str + """The incremental text content""" + + type: Literal["text"] + """Discriminator type of the delta. Always "text" """ + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta(BaseModel): + image: str + """The incremental image data as bytes""" + + type: Literal["image"] + """Discriminator type of the delta. Always "image" """ + + +PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall] + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta(BaseModel): + parse_status: Literal["started", "in_progress", "failed", "succeeded"] + """Current parsing status of the tool call""" + + tool_call: PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall + """Either an in-progress tool call string or the final parsed tool call""" + + type: Literal["tool_call"] + """Discriminator type of the delta. Always "tool_call" """ + + +PayloadAgentTurnResponseStepProgressPayloadDelta: TypeAlias = Annotated[ + Union[ + PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta, + PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta, + PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta, + ], + PropertyInfo(discriminator="type"), +] + + +class PayloadAgentTurnResponseStepProgressPayload(BaseModel): + delta: PayloadAgentTurnResponseStepProgressPayloadDelta + """Incremental content changes during step execution""" + + event_type: Literal["step_progress"] + """Type of event being reported""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + +PayloadAgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[ + Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep], + PropertyInfo(discriminator="step_type"), +] + + +class PayloadAgentTurnResponseStepCompletePayload(BaseModel): + event_type: Literal["step_complete"] + """Type of event being reported""" + + step_details: PayloadAgentTurnResponseStepCompletePayloadStepDetails + """Complete details of the executed step""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + +class PayloadAgentTurnResponseTurnStartPayload(BaseModel): + event_type: Literal["turn_start"] + """Type of event being reported""" + + turn_id: str + """Unique identifier for the turn within a session""" + + +class PayloadAgentTurnResponseTurnCompletePayload(BaseModel): + event_type: Literal["turn_complete"] + """Type of event being reported""" + + turn: Turn + """Complete turn data including all steps and results""" + + +class PayloadAgentTurnResponseTurnAwaitingInputPayload(BaseModel): + event_type: Literal["turn_awaiting_input"] + """Type of event being reported""" + + turn: Turn + """Turn data when waiting for external tool responses""" + -__all__ = ["TurnResponseEvent"] +Payload: TypeAlias = Annotated[ + Union[ + PayloadAgentTurnResponseStepStartPayload, + PayloadAgentTurnResponseStepProgressPayload, + PayloadAgentTurnResponseStepCompletePayload, + PayloadAgentTurnResponseTurnStartPayload, + PayloadAgentTurnResponseTurnCompletePayload, + PayloadAgentTurnResponseTurnAwaitingInputPayload, + ], + PropertyInfo(discriminator="event_type"), +] class TurnResponseEvent(BaseModel): - payload: TurnResponseEventPayload + payload: Payload """Event-specific payload containing event data""" diff --git a/src/llama_stack_client/types/agents/turn_response_event_payload.py b/src/llama_stack_client/types/agents/turn_response_event_payload.py deleted file mode 100644 index 1844c61e..00000000 --- a/src/llama_stack_client/types/agents/turn_response_event_payload.py +++ /dev/null @@ -1,109 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Dict, List, Union, Optional -from typing_extensions import Literal, Annotated, TypeAlias - -from .turn import Turn -from ..._utils import PropertyInfo -from ..._models import BaseModel -from ..inference_step import InferenceStep -from ..shield_call_step import ShieldCallStep -from ..tool_execution_step import ToolExecutionStep -from ..shared.content_delta import ContentDelta -from ..memory_retrieval_step import MemoryRetrievalStep - -__all__ = [ - "TurnResponseEventPayload", - "AgentTurnResponseStepStartPayload", - "AgentTurnResponseStepProgressPayload", - "AgentTurnResponseStepCompletePayload", - "AgentTurnResponseStepCompletePayloadStepDetails", - "AgentTurnResponseTurnStartPayload", - "AgentTurnResponseTurnCompletePayload", - "AgentTurnResponseTurnAwaitingInputPayload", -] - - -class AgentTurnResponseStepStartPayload(BaseModel): - event_type: Literal["step_start"] - """Type of event being reported""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None - """(Optional) Additional metadata for the step""" - - -class AgentTurnResponseStepProgressPayload(BaseModel): - delta: ContentDelta - """Incremental content changes during step execution""" - - event_type: Literal["step_progress"] - """Type of event being reported""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - -AgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[ - Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep], - PropertyInfo(discriminator="step_type"), -] - - -class AgentTurnResponseStepCompletePayload(BaseModel): - event_type: Literal["step_complete"] - """Type of event being reported""" - - step_details: AgentTurnResponseStepCompletePayloadStepDetails - """Complete details of the executed step""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - -class AgentTurnResponseTurnStartPayload(BaseModel): - event_type: Literal["turn_start"] - """Type of event being reported""" - - turn_id: str - """Unique identifier for the turn within a session""" - - -class AgentTurnResponseTurnCompletePayload(BaseModel): - event_type: Literal["turn_complete"] - """Type of event being reported""" - - turn: Turn - """Complete turn data including all steps and results""" - - -class AgentTurnResponseTurnAwaitingInputPayload(BaseModel): - event_type: Literal["turn_awaiting_input"] - """Type of event being reported""" - - turn: Turn - """Turn data when waiting for external tool responses""" - - -TurnResponseEventPayload: TypeAlias = Annotated[ - Union[ - AgentTurnResponseStepStartPayload, - AgentTurnResponseStepProgressPayload, - AgentTurnResponseStepCompletePayload, - AgentTurnResponseTurnStartPayload, - AgentTurnResponseTurnCompletePayload, - AgentTurnResponseTurnAwaitingInputPayload, - ], - PropertyInfo(discriminator="event_type"), -] diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py index 740bf99b..dc968521 100644 --- a/src/llama_stack_client/types/benchmark_config_param.py +++ b/src/llama_stack_client/types/benchmark_config_param.py @@ -2,17 +2,42 @@ from __future__ import annotations -from typing import Dict -from typing_extensions import Required, TypedDict +from typing import Dict, Union +from typing_extensions import Literal, Required, TypeAlias, TypedDict -from .eval_candidate_param import EvalCandidateParam from .scoring_fn_params_param import ScoringFnParamsParam +from .shared_params.agent_config import AgentConfig +from .shared_params.system_message import SystemMessage +from .shared_params.sampling_params import SamplingParams -__all__ = ["BenchmarkConfigParam"] +__all__ = ["BenchmarkConfigParam", "EvalCandidate", "EvalCandidateModelCandidate", "EvalCandidateAgentCandidate"] + + +class EvalCandidateModelCandidate(TypedDict, total=False): + model: Required[str] + """The model ID to evaluate.""" + + sampling_params: Required[SamplingParams] + """The sampling parameters for the model.""" + + type: Required[Literal["model"]] + + system_message: SystemMessage + """(Optional) The system message providing instructions or context to the model.""" + + +class EvalCandidateAgentCandidate(TypedDict, total=False): + config: Required[AgentConfig] + """The configuration for the agent candidate.""" + + type: Required[Literal["agent"]] + + +EvalCandidate: TypeAlias = Union[EvalCandidateModelCandidate, EvalCandidateAgentCandidate] class BenchmarkConfigParam(TypedDict, total=False): - eval_candidate: Required[EvalCandidateParam] + eval_candidate: Required[EvalCandidate] """The candidate to evaluate.""" scoring_params: Required[Dict[str, ScoringFnParamsParam]] diff --git a/src/llama_stack_client/types/eval_candidate_param.py b/src/llama_stack_client/types/eval_candidate_param.py deleted file mode 100644 index be1b21c8..00000000 --- a/src/llama_stack_client/types/eval_candidate_param.py +++ /dev/null @@ -1,35 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypeAlias, TypedDict - -from .shared_params.agent_config import AgentConfig -from .shared_params.system_message import SystemMessage -from .shared_params.sampling_params import SamplingParams - -__all__ = ["EvalCandidateParam", "ModelCandidate", "AgentCandidate"] - - -class ModelCandidate(TypedDict, total=False): - model: Required[str] - """The model ID to evaluate.""" - - sampling_params: Required[SamplingParams] - """The sampling parameters for the model.""" - - type: Required[Literal["model"]] - - system_message: SystemMessage - """(Optional) The system message providing instructions or context to the model.""" - - -class AgentCandidate(TypedDict, total=False): - config: Required[AgentConfig] - """The configuration for the agent candidate.""" - - type: Required[Literal["agent"]] - - -EvalCandidateParam: TypeAlias = Union[ModelCandidate, AgentCandidate] diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py index 6278e1a0..2be39a7a 100644 --- a/src/llama_stack_client/types/file_create_params.py +++ b/src/llama_stack_client/types/file_create_params.py @@ -2,12 +2,28 @@ from __future__ import annotations -from typing_extensions import Required, TypedDict +from typing_extensions import Literal, Required, TypedDict from .._types import FileTypes -__all__ = ["FileCreateParams"] +__all__ = ["FileCreateParams", "ExpiresAfter"] class FileCreateParams(TypedDict, total=False): file: Required[FileTypes] + + purpose: Required[Literal["assistants", "batch"]] + """Valid purpose values for OpenAI Files API.""" + + expires_after: ExpiresAfter + """Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + """ + + +class ExpiresAfter(TypedDict, total=False): + anchor: Required[Literal["created_at"]] + + seconds: Required[int] diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py index 007d56ac..f346cda7 100644 --- a/src/llama_stack_client/types/shared/__init__.py +++ b/src/llama_stack_client/types/shared/__init__.py @@ -9,7 +9,6 @@ from .query_config import QueryConfig as QueryConfig from .query_result import QueryResult as QueryResult from .user_message import UserMessage as UserMessage -from .content_delta import ContentDelta as ContentDelta from .scoring_result import ScoringResult as ScoringResult from .system_message import SystemMessage as SystemMessage from .response_format import ResponseFormat as ResponseFormat @@ -19,6 +18,5 @@ from .interleaved_content import InterleavedContent as InterleavedContent from .tool_param_definition import ToolParamDefinition as ToolParamDefinition from .tool_response_message import ToolResponseMessage as ToolResponseMessage -from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig from .chat_completion_response import ChatCompletionResponse as ChatCompletionResponse from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py deleted file mode 100644 index 7ed58d13..00000000 --- a/src/llama_stack_client/types/shared/content_delta.py +++ /dev/null @@ -1,43 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Union -from typing_extensions import Literal, Annotated, TypeAlias - -from ..._utils import PropertyInfo -from ..._models import BaseModel -from .tool_call import ToolCall - -__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"] - - -class TextDelta(BaseModel): - text: str - """The incremental text content""" - - type: Literal["text"] - """Discriminator type of the delta. Always "text" """ - - -class ImageDelta(BaseModel): - image: str - """The incremental image data as bytes""" - - type: Literal["image"] - """Discriminator type of the delta. Always "image" """ - - -ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall] - - -class ToolCallDelta(BaseModel): - parse_status: Literal["started", "in_progress", "failed", "succeeded"] - """Current parsing status of the tool call""" - - tool_call: ToolCallDeltaToolCall - """Either an in-progress tool call string or the final parsed tool call""" - - type: Literal["tool_call"] - """Discriminator type of the delta. Always "tool_call" """ - - -ContentDelta: TypeAlias = Annotated[Union[TextDelta, ImageDelta, ToolCallDelta], PropertyInfo(discriminator="type")] diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py index 389514c7..a4a1f741 100644 --- a/src/llama_stack_client/types/shared/query_config.py +++ b/src/llama_stack_client/types/shared/query_config.py @@ -5,9 +5,41 @@ from ..._utils import PropertyInfo from ..._models import BaseModel -from .query_generator_config import QueryGeneratorConfig -__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"] +__all__ = [ + "QueryConfig", + "QueryGeneratorConfig", + "QueryGeneratorConfigDefaultRagQueryGeneratorConfig", + "QueryGeneratorConfigLlmragQueryGeneratorConfig", + "Ranker", + "RankerRrfRanker", + "RankerWeightedRanker", +] + + +class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(BaseModel): + separator: str + """String separator used to join query terms""" + + type: Literal["default"] + """Type of query generator, always 'default'""" + + +class QueryGeneratorConfigLlmragQueryGeneratorConfig(BaseModel): + model: str + """Name of the language model to use for query generation""" + + template: str + """Template string for formatting the query generation prompt""" + + type: Literal["llm"] + """Type of query generator, always 'llm'""" + + +QueryGeneratorConfig: TypeAlias = Annotated[ + Union[QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig], + PropertyInfo(discriminator="type"), +] class RankerRrfRanker(BaseModel): diff --git a/src/llama_stack_client/types/shared/query_generator_config.py b/src/llama_stack_client/types/shared/query_generator_config.py deleted file mode 100644 index 624fc190..00000000 --- a/src/llama_stack_client/types/shared/query_generator_config.py +++ /dev/null @@ -1,33 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Union -from typing_extensions import Literal, Annotated, TypeAlias - -from ..._utils import PropertyInfo -from ..._models import BaseModel - -__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"] - - -class DefaultRagQueryGeneratorConfig(BaseModel): - separator: str - """String separator used to join query terms""" - - type: Literal["default"] - """Type of query generator, always 'default'""" - - -class LlmragQueryGeneratorConfig(BaseModel): - model: str - """Name of the language model to use for query generation""" - - template: str - """Template string for formatting the query generation prompt""" - - type: Literal["llm"] - """Type of query generator, always 'llm'""" - - -QueryGeneratorConfig: TypeAlias = Annotated[ - Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig], PropertyInfo(discriminator="type") -] diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py index 2ba8b592..894d8a8d 100644 --- a/src/llama_stack_client/types/shared_params/__init__.py +++ b/src/llama_stack_client/types/shared_params/__init__.py @@ -12,5 +12,4 @@ from .completion_message import CompletionMessage as CompletionMessage from .interleaved_content import InterleavedContent as InterleavedContent from .tool_response_message import ToolResponseMessage as ToolResponseMessage -from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem diff --git a/src/llama_stack_client/types/shared_params/query_config.py b/src/llama_stack_client/types/shared_params/query_config.py index d008c48c..91a5b596 100644 --- a/src/llama_stack_client/types/shared_params/query_config.py +++ b/src/llama_stack_client/types/shared_params/query_config.py @@ -5,9 +5,39 @@ from typing import Union from typing_extensions import Literal, Required, TypeAlias, TypedDict -from .query_generator_config import QueryGeneratorConfig +__all__ = [ + "QueryConfig", + "QueryGeneratorConfig", + "QueryGeneratorConfigDefaultRagQueryGeneratorConfig", + "QueryGeneratorConfigLlmragQueryGeneratorConfig", + "Ranker", + "RankerRrfRanker", + "RankerWeightedRanker", +] -__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"] + +class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(TypedDict, total=False): + separator: Required[str] + """String separator used to join query terms""" + + type: Required[Literal["default"]] + """Type of query generator, always 'default'""" + + +class QueryGeneratorConfigLlmragQueryGeneratorConfig(TypedDict, total=False): + model: Required[str] + """Name of the language model to use for query generation""" + + template: Required[str] + """Template string for formatting the query generation prompt""" + + type: Required[Literal["llm"]] + """Type of query generator, always 'llm'""" + + +QueryGeneratorConfig: TypeAlias = Union[ + QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig +] class RankerRrfRanker(TypedDict, total=False): diff --git a/src/llama_stack_client/types/shared_params/query_generator_config.py b/src/llama_stack_client/types/shared_params/query_generator_config.py deleted file mode 100644 index 8c589bf9..00000000 --- a/src/llama_stack_client/types/shared_params/query_generator_config.py +++ /dev/null @@ -1,30 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypeAlias, TypedDict - -__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"] - - -class DefaultRagQueryGeneratorConfig(TypedDict, total=False): - separator: Required[str] - """String separator used to join query terms""" - - type: Required[Literal["default"]] - """Type of query generator, always 'default'""" - - -class LlmragQueryGeneratorConfig(TypedDict, total=False): - model: Required[str] - """Name of the language model to use for query generation""" - - template: Required[str] - """Template string for formatting the query generation prompt""" - - type: Required[Literal["llm"]] - """Type of query generator, always 'llm'""" - - -QueryGeneratorConfig: TypeAlias = Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig] diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py index bdf81d4f..83b763ab 100644 --- a/tests/api_resources/test_files.py +++ b/tests/api_resources/test_files.py @@ -22,6 +22,19 @@ class TestFiles: def test_method_create(self, client: LlamaStackClient) -> None: file = client.files.create( file=b"raw file contents", + purpose="assistants", + ) + assert_matches_type(File, file, path=["response"]) + + @parametrize + def test_method_create_with_all_params(self, client: LlamaStackClient) -> None: + file = client.files.create( + file=b"raw file contents", + purpose="assistants", + expires_after={ + "anchor": "created_at", + "seconds": 0, + }, ) assert_matches_type(File, file, path=["response"]) @@ -29,6 +42,7 @@ def test_method_create(self, client: LlamaStackClient) -> None: def test_raw_response_create(self, client: LlamaStackClient) -> None: response = client.files.with_raw_response.create( file=b"raw file contents", + purpose="assistants", ) assert response.is_closed is True @@ -40,6 +54,7 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None: def test_streaming_response_create(self, client: LlamaStackClient) -> None: with client.files.with_streaming_response.create( file=b"raw file contents", + purpose="assistants", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -208,6 +223,19 @@ class TestAsyncFiles: async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: file = await async_client.files.create( file=b"raw file contents", + purpose="assistants", + ) + assert_matches_type(File, file, path=["response"]) + + @parametrize + async def test_method_create_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: + file = await async_client.files.create( + file=b"raw file contents", + purpose="assistants", + expires_after={ + "anchor": "created_at", + "seconds": 0, + }, ) assert_matches_type(File, file, path=["response"]) @@ -215,6 +243,7 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None: response = await async_client.files.with_raw_response.create( file=b"raw file contents", + purpose="assistants", ) assert response.is_closed is True @@ -226,6 +255,7 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None: async with async_client.files.with_streaming_response.create( file=b"raw file contents", + purpose="assistants", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" From f89674726f55915a8cda0e2b4284be3c92978121 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 03:38:32 +0000 Subject: [PATCH 6/8] docs: update examples --- .stats.yml | 2 +- README.md | 96 ++++++++++++++++++++++++++++--- tests/test_client.py | 132 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 198 insertions(+), 32 deletions(-) diff --git a/.stats.yml b/.stats.yml index 20dba32e..36fa92d0 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 105 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml openapi_spec_hash: f73b3af77108625edae3f25972b9e665 -config_hash: 5b643c97c83a497d7d346253f1e175f3 +config_hash: 06f95bf1b7786cfe2470af8f238fc36d diff --git a/README.md b/README.md index 76c9b9ae..c8cebcc3 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,50 @@ asyncio.run(main()) Functionality between the synchronous and asynchronous clients is otherwise identical. +## Streaming responses + +We provide support for streaming responses using Server Side Events (SSE). + +```python +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +stream = client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + stream=True, +) +for completion in stream: + print(completion) +``` + +The async client uses the exact same interface. + +```python +from llama_stack_client import AsyncLlamaStackClient + +client = AsyncLlamaStackClient() + +stream = await client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + stream=True, +) +async for completion in stream: + print(completion) +``` + ## Using types Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like: @@ -168,7 +212,15 @@ from llama_stack_client import LlamaStackClient client = LlamaStackClient() try: - client.agents.toolgroups.list() + client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ) except llama_stack_client.APIConnectionError as e: print("The server could not be reached") print(e.__cause__) # an underlying Exception, likely raised within httpx. @@ -211,7 +263,15 @@ client = LlamaStackClient( ) # Or, configure per-request: -client.with_options(max_retries=5).toolgroups.list.create() +client.with_options(max_retries=5).chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", +) ``` ### Timeouts @@ -234,7 +294,15 @@ client = LlamaStackClient( ) # Override per-request: -client.with_options(timeout=5.0).toolgroups.list.create() +client.with_options(timeout=5.0).chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", +) ``` On timeout, an `APITimeoutError` is thrown. @@ -273,11 +341,17 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to from llama_stack_client import LlamaStackClient client = LlamaStackClient() -response = client.toolgroups.with_raw_response.list() +response = client.chat.completions.with_raw_response.create( + messages=[{ + "content": "string", + "role": "user", + }], + model="model", +) print(response.headers.get('X-My-Header')) -toolgroup = response.parse() # get the object that `toolgroups.list()` would have returned -print(toolgroup) +completion = response.parse() # get the object that `chat.completions.create()` would have returned +print(completion) ``` These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object. @@ -291,7 +365,15 @@ The above interface eagerly reads the full response body when you make the reque To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods. ```python -with client.agents.toolgroups.with_streaming_response.list() as response: +with client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", +) as response: print(response.headers.get("X-My-Header")) for line in response.iter_lines(): diff --git a/tests/test_client.py b/tests/test_client.py index c5606d5d..708c7420 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -678,20 +678,36 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - client.toolgroups.with_streaming_response.list().__enter__() + client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ).__enter__() assert _get_open_connections(self.client) == 0 @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500)) + respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - client.toolgroups.with_streaming_response.list().__enter__() + client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ).__enter__() assert _get_open_connections(self.client) == 0 @pytest.mark.parametrize("failures_before_success", [0, 2, 4]) @@ -718,9 +734,17 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = client.toolgroups.with_raw_response.list() + response = client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ) assert response.retries_taken == failures_before_success assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success @@ -742,9 +766,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - - response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()}) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) + + response = client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + extra_headers={"x-stainless-retry-count": Omit()}, + ) assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0 @@ -765,9 +798,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - - response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"}) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) + + response = client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + extra_headers={"x-stainless-retry-count": "42"}, + ) assert response.http_request.headers.get("x-stainless-retry-count") == "42" @@ -1456,10 +1498,18 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte async def test_retrying_timeout_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - await async_client.toolgroups.with_streaming_response.list().__aenter__() + await async_client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ).__aenter__() assert _get_open_connections(self.client) == 0 @@ -1468,10 +1518,18 @@ async def test_retrying_timeout_errors_doesnt_leak( async def test_retrying_status_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500)) + respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - await async_client.toolgroups.with_streaming_response.list().__aenter__() + await async_client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ).__aenter__() assert _get_open_connections(self.client) == 0 @pytest.mark.parametrize("failures_before_success", [0, 2, 4]) @@ -1499,9 +1557,17 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = await client.toolgroups.with_raw_response.list() + response = await client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + ) assert response.retries_taken == failures_before_success assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success @@ -1524,9 +1590,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - - response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()}) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) + + response = await client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + extra_headers={"x-stainless-retry-count": Omit()}, + ) assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0 @@ -1548,9 +1623,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler) - - response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"}) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) + + response = await client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + extra_headers={"x-stainless-retry-count": "42"}, + ) assert response.http_request.headers.get("x-stainless-retry-count") == "42" From 4c75724250abc5a8424f35ff25956132d317c00f Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 03:39:36 +0000 Subject: [PATCH 7/8] codegen metadata --- .stats.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.stats.yml b/.stats.yml index 36fa92d0..755df453 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 105 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml openapi_spec_hash: f73b3af77108625edae3f25972b9e665 -config_hash: 06f95bf1b7786cfe2470af8f238fc36d +config_hash: 548f336ac1b68ab1dfe385b79df764dd From 1231814b1aeb959bb43ae2eecb3d6a118b7582be Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 03:41:14 +0000 Subject: [PATCH 8/8] release: 0.3.0-alpha.1 --- .release-please-manifest.json | 2 +- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index ed9acd29..1ae25264 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.2.23-alpha.1" + ".": "0.3.0-alpha.1" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 0011c19f..93d68692 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,36 @@ # Changelog +## 0.3.0-alpha.1 (2025-09-30) + +Full Changelog: [v0.2.23-alpha.1...v0.3.0-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.23-alpha.1...v0.3.0-alpha.1) + +### ⚠ BREAKING CHANGES + +* **api:** fixes to remove deprecated inference resources + +### Features + +* **api:** expires_after changes for /files ([7f24c43](https://github.com/llamastack/llama-stack-client-python/commit/7f24c432dc1859312710a4a1ff4a80f6f861bee8)) +* **api:** fixes to remove deprecated inference resources ([04834d2](https://github.com/llamastack/llama-stack-client-python/commit/04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec)) +* **api:** removing openai/v1 ([a918b43](https://github.com/llamastack/llama-stack-client-python/commit/a918b4323118c18f77c2abe7e1a3054c1eebeaac)) +* **api:** updating post /v1/files to have correct multipart/form-data ([433a996](https://github.com/llamastack/llama-stack-client-python/commit/433a996527bcca131ada4730376d8993f34ad6f5)) + + +### Bug Fixes + +* clean up deprecated code ([f10ead0](https://github.com/llamastack/llama-stack-client-python/commit/f10ead00522b7ca803cd7dc3617da0d451efa7da)) +* Don't retry for non-recoverable server http errors ([#212](https://github.com/llamastack/llama-stack-client-python/issues/212)) ([6782e8f](https://github.com/llamastack/llama-stack-client-python/commit/6782e8fc5931369223ed4446f8e7732f62712eff)) + + +### Documentation + +* update examples ([f896747](https://github.com/llamastack/llama-stack-client-python/commit/f89674726f55915a8cda0e2b4284be3c92978121)) + + +### Build System + +* Bump version to 0.2.23 ([0d4dc64](https://github.com/llamastack/llama-stack-client-python/commit/0d4dc6449224fa2a0f6d20f6229dd9d1a5427861)) + ## 0.2.23-alpha.1 (2025-09-26) Full Changelog: [v0.2.19-alpha.1...v0.2.23-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.19-alpha.1...v0.2.23-alpha.1) diff --git a/pyproject.toml b/pyproject.toml index 843dd9b7..3b50518e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "llama_stack_client" -version = "0.2.23" +version = "0.3.0-alpha.1" description = "The official Python library for the llama-stack-client API" dynamic = ["readme"] license = "MIT"