From a918b4323118c18f77c2abe7e1a3054c1eebeaac Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Sat, 27 Sep 2025 20:42:37 +0000
Subject: [PATCH 1/8] feat(api): removing openai/v1
---
 .stats.yml                                    |   8 +-
 README.md                                     |  42 ++
 api.md                                        |  64 +-
 .../resources/chat/completions.py             |  12 +-
 .../resources/completions.py                  |   4 +-
 .../resources/embeddings.py                   |   4 +-
 src/llama_stack_client/resources/files.py     |  36 +-
 src/llama_stack_client/resources/inference.py | 351 +----------
 .../resources/models/models.py                |   4 +-
 .../resources/models/openai.py                |   4 +-
 .../resources/moderations.py                  |   4 +-
 .../resources/responses/input_items.py        |   4 +-
 .../resources/responses/responses.py          |  16 +-
 .../resources/vector_stores/files.py          |  24 +-
 .../resources/vector_stores/vector_stores.py  |  24 +-
 src/llama_stack_client/types/__init__.py      |  11 -
 .../types/completion_response.py              |  24 -
 .../types/file_create_params.py               |   5 +
 .../inference_batch_chat_completion_params.py |  85 ---
 ...nference_batch_chat_completion_response.py |  13 -
 .../inference_batch_completion_params.py      |  41 --
 .../types/inference_completion_params.py      |  65 --
 .../types/list_models_response.py             |  10 -
 .../types/model_list_response.py              |  19 +-
 .../types/shared/__init__.py                  |   1 -
 .../types/shared/batch_completion.py          |  13 -
 .../types/shared/tool_param_definition.py     |   4 +
 .../shared_params/tool_param_definition.py    |   4 +
 src/llama_stack_client/types/tool.py          |   6 +
 src/llama_stack_client/types/tool_def.py      |   6 +
 .../types/tool_def_param.py                   |   6 +
 tests/api_resources/test_agents.py            |   4 +
 tests/api_resources/test_files.py             |  12 +
 tests/api_resources/test_inference.py         | 564 +-----------------
 34 files changed, 239 insertions(+), 1255 deletions(-)
 delete mode 100644 src/llama_stack_client/types/completion_response.py
 delete mode 100644 src/llama_stack_client/types/inference_batch_chat_completion_params.py
 delete mode 100644 src/llama_stack_client/types/inference_batch_chat_completion_response.py
 delete mode 100644 src/llama_stack_client/types/inference_batch_completion_params.py
 delete mode 100644 src/llama_stack_client/types/inference_completion_params.py
 delete mode 100644 src/llama_stack_client/types/list_models_response.py
 delete mode 100644 src/llama_stack_client/types/shared/batch_completion.py
diff --git a/.stats.yml b/.stats.yml
index fa9edfc7..e5bf0be0 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
-configured_endpoints: 111
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-f252873ea1e1f38fd207331ef2621c511154d5be3f4076e59cc15754fc58eee4.yml
-openapi_spec_hash: 10cbb4337a06a9fdd7d08612dd6044c3
-config_hash: 0358112cc0f3d880b4d55debdbe1cfa3
+configured_endpoints: 107
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-1eddf141208c131ee4a64ef996f8f419b444f60450de6807a9f6bc711ed8b661.yml
+openapi_spec_hash: 94765c67ea99b1358169d41d810dd395
+config_hash: 7ec5a583f9c26b38993013bdfb0e7d46
diff --git a/README.md b/README.md
index 928458d2..cbd6bf78 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,48 @@ Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typ
 
 Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`.
 
+## Nested params
+
+Nested parameters are dictionaries, typed using `TypedDict`, for example:
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+chat_completion_response = client.inference.chat_completion(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model_id="model_id",
+    logprobs={},
+)
+print(chat_completion_response.logprobs)
+```
+
+## File uploads
+
+Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`.
+
+```python
+from pathlib import Path
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+client.files.create(
+    expires_after_anchor="expires_after_anchor",
+    expires_after_seconds=0,
+    file=Path("/path/to/file"),
+    purpose="assistants",
+)
+```
+
+The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically.
+
 ## Handling errors
 
 When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `llama_stack_client.APIConnectionError` is raised.
diff --git a/api.md b/api.md
index 22c2120f..97d40b31 100644
--- a/api.md
+++ b/api.md
@@ -3,7 +3,6 @@
 ```python
 from llama_stack_client.types import (
     AgentConfig,
-    BatchCompletion,
     ChatCompletionResponse,
     CompletionMessage,
     ContentDelta,
@@ -91,10 +90,10 @@ from llama_stack_client.types import (
 
 Methods:
 
-- client.responses.create(\*\*params) -> ResponseObject
-- client.responses.retrieve(response_id) -> ResponseObject
-- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse]
-- client.responses.delete(response_id) -> ResponseDeleteResponse
+- client.responses.create(\*\*params) -> ResponseObject
+- client.responses.retrieve(response_id) -> ResponseObject
+- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse]
+- client.responses.delete(response_id) -> ResponseDeleteResponse
 
 ## InputItems
 
@@ -106,7 +105,7 @@ from llama_stack_client.types.responses import InputItemListResponse
 
 Methods:
 
-- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse
+- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse
 
 # Agents
 
@@ -244,20 +243,15 @@ Types:
 ```python
 from llama_stack_client.types import (
     ChatCompletionResponseStreamChunk,
-    CompletionResponse,
     EmbeddingsResponse,
     TokenLogProbs,
-    InferenceBatchChatCompletionResponse,
     InferenceRerankResponse,
 )
 ```
 
 Methods:
 
-- client.inference.batch_chat_completion(\*\*params) -> InferenceBatchChatCompletionResponse
-- client.inference.batch_completion(\*\*params) -> BatchCompletion
 - client.inference.chat_completion(\*\*params) -> ChatCompletionResponse
-- client.inference.completion(\*\*params) -> CompletionResponse
 - client.inference.embeddings(\*\*params) -> EmbeddingsResponse
 - client.inference.rerank(\*\*params) -> InferenceRerankResponse
 
@@ -271,7 +265,7 @@ from llama_stack_client.types import CreateEmbeddingsResponse
 
 Methods:
 
-- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse
+- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse
 
 # Chat
 
@@ -295,9 +289,9 @@ from llama_stack_client.types.chat import (
 
 Methods:
 
-- client.chat.completions.create(\*\*params) -> CompletionCreateResponse
-- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse
-- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse]
+- client.chat.completions.create(\*\*params) -> CompletionCreateResponse
+- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse
+- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse]
 
 # Completions
 
@@ -309,7 +303,7 @@ from llama_stack_client.types import CompletionCreateResponse
 
 Methods:
 
-- client.completions.create(\*\*params) -> CompletionCreateResponse
+- client.completions.create(\*\*params) -> CompletionCreateResponse
 
 # VectorIo
 
@@ -359,12 +353,12 @@ from llama_stack_client.types import (
 
 Methods:
 
-- client.vector_stores.create(\*\*params) -> VectorStore
-- client.vector_stores.retrieve(vector_store_id) -> VectorStore
-- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore
-- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore]
-- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse
-- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse
+- client.vector_stores.create(\*\*params) -> VectorStore
+- client.vector_stores.retrieve(vector_store_id) -> VectorStore
+- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore
+- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore]
+- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse
+- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse
 
 ## Files
 
@@ -380,12 +374,12 @@ from llama_stack_client.types.vector_stores import (
 
 Methods:
 
-- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile
-- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile
-- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile
-- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile]
-- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse
-- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse
+- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile
+- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile
+- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile
+- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile]
+- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse
+- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse
 
 # Models
 
@@ -412,7 +406,7 @@ from llama_stack_client.types.models import OpenAIListResponse
 
 Methods:
 
-- client.models.openai.list() -> OpenAIListResponse
+- client.models.openai.list() -> OpenAIListResponse
 
 # PostTraining
 
@@ -481,7 +475,7 @@ from llama_stack_client.types import CreateResponse
 
 Methods:
 
-- client.moderations.create(\*\*params) -> CreateResponse
+- client.moderations.create(\*\*params) -> CreateResponse
 
 # Safety
 
@@ -608,8 +602,8 @@ from llama_stack_client.types import DeleteFileResponse, File, ListFilesResponse
 
 Methods:
 
-- client.files.create(\*\*params) -> File
-- client.files.retrieve(file_id) -> File
-- client.files.list(\*\*params) -> SyncOpenAICursorPage[File]
-- client.files.delete(file_id) -> DeleteFileResponse
-- client.files.content(file_id) -> object
+- client.files.create(\*\*params) -> File
+- client.files.retrieve(file_id) -> File
+- client.files.list(\*\*params) -> SyncOpenAICursorPage[File]
+- client.files.delete(file_id) -> DeleteFileResponse
+- client.files.content(file_id) -> object
diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py
index 5445a2d1..2fb19980 100644
--- a/src/llama_stack_client/resources/chat/completions.py
+++ b/src/llama_stack_client/resources/chat/completions.py
@@ -372,7 +372,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
         return self._post(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             body=maybe_transform(
                 {
                     "messages": messages,
@@ -439,7 +439,7 @@ def retrieve(
         if not completion_id:
             raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
         return self._get(
-            f"/v1/openai/v1/chat/completions/{completion_id}",
+            f"/v1/chat/completions/{completion_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -481,7 +481,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             page=SyncOpenAICursorPage[CompletionListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -845,7 +845,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
         return await self._post(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             body=await async_maybe_transform(
                 {
                     "messages": messages,
@@ -912,7 +912,7 @@ async def retrieve(
         if not completion_id:
             raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
         return await self._get(
-            f"/v1/openai/v1/chat/completions/{completion_id}",
+            f"/v1/chat/completions/{completion_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -954,7 +954,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             page=AsyncOpenAICursorPage[CompletionListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py
index 2c1475de..caeab7a1 100644
--- a/src/llama_stack_client/resources/completions.py
+++ b/src/llama_stack_client/resources/completions.py
@@ -326,7 +326,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]:
         return self._post(
-            "/v1/openai/v1/completions",
+            "/v1/completions",
             body=maybe_transform(
                 {
                     "model": model,
@@ -664,7 +664,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]:
         return await self._post(
-            "/v1/openai/v1/completions",
+            "/v1/completions",
             body=await async_maybe_transform(
                 {
                     "model": model,
diff --git a/src/llama_stack_client/resources/embeddings.py b/src/llama_stack_client/resources/embeddings.py
index 60c38cb2..29cd69d8 100644
--- a/src/llama_stack_client/resources/embeddings.py
+++ b/src/llama_stack_client/resources/embeddings.py
@@ -87,7 +87,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/embeddings",
+            "/v1/embeddings",
             body=maybe_transform(
                 {
                     "input": input,
@@ -169,7 +169,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/embeddings",
+            "/v1/embeddings",
             body=await async_maybe_transform(
                 {
                     "input": input,
diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py
index 6b395e52..e8f20d35 100644
--- a/src/llama_stack_client/resources/files.py
+++ b/src/llama_stack_client/resources/files.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Mapping, cast
+from typing import Mapping, Optional, cast
 from typing_extensions import Literal
 
 import httpx
@@ -49,6 +49,8 @@ def with_streaming_response(self) -> FilesResourceWithStreamingResponse:
     def create(
         self,
         *,
+        expires_after_anchor: Optional[str],
+        expires_after_seconds: Optional[int],
         file: FileTypes,
         purpose: Literal["assistants", "batch"],
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -65,6 +67,9 @@ def create(
 
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file.
+          Expected expires_after[anchor] = "created_at", expires_after[seconds] =
+          {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
 
         Args:
           purpose: Valid purpose values for OpenAI Files API.
@@ -79,6 +84,8 @@ def create(
         """
         body = deepcopy_minimal(
             {
+                "expires_after_anchor": expires_after_anchor,
+                "expires_after_seconds": expires_after_seconds,
                 "file": file,
                 "purpose": purpose,
             }
@@ -89,7 +96,7 @@ def create(
         # multipart/form-data; boundary=---abc--
         extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
         return self._post(
-            "/v1/openai/v1/files",
+            "/v1/files",
             body=maybe_transform(body, file_create_params.FileCreateParams),
             files=files,
             options=make_request_options(
@@ -124,7 +131,7 @@ def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -171,7 +178,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/files",
+            "/v1/files",
             page=SyncOpenAICursorPage[File],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -217,7 +224,7 @@ def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._delete(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -250,7 +257,7 @@ def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/files/{file_id}/content",
+            f"/v1/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -281,6 +288,8 @@ def with_streaming_response(self) -> AsyncFilesResourceWithStreamingResponse:
     async def create(
         self,
         *,
+        expires_after_anchor: Optional[str],
+        expires_after_seconds: Optional[int],
         file: FileTypes,
         purpose: Literal["assistants", "batch"],
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -297,6 +306,9 @@ async def create(
 
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file.
+          Expected expires_after[anchor] = "created_at", expires_after[seconds] =
+          {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
 
         Args:
           purpose: Valid purpose values for OpenAI Files API.
@@ -311,6 +323,8 @@ async def create(
         """
         body = deepcopy_minimal(
             {
+                "expires_after_anchor": expires_after_anchor,
+                "expires_after_seconds": expires_after_seconds,
                 "file": file,
                 "purpose": purpose,
             }
@@ -321,7 +335,7 @@ async def create(
         # multipart/form-data; boundary=---abc--
         extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
         return await self._post(
-            "/v1/openai/v1/files",
+            "/v1/files",
             body=await async_maybe_transform(body, file_create_params.FileCreateParams),
             files=files,
             options=make_request_options(
@@ -356,7 +370,7 @@ async def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -403,7 +417,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/files",
+            "/v1/files",
             page=AsyncOpenAICursorPage[File],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -449,7 +463,7 @@ async def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -482,7 +496,7 @@ async def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/files/{file_id}/content",
+            f"/v1/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 732025cc..9a2b6c50 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -8,14 +8,7 @@
 
 import httpx
 
-from ..types import (
-    inference_rerank_params,
-    inference_completion_params,
-    inference_embeddings_params,
-    inference_chat_completion_params,
-    inference_batch_completion_params,
-    inference_batch_chat_completion_params,
-)
+from ..types import inference_rerank_params, inference_embeddings_params, inference_chat_completion_params
 from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
 from .._utils import required_args, maybe_transform, async_maybe_transform
 from .._compat import cached_property
@@ -29,18 +22,14 @@
 from .._wrappers import DataWrapper
 from .._streaming import Stream, AsyncStream
 from .._base_client import make_request_options
-from ..types.completion_response import CompletionResponse
 from ..types.embeddings_response import EmbeddingsResponse
 from ..types.shared_params.message import Message
-from ..types.shared.batch_completion import BatchCompletion
 from ..types.inference_rerank_response import InferenceRerankResponse
 from ..types.shared_params.response_format import ResponseFormat
 from ..types.shared_params.sampling_params import SamplingParams
 from ..types.shared.chat_completion_response import ChatCompletionResponse
-from ..types.shared_params.interleaved_content import InterleavedContent
 from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
 from ..types.shared_params.interleaved_content_item import InterleavedContentItem
-from ..types.inference_batch_chat_completion_response import InferenceBatchChatCompletionResponse
 
 __all__ = ["InferenceResource", "AsyncInferenceResource"]
 
@@ -65,131 +54,7 @@ def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
         """
         return InferenceResourceWithStreamingResponse(self)
 
-    def batch_chat_completion(
-        self,
-        *,
-        messages_batch: Iterable[Iterable[Message]],
-        model_id: str,
-        logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
-        tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceBatchChatCompletionResponse:
-        """
-        Generate chat completions for a batch of messages using the specified model.
-
-        Args:
-          messages_batch: The messages to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/batch-chat-completion",
-            body=maybe_transform(
-                {
-                    "messages_batch": messages_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "tool_config": tool_config,
-                    "tools": tools,
-                },
-                inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=InferenceBatchChatCompletionResponse,
-        )
-
-    def batch_completion(
-        self,
-        *,
-        content_batch: SequenceNotStr[InterleavedContent],
-        model_id: str,
-        logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> BatchCompletion:
-        """
-        Generate completions for a batch of content using the specified model.
-
-        Args:
-          content_batch: The content to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/batch-completion",
-            body=maybe_transform(
-                {
-                    "content_batch": content_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                },
-                inference_batch_completion_params.InferenceBatchCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=BatchCompletion,
-        )
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     def chat_completion(
         self,
@@ -258,9 +123,7 @@ def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     def chat_completion(
         self,
@@ -329,9 +192,7 @@ def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     def chat_completion(
         self,
@@ -400,9 +261,7 @@ def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
     def chat_completion(
         self,
@@ -453,7 +312,7 @@ def chat_completion(
             stream_cls=Stream[ChatCompletionResponseStreamChunk],
         )
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @overload
     def completion(
         self,
@@ -500,7 +359,7 @@ def completion(
         """
         ...
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @overload
     def completion(
         self,
@@ -547,7 +406,7 @@ def completion(
         """
         ...
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @overload
     def completion(
         self,
@@ -594,7 +453,7 @@ def completion(
         """
         ...
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @required_args(["content", "model_id"], ["content", "model_id", "stream"])
     def completion(
         self,
@@ -637,7 +496,7 @@ def completion(
             stream_cls=Stream[CompletionResponse],
         )
 
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
+    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.")
     def embeddings(
         self,
         *,
@@ -778,131 +637,7 @@ def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse
         """
         return AsyncInferenceResourceWithStreamingResponse(self)
 
-    async def batch_chat_completion(
-        self,
-        *,
-        messages_batch: Iterable[Iterable[Message]],
-        model_id: str,
-        logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
-        tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceBatchChatCompletionResponse:
-        """
-        Generate chat completions for a batch of messages using the specified model.
-
-        Args:
-          messages_batch: The messages to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/batch-chat-completion",
-            body=await async_maybe_transform(
-                {
-                    "messages_batch": messages_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "tool_config": tool_config,
-                    "tools": tools,
-                },
-                inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=InferenceBatchChatCompletionResponse,
-        )
-
-    async def batch_completion(
-        self,
-        *,
-        content_batch: SequenceNotStr[InterleavedContent],
-        model_id: str,
-        logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> BatchCompletion:
-        """
-        Generate completions for a batch of content using the specified model.
-
-        Args:
-          content_batch: The content to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/batch-completion",
-            body=await async_maybe_transform(
-                {
-                    "content_batch": content_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                },
-                inference_batch_completion_params.InferenceBatchCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=BatchCompletion,
-        )
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     async def chat_completion(
         self,
@@ -971,9 +706,7 @@ async def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     async def chat_completion(
         self,
@@ -1042,9 +775,7 @@ async def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @overload
     async def chat_completion(
         self,
@@ -1113,9 +844,7 @@ async def chat_completion(
         """
         ...
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
+    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
     @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
     async def chat_completion(
         self,
@@ -1166,7 +895,7 @@ async def chat_completion(
             stream_cls=AsyncStream[ChatCompletionResponseStreamChunk],
         )
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @overload
     async def completion(
         self,
@@ -1260,7 +989,7 @@ async def completion(
         """
         ...
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @overload
     async def completion(
         self,
@@ -1307,7 +1036,7 @@ async def completion(
         """
         ...
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
+    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
     @required_args(["content", "model_id"], ["content", "model_id", "stream"])
     async def completion(
         self,
@@ -1350,7 +1079,7 @@ async def completion(
             stream_cls=AsyncStream[CompletionResponse],
         )
 
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
+    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.")
     async def embeddings(
         self,
         *,
@@ -1475,22 +1204,11 @@ class InferenceResourceWithRawResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = to_raw_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = to_raw_response_wrapper(
-            inference.batch_completion,
-        )
         self.chat_completion = (  # pyright: ignore[reportDeprecated]
             to_raw_response_wrapper(
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.embeddings = (  # pyright: ignore[reportDeprecated]
             to_raw_response_wrapper(
                 inference.embeddings,  # pyright: ignore[reportDeprecated],
@@ -1505,22 +1223,11 @@ class AsyncInferenceResourceWithRawResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = async_to_raw_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = async_to_raw_response_wrapper(
-            inference.batch_completion,
-        )
         self.chat_completion = (  # pyright: ignore[reportDeprecated]
             async_to_raw_response_wrapper(
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.embeddings = (  # pyright: ignore[reportDeprecated]
             async_to_raw_response_wrapper(
                 inference.embeddings,  # pyright: ignore[reportDeprecated],
@@ -1535,22 +1242,11 @@ class InferenceResourceWithStreamingResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = to_streamed_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = to_streamed_response_wrapper(
-            inference.batch_completion,
-        )
         self.chat_completion = (  # pyright: ignore[reportDeprecated]
             to_streamed_response_wrapper(
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.embeddings = (  # pyright: ignore[reportDeprecated]
             to_streamed_response_wrapper(
                 inference.embeddings,  # pyright: ignore[reportDeprecated],
@@ -1565,22 +1261,11 @@ class AsyncInferenceResourceWithStreamingResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = async_to_streamed_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = async_to_streamed_response_wrapper(
-            inference.batch_completion,
-        )
         self.chat_completion = (  # pyright: ignore[reportDeprecated]
             async_to_streamed_response_wrapper(
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.embeddings = (  # pyright: ignore[reportDeprecated]
             async_to_streamed_response_wrapper(
                 inference.embeddings,  # pyright: ignore[reportDeprecated],
diff --git a/src/llama_stack_client/resources/models/models.py b/src/llama_stack_client/resources/models/models.py
index f044c50d..72d9f81e 100644
--- a/src/llama_stack_client/resources/models/models.py
+++ b/src/llama_stack_client/resources/models/models.py
@@ -101,7 +101,7 @@ def list(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ModelListResponse:
-        """List all models."""
+        """List models using the OpenAI API."""
         return self._get(
             "/v1/models",
             options=make_request_options(
@@ -271,7 +271,7 @@ async def list(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ModelListResponse:
-        """List all models."""
+        """List models using the OpenAI API."""
         return await self._get(
             "/v1/models",
             options=make_request_options(
diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py
index e4b2fbd8..57179ed8 100644
--- a/src/llama_stack_client/resources/models/openai.py
+++ b/src/llama_stack_client/resources/models/openai.py
@@ -54,7 +54,7 @@ def list(
     ) -> OpenAIListResponse:
         """List models using the OpenAI API."""
         return self._get(
-            "/v1/openai/v1/models",
+            "/v1/models",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
@@ -98,7 +98,7 @@ async def list(
     ) -> OpenAIListResponse:
         """List models using the OpenAI API."""
         return await self._get(
-            "/v1/openai/v1/models",
+            "/v1/models",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
diff --git a/src/llama_stack_client/resources/moderations.py b/src/llama_stack_client/resources/moderations.py
index a016b5b0..a73dc85a 100644
--- a/src/llama_stack_client/resources/moderations.py
+++ b/src/llama_stack_client/resources/moderations.py
@@ -73,7 +73,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/moderations",
+            "/v1/moderations",
             body=maybe_transform(
                 {
                     "input": input,
@@ -138,7 +138,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/moderations",
+            "/v1/moderations",
             body=await async_maybe_transform(
                 {
                     "input": input,
diff --git a/src/llama_stack_client/resources/responses/input_items.py b/src/llama_stack_client/resources/responses/input_items.py
index da06debd..a5836ba7 100644
--- a/src/llama_stack_client/resources/responses/input_items.py
+++ b/src/llama_stack_client/resources/responses/input_items.py
@@ -85,7 +85,7 @@ def list(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._get(
-            f"/v1/openai/v1/responses/{response_id}/input_items",
+            f"/v1/responses/{response_id}/input_items",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
@@ -168,7 +168,7 @@ async def list(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._get(
-            f"/v1/openai/v1/responses/{response_id}/input_items",
+            f"/v1/responses/{response_id}/input_items",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
diff --git a/src/llama_stack_client/resources/responses/responses.py b/src/llama_stack_client/resources/responses/responses.py
index 7f21f3ea..16e38fd0 100644
--- a/src/llama_stack_client/resources/responses/responses.py
+++ b/src/llama_stack_client/resources/responses/responses.py
@@ -228,7 +228,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ResponseObject | Stream[ResponseObjectStream]:
         return self._post(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             body=maybe_transform(
                 {
                     "input": input,
@@ -281,7 +281,7 @@ def retrieve(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._get(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -323,7 +323,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             page=SyncOpenAICursorPage[ResponseListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -369,7 +369,7 @@ def delete(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._delete(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -568,7 +568,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ResponseObject | AsyncStream[ResponseObjectStream]:
         return await self._post(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             body=await async_maybe_transform(
                 {
                     "input": input,
@@ -621,7 +621,7 @@ async def retrieve(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._get(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -663,7 +663,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             page=AsyncOpenAICursorPage[ResponseListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -709,7 +709,7 @@ async def delete(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/vector_stores/files.py b/src/llama_stack_client/resources/vector_stores/files.py
index 39f16a66..f9a1ef31 100644
--- a/src/llama_stack_client/resources/vector_stores/files.py
+++ b/src/llama_stack_client/resources/vector_stores/files.py
@@ -82,7 +82,7 @@ def create(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             body=maybe_transform(
                 {
                     "file_id": file_id,
@@ -126,7 +126,7 @@ def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -165,7 +165,7 @@ def update(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             body=maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -218,7 +218,7 @@ def list(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get_api_list(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             page=SyncOpenAICursorPage[VectorStoreFile],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -268,7 +268,7 @@ def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -304,7 +304,7 @@ def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -367,7 +367,7 @@ async def create(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             body=await async_maybe_transform(
                 {
                     "file_id": file_id,
@@ -411,7 +411,7 @@ async def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -450,7 +450,7 @@ async def update(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             body=await async_maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -503,7 +503,7 @@ def list(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get_api_list(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             page=AsyncOpenAICursorPage[VectorStoreFile],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -553,7 +553,7 @@ async def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -589,7 +589,7 @@ async def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/vector_stores/vector_stores.py b/src/llama_stack_client/resources/vector_stores/vector_stores.py
index f3ab01f2..f858100b 100644
--- a/src/llama_stack_client/resources/vector_stores/vector_stores.py
+++ b/src/llama_stack_client/resources/vector_stores/vector_stores.py
@@ -112,7 +112,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             body=maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
@@ -158,7 +158,7 @@ def retrieve(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -200,7 +200,7 @@ def update(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             body=maybe_transform(
                 {
                     "expires_after": expires_after,
@@ -255,7 +255,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             page=SyncOpenAICursorPage[VectorStore],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -301,7 +301,7 @@ def delete(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -354,7 +354,7 @@ def search(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+            f"/v1/vector_stores/{vector_store_id}/search",
             body=maybe_transform(
                 {
                     "query": query,
@@ -446,7 +446,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             body=await async_maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
@@ -492,7 +492,7 @@ async def retrieve(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -534,7 +534,7 @@ async def update(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             body=await async_maybe_transform(
                 {
                     "expires_after": expires_after,
@@ -589,7 +589,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             page=AsyncOpenAICursorPage[VectorStore],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -635,7 +635,7 @@ async def delete(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -688,7 +688,7 @@ async def search(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+            f"/v1/vector_stores/{vector_store_id}/search",
             body=await async_maybe_transform(
                 {
                     "query": query,
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 56b7f887..e63970ee 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -22,7 +22,6 @@
     SystemMessage as SystemMessage,
     ResponseFormat as ResponseFormat,
     SamplingParams as SamplingParams,
-    BatchCompletion as BatchCompletion,
     SafetyViolation as SafetyViolation,
     CompletionMessage as CompletionMessage,
     InterleavedContent as InterleavedContent,
@@ -61,7 +60,6 @@
 from .tool_list_response import ToolListResponse as ToolListResponse
 from .agent_create_params import AgentCreateParams as AgentCreateParams
 from .agent_list_response import AgentListResponse as AgentListResponse
-from .completion_response import CompletionResponse as CompletionResponse
 from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse
 from .list_files_response import ListFilesResponse as ListFilesResponse
 from .list_tools_response import ListToolsResponse as ListToolsResponse
@@ -73,7 +71,6 @@
 from .delete_file_response import DeleteFileResponse as DeleteFileResponse
 from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam
 from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams
-from .list_models_response import ListModelsResponse as ListModelsResponse
 from .list_routes_response import ListRoutesResponse as ListRoutesResponse
 from .query_spans_response import QuerySpansResponse as QuerySpansResponse
 from .response_list_params import ResponseListParams as ResponseListParams
@@ -134,7 +131,6 @@
 from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams
 from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams
 from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams
-from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams
 from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams
 from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse
 from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse
@@ -160,20 +156,13 @@
 from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse
 from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse
 from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse
-from .inference_batch_completion_params import InferenceBatchCompletionParams as InferenceBatchCompletionParams
 from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse
 from .chat_completion_response_stream_chunk import (
     ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk,
 )
-from .inference_batch_chat_completion_params import (
-    InferenceBatchChatCompletionParams as InferenceBatchChatCompletionParams,
-)
 from .telemetry_save_spans_to_dataset_params import (
     TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams,
 )
-from .inference_batch_chat_completion_response import (
-    InferenceBatchChatCompletionResponse as InferenceBatchChatCompletionResponse,
-)
 from .post_training_preference_optimize_params import (
     PostTrainingPreferenceOptimizeParams as PostTrainingPreferenceOptimizeParams,
 )
diff --git a/src/llama_stack_client/types/completion_response.py b/src/llama_stack_client/types/completion_response.py
deleted file mode 100644
index 9718be8a..00000000
--- a/src/llama_stack_client/types/completion_response.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-
-__all__ = ["CompletionResponse"]
-
-
-class CompletionResponse(BaseModel):
-    content: str
-    """The generated completion text"""
-
-    stop_reason: Literal["end_of_turn", "end_of_message", "out_of_tokens"]
-    """Reason why generation stopped"""
-
-    logprobs: Optional[List[TokenLogProbs]] = None
-    """Optional log probabilities for generated tokens"""
-
-    metrics: Optional[List[Metric]] = None
-    """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py
index 8322c0a9..a1197ff5 100644
--- a/src/llama_stack_client/types/file_create_params.py
+++ b/src/llama_stack_client/types/file_create_params.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from typing import Optional
 from typing_extensions import Literal, Required, TypedDict
 
 from .._types import FileTypes
@@ -10,6 +11,10 @@
 
 
 class FileCreateParams(TypedDict, total=False):
+    expires_after_anchor: Required[Optional[str]]
+
+    expires_after_seconds: Required[Optional[int]]
+
     file: Required[FileTypes]
 
     purpose: Required[Literal["assistants", "batch"]]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_params.py b/src/llama_stack_client/types/inference_batch_chat_completion_params.py
deleted file mode 100644
index b5da0f0e..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_params.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = ["InferenceBatchChatCompletionParams", "Logprobs", "ToolConfig", "Tool"]
-
-
-class InferenceBatchChatCompletionParams(TypedDict, total=False):
-    messages_batch: Required[Iterable[Iterable[Message]]]
-    """The messages to generate completions for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-    tool_config: ToolConfig
-    """(Optional) Configuration for tool use."""
-
-    tools: Iterable[Tool]
-    """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
-    system_message_behavior: Literal["append", "replace"]
-    """(Optional) Config for how to override the default system prompt.
-
-    - `SystemMessageBehavior.append`: Appends the provided system message to the
-      default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
-      system prompt with the provided system message. The system message can include
-      the string '{{function_definitions}}' to indicate where the function
-      definitions should be inserted.
-    """
-
-    tool_choice: Union[Literal["auto", "required", "none"], str]
-    """(Optional) Whether tool use is automatic, required, or none.
-
-    Can also specify a tool name to use a specific tool. Defaults to
-    ToolChoice.auto.
-    """
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-     tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls.
-    """
-
-
-class Tool(TypedDict, total=False):
-    tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
-    description: str
-
-    parameters: Dict[str, ToolParamDefinition]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_response.py b/src/llama_stack_client/types/inference_batch_chat_completion_response.py
deleted file mode 100644
index ed24908d..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_response.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-from .shared.chat_completion_response import ChatCompletionResponse
-
-__all__ = ["InferenceBatchChatCompletionResponse"]
-
-
-class InferenceBatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-    """List of chat completion responses, one for each conversation in the batch"""
diff --git a/src/llama_stack_client/types/inference_batch_completion_params.py b/src/llama_stack_client/types/inference_batch_completion_params.py
deleted file mode 100644
index b225b883..00000000
--- a/src/llama_stack_client/types/inference_batch_completion_params.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing_extensions import Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = ["InferenceBatchCompletionParams", "Logprobs"]
-
-
-class InferenceBatchCompletionParams(TypedDict, total=False):
-    content_batch: Required[SequenceNotStr[InterleavedContent]]
-    """The content to generate completions for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
diff --git a/src/llama_stack_client/types/inference_completion_params.py b/src/llama_stack_client/types/inference_completion_params.py
deleted file mode 100644
index c122f017..00000000
--- a/src/llama_stack_client/types/inference_completion_params.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = [
-    "InferenceCompletionParamsBase",
-    "Logprobs",
-    "InferenceCompletionParamsNonStreaming",
-    "InferenceCompletionParamsStreaming",
-]
-
-
-class InferenceCompletionParamsBase(TypedDict, total=False):
-    content: Required[InterleavedContent]
-    """The content to generate a completion for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class InferenceCompletionParamsNonStreaming(InferenceCompletionParamsBase, total=False):
-    stream: Literal[False]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-class InferenceCompletionParamsStreaming(InferenceCompletionParamsBase):
-    stream: Required[Literal[True]]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-InferenceCompletionParams = Union[InferenceCompletionParamsNonStreaming, InferenceCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/list_models_response.py b/src/llama_stack_client/types/list_models_response.py
deleted file mode 100644
index a36896b8..00000000
--- a/src/llama_stack_client/types/list_models_response.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from .._models import BaseModel
-from .model_list_response import ModelListResponse
-
-__all__ = ["ListModelsResponse"]
-
-
-class ListModelsResponse(BaseModel):
-    data: ModelListResponse
diff --git a/src/llama_stack_client/types/model_list_response.py b/src/llama_stack_client/types/model_list_response.py
index 905cdb0f..7631b69f 100644
--- a/src/llama_stack_client/types/model_list_response.py
+++ b/src/llama_stack_client/types/model_list_response.py
@@ -1,10 +1,21 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import List
-from typing_extensions import TypeAlias
+from typing_extensions import Literal, TypeAlias
 
-from .model import Model
+from .._models import BaseModel
 
-__all__ = ["ModelListResponse"]
+__all__ = ["ModelListResponse", "ModelListResponseItem"]
 
-ModelListResponse: TypeAlias = List[Model]
+
+class ModelListResponseItem(BaseModel):
+    id: str
+
+    created: int
+
+    object: Literal["model"]
+
+    owned_by: str
+
+
+ModelListResponse: TypeAlias = List[ModelListResponseItem]
diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py
index fb14d8a6..007d56ac 100644
--- a/src/llama_stack_client/types/shared/__init__.py
+++ b/src/llama_stack_client/types/shared/__init__.py
@@ -14,7 +14,6 @@
 from .system_message import SystemMessage as SystemMessage
 from .response_format import ResponseFormat as ResponseFormat
 from .sampling_params import SamplingParams as SamplingParams
-from .batch_completion import BatchCompletion as BatchCompletion
 from .safety_violation import SafetyViolation as SafetyViolation
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
diff --git a/src/llama_stack_client/types/shared/batch_completion.py b/src/llama_stack_client/types/shared/batch_completion.py
deleted file mode 100644
index 43a0a735..00000000
--- a/src/llama_stack_client/types/shared/batch_completion.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from ..._models import BaseModel
-from ..completion_response import CompletionResponse
-
-__all__ = ["BatchCompletion"]
-
-
-class BatchCompletion(BaseModel):
-    batch: List[CompletionResponse]
-    """List of completion responses, one for each input in the batch"""
diff --git a/src/llama_stack_client/types/shared/tool_param_definition.py b/src/llama_stack_client/types/shared/tool_param_definition.py
index 1466c1f9..316f1e01 100644
--- a/src/llama_stack_client/types/shared/tool_param_definition.py
+++ b/src/llama_stack_client/types/shared/tool_param_definition.py
@@ -14,4 +14,8 @@ class ToolParamDefinition(BaseModel):
 
     description: Optional[str] = None
 
+    items: Union[bool, float, str, List[object], object, None] = None
+
     required: Optional[bool] = None
+
+    title: Optional[str] = None
diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py
index 2d7805fe..87563946 100644
--- a/src/llama_stack_client/types/shared_params/tool_param_definition.py
+++ b/src/llama_stack_client/types/shared_params/tool_param_definition.py
@@ -15,4 +15,8 @@ class ToolParamDefinition(TypedDict, total=False):
 
     description: str
 
+    items: Union[bool, float, str, Iterable[object], object, None]
+
     required: bool
+
+    title: str
diff --git a/src/llama_stack_client/types/tool.py b/src/llama_stack_client/types/tool.py
index c6994268..a7243b64 100644
--- a/src/llama_stack_client/types/tool.py
+++ b/src/llama_stack_client/types/tool.py
@@ -24,6 +24,12 @@ class Parameter(BaseModel):
     default: Union[bool, float, str, List[object], object, None] = None
     """(Optional) Default value for the parameter if not provided"""
 
+    items: Optional[object] = None
+    """Type of the elements when parameter_type is array"""
+
+    title: Optional[str] = None
+    """(Optional) Title of the parameter"""
+
 
 class Tool(BaseModel):
     description: str
diff --git a/src/llama_stack_client/types/tool_def.py b/src/llama_stack_client/types/tool_def.py
index c82a9b8a..21949b41 100644
--- a/src/llama_stack_client/types/tool_def.py
+++ b/src/llama_stack_client/types/tool_def.py
@@ -23,6 +23,12 @@ class Parameter(BaseModel):
     default: Union[bool, float, str, List[object], object, None] = None
     """(Optional) Default value for the parameter if not provided"""
 
+    items: Optional[object] = None
+    """Type of the elements when parameter_type is array"""
+
+    title: Optional[str] = None
+    """(Optional) Title of the parameter"""
+
 
 class ToolDef(BaseModel):
     name: str
diff --git a/src/llama_stack_client/types/tool_def_param.py b/src/llama_stack_client/types/tool_def_param.py
index 93ad8285..a50437b2 100644
--- a/src/llama_stack_client/types/tool_def_param.py
+++ b/src/llama_stack_client/types/tool_def_param.py
@@ -24,6 +24,12 @@ class Parameter(TypedDict, total=False):
     default: Union[bool, float, str, Iterable[object], object, None]
     """(Optional) Default value for the parameter if not provided"""
 
+    items: object
+    """Type of the elements when parameter_type is array"""
+
+    title: str
+    """(Optional) Title of the parameter"""
+
 
 class ToolDefParam(TypedDict, total=False):
     name: Required[str]
diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py
index 18b34012..c19bc9bf 100644
--- a/tests/api_resources/test_agents.py
+++ b/tests/api_resources/test_agents.py
@@ -49,6 +49,8 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
                                 "parameter_type": "parameter_type",
                                 "required": True,
                                 "default": True,
+                                "items": {},
+                                "title": "title",
                             }
                         ],
                     }
@@ -253,6 +255,8 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack
                                 "parameter_type": "parameter_type",
                                 "required": True,
                                 "default": True,
+                                "items": {},
+                                "title": "title",
                             }
                         ],
                     }
diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py
index d9b29ffc..f2bc1e0a 100644
--- a/tests/api_resources/test_files.py
+++ b/tests/api_resources/test_files.py
@@ -21,6 +21,8 @@ class TestFiles:
     @parametrize
     def test_method_create(self, client: LlamaStackClient) -> None:
         file = client.files.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         )
@@ -29,6 +31,8 @@ def test_method_create(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_raw_response_create(self, client: LlamaStackClient) -> None:
         response = client.files.with_raw_response.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         )
@@ -41,6 +45,8 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_streaming_response_create(self, client: LlamaStackClient) -> None:
         with client.files.with_streaming_response.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         ) as response:
@@ -210,6 +216,8 @@ class TestAsyncFiles:
     @parametrize
     async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
         file = await async_client.files.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         )
@@ -218,6 +226,8 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
     @parametrize
     async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.files.with_raw_response.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         )
@@ -230,6 +240,8 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) ->
     @parametrize
     async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.files.with_streaming_response.create(
+            expires_after_anchor="expires_after_anchor",
+            expires_after_seconds=0,
             file=b"raw file contents",
             purpose="assistants",
         ) as response:
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py
index 474ff7cf..6e952637 100644
--- a/tests/api_resources/test_inference.py
+++ b/tests/api_resources/test_inference.py
@@ -10,12 +10,10 @@
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
 from llama_stack_client.types import (
-    CompletionResponse,
     EmbeddingsResponse,
     InferenceRerankResponse,
-    InferenceBatchChatCompletionResponse,
 )
-from llama_stack_client.types.shared import BatchCompletion, ChatCompletionResponse
+from llama_stack_client.types.shared import ChatCompletionResponse
 
 # pyright: reportDeprecated=false
 
@@ -25,160 +23,6 @@
 class TestInference:
     parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
 
-    @parametrize
-    def test_method_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_batch_chat_completion_with_all_params(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ]
-            ],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-            tool_config={
-                "system_message_behavior": "append",
-                "tool_choice": "auto",
-                "tool_prompt_format": "json",
-            },
-            tools=[
-                {
-                    "tool_name": "brave_search",
-                    "description": "description",
-                    "parameters": {
-                        "foo": {
-                            "param_type": "param_type",
-                            "default": True,
-                            "description": "description",
-                            "required": True,
-                        }
-                    },
-                }
-            ],
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        response = client.inference.with_raw_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        with client.inference.with_streaming_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = response.parse()
-            assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_batch_completion(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_method_batch_completion_with_all_params(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_batch_completion(self, client: LlamaStackClient) -> None:
-        response = client.inference.with_raw_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_batch_completion(self, client: LlamaStackClient) -> None:
-        with client.inference.with_streaming_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = response.parse()
-            assert_matches_type(BatchCompletion, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
         with pytest.warns(DeprecationWarning):
@@ -234,7 +78,9 @@ def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaSt
                                 "param_type": "param_type",
                                 "default": True,
                                 "description": "description",
+                                "items": True,
                                 "required": True,
+                                "title": "title",
                             }
                         },
                     }
@@ -337,7 +183,9 @@ def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaSt
                                 "param_type": "param_type",
                                 "default": True,
                                 "description": "description",
+                                "items": True,
                                 "required": True,
+                                "title": "title",
                             }
                         },
                     }
@@ -385,128 +233,6 @@ def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackC
 
         assert cast(Any, response.is_closed) is True
 
-    @parametrize
-    def test_method_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(CompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_method_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_raw_response_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = response.parse()
-        stream.close()
-
-    @parametrize
-    def test_streaming_response_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = response.parse()
-                stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     def test_method_embeddings(self, client: LlamaStackClient) -> None:
         with pytest.warns(DeprecationWarning):
@@ -611,160 +337,6 @@ class TestAsyncInference:
         "async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"]
     )
 
-    @parametrize
-    async def test_method_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_batch_chat_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ]
-            ],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-            tool_config={
-                "system_message_behavior": "append",
-                "tool_choice": "auto",
-                "tool_prompt_format": "json",
-            },
-            tools=[
-                {
-                    "tool_name": "brave_search",
-                    "description": "description",
-                    "parameters": {
-                        "foo": {
-                            "param_type": "param_type",
-                            "default": True,
-                            "description": "description",
-                            "required": True,
-                        }
-                    },
-                }
-            ],
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        response = await async_client.inference.with_raw_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        async with async_client.inference.with_streaming_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = await response.parse()
-            assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_method_batch_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        response = await async_client.inference.with_raw_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        async with async_client.inference.with_streaming_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = await response.parse()
-            assert_matches_type(BatchCompletion, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
         with pytest.warns(DeprecationWarning):
@@ -820,7 +392,9 @@ async def test_method_chat_completion_with_all_params_overload_1(self, async_cli
                                 "param_type": "param_type",
                                 "default": True,
                                 "description": "description",
+                                "items": True,
                                 "required": True,
+                                "title": "title",
                             }
                         },
                     }
@@ -923,7 +497,9 @@ async def test_method_chat_completion_with_all_params_overload_2(self, async_cli
                                 "param_type": "param_type",
                                 "default": True,
                                 "description": "description",
+                                "items": True,
                                 "required": True,
+                                "title": "title",
                             }
                         },
                     }
@@ -971,128 +547,6 @@ async def test_streaming_response_chat_completion_overload_2(self, async_client:
 
         assert cast(Any, response.is_closed) is True
 
-    @parametrize
-    async def test_method_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(CompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_method_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_raw_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = await response.parse()
-        await stream.close()
-
-    @parametrize
-    async def test_streaming_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = await response.parse()
-                await stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
         with pytest.warns(DeprecationWarning):
From 7f24c432dc1859312710a4a1ff4a80f6f861bee8 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 01:55:37 +0000
Subject: [PATCH 2/8] feat(api): expires_after changes for /files
---
 .stats.yml                                    |   6 +-
 README.md                                     |   3 -
 api.md                                        |  11 +-
 src/llama_stack_client/resources/files.py     |  34 +--
 src/llama_stack_client/resources/inference.py | 284 +-----------------
 .../resources/models/models.py                |   4 +-
 .../resources/models/openai.py                |  18 +-
 src/llama_stack_client/types/__init__.py      |   5 +-
 .../types/embeddings_response.py              |  16 -
 .../types/file_create_params.py               |  10 +-
 .../types/inference_embeddings_params.py      |  46 ---
 .../types/inference_rerank_params.py          | 106 -------
 .../types/inference_rerank_response.py        |  23 --
 .../types/list_models_response.py             |  10 +
 .../types/model_list_response.py              |  19 +-
 .../types/models/openai_list_response.py      |  19 +-
 .../types/response_list_response.py           |   3 -
 .../types/response_object.py                  |   3 -
 tests/api_resources/models/test_openai.py     |  14 +-
 tests/api_resources/test_files.py             |  18 --
 tests/api_resources/test_inference.py         | 200 ------------
 21 files changed, 51 insertions(+), 801 deletions(-)
 delete mode 100644 src/llama_stack_client/types/embeddings_response.py
 delete mode 100644 src/llama_stack_client/types/inference_embeddings_params.py
 delete mode 100644 src/llama_stack_client/types/inference_rerank_params.py
 delete mode 100644 src/llama_stack_client/types/inference_rerank_response.py
 create mode 100644 src/llama_stack_client/types/list_models_response.py
diff --git a/.stats.yml b/.stats.yml
index e5bf0be0..016bf7b6 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
-configured_endpoints: 107
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-1eddf141208c131ee4a64ef996f8f419b444f60450de6807a9f6bc711ed8b661.yml
-openapi_spec_hash: 94765c67ea99b1358169d41d810dd395
+configured_endpoints: 105
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml
+openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952
 config_hash: 7ec5a583f9c26b38993013bdfb0e7d46
diff --git a/README.md b/README.md
index cbd6bf78..d448f59d 100644
--- a/README.md
+++ b/README.md
@@ -151,10 +151,7 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient()
 
 client.files.create(
-    expires_after_anchor="expires_after_anchor",
-    expires_after_seconds=0,
     file=Path("/path/to/file"),
-    purpose="assistants",
 )
 ```
 
diff --git a/api.md b/api.md
index 97d40b31..ad4e635c 100644
--- a/api.md
+++ b/api.md
@@ -241,19 +241,12 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types import (
-    ChatCompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    TokenLogProbs,
-    InferenceRerankResponse,
-)
+from llama_stack_client.types import ChatCompletionResponseStreamChunk, TokenLogProbs
 ```
 
 Methods:
 
 - client.inference.chat_completion(\*\*params) -> ChatCompletionResponse
-- client.inference.embeddings(\*\*params) -> EmbeddingsResponse
-- client.inference.rerank(\*\*params) -> InferenceRerankResponse
 
 # Embeddings
 
@@ -406,7 +399,7 @@ from llama_stack_client.types.models import OpenAIListResponse
 
 Methods:
 
-- client.models.openai.list() -> OpenAIListResponse
+- client.models.openai.list() -> ModelListResponse
 
 # PostTraining
 
diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py
index e8f20d35..04c37c56 100644
--- a/src/llama_stack_client/resources/files.py
+++ b/src/llama_stack_client/resources/files.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Mapping, Optional, cast
+from typing import Mapping, cast
 from typing_extensions import Literal
 
 import httpx
@@ -49,10 +49,7 @@ def with_streaming_response(self) -> FilesResourceWithStreamingResponse:
     def create(
         self,
         *,
-        expires_after_anchor: Optional[str],
-        expires_after_seconds: Optional[int],
         file: FileTypes,
-        purpose: Literal["assistants", "batch"],
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -68,12 +65,8 @@ def create(
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
         - expires_after: Optional form values describing expiration for the file.
-          Expected expires_after[anchor] = "created_at", expires_after[seconds] =
-          {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
 
         Args:
-          purpose: Valid purpose values for OpenAI Files API.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -82,14 +75,7 @@ def create(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        body = deepcopy_minimal(
-            {
-                "expires_after_anchor": expires_after_anchor,
-                "expires_after_seconds": expires_after_seconds,
-                "file": file,
-                "purpose": purpose,
-            }
-        )
+        body = deepcopy_minimal({"file": file})
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
         # It should be noted that the actual Content-Type header that will be
         # sent to the server will contain a `boundary` parameter, e.g.
@@ -288,10 +274,7 @@ def with_streaming_response(self) -> AsyncFilesResourceWithStreamingResponse:
     async def create(
         self,
         *,
-        expires_after_anchor: Optional[str],
-        expires_after_seconds: Optional[int],
         file: FileTypes,
-        purpose: Literal["assistants", "batch"],
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -307,12 +290,8 @@ async def create(
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
         - expires_after: Optional form values describing expiration for the file.
-          Expected expires_after[anchor] = "created_at", expires_after[seconds] =
-          {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
 
         Args:
-          purpose: Valid purpose values for OpenAI Files API.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -321,14 +300,7 @@ async def create(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        body = deepcopy_minimal(
-            {
-                "expires_after_anchor": expires_after_anchor,
-                "expires_after_seconds": expires_after_seconds,
-                "file": file,
-                "purpose": purpose,
-            }
-        )
+        body = deepcopy_minimal({"file": file})
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
         # It should be noted that the actual Content-Type header that will be
         # sent to the server will contain a `boundary` parameter, e.g.
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 9a2b6c50..bac5cb3e 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -3,13 +3,13 @@
 from __future__ import annotations
 
 import typing_extensions
-from typing import Type, Union, Iterable, cast
+from typing import Iterable
 from typing_extensions import Literal, overload
 
 import httpx
 
-from ..types import inference_rerank_params, inference_embeddings_params, inference_chat_completion_params
-from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from ..types import inference_chat_completion_params
+from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
 from .._utils import required_args, maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
@@ -19,17 +19,13 @@
     async_to_raw_response_wrapper,
     async_to_streamed_response_wrapper,
 )
-from .._wrappers import DataWrapper
 from .._streaming import Stream, AsyncStream
 from .._base_client import make_request_options
-from ..types.embeddings_response import EmbeddingsResponse
 from ..types.shared_params.message import Message
-from ..types.inference_rerank_response import InferenceRerankResponse
 from ..types.shared_params.response_format import ResponseFormat
 from ..types.shared_params.sampling_params import SamplingParams
 from ..types.shared.chat_completion_response import ChatCompletionResponse
 from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
-from ..types.shared_params.interleaved_content_item import InterleavedContentItem
 
 __all__ = ["InferenceResource", "AsyncInferenceResource"]
 
@@ -495,126 +491,7 @@ def completion(
             stream=stream or False,
             stream_cls=Stream[CompletionResponse],
         )
-
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.")
-    def embeddings(
-        self,
-        *,
-        contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
-        model_id: str,
-        output_dimension: int | Omit = omit,
-        task_type: Literal["query", "document"] | Omit = omit,
-        text_truncation: Literal["none", "start", "end"] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> EmbeddingsResponse:
-        """
-        Generate embeddings for content pieces using the specified model.
-
-        Args:
-          contents: List of contents to generate embeddings for. Each content can be a string or an
-              InterleavedContentItem (and hence can be multimodal). The behavior depends on
-              the model and provider. Some models may only support text.
-
-          model_id: The identifier of the model to use. The model must be an embedding model
-              registered with Llama Stack and available via the /models endpoint.
-
-          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
-              Matryoshka models.
-
-          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
-              embedding models.
-
-          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
-              than the model's max sequence length.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/embeddings",
-            body=maybe_transform(
-                {
-                    "contents": contents,
-                    "model_id": model_id,
-                    "output_dimension": output_dimension,
-                    "task_type": task_type,
-                    "text_truncation": text_truncation,
-                },
-                inference_embeddings_params.InferenceEmbeddingsParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=EmbeddingsResponse,
-        )
-
-    def rerank(
-        self,
-        *,
-        items: SequenceNotStr[inference_rerank_params.Item],
-        model: str,
-        query: inference_rerank_params.Query,
-        max_num_results: int | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceRerankResponse:
-        """
-        Rerank a list of documents based on their relevance to a query.
-
-        Args:
-          items: List of items to rerank. Each item can be a string, text content part, or image
-              content part. Each input must not exceed the model's max input token length.
-
-          model: The identifier of the reranking model to use.
-
-          query: The search query to rank items against. Can be a string, text content part, or
-              image content part. The input must not exceed the model's max input token
-              length.
-
-          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/rerank",
-            body=maybe_transform(
-                {
-                    "items": items,
-                    "model": model,
-                    "query": query,
-                    "max_num_results": max_num_results,
-                },
-                inference_rerank_params.InferenceRerankParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
-            ),
-            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
-        )
+    
 
 
 class AsyncInferenceResource(AsyncAPIResource):
@@ -1078,126 +955,7 @@ async def completion(
             stream=stream or False,
             stream_cls=AsyncStream[CompletionResponse],
         )
-
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/embeddings.")
-    async def embeddings(
-        self,
-        *,
-        contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
-        model_id: str,
-        output_dimension: int | Omit = omit,
-        task_type: Literal["query", "document"] | Omit = omit,
-        text_truncation: Literal["none", "start", "end"] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> EmbeddingsResponse:
-        """
-        Generate embeddings for content pieces using the specified model.
-
-        Args:
-          contents: List of contents to generate embeddings for. Each content can be a string or an
-              InterleavedContentItem (and hence can be multimodal). The behavior depends on
-              the model and provider. Some models may only support text.
-
-          model_id: The identifier of the model to use. The model must be an embedding model
-              registered with Llama Stack and available via the /models endpoint.
-
-          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
-              Matryoshka models.
-
-          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
-              embedding models.
-
-          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
-              than the model's max sequence length.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/embeddings",
-            body=await async_maybe_transform(
-                {
-                    "contents": contents,
-                    "model_id": model_id,
-                    "output_dimension": output_dimension,
-                    "task_type": task_type,
-                    "text_truncation": text_truncation,
-                },
-                inference_embeddings_params.InferenceEmbeddingsParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=EmbeddingsResponse,
-        )
-
-    async def rerank(
-        self,
-        *,
-        items: SequenceNotStr[inference_rerank_params.Item],
-        model: str,
-        query: inference_rerank_params.Query,
-        max_num_results: int | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceRerankResponse:
-        """
-        Rerank a list of documents based on their relevance to a query.
-
-        Args:
-          items: List of items to rerank. Each item can be a string, text content part, or image
-              content part. Each input must not exceed the model's max input token length.
-
-          model: The identifier of the reranking model to use.
-
-          query: The search query to rank items against. Can be a string, text content part, or
-              image content part. The input must not exceed the model's max input token
-              length.
-
-          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/rerank",
-            body=await async_maybe_transform(
-                {
-                    "items": items,
-                    "model": model,
-                    "query": query,
-                    "max_num_results": max_num_results,
-                },
-                inference_rerank_params.InferenceRerankParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
-            ),
-            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
-        )
+    
 
 
 class InferenceResourceWithRawResponse:
@@ -1209,14 +967,6 @@ def __init__(self, inference: InferenceResource) -> None:
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.rerank = to_raw_response_wrapper(
-            inference.rerank,
-        )
 
 
 class AsyncInferenceResourceWithRawResponse:
@@ -1228,14 +978,6 @@ def __init__(self, inference: AsyncInferenceResource) -> None:
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.rerank = async_to_raw_response_wrapper(
-            inference.rerank,
-        )
 
 
 class InferenceResourceWithStreamingResponse:
@@ -1247,14 +989,6 @@ def __init__(self, inference: InferenceResource) -> None:
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.rerank = to_streamed_response_wrapper(
-            inference.rerank,
-        )
 
 
 class AsyncInferenceResourceWithStreamingResponse:
@@ -1266,11 +1000,3 @@ def __init__(self, inference: AsyncInferenceResource) -> None:
                 inference.chat_completion,  # pyright: ignore[reportDeprecated],
             )
         )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.rerank = async_to_streamed_response_wrapper(
-            inference.rerank,
-        )
diff --git a/src/llama_stack_client/resources/models/models.py b/src/llama_stack_client/resources/models/models.py
index 72d9f81e..f044c50d 100644
--- a/src/llama_stack_client/resources/models/models.py
+++ b/src/llama_stack_client/resources/models/models.py
@@ -101,7 +101,7 @@ def list(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ModelListResponse:
-        """List models using the OpenAI API."""
+        """List all models."""
         return self._get(
             "/v1/models",
             options=make_request_options(
@@ -271,7 +271,7 @@ async def list(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ModelListResponse:
-        """List models using the OpenAI API."""
+        """List all models."""
         return await self._get(
             "/v1/models",
             options=make_request_options(
diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py
index 57179ed8..ab4b4038 100644
--- a/src/llama_stack_client/resources/models/openai.py
+++ b/src/llama_stack_client/resources/models/openai.py
@@ -17,7 +17,7 @@
 )
 from ..._wrappers import DataWrapper
 from ..._base_client import make_request_options
-from ...types.models.openai_list_response import OpenAIListResponse
+from ...types.model_list_response import ModelListResponse
 
 __all__ = ["OpenAIResource", "AsyncOpenAIResource"]
 
@@ -51,8 +51,8 @@ def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> OpenAIListResponse:
-        """List models using the OpenAI API."""
+    ) -> ModelListResponse:
+        """List all models."""
         return self._get(
             "/v1/models",
             options=make_request_options(
@@ -60,9 +60,9 @@ def list(
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+                post_parser=DataWrapper[ModelListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+            cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
         )
 
 
@@ -95,8 +95,8 @@ async def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> OpenAIListResponse:
-        """List models using the OpenAI API."""
+    ) -> ModelListResponse:
+        """List all models."""
         return await self._get(
             "/v1/models",
             options=make_request_options(
@@ -104,9 +104,9 @@ async def list(
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+                post_parser=DataWrapper[ModelListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+            cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
         )
 
 
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index e63970ee..8a61ceec 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -60,7 +60,6 @@
 from .tool_list_response import ToolListResponse as ToolListResponse
 from .agent_create_params import AgentCreateParams as AgentCreateParams
 from .agent_list_response import AgentListResponse as AgentListResponse
-from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse
 from .list_files_response import ListFilesResponse as ListFilesResponse
 from .list_tools_response import ListToolsResponse as ListToolsResponse
 from .model_list_response import ModelListResponse as ModelListResponse
@@ -71,6 +70,7 @@
 from .delete_file_response import DeleteFileResponse as DeleteFileResponse
 from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam
 from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams
+from .list_models_response import ListModelsResponse as ListModelsResponse
 from .list_routes_response import ListRoutesResponse as ListRoutesResponse
 from .query_spans_response import QuerySpansResponse as QuerySpansResponse
 from .response_list_params import ResponseListParams as ResponseListParams
@@ -100,7 +100,6 @@
 from .dataset_iterrows_params import DatasetIterrowsParams as DatasetIterrowsParams
 from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams
 from .embedding_create_params import EmbeddingCreateParams as EmbeddingCreateParams
-from .inference_rerank_params import InferenceRerankParams as InferenceRerankParams
 from .list_providers_response import ListProvidersResponse as ListProvidersResponse
 from .scoring_fn_params_param import ScoringFnParamsParam as ScoringFnParamsParam
 from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse
@@ -119,7 +118,6 @@
 from .dataset_register_response import DatasetRegisterResponse as DatasetRegisterResponse
 from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse
 from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams
-from .inference_rerank_response import InferenceRerankResponse as InferenceRerankResponse
 from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse
 from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams
 from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams
@@ -131,7 +129,6 @@
 from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams
 from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams
 from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams
-from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams
 from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse
 from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse
 from .vector_db_register_response import VectorDBRegisterResponse as VectorDBRegisterResponse
diff --git a/src/llama_stack_client/types/embeddings_response.py b/src/llama_stack_client/types/embeddings_response.py
deleted file mode 100644
index f36c6b97..00000000
--- a/src/llama_stack_client/types/embeddings_response.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-
-__all__ = ["EmbeddingsResponse"]
-
-
-class EmbeddingsResponse(BaseModel):
-    embeddings: List[List[float]]
-    """List of embedding vectors, one per input content.
-
-    Each embedding is a list of floats. The dimensionality of the embedding is
-    model-specific; you can check model metadata using /models/{model_id}
-    """
diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py
index a1197ff5..6278e1a0 100644
--- a/src/llama_stack_client/types/file_create_params.py
+++ b/src/llama_stack_client/types/file_create_params.py
@@ -2,8 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Optional
-from typing_extensions import Literal, Required, TypedDict
+from typing_extensions import Required, TypedDict
 
 from .._types import FileTypes
 
@@ -11,11 +10,4 @@
 
 
 class FileCreateParams(TypedDict, total=False):
-    expires_after_anchor: Required[Optional[str]]
-
-    expires_after_seconds: Required[Optional[int]]
-
     file: Required[FileTypes]
-
-    purpose: Required[Literal["assistants", "batch"]]
-    """Valid purpose values for OpenAI Files API."""
diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py
deleted file mode 100644
index a1be545b..00000000
--- a/src/llama_stack_client/types/inference_embeddings_params.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.interleaved_content_item import InterleavedContentItem
-
-__all__ = ["InferenceEmbeddingsParams"]
-
-
-class InferenceEmbeddingsParams(TypedDict, total=False):
-    contents: Required[Union[SequenceNotStr[str], Iterable[InterleavedContentItem]]]
-    """List of contents to generate embeddings for.
-
-    Each content can be a string or an InterleavedContentItem (and hence can be
-    multimodal). The behavior depends on the model and provider. Some models may
-    only support text.
-    """
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be an embedding model registered with Llama Stack and available
-    via the /models endpoint.
-    """
-
-    output_dimension: int
-    """(Optional) Output dimensionality for the embeddings.
-
-    Only supported by Matryoshka models.
-    """
-
-    task_type: Literal["query", "document"]
-    """
-    (Optional) How is the embedding being used? This is only supported by asymmetric
-    embedding models.
-    """
-
-    text_truncation: Literal["none", "start", "end"]
-    """
-    (Optional) Config for how to truncate text for embedding when text is longer
-    than the model's max sequence length.
-    """
diff --git a/src/llama_stack_client/types/inference_rerank_params.py b/src/llama_stack_client/types/inference_rerank_params.py
deleted file mode 100644
index 8f8c4d64..00000000
--- a/src/llama_stack_client/types/inference_rerank_params.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-from .._types import SequenceNotStr
-
-__all__ = [
-    "InferenceRerankParams",
-    "Item",
-    "ItemOpenAIChatCompletionContentPartTextParam",
-    "ItemOpenAIChatCompletionContentPartImageParam",
-    "ItemOpenAIChatCompletionContentPartImageParamImageURL",
-    "Query",
-    "QueryOpenAIChatCompletionContentPartTextParam",
-    "QueryOpenAIChatCompletionContentPartImageParam",
-    "QueryOpenAIChatCompletionContentPartImageParamImageURL",
-]
-
-
-class InferenceRerankParams(TypedDict, total=False):
-    items: Required[SequenceNotStr[Item]]
-    """List of items to rerank.
-
-    Each item can be a string, text content part, or image content part. Each input
-    must not exceed the model's max input token length.
-    """
-
-    model: Required[str]
-    """The identifier of the reranking model to use."""
-
-    query: Required[Query]
-    """The search query to rank items against.
-
-    Can be a string, text content part, or image content part. The input must not
-    exceed the model's max input token length.
-    """
-
-    max_num_results: int
-    """(Optional) Maximum number of results to return. Default: returns all."""
-
-
-class ItemOpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
-    text: Required[str]
-    """The text content of the message"""
-
-    type: Required[Literal["text"]]
-    """Must be "text" to identify this as text content"""
-
-
-class ItemOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False):
-    url: Required[str]
-    """URL of the image to include in the message"""
-
-    detail: str
-    """(Optional) Level of detail for image processing.
-
-    Can be "low", "high", or "auto"
-    """
-
-
-class ItemOpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
-    image_url: Required[ItemOpenAIChatCompletionContentPartImageParamImageURL]
-    """Image URL specification and processing details"""
-
-    type: Required[Literal["image_url"]]
-    """Must be "image_url" to identify this as image content"""
-
-
-Item: TypeAlias = Union[
-    str, ItemOpenAIChatCompletionContentPartTextParam, ItemOpenAIChatCompletionContentPartImageParam
-]
-
-
-class QueryOpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
-    text: Required[str]
-    """The text content of the message"""
-
-    type: Required[Literal["text"]]
-    """Must be "text" to identify this as text content"""
-
-
-class QueryOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False):
-    url: Required[str]
-    """URL of the image to include in the message"""
-
-    detail: str
-    """(Optional) Level of detail for image processing.
-
-    Can be "low", "high", or "auto"
-    """
-
-
-class QueryOpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
-    image_url: Required[QueryOpenAIChatCompletionContentPartImageParamImageURL]
-    """Image URL specification and processing details"""
-
-    type: Required[Literal["image_url"]]
-    """Must be "image_url" to identify this as image content"""
-
-
-Query: TypeAlias = Union[
-    str, QueryOpenAIChatCompletionContentPartTextParam, QueryOpenAIChatCompletionContentPartImageParam
-]
diff --git a/src/llama_stack_client/types/inference_rerank_response.py b/src/llama_stack_client/types/inference_rerank_response.py
deleted file mode 100644
index e74fc7e6..00000000
--- a/src/llama_stack_client/types/inference_rerank_response.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-from typing_extensions import TypeAlias
-
-from .._models import BaseModel
-
-__all__ = ["InferenceRerankResponse", "InferenceRerankResponseItem"]
-
-
-class InferenceRerankResponseItem(BaseModel):
-    index: int
-    """The original index of the document in the input list"""
-
-    relevance_score: float
-    """The relevance score from the model output.
-
-    Values are inverted when applicable so that higher scores indicate greater
-    relevance.
-    """
-
-
-InferenceRerankResponse: TypeAlias = List[InferenceRerankResponseItem]
diff --git a/src/llama_stack_client/types/list_models_response.py b/src/llama_stack_client/types/list_models_response.py
new file mode 100644
index 00000000..a36896b8
--- /dev/null
+++ b/src/llama_stack_client/types/list_models_response.py
@@ -0,0 +1,10 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from .._models import BaseModel
+from .model_list_response import ModelListResponse
+
+__all__ = ["ListModelsResponse"]
+
+
+class ListModelsResponse(BaseModel):
+    data: ModelListResponse
diff --git a/src/llama_stack_client/types/model_list_response.py b/src/llama_stack_client/types/model_list_response.py
index 7631b69f..905cdb0f 100644
--- a/src/llama_stack_client/types/model_list_response.py
+++ b/src/llama_stack_client/types/model_list_response.py
@@ -1,21 +1,10 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import List
-from typing_extensions import Literal, TypeAlias
+from typing_extensions import TypeAlias
 
-from .._models import BaseModel
+from .model import Model
 
-__all__ = ["ModelListResponse", "ModelListResponseItem"]
+__all__ = ["ModelListResponse"]
 
-
-class ModelListResponseItem(BaseModel):
-    id: str
-
-    created: int
-
-    object: Literal["model"]
-
-    owned_by: str
-
-
-ModelListResponse: TypeAlias = List[ModelListResponseItem]
+ModelListResponse: TypeAlias = List[Model]
diff --git a/src/llama_stack_client/types/models/openai_list_response.py b/src/llama_stack_client/types/models/openai_list_response.py
index f14845d5..5b6c0358 100644
--- a/src/llama_stack_client/types/models/openai_list_response.py
+++ b/src/llama_stack_client/types/models/openai_list_response.py
@@ -1,21 +1,10 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import List
-from typing_extensions import Literal, TypeAlias
+from typing_extensions import TypeAlias
 
-from ..._models import BaseModel
+from ..model import Model
 
-__all__ = ["OpenAIListResponse", "OpenAIListResponseItem"]
+__all__ = ["OpenAIListResponse"]
 
-
-class OpenAIListResponseItem(BaseModel):
-    id: str
-
-    created: int
-
-    object: Literal["model"]
-
-    owned_by: str
-
-
-OpenAIListResponse: TypeAlias = List[OpenAIListResponseItem]
+OpenAIListResponse: TypeAlias = List[Model]
diff --git a/src/llama_stack_client/types/response_list_response.py b/src/llama_stack_client/types/response_list_response.py
index ae50d44a..ac7ec1b1 100644
--- a/src/llama_stack_client/types/response_list_response.py
+++ b/src/llama_stack_client/types/response_list_response.py
@@ -570,6 +570,3 @@ class ResponseListResponse(BaseModel):
 
     truncation: Optional[str] = None
     """(Optional) Truncation strategy applied to the response"""
-
-    user: Optional[str] = None
-    """(Optional) User identifier associated with the request"""
diff --git a/src/llama_stack_client/types/response_object.py b/src/llama_stack_client/types/response_object.py
index c0f348a9..b618ddf5 100644
--- a/src/llama_stack_client/types/response_object.py
+++ b/src/llama_stack_client/types/response_object.py
@@ -361,6 +361,3 @@ def output_text(self) -> str:
 
     truncation: Optional[str] = None
     """(Optional) Truncation strategy applied to the response"""
-
-    user: Optional[str] = None
-    """(Optional) User identifier associated with the request"""
diff --git a/tests/api_resources/models/test_openai.py b/tests/api_resources/models/test_openai.py
index ea64cce2..f94d2bf6 100644
--- a/tests/api_resources/models/test_openai.py
+++ b/tests/api_resources/models/test_openai.py
@@ -9,7 +9,7 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types.models import OpenAIListResponse
+from llama_stack_client.types import ModelListResponse
 
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
@@ -20,7 +20,7 @@ class TestOpenAI:
     @parametrize
     def test_method_list(self, client: LlamaStackClient) -> None:
         openai = client.models.openai.list()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     def test_raw_response_list(self, client: LlamaStackClient) -> None:
@@ -29,7 +29,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None:
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         openai = response.parse()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     def test_streaming_response_list(self, client: LlamaStackClient) -> None:
@@ -38,7 +38,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None:
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             openai = response.parse()
-            assert_matches_type(OpenAIListResponse, openai, path=["response"])
+            assert_matches_type(ModelListResponse, openai, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -51,7 +51,7 @@ class TestAsyncOpenAI:
     @parametrize
     async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None:
         openai = await async_client.models.openai.list()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -60,7 +60,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         openai = await response.parse()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -69,6 +69,6 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             openai = await response.parse()
-            assert_matches_type(OpenAIListResponse, openai, path=["response"])
+            assert_matches_type(ModelListResponse, openai, path=["response"])
 
         assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py
index f2bc1e0a..bdf81d4f 100644
--- a/tests/api_resources/test_files.py
+++ b/tests/api_resources/test_files.py
@@ -21,20 +21,14 @@ class TestFiles:
     @parametrize
     def test_method_create(self, client: LlamaStackClient) -> None:
         file = client.files.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         )
         assert_matches_type(File, file, path=["response"])
 
     @parametrize
     def test_raw_response_create(self, client: LlamaStackClient) -> None:
         response = client.files.with_raw_response.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         )
 
         assert response.is_closed is True
@@ -45,10 +39,7 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_streaming_response_create(self, client: LlamaStackClient) -> None:
         with client.files.with_streaming_response.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -216,20 +207,14 @@ class TestAsyncFiles:
     @parametrize
     async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
         file = await async_client.files.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         )
         assert_matches_type(File, file, path=["response"])
 
     @parametrize
     async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.files.with_raw_response.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         )
 
         assert response.is_closed is True
@@ -240,10 +225,7 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) ->
     @parametrize
     async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.files.with_streaming_response.create(
-            expires_after_anchor="expires_after_anchor",
-            expires_after_seconds=0,
             file=b"raw file contents",
-            purpose="assistants",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py
index 6e952637..6fc8040b 100644
--- a/tests/api_resources/test_inference.py
+++ b/tests/api_resources/test_inference.py
@@ -9,10 +9,6 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types import (
-    EmbeddingsResponse,
-    InferenceRerankResponse,
-)
 from llama_stack_client.types.shared import ChatCompletionResponse
 
 # pyright: reportDeprecated=false
@@ -233,104 +229,6 @@ def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackC
 
         assert cast(Any, response.is_closed) is True
 
-    @parametrize
-    def test_method_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-                output_dimension=0,
-                task_type="query",
-                text_truncation="none",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_rerank(self, client: LlamaStackClient) -> None:
-        inference = client.inference.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        )
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_rerank_with_all_params(self, client: LlamaStackClient) -> None:
-        inference = client.inference.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-            max_num_results=0,
-        )
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_rerank(self, client: LlamaStackClient) -> None:
-        response = client.inference.with_raw_response.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_rerank(self, client: LlamaStackClient) -> None:
-        with client.inference.with_streaming_response.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = response.parse()
-            assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
 
 class TestAsyncInference:
     parametrize = pytest.mark.parametrize(
@@ -546,101 +444,3 @@ async def test_streaming_response_chat_completion_overload_2(self, async_client:
                 await stream.close()
 
         assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-                output_dimension=0,
-                task_type="query",
-                text_truncation="none",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        )
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_rerank_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-            max_num_results=0,
-        )
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_rerank(self, async_client: AsyncLlamaStackClient) -> None:
-        response = await async_client.inference.with_raw_response.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_rerank(self, async_client: AsyncLlamaStackClient) -> None:
-        async with async_client.inference.with_streaming_response.rerank(
-            items=["string"],
-            model="model",
-            query="string",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = await response.parse()
-            assert_matches_type(InferenceRerankResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
From 04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 02:00:14 +0000
Subject: [PATCH 3/8] feat(api)!: fixes to remove deprecated inference
 resources
---
 .stats.yml                                    |   2 +-
 README.md                                     |  43 +-
 api.md                                        |   4 +-
 src/llama_stack_client/resources/inference.py | 546 +++---------------
 src/llama_stack_client/types/__init__.py      |   7 +-
 .../chat_completion_response_stream_chunk.py  |  36 --
 .../types/inference_chat_completion_params.py | 134 -----
 .../types/inference_rerank_params.py          | 106 ++++
 .../types/inference_rerank_response.py        |  23 +
 .../types/shared/chat_completion_response.py  |  12 +-
 .../types/shared_params/__init__.py           |   1 -
 .../shared_params/tool_param_definition.py    |  22 -
 .../types/token_log_probs.py                  |  12 -
 tests/api_resources/test_inference.py         | 458 +++------------
 tests/test_client.py                          | 132 +----
 15 files changed, 308 insertions(+), 1230 deletions(-)
 delete mode 100644 src/llama_stack_client/types/chat_completion_response_stream_chunk.py
 delete mode 100644 src/llama_stack_client/types/inference_chat_completion_params.py
 create mode 100644 src/llama_stack_client/types/inference_rerank_params.py
 create mode 100644 src/llama_stack_client/types/inference_rerank_response.py
 delete mode 100644 src/llama_stack_client/types/shared_params/tool_param_definition.py
 delete mode 100644 src/llama_stack_client/types/token_log_probs.py
diff --git a/.stats.yml b/.stats.yml
index 016bf7b6..ed589610 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 105
 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml
 openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952
-config_hash: 7ec5a583f9c26b38993013bdfb0e7d46
+config_hash: 5b643c97c83a497d7d346253f1e175f3
diff --git a/README.md b/README.md
index d448f59d..75857f1d 100644
--- a/README.md
+++ b/README.md
@@ -127,17 +127,11 @@ from llama_stack_client import LlamaStackClient
 
 client = LlamaStackClient()
 
-chat_completion_response = client.inference.chat_completion(
-    messages=[
-        {
-            "content": "string",
-            "role": "user",
-        }
-    ],
-    model_id="model_id",
-    logprobs={},
+client.toolgroups.register(
+    provider_id="provider_id",
+    toolgroup_id="toolgroup_id",
+    mcp_endpoint={"uri": "uri"},
 )
-print(chat_completion_response.logprobs)
 ```
 
 ## File uploads
@@ -173,10 +167,7 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient()
 
 try:
-    client.agents.sessions.create(
-        agent_id="agent_id",
-        session_name="session_name",
-    )
+    client.agents.toolgroups.list()
 except llama_stack_client.APIConnectionError as e:
     print("The server could not be reached")
     print(e.__cause__)  # an underlying Exception, likely raised within httpx.
@@ -219,10 +210,7 @@ client = LlamaStackClient(
 )
 
 # Or, configure per-request:
-client.with_options(max_retries=5).agents.sessions.create(
-    agent_id="agent_id",
-    session_name="session_name",
-)
+client.with_options(max_retries=5).toolgroups.list.create()
 ```
 
 ### Timeouts
@@ -245,10 +233,7 @@ client = LlamaStackClient(
 )
 
 # Override per-request:
-client.with_options(timeout=5.0).agents.sessions.create(
-    agent_id="agent_id",
-    session_name="session_name",
-)
+client.with_options(timeout=5.0).toolgroups.list.create()
 ```
 
 On timeout, an `APITimeoutError` is thrown.
@@ -287,14 +272,11 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to
 from llama_stack_client import LlamaStackClient
 
 client = LlamaStackClient()
-response = client.agents.sessions.with_raw_response.create(
-    agent_id="agent_id",
-    session_name="session_name",
-)
+response = client.toolgroups.with_raw_response.list()
 print(response.headers.get('X-My-Header'))
 
-session = response.parse()  # get the object that `agents.sessions.create()` would have returned
-print(session.session_id)
+toolgroup = response.parse()  # get the object that `toolgroups.list()` would have returned
+print(toolgroup)
 ```
 
 These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object.
@@ -308,10 +290,7 @@ The above interface eagerly reads the full response body when you make the reque
 To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods.
 
 ```python
-with client.agents.sessions.with_streaming_response.create(
-    agent_id="agent_id",
-    session_name="session_name",
-) as response:
+with client.agents.toolgroups.with_streaming_response.list() as response:
     print(response.headers.get("X-My-Header"))
 
     for line in response.iter_lines():
diff --git a/api.md b/api.md
index ad4e635c..85e5a178 100644
--- a/api.md
+++ b/api.md
@@ -241,12 +241,12 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types import ChatCompletionResponseStreamChunk, TokenLogProbs
+from llama_stack_client.types import InferenceRerankResponse
 ```
 
 Methods:
 
-- client.inference.chat_completion(\*\*params) -> ChatCompletionResponse
+- client.inference.rerank(\*\*params) -> InferenceRerankResponse
 
 # Embeddings
 
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index bac5cb3e..5c022b0f 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -2,15 +2,13 @@
 
 from __future__ import annotations
 
-import typing_extensions
-from typing import Iterable
-from typing_extensions import Literal, overload
+from typing import Type, cast
 
 import httpx
 
-from ..types import inference_chat_completion_params
-from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
-from .._utils import required_args, maybe_transform, async_maybe_transform
+from ..types import inference_rerank_params
+from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
@@ -19,13 +17,9 @@
     async_to_raw_response_wrapper,
     async_to_streamed_response_wrapper,
 )
-from .._streaming import Stream, AsyncStream
+from .._wrappers import DataWrapper
 from .._base_client import make_request_options
-from ..types.shared_params.message import Message
-from ..types.shared_params.response_format import ResponseFormat
-from ..types.shared_params.sampling_params import SamplingParams
-from ..types.shared.chat_completion_response import ChatCompletionResponse
-from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
+from ..types.inference_rerank_response import InferenceRerankResponse
 
 __all__ = ["InferenceResource", "AsyncInferenceResource"]
 
@@ -50,64 +44,34 @@ def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
         """
         return InferenceResourceWithStreamingResponse(self)
 
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    def chat_completion(
+    def rerank(
         self,
         *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
+        items: SequenceNotStr[inference_rerank_params.Item],
+        model: str,
+        query: inference_rerank_params.Query,
+        max_num_results: int | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse:
+    ) -> InferenceRerankResponse:
         """
-        Generate a chat completion for the given messages using the specified model.
+        Rerank a list of documents based on their relevance to a query.
 
         Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
+          items: List of items to rerank. Each item can be a string, text content part, or image
+              content part. Each input must not exceed the model's max input token length.
 
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
+          model: The identifier of the reranking model to use.
 
-          tool_config: (Optional) Configuration for tool use.
+          query: The search query to rank items against. Can be a string, text content part, or
+              image content part. The input must not exceed the model's max input token
+              length.
 
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
+          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
 
           extra_headers: Send extra headers
 
@@ -117,195 +81,25 @@ def chat_completion(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Stream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: bool,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
         return self._post(
-            "/v1/inference/chat-completion",
+            "/v1alpha/inference/rerank",
             body=maybe_transform(
                 {
-                    "messages": messages,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                    "tool_choice": tool_choice,
-                    "tool_config": tool_config,
-                    "tool_prompt_format": tool_prompt_format,
-                    "tools": tools,
+                    "items": items,
+                    "model": model,
+                    "query": query,
+                    "max_num_results": max_num_results,
                 },
-                inference_chat_completion_params.InferenceChatCompletionParamsStreaming
-                if stream
-                else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
+                inference_rerank_params.InferenceRerankParams,
             ),
             options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
             ),
-            cast_to=ChatCompletionResponse,
-            stream=stream or False,
-            stream_cls=Stream[ChatCompletionResponseStreamChunk],
+            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
         )
 
     @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
@@ -514,64 +308,34 @@ def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse
         """
         return AsyncInferenceResourceWithStreamingResponse(self)
 
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    async def chat_completion(
+    async def rerank(
         self,
         *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
+        items: SequenceNotStr[inference_rerank_params.Item],
+        model: str,
+        query: inference_rerank_params.Query,
+        max_num_results: int | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse:
+    ) -> InferenceRerankResponse:
         """
-        Generate a chat completion for the given messages using the specified model.
+        Rerank a list of documents based on their relevance to a query.
 
         Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
+          items: List of items to rerank. Each item can be a string, text content part, or image
+              content part. Each input must not exceed the model's max input token length.
 
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
+          model: The identifier of the reranking model to use.
 
-          tool_config: (Optional) Configuration for tool use.
+          query: The search query to rank items against. Can be a string, text content part, or
+              image content part. The input must not exceed the model's max input token
+              length.
 
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
+          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
 
           extra_headers: Send extra headers
 
@@ -581,195 +345,25 @@ async def chat_completion(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> AsyncStream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @overload
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: bool,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-               tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/chat-completion is deprecated. Please use /v1/chat/completions.")
-    @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
         return await self._post(
-            "/v1/inference/chat-completion",
+            "/v1alpha/inference/rerank",
             body=await async_maybe_transform(
                 {
-                    "messages": messages,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                    "tool_choice": tool_choice,
-                    "tool_config": tool_config,
-                    "tool_prompt_format": tool_prompt_format,
-                    "tools": tools,
+                    "items": items,
+                    "model": model,
+                    "query": query,
+                    "max_num_results": max_num_results,
                 },
-                inference_chat_completion_params.InferenceChatCompletionParamsStreaming
-                if stream
-                else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
+                inference_rerank_params.InferenceRerankParams,
             ),
             options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
             ),
-            cast_to=ChatCompletionResponse,
-            stream=stream or False,
-            stream_cls=AsyncStream[ChatCompletionResponseStreamChunk],
+            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
         )
 
     @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
@@ -962,10 +556,8 @@ class InferenceResourceWithRawResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
+        self.rerank = to_raw_response_wrapper(
+            inference.rerank,
         )
 
 
@@ -973,10 +565,8 @@ class AsyncInferenceResourceWithRawResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
+        self.rerank = async_to_raw_response_wrapper(
+            inference.rerank,
         )
 
 
@@ -984,10 +574,8 @@ class InferenceResourceWithStreamingResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
+        self.rerank = to_streamed_response_wrapper(
+            inference.rerank,
         )
 
 
@@ -995,8 +583,6 @@ class AsyncInferenceResourceWithStreamingResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
+        self.rerank = async_to_streamed_response_wrapper(
+            inference.rerank,
         )
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 8a61ceec..78cfbe2d 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -47,7 +47,6 @@
 from .tool_def_param import ToolDefParam as ToolDefParam
 from .create_response import CreateResponse as CreateResponse
 from .response_object import ResponseObject as ResponseObject
-from .token_log_probs import TokenLogProbs as TokenLogProbs
 from .file_list_params import FileListParams as FileListParams
 from .shield_call_step import ShieldCallStep as ShieldCallStep
 from .span_with_status import SpanWithStatus as SpanWithStatus
@@ -100,6 +99,7 @@
 from .dataset_iterrows_params import DatasetIterrowsParams as DatasetIterrowsParams
 from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams
 from .embedding_create_params import EmbeddingCreateParams as EmbeddingCreateParams
+from .inference_rerank_params import InferenceRerankParams as InferenceRerankParams
 from .list_providers_response import ListProvidersResponse as ListProvidersResponse
 from .scoring_fn_params_param import ScoringFnParamsParam as ScoringFnParamsParam
 from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse
@@ -118,6 +118,7 @@
 from .dataset_register_response import DatasetRegisterResponse as DatasetRegisterResponse
 from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse
 from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams
+from .inference_rerank_response import InferenceRerankResponse as InferenceRerankResponse
 from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse
 from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams
 from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams
@@ -147,16 +148,12 @@
 from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse
 from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse
 from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams
-from .inference_chat_completion_params import InferenceChatCompletionParams as InferenceChatCompletionParams
 from .list_post_training_jobs_response import ListPostTrainingJobsResponse as ListPostTrainingJobsResponse
 from .scoring_function_register_params import ScoringFunctionRegisterParams as ScoringFunctionRegisterParams
 from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse
 from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse
 from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse
 from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse
-from .chat_completion_response_stream_chunk import (
-    ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk,
-)
 from .telemetry_save_spans_to_dataset_params import (
     TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams,
 )
diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
deleted file mode 100644
index 1a55f3d1..00000000
--- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-from .shared.content_delta import ContentDelta
-
-__all__ = ["ChatCompletionResponseStreamChunk", "Event"]
-
-
-class Event(BaseModel):
-    delta: ContentDelta
-    """Content generated since last event.
-
-    This can be one or more tokens, or a tool call.
-    """
-
-    event_type: Literal["start", "complete", "progress"]
-    """Type of the event"""
-
-    logprobs: Optional[List[TokenLogProbs]] = None
-    """Optional log probabilities for generated tokens"""
-
-    stop_reason: Optional[Literal["end_of_turn", "end_of_message", "out_of_tokens"]] = None
-    """Optional reason why generation stopped, if complete"""
-
-
-class ChatCompletionResponseStreamChunk(BaseModel):
-    event: Event
-    """The event containing the new content"""
-
-    metrics: Optional[List[Metric]] = None
-    """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/inference_chat_completion_params.py b/src/llama_stack_client/types/inference_chat_completion_params.py
deleted file mode 100644
index 746d3dee..00000000
--- a/src/llama_stack_client/types/inference_chat_completion_params.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = [
-    "InferenceChatCompletionParamsBase",
-    "Logprobs",
-    "ToolConfig",
-    "Tool",
-    "InferenceChatCompletionParamsNonStreaming",
-    "InferenceChatCompletionParamsStreaming",
-]
-
-
-class InferenceChatCompletionParamsBase(TypedDict, total=False):
-    messages: Required[Iterable[Message]]
-    """List of messages in the conversation."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding.
-
-    There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
-    schema. Most providers support this format. - `ResponseFormat.grammar`: The
-    grammar is a BNF grammar. This format is more flexible, but not all providers
-    support it.
-    """
-
-    sampling_params: SamplingParams
-    """Parameters to control the sampling strategy."""
-
-    tool_choice: Literal["auto", "required", "none"]
-    """(Optional) Whether tool use is required or automatic.
-
-    Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead.
-    """
-
-    tool_config: ToolConfig
-    """(Optional) Configuration for tool use."""
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-     tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls. .. deprecated:: Use
-    tool_config instead.
-    """
-
-    tools: Iterable[Tool]
-    """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
-    system_message_behavior: Literal["append", "replace"]
-    """(Optional) Config for how to override the default system prompt.
-
-    - `SystemMessageBehavior.append`: Appends the provided system message to the
-      default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
-      system prompt with the provided system message. The system message can include
-      the string '{{function_definitions}}' to indicate where the function
-      definitions should be inserted.
-    """
-
-    tool_choice: Union[Literal["auto", "required", "none"], str]
-    """(Optional) Whether tool use is automatic, required, or none.
-
-    Can also specify a tool name to use a specific tool. Defaults to
-    ToolChoice.auto.
-    """
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-     tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls.
-    """
-
-
-class Tool(TypedDict, total=False):
-    tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
-    description: str
-
-    parameters: Dict[str, ToolParamDefinition]
-
-
-class InferenceChatCompletionParamsNonStreaming(InferenceChatCompletionParamsBase, total=False):
-    stream: Literal[False]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-class InferenceChatCompletionParamsStreaming(InferenceChatCompletionParamsBase):
-    stream: Required[Literal[True]]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-InferenceChatCompletionParams = Union[InferenceChatCompletionParamsNonStreaming, InferenceChatCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_rerank_params.py b/src/llama_stack_client/types/inference_rerank_params.py
new file mode 100644
index 00000000..8f8c4d64
--- /dev/null
+++ b/src/llama_stack_client/types/inference_rerank_params.py
@@ -0,0 +1,106 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Union
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
+
+from .._types import SequenceNotStr
+
+__all__ = [
+    "InferenceRerankParams",
+    "Item",
+    "ItemOpenAIChatCompletionContentPartTextParam",
+    "ItemOpenAIChatCompletionContentPartImageParam",
+    "ItemOpenAIChatCompletionContentPartImageParamImageURL",
+    "Query",
+    "QueryOpenAIChatCompletionContentPartTextParam",
+    "QueryOpenAIChatCompletionContentPartImageParam",
+    "QueryOpenAIChatCompletionContentPartImageParamImageURL",
+]
+
+
+class InferenceRerankParams(TypedDict, total=False):
+    items: Required[SequenceNotStr[Item]]
+    """List of items to rerank.
+
+    Each item can be a string, text content part, or image content part. Each input
+    must not exceed the model's max input token length.
+    """
+
+    model: Required[str]
+    """The identifier of the reranking model to use."""
+
+    query: Required[Query]
+    """The search query to rank items against.
+
+    Can be a string, text content part, or image content part. The input must not
+    exceed the model's max input token length.
+    """
+
+    max_num_results: int
+    """(Optional) Maximum number of results to return. Default: returns all."""
+
+
+class ItemOpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
+    text: Required[str]
+    """The text content of the message"""
+
+    type: Required[Literal["text"]]
+    """Must be "text" to identify this as text content"""
+
+
+class ItemOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False):
+    url: Required[str]
+    """URL of the image to include in the message"""
+
+    detail: str
+    """(Optional) Level of detail for image processing.
+
+    Can be "low", "high", or "auto"
+    """
+
+
+class ItemOpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
+    image_url: Required[ItemOpenAIChatCompletionContentPartImageParamImageURL]
+    """Image URL specification and processing details"""
+
+    type: Required[Literal["image_url"]]
+    """Must be "image_url" to identify this as image content"""
+
+
+Item: TypeAlias = Union[
+    str, ItemOpenAIChatCompletionContentPartTextParam, ItemOpenAIChatCompletionContentPartImageParam
+]
+
+
+class QueryOpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
+    text: Required[str]
+    """The text content of the message"""
+
+    type: Required[Literal["text"]]
+    """Must be "text" to identify this as text content"""
+
+
+class QueryOpenAIChatCompletionContentPartImageParamImageURL(TypedDict, total=False):
+    url: Required[str]
+    """URL of the image to include in the message"""
+
+    detail: str
+    """(Optional) Level of detail for image processing.
+
+    Can be "low", "high", or "auto"
+    """
+
+
+class QueryOpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
+    image_url: Required[QueryOpenAIChatCompletionContentPartImageParamImageURL]
+    """Image URL specification and processing details"""
+
+    type: Required[Literal["image_url"]]
+    """Must be "image_url" to identify this as image content"""
+
+
+Query: TypeAlias = Union[
+    str, QueryOpenAIChatCompletionContentPartTextParam, QueryOpenAIChatCompletionContentPartImageParam
+]
diff --git a/src/llama_stack_client/types/inference_rerank_response.py b/src/llama_stack_client/types/inference_rerank_response.py
new file mode 100644
index 00000000..e74fc7e6
--- /dev/null
+++ b/src/llama_stack_client/types/inference_rerank_response.py
@@ -0,0 +1,23 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List
+from typing_extensions import TypeAlias
+
+from .._models import BaseModel
+
+__all__ = ["InferenceRerankResponse", "InferenceRerankResponseItem"]
+
+
+class InferenceRerankResponseItem(BaseModel):
+    index: int
+    """The original index of the document in the input list"""
+
+    relevance_score: float
+    """The relevance score from the model output.
+
+    Values are inverted when applicable so that higher scores indicate greater
+    relevance.
+    """
+
+
+InferenceRerankResponse: TypeAlias = List[InferenceRerankResponseItem]
diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py
index 30191439..eb78a109 100644
--- a/src/llama_stack_client/types/shared/chat_completion_response.py
+++ b/src/llama_stack_client/types/shared/chat_completion_response.py
@@ -1,20 +1,24 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from .metric import Metric
 from ..._models import BaseModel
-from ..token_log_probs import TokenLogProbs
 from .completion_message import CompletionMessage
 
-__all__ = ["ChatCompletionResponse"]
+__all__ = ["ChatCompletionResponse", "Logprob"]
+
+
+class Logprob(BaseModel):
+    logprobs_by_token: Dict[str, float]
+    """Dictionary mapping tokens to their log probabilities"""
 
 
 class ChatCompletionResponse(BaseModel):
     completion_message: CompletionMessage
     """The complete response message"""
 
-    logprobs: Optional[List[TokenLogProbs]] = None
+    logprobs: Optional[List[Logprob]] = None
     """Optional log probabilities for generated tokens"""
 
     metrics: Optional[List[Metric]] = None
diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py
index 3a0842e8..2ba8b592 100644
--- a/src/llama_stack_client/types/shared_params/__init__.py
+++ b/src/llama_stack_client/types/shared_params/__init__.py
@@ -11,7 +11,6 @@
 from .sampling_params import SamplingParams as SamplingParams
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
-from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
 from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
 from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py
deleted file mode 100644
index 87563946..00000000
--- a/src/llama_stack_client/types/shared_params/tool_param_definition.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Required, TypedDict
-
-__all__ = ["ToolParamDefinition"]
-
-
-class ToolParamDefinition(TypedDict, total=False):
-    param_type: Required[str]
-
-    default: Union[bool, float, str, Iterable[object], object, None]
-
-    description: str
-
-    items: Union[bool, float, str, Iterable[object], object, None]
-
-    required: bool
-
-    title: str
diff --git a/src/llama_stack_client/types/token_log_probs.py b/src/llama_stack_client/types/token_log_probs.py
deleted file mode 100644
index b1a0a2b4..00000000
--- a/src/llama_stack_client/types/token_log_probs.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict
-
-from .._models import BaseModel
-
-__all__ = ["TokenLogProbs"]
-
-
-class TokenLogProbs(BaseModel):
-    logprobs_by_token: Dict[str, float]
-    """Dictionary mapping tokens to their log probabilities"""
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py
index 6fc8040b..f26802c2 100644
--- a/tests/api_resources/test_inference.py
+++ b/tests/api_resources/test_inference.py
@@ -9,9 +9,7 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types.shared import ChatCompletionResponse
-
-# pyright: reportDeprecated=false
+from llama_stack_client.types import InferenceRerankResponse
 
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
@@ -20,212 +18,49 @@ class TestInference:
     parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
 
     @parametrize
-    def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
+    def test_method_rerank(self, client: LlamaStackClient) -> None:
+        inference = client.inference.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        )
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "items": True,
-                                "required": True,
-                                "title": "title",
-                            }
-                        },
-                    }
-                ],
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
+    def test_method_rerank_with_all_params(self, client: LlamaStackClient) -> None:
+        inference = client.inference.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+            max_num_results=0,
+        )
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    def test_raw_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
+    def test_raw_response_rerank(self, client: LlamaStackClient) -> None:
+        response = client.inference.with_raw_response.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        )
 
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         inference = response.parse()
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "items": True,
-                                "required": True,
-                                "title": "title",
-                            }
-                        },
-                    }
-                ],
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_raw_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = response.parse()
-        stream.close()
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+    def test_streaming_response_rerank(self, client: LlamaStackClient) -> None:
+        with client.inference.with_streaming_response.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
-                stream = response.parse()
-                stream.close()
+            inference = response.parse()
+            assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -236,211 +71,48 @@ class TestAsyncInference:
     )
 
     @parametrize
-    async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
+    async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None:
+        inference = await async_client.inference.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        )
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    async def test_method_chat_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "items": True,
-                                "required": True,
-                                "title": "title",
-                            }
-                        },
-                    }
-                ],
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
+    async def test_method_rerank_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        inference = await async_client.inference.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+            max_num_results=0,
+        )
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    async def test_raw_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
+    async def test_raw_response_rerank(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.inference.with_raw_response.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        )
 
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         inference = await response.parse()
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_method_chat_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "items": True,
-                                "required": True,
-                                "title": "title",
-                            }
-                        },
-                    }
-                ],
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_raw_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = await response.parse()
-        await stream.close()
+        assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
     @parametrize
-    async def test_streaming_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+    async def test_streaming_response_rerank(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.inference.with_streaming_response.rerank(
+            items=["string"],
+            model="model",
+            query="string",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
-                stream = await response.parse()
-                await stream.close()
+            inference = await response.parse()
+            assert_matches_type(InferenceRerankResponse, inference, path=["response"])
 
         assert cast(Any, response.is_closed) is True
diff --git a/tests/test_client.py b/tests/test_client.py
index a5bce12c..c5606d5d 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -678,36 +678,20 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ).__enter__()
+            client.toolgroups.with_streaming_response.list().__enter__()
 
         assert _get_open_connections(self.client) == 0
 
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+        respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ).__enter__()
+            client.toolgroups.with_streaming_response.list().__enter__()
         assert _get_open_connections(self.client) == 0
 
     @pytest.mark.parametrize("failures_before_success", [0, 2, 4])
@@ -734,17 +718,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
 
-        response = client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-        )
+        response = client.toolgroups.with_raw_response.list()
 
         assert response.retries_taken == failures_before_success
         assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success
@@ -766,18 +742,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
-
-        response = client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-            extra_headers={"x-stainless-retry-count": Omit()},
-        )
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+
+        response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()})
 
         assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0
 
@@ -798,18 +765,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
-
-        response = client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-            extra_headers={"x-stainless-retry-count": "42"},
-        )
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+
+        response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"})
 
         assert response.http_request.headers.get("x-stainless-retry-count") == "42"
 
@@ -1498,18 +1456,10 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte
     async def test_retrying_timeout_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            await async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ).__aenter__()
+            await async_client.toolgroups.with_streaming_response.list().__aenter__()
 
         assert _get_open_connections(self.client) == 0
 
@@ -1518,18 +1468,10 @@ async def test_retrying_timeout_errors_doesnt_leak(
     async def test_retrying_status_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+        respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            await async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ).__aenter__()
+            await async_client.toolgroups.with_streaming_response.list().__aenter__()
         assert _get_open_connections(self.client) == 0
 
     @pytest.mark.parametrize("failures_before_success", [0, 2, 4])
@@ -1557,17 +1499,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
 
-        response = await client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-        )
+        response = await client.toolgroups.with_raw_response.list()
 
         assert response.retries_taken == failures_before_success
         assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success
@@ -1590,18 +1524,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
-
-        response = await client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-            extra_headers={"x-stainless-retry-count": Omit()},
-        )
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+
+        response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()})
 
         assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0
 
@@ -1623,18 +1548,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
-
-        response = await client.inference.with_raw_response.chat_completion(
-            messages=[
-                {
-                    "content": "string",
-                    "role": "user",
-                }
-            ],
-            model_id="model_id",
-            extra_headers={"x-stainless-retry-count": "42"},
-        )
+        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+
+        response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"})
 
         assert response.http_request.headers.get("x-stainless-retry-count") == "42"
 
From f10ead00522b7ca803cd7dc3617da0d451efa7da Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe 
Date: Mon, 29 Sep 2025 19:40:32 -0700
Subject: [PATCH 4/8] fix: clean up deprecated code
---
 .../lib/inference/event_logger.py             |  30 +-
 src/llama_stack_client/resources/inference.py | 370 ------------------
 2 files changed, 3 insertions(+), 397 deletions(-)
diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index 14b46372..cbf5f680 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from typing import Generator
 from termcolor import cprint
-from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
+from llama_stack_client.types import ChatCompletionChunk
 
 
 class InferenceStreamPrintableEvent:
@@ -28,35 +28,11 @@ def __init__(self):
         self.is_thinking = False
 
     def yield_printable_events(
-        self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
+        self, chunk: ChatCompletionChunk
     ) -> Generator[InferenceStreamPrintableEvent, None, None]:
-        # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
-        if hasattr(chunk, "event"):
-            yield from self._handle_inference_stream_chunk(chunk)
-        # Check if the chunk has choices attribute (ChatCompletionChunk)
-        elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
+        if hasattr(chunk, "choices") and len(chunk.choices) > 0:
             yield from self._handle_chat_completion_chunk(chunk)
 
-    def _handle_inference_stream_chunk(
-        self, chunk: ChatCompletionResponseStreamChunk
-    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
-        event = chunk.event
-        if event.event_type == "start":
-            yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
-        elif event.event_type == "progress":
-            if event.delta.type == "reasoning":
-                if not self.is_thinking:
-                    yield InferenceStreamPrintableEvent(" ", color="magenta", end="")
-                    self.is_thinking = True
-                yield InferenceStreamPrintableEvent(event.delta.reasoning, color="magenta", end="")
-            else:
-                if self.is_thinking:
-                    yield InferenceStreamPrintableEvent("", color="magenta", end="")
-                    self.is_thinking = False
-                yield InferenceStreamPrintableEvent(event.delta.text, color="yellow", end="")
-        elif event.event_type == "complete":
-            yield InferenceStreamPrintableEvent("")
-
     def _handle_chat_completion_chunk(
         self, chunk: ChatCompletionChunk
     ) -> Generator[InferenceStreamPrintableEvent, None, None]:
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 5c022b0f..e5cf7b6b 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -102,191 +102,6 @@ def rerank(
             cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
         )
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Stream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: bool,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | Stream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @required_args(["content", "model_id"], ["content", "model_id", "stream"])
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | Stream[CompletionResponse]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return self._post(
-            "/v1/inference/completion",
-            body=maybe_transform(
-                {
-                    "content": content,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                },
-                inference_completion_params.InferenceCompletionParamsStreaming
-                if stream
-                else inference_completion_params.InferenceCompletionParamsNonStreaming,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=CompletionResponse,
-            stream=stream or False,
-            stream_cls=Stream[CompletionResponse],
-        )
-    
-
 
 class AsyncInferenceResource(AsyncAPIResource):
     @cached_property
@@ -366,191 +181,6 @@ async def rerank(
             cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
         )
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> AsyncStream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: bool,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | AsyncStream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/completions.")
-    @required_args(["content", "model_id"], ["content", "model_id", "stream"])
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | AsyncStream[CompletionResponse]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return await self._post(
-            "/v1/inference/completion",
-            body=await async_maybe_transform(
-                {
-                    "content": content,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                },
-                inference_completion_params.InferenceCompletionParamsStreaming
-                if stream
-                else inference_completion_params.InferenceCompletionParamsNonStreaming,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=CompletionResponse,
-            stream=stream or False,
-            stream_cls=AsyncStream[CompletionResponse],
-        )
-    
-
 
 class InferenceResourceWithRawResponse:
     def __init__(self, inference: InferenceResource) -> None:
From 433a996527bcca131ada4730376d8993f34ad6f5 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 03:37:07 +0000
Subject: [PATCH 5/8] feat(api): updating post /v1/files to have correct
 multipart/form-data
---
 .stats.yml                                    |   4 +-
 README.md                                     |   1 +
 api.md                                        |  11 +-
 src/llama_stack_client/resources/files.py     |  36 +++-
 src/llama_stack_client/types/__init__.py      |   3 -
 .../types/agents/__init__.py                  |   1 -
 .../types/agents/turn_response_event.py       | 155 +++++++++++++++++-
 .../agents/turn_response_event_payload.py     | 109 ------------
 .../types/benchmark_config_param.py           |  35 +++-
 .../types/eval_candidate_param.py             |  35 ----
 .../types/file_create_params.py               |  20 ++-
 .../types/shared/__init__.py                  |   2 -
 .../types/shared/content_delta.py             |  43 -----
 .../types/shared/query_config.py              |  36 +++-
 .../types/shared/query_generator_config.py    |  33 ----
 .../types/shared_params/__init__.py           |   1 -
 .../types/shared_params/query_config.py       |  34 +++-
 .../shared_params/query_generator_config.py   |  30 ----
 tests/api_resources/test_files.py             |  30 ++++
 19 files changed, 335 insertions(+), 284 deletions(-)
 delete mode 100644 src/llama_stack_client/types/agents/turn_response_event_payload.py
 delete mode 100644 src/llama_stack_client/types/eval_candidate_param.py
 delete mode 100644 src/llama_stack_client/types/shared/content_delta.py
 delete mode 100644 src/llama_stack_client/types/shared/query_generator_config.py
 delete mode 100644 src/llama_stack_client/types/shared_params/query_generator_config.py
diff --git a/.stats.yml b/.stats.yml
index ed589610..20dba32e 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 105
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-adcfaad1990d45e42b20e200a9ecc35ee32df5692bd9cd18ae898b0b7728c919.yml
-openapi_spec_hash: 4f532287bafe5da0578a1c1a5e31c952
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml
+openapi_spec_hash: f73b3af77108625edae3f25972b9e665
 config_hash: 5b643c97c83a497d7d346253f1e175f3
diff --git a/README.md b/README.md
index 75857f1d..76c9b9ae 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,7 @@ client = LlamaStackClient()
 
 client.files.create(
     file=Path("/path/to/file"),
+    purpose="assistants",
 )
 ```
 
diff --git a/api.md b/api.md
index 85e5a178..c246f4c1 100644
--- a/api.md
+++ b/api.md
@@ -5,7 +5,6 @@ from llama_stack_client.types import (
     AgentConfig,
     ChatCompletionResponse,
     CompletionMessage,
-    ContentDelta,
     Document,
     InterleavedContent,
     InterleavedContentItem,
@@ -13,7 +12,6 @@ from llama_stack_client.types import (
     Metric,
     ParamType,
     QueryConfig,
-    QueryGeneratorConfig,
     QueryResult,
     ResponseFormat,
     SafetyViolation,
@@ -163,12 +161,7 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types.agents import (
-    AgentTurnResponseStreamChunk,
-    Turn,
-    TurnResponseEvent,
-    TurnResponseEventPayload,
-)
+from llama_stack_client.types.agents import AgentTurnResponseStreamChunk, Turn, TurnResponseEvent
 ```
 
 Methods:
@@ -205,7 +198,7 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types import BenchmarkConfig, EvalCandidate, EvaluateResponse, Job
+from llama_stack_client.types import BenchmarkConfig, EvaluateResponse, Job
 ```
 
 Methods:
diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py
index 04c37c56..39add811 100644
--- a/src/llama_stack_client/resources/files.py
+++ b/src/llama_stack_client/resources/files.py
@@ -50,6 +50,8 @@ def create(
         self,
         *,
         file: FileTypes,
+        purpose: Literal["assistants", "batch"],
+        expires_after: file_create_params.ExpiresAfter | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -67,6 +69,14 @@ def create(
         - expires_after: Optional form values describing expiration for the file.
 
         Args:
+          purpose: Valid purpose values for OpenAI Files API.
+
+          expires_after:
+              Control expiration of uploaded files. Params:
+
+              - anchor, must be "created_at"
+              - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -75,7 +85,13 @@ def create(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        body = deepcopy_minimal({"file": file})
+        body = deepcopy_minimal(
+            {
+                "file": file,
+                "purpose": purpose,
+                "expires_after": expires_after,
+            }
+        )
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
         # It should be noted that the actual Content-Type header that will be
         # sent to the server will contain a `boundary` parameter, e.g.
@@ -275,6 +291,8 @@ async def create(
         self,
         *,
         file: FileTypes,
+        purpose: Literal["assistants", "batch"],
+        expires_after: file_create_params.ExpiresAfter | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -292,6 +310,14 @@ async def create(
         - expires_after: Optional form values describing expiration for the file.
 
         Args:
+          purpose: Valid purpose values for OpenAI Files API.
+
+          expires_after:
+              Control expiration of uploaded files. Params:
+
+              - anchor, must be "created_at"
+              - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -300,7 +326,13 @@ async def create(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        body = deepcopy_minimal({"file": file})
+        body = deepcopy_minimal(
+            {
+                "file": file,
+                "purpose": purpose,
+                "expires_after": expires_after,
+            }
+        )
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
         # It should be noted that the actual Content-Type header that will be
         # sent to the server will contain a `boundary` parameter, e.g.
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 78cfbe2d..f81ada61 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -17,7 +17,6 @@
     QueryConfig as QueryConfig,
     QueryResult as QueryResult,
     UserMessage as UserMessage,
-    ContentDelta as ContentDelta,
     ScoringResult as ScoringResult,
     SystemMessage as SystemMessage,
     ResponseFormat as ResponseFormat,
@@ -27,7 +26,6 @@
     InterleavedContent as InterleavedContent,
     ToolParamDefinition as ToolParamDefinition,
     ToolResponseMessage as ToolResponseMessage,
-    QueryGeneratorConfig as QueryGeneratorConfig,
     ChatCompletionResponse as ChatCompletionResponse,
     InterleavedContentItem as InterleavedContentItem,
 )
@@ -67,7 +65,6 @@
 from .tool_execution_step import ToolExecutionStep as ToolExecutionStep
 from .tool_response_param import ToolResponseParam as ToolResponseParam
 from .delete_file_response import DeleteFileResponse as DeleteFileResponse
-from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam
 from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams
 from .list_models_response import ListModelsResponse as ListModelsResponse
 from .list_routes_response import ListRoutesResponse as ListRoutesResponse
diff --git a/src/llama_stack_client/types/agents/__init__.py b/src/llama_stack_client/types/agents/__init__.py
index f4f48353..3a144840 100644
--- a/src/llama_stack_client/types/agents/__init__.py
+++ b/src/llama_stack_client/types/agents/__init__.py
@@ -13,5 +13,4 @@
 from .step_retrieve_response import StepRetrieveResponse as StepRetrieveResponse
 from .session_create_response import SessionCreateResponse as SessionCreateResponse
 from .session_retrieve_params import SessionRetrieveParams as SessionRetrieveParams
-from .turn_response_event_payload import TurnResponseEventPayload as TurnResponseEventPayload
 from .agent_turn_response_stream_chunk import AgentTurnResponseStreamChunk as AgentTurnResponseStreamChunk
diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py
index df213246..c52121ab 100644
--- a/src/llama_stack_client/types/agents/turn_response_event.py
+++ b/src/llama_stack_client/types/agents/turn_response_event.py
@@ -1,11 +1,160 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from typing import Dict, List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from .turn import Turn
+from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from .turn_response_event_payload import TurnResponseEventPayload
+from ..inference_step import InferenceStep
+from ..shared.tool_call import ToolCall
+from ..shield_call_step import ShieldCallStep
+from ..tool_execution_step import ToolExecutionStep
+from ..memory_retrieval_step import MemoryRetrievalStep
+
+__all__ = [
+    "TurnResponseEvent",
+    "Payload",
+    "PayloadAgentTurnResponseStepStartPayload",
+    "PayloadAgentTurnResponseStepProgressPayload",
+    "PayloadAgentTurnResponseStepProgressPayloadDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall",
+    "PayloadAgentTurnResponseStepCompletePayload",
+    "PayloadAgentTurnResponseStepCompletePayloadStepDetails",
+    "PayloadAgentTurnResponseTurnStartPayload",
+    "PayloadAgentTurnResponseTurnCompletePayload",
+    "PayloadAgentTurnResponseTurnAwaitingInputPayload",
+]
+
+
+class PayloadAgentTurnResponseStepStartPayload(BaseModel):
+    event_type: Literal["step_start"]
+    """Type of event being reported"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+    metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
+    """(Optional) Additional metadata for the step"""
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta(BaseModel):
+    text: str
+    """The incremental text content"""
+
+    type: Literal["text"]
+    """Discriminator type of the delta. Always "text" """
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta(BaseModel):
+    image: str
+    """The incremental image data as bytes"""
+
+    type: Literal["image"]
+    """Discriminator type of the delta. Always "image" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta(BaseModel):
+    parse_status: Literal["started", "in_progress", "failed", "succeeded"]
+    """Current parsing status of the tool call"""
+
+    tool_call: PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall
+    """Either an in-progress tool call string or the final parsed tool call"""
+
+    type: Literal["tool_call"]
+    """Discriminator type of the delta. Always "tool_call" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDelta: TypeAlias = Annotated[
+    Union[
+        PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta,
+        PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta,
+        PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class PayloadAgentTurnResponseStepProgressPayload(BaseModel):
+    delta: PayloadAgentTurnResponseStepProgressPayloadDelta
+    """Incremental content changes during step execution"""
+
+    event_type: Literal["step_progress"]
+    """Type of event being reported"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+
+PayloadAgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
+    Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
+    PropertyInfo(discriminator="step_type"),
+]
+
+
+class PayloadAgentTurnResponseStepCompletePayload(BaseModel):
+    event_type: Literal["step_complete"]
+    """Type of event being reported"""
+
+    step_details: PayloadAgentTurnResponseStepCompletePayloadStepDetails
+    """Complete details of the executed step"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+
+class PayloadAgentTurnResponseTurnStartPayload(BaseModel):
+    event_type: Literal["turn_start"]
+    """Type of event being reported"""
+
+    turn_id: str
+    """Unique identifier for the turn within a session"""
+
+
+class PayloadAgentTurnResponseTurnCompletePayload(BaseModel):
+    event_type: Literal["turn_complete"]
+    """Type of event being reported"""
+
+    turn: Turn
+    """Complete turn data including all steps and results"""
+
+
+class PayloadAgentTurnResponseTurnAwaitingInputPayload(BaseModel):
+    event_type: Literal["turn_awaiting_input"]
+    """Type of event being reported"""
+
+    turn: Turn
+    """Turn data when waiting for external tool responses"""
+
 
-__all__ = ["TurnResponseEvent"]
+Payload: TypeAlias = Annotated[
+    Union[
+        PayloadAgentTurnResponseStepStartPayload,
+        PayloadAgentTurnResponseStepProgressPayload,
+        PayloadAgentTurnResponseStepCompletePayload,
+        PayloadAgentTurnResponseTurnStartPayload,
+        PayloadAgentTurnResponseTurnCompletePayload,
+        PayloadAgentTurnResponseTurnAwaitingInputPayload,
+    ],
+    PropertyInfo(discriminator="event_type"),
+]
 
 
 class TurnResponseEvent(BaseModel):
-    payload: TurnResponseEventPayload
+    payload: Payload
     """Event-specific payload containing event data"""
diff --git a/src/llama_stack_client/types/agents/turn_response_event_payload.py b/src/llama_stack_client/types/agents/turn_response_event_payload.py
deleted file mode 100644
index 1844c61e..00000000
--- a/src/llama_stack_client/types/agents/turn_response_event_payload.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict, List, Union, Optional
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from .turn import Turn
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from ..inference_step import InferenceStep
-from ..shield_call_step import ShieldCallStep
-from ..tool_execution_step import ToolExecutionStep
-from ..shared.content_delta import ContentDelta
-from ..memory_retrieval_step import MemoryRetrievalStep
-
-__all__ = [
-    "TurnResponseEventPayload",
-    "AgentTurnResponseStepStartPayload",
-    "AgentTurnResponseStepProgressPayload",
-    "AgentTurnResponseStepCompletePayload",
-    "AgentTurnResponseStepCompletePayloadStepDetails",
-    "AgentTurnResponseTurnStartPayload",
-    "AgentTurnResponseTurnCompletePayload",
-    "AgentTurnResponseTurnAwaitingInputPayload",
-]
-
-
-class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal["step_start"]
-    """Type of event being reported"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-    metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
-    """(Optional) Additional metadata for the step"""
-
-
-class AgentTurnResponseStepProgressPayload(BaseModel):
-    delta: ContentDelta
-    """Incremental content changes during step execution"""
-
-    event_type: Literal["step_progress"]
-    """Type of event being reported"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-
-AgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
-    Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
-    PropertyInfo(discriminator="step_type"),
-]
-
-
-class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal["step_complete"]
-    """Type of event being reported"""
-
-    step_details: AgentTurnResponseStepCompletePayloadStepDetails
-    """Complete details of the executed step"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-
-class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal["turn_start"]
-    """Type of event being reported"""
-
-    turn_id: str
-    """Unique identifier for the turn within a session"""
-
-
-class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal["turn_complete"]
-    """Type of event being reported"""
-
-    turn: Turn
-    """Complete turn data including all steps and results"""
-
-
-class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    event_type: Literal["turn_awaiting_input"]
-    """Type of event being reported"""
-
-    turn: Turn
-    """Turn data when waiting for external tool responses"""
-
-
-TurnResponseEventPayload: TypeAlias = Annotated[
-    Union[
-        AgentTurnResponseStepStartPayload,
-        AgentTurnResponseStepProgressPayload,
-        AgentTurnResponseStepCompletePayload,
-        AgentTurnResponseTurnStartPayload,
-        AgentTurnResponseTurnCompletePayload,
-        AgentTurnResponseTurnAwaitingInputPayload,
-    ],
-    PropertyInfo(discriminator="event_type"),
-]
diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py
index 740bf99b..dc968521 100644
--- a/src/llama_stack_client/types/benchmark_config_param.py
+++ b/src/llama_stack_client/types/benchmark_config_param.py
@@ -2,17 +2,42 @@
 
 from __future__ import annotations
 
-from typing import Dict
-from typing_extensions import Required, TypedDict
+from typing import Dict, Union
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .eval_candidate_param import EvalCandidateParam
 from .scoring_fn_params_param import ScoringFnParamsParam
+from .shared_params.agent_config import AgentConfig
+from .shared_params.system_message import SystemMessage
+from .shared_params.sampling_params import SamplingParams
 
-__all__ = ["BenchmarkConfigParam"]
+__all__ = ["BenchmarkConfigParam", "EvalCandidate", "EvalCandidateModelCandidate", "EvalCandidateAgentCandidate"]
+
+
+class EvalCandidateModelCandidate(TypedDict, total=False):
+    model: Required[str]
+    """The model ID to evaluate."""
+
+    sampling_params: Required[SamplingParams]
+    """The sampling parameters for the model."""
+
+    type: Required[Literal["model"]]
+
+    system_message: SystemMessage
+    """(Optional) The system message providing instructions or context to the model."""
+
+
+class EvalCandidateAgentCandidate(TypedDict, total=False):
+    config: Required[AgentConfig]
+    """The configuration for the agent candidate."""
+
+    type: Required[Literal["agent"]]
+
+
+EvalCandidate: TypeAlias = Union[EvalCandidateModelCandidate, EvalCandidateAgentCandidate]
 
 
 class BenchmarkConfigParam(TypedDict, total=False):
-    eval_candidate: Required[EvalCandidateParam]
+    eval_candidate: Required[EvalCandidate]
     """The candidate to evaluate."""
 
     scoring_params: Required[Dict[str, ScoringFnParamsParam]]
diff --git a/src/llama_stack_client/types/eval_candidate_param.py b/src/llama_stack_client/types/eval_candidate_param.py
deleted file mode 100644
index be1b21c8..00000000
--- a/src/llama_stack_client/types/eval_candidate_param.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-from .shared_params.agent_config import AgentConfig
-from .shared_params.system_message import SystemMessage
-from .shared_params.sampling_params import SamplingParams
-
-__all__ = ["EvalCandidateParam", "ModelCandidate", "AgentCandidate"]
-
-
-class ModelCandidate(TypedDict, total=False):
-    model: Required[str]
-    """The model ID to evaluate."""
-
-    sampling_params: Required[SamplingParams]
-    """The sampling parameters for the model."""
-
-    type: Required[Literal["model"]]
-
-    system_message: SystemMessage
-    """(Optional) The system message providing instructions or context to the model."""
-
-
-class AgentCandidate(TypedDict, total=False):
-    config: Required[AgentConfig]
-    """The configuration for the agent candidate."""
-
-    type: Required[Literal["agent"]]
-
-
-EvalCandidateParam: TypeAlias = Union[ModelCandidate, AgentCandidate]
diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py
index 6278e1a0..2be39a7a 100644
--- a/src/llama_stack_client/types/file_create_params.py
+++ b/src/llama_stack_client/types/file_create_params.py
@@ -2,12 +2,28 @@
 
 from __future__ import annotations
 
-from typing_extensions import Required, TypedDict
+from typing_extensions import Literal, Required, TypedDict
 
 from .._types import FileTypes
 
-__all__ = ["FileCreateParams"]
+__all__ = ["FileCreateParams", "ExpiresAfter"]
 
 
 class FileCreateParams(TypedDict, total=False):
     file: Required[FileTypes]
+
+    purpose: Required[Literal["assistants", "batch"]]
+    """Valid purpose values for OpenAI Files API."""
+
+    expires_after: ExpiresAfter
+    """Control expiration of uploaded files. Params:
+
+    - anchor, must be "created_at"
+    - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+    """
+
+
+class ExpiresAfter(TypedDict, total=False):
+    anchor: Required[Literal["created_at"]]
+
+    seconds: Required[int]
diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py
index 007d56ac..f346cda7 100644
--- a/src/llama_stack_client/types/shared/__init__.py
+++ b/src/llama_stack_client/types/shared/__init__.py
@@ -9,7 +9,6 @@
 from .query_config import QueryConfig as QueryConfig
 from .query_result import QueryResult as QueryResult
 from .user_message import UserMessage as UserMessage
-from .content_delta import ContentDelta as ContentDelta
 from .scoring_result import ScoringResult as ScoringResult
 from .system_message import SystemMessage as SystemMessage
 from .response_format import ResponseFormat as ResponseFormat
@@ -19,6 +18,5 @@
 from .interleaved_content import InterleavedContent as InterleavedContent
 from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
 from .chat_completion_response import ChatCompletionResponse as ChatCompletionResponse
 from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py
deleted file mode 100644
index 7ed58d13..00000000
--- a/src/llama_stack_client/types/shared/content_delta.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from .tool_call import ToolCall
-
-__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"]
-
-
-class TextDelta(BaseModel):
-    text: str
-    """The incremental text content"""
-
-    type: Literal["text"]
-    """Discriminator type of the delta. Always "text" """
-
-
-class ImageDelta(BaseModel):
-    image: str
-    """The incremental image data as bytes"""
-
-    type: Literal["image"]
-    """Discriminator type of the delta. Always "image" """
-
-
-ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
-
-
-class ToolCallDelta(BaseModel):
-    parse_status: Literal["started", "in_progress", "failed", "succeeded"]
-    """Current parsing status of the tool call"""
-
-    tool_call: ToolCallDeltaToolCall
-    """Either an in-progress tool call string or the final parsed tool call"""
-
-    type: Literal["tool_call"]
-    """Discriminator type of the delta. Always "tool_call" """
-
-
-ContentDelta: TypeAlias = Annotated[Union[TextDelta, ImageDelta, ToolCallDelta], PropertyInfo(discriminator="type")]
diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py
index 389514c7..a4a1f741 100644
--- a/src/llama_stack_client/types/shared/query_config.py
+++ b/src/llama_stack_client/types/shared/query_config.py
@@ -5,9 +5,41 @@
 
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from .query_generator_config import QueryGeneratorConfig
 
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+__all__ = [
+    "QueryConfig",
+    "QueryGeneratorConfig",
+    "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+    "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+    "Ranker",
+    "RankerRrfRanker",
+    "RankerWeightedRanker",
+]
+
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(BaseModel):
+    separator: str
+    """String separator used to join query terms"""
+
+    type: Literal["default"]
+    """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(BaseModel):
+    model: str
+    """Name of the language model to use for query generation"""
+
+    template: str
+    """Template string for formatting the query generation prompt"""
+
+    type: Literal["llm"]
+    """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Annotated[
+    Union[QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig],
+    PropertyInfo(discriminator="type"),
+]
 
 
 class RankerRrfRanker(BaseModel):
diff --git a/src/llama_stack_client/types/shared/query_generator_config.py b/src/llama_stack_client/types/shared/query_generator_config.py
deleted file mode 100644
index 624fc190..00000000
--- a/src/llama_stack_client/types/shared/query_generator_config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(BaseModel):
-    separator: str
-    """String separator used to join query terms"""
-
-    type: Literal["default"]
-    """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(BaseModel):
-    model: str
-    """Name of the language model to use for query generation"""
-
-    template: str
-    """Template string for formatting the query generation prompt"""
-
-    type: Literal["llm"]
-    """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Annotated[
-    Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig], PropertyInfo(discriminator="type")
-]
diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py
index 2ba8b592..894d8a8d 100644
--- a/src/llama_stack_client/types/shared_params/__init__.py
+++ b/src/llama_stack_client/types/shared_params/__init__.py
@@ -12,5 +12,4 @@
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
 from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared_params/query_config.py b/src/llama_stack_client/types/shared_params/query_config.py
index d008c48c..91a5b596 100644
--- a/src/llama_stack_client/types/shared_params/query_config.py
+++ b/src/llama_stack_client/types/shared_params/query_config.py
@@ -5,9 +5,39 @@
 from typing import Union
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .query_generator_config import QueryGeneratorConfig
+__all__ = [
+    "QueryConfig",
+    "QueryGeneratorConfig",
+    "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+    "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+    "Ranker",
+    "RankerRrfRanker",
+    "RankerWeightedRanker",
+]
 
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(TypedDict, total=False):
+    separator: Required[str]
+    """String separator used to join query terms"""
+
+    type: Required[Literal["default"]]
+    """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(TypedDict, total=False):
+    model: Required[str]
+    """Name of the language model to use for query generation"""
+
+    template: Required[str]
+    """Template string for formatting the query generation prompt"""
+
+    type: Required[Literal["llm"]]
+    """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Union[
+    QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig
+]
 
 
 class RankerRrfRanker(TypedDict, total=False):
diff --git a/src/llama_stack_client/types/shared_params/query_generator_config.py b/src/llama_stack_client/types/shared_params/query_generator_config.py
deleted file mode 100644
index 8c589bf9..00000000
--- a/src/llama_stack_client/types/shared_params/query_generator_config.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(TypedDict, total=False):
-    separator: Required[str]
-    """String separator used to join query terms"""
-
-    type: Required[Literal["default"]]
-    """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(TypedDict, total=False):
-    model: Required[str]
-    """Name of the language model to use for query generation"""
-
-    template: Required[str]
-    """Template string for formatting the query generation prompt"""
-
-    type: Required[Literal["llm"]]
-    """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig]
diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py
index bdf81d4f..83b763ab 100644
--- a/tests/api_resources/test_files.py
+++ b/tests/api_resources/test_files.py
@@ -22,6 +22,19 @@ class TestFiles:
     def test_method_create(self, client: LlamaStackClient) -> None:
         file = client.files.create(
             file=b"raw file contents",
+            purpose="assistants",
+        )
+        assert_matches_type(File, file, path=["response"])
+
+    @parametrize
+    def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
+        file = client.files.create(
+            file=b"raw file contents",
+            purpose="assistants",
+            expires_after={
+                "anchor": "created_at",
+                "seconds": 0,
+            },
         )
         assert_matches_type(File, file, path=["response"])
 
@@ -29,6 +42,7 @@ def test_method_create(self, client: LlamaStackClient) -> None:
     def test_raw_response_create(self, client: LlamaStackClient) -> None:
         response = client.files.with_raw_response.create(
             file=b"raw file contents",
+            purpose="assistants",
         )
 
         assert response.is_closed is True
@@ -40,6 +54,7 @@ def test_raw_response_create(self, client: LlamaStackClient) -> None:
     def test_streaming_response_create(self, client: LlamaStackClient) -> None:
         with client.files.with_streaming_response.create(
             file=b"raw file contents",
+            purpose="assistants",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -208,6 +223,19 @@ class TestAsyncFiles:
     async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
         file = await async_client.files.create(
             file=b"raw file contents",
+            purpose="assistants",
+        )
+        assert_matches_type(File, file, path=["response"])
+
+    @parametrize
+    async def test_method_create_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        file = await async_client.files.create(
+            file=b"raw file contents",
+            purpose="assistants",
+            expires_after={
+                "anchor": "created_at",
+                "seconds": 0,
+            },
         )
         assert_matches_type(File, file, path=["response"])
 
@@ -215,6 +243,7 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
     async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.files.with_raw_response.create(
             file=b"raw file contents",
+            purpose="assistants",
         )
 
         assert response.is_closed is True
@@ -226,6 +255,7 @@ async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) ->
     async def test_streaming_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.files.with_streaming_response.create(
             file=b"raw file contents",
+            purpose="assistants",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
From f89674726f55915a8cda0e2b4284be3c92978121 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 03:38:32 +0000
Subject: [PATCH 6/8] docs: update examples
---
 .stats.yml           |   2 +-
 README.md            |  96 ++++++++++++++++++++++++++++---
 tests/test_client.py | 132 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 198 insertions(+), 32 deletions(-)
diff --git a/.stats.yml b/.stats.yml
index 20dba32e..36fa92d0 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 105
 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml
 openapi_spec_hash: f73b3af77108625edae3f25972b9e665
-config_hash: 5b643c97c83a497d7d346253f1e175f3
+config_hash: 06f95bf1b7786cfe2470af8f238fc36d
diff --git a/README.md b/README.md
index 76c9b9ae..c8cebcc3 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,50 @@ asyncio.run(main())
 
 Functionality between the synchronous and asynchronous clients is otherwise identical.
 
+## Streaming responses
+
+We provide support for streaming responses using Server Side Events (SSE).
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+stream = client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+for completion in stream:
+    print(completion)
+```
+
+The async client uses the exact same interface.
+
+```python
+from llama_stack_client import AsyncLlamaStackClient
+
+client = AsyncLlamaStackClient()
+
+stream = await client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+async for completion in stream:
+    print(completion)
+```
+
 ## Using types
 
 Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:
@@ -168,7 +212,15 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient()
 
 try:
-    client.agents.toolgroups.list()
+    client.chat.completions.create(
+        messages=[
+            {
+                "content": "string",
+                "role": "user",
+            }
+        ],
+        model="model",
+    )
 except llama_stack_client.APIConnectionError as e:
     print("The server could not be reached")
     print(e.__cause__)  # an underlying Exception, likely raised within httpx.
@@ -211,7 +263,15 @@ client = LlamaStackClient(
 )
 
 # Or, configure per-request:
-client.with_options(max_retries=5).toolgroups.list.create()
+client.with_options(max_retries=5).chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+)
 ```
 
 ### Timeouts
@@ -234,7 +294,15 @@ client = LlamaStackClient(
 )
 
 # Override per-request:
-client.with_options(timeout=5.0).toolgroups.list.create()
+client.with_options(timeout=5.0).chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+)
 ```
 
 On timeout, an `APITimeoutError` is thrown.
@@ -273,11 +341,17 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to
 from llama_stack_client import LlamaStackClient
 
 client = LlamaStackClient()
-response = client.toolgroups.with_raw_response.list()
+response = client.chat.completions.with_raw_response.create(
+    messages=[{
+        "content": "string",
+        "role": "user",
+    }],
+    model="model",
+)
 print(response.headers.get('X-My-Header'))
 
-toolgroup = response.parse()  # get the object that `toolgroups.list()` would have returned
-print(toolgroup)
+completion = response.parse()  # get the object that `chat.completions.create()` would have returned
+print(completion)
 ```
 
 These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object.
@@ -291,7 +365,15 @@ The above interface eagerly reads the full response body when you make the reque
 To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods.
 
 ```python
-with client.agents.toolgroups.with_streaming_response.list() as response:
+with client.chat.completions.with_streaming_response.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+) as response:
     print(response.headers.get("X-My-Header"))
 
     for line in response.iter_lines():
diff --git a/tests/test_client.py b/tests/test_client.py
index c5606d5d..708c7420 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -678,20 +678,36 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            client.toolgroups.with_streaming_response.list().__enter__()
+            client.chat.completions.with_streaming_response.create(
+                messages=[
+                    {
+                        "content": "string",
+                        "role": "user",
+                    }
+                ],
+                model="model",
+            ).__enter__()
 
         assert _get_open_connections(self.client) == 0
 
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500))
+        respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            client.toolgroups.with_streaming_response.list().__enter__()
+            client.chat.completions.with_streaming_response.create(
+                messages=[
+                    {
+                        "content": "string",
+                        "role": "user",
+                    }
+                ],
+                model="model",
+            ).__enter__()
         assert _get_open_connections(self.client) == 0
 
     @pytest.mark.parametrize("failures_before_success", [0, 2, 4])
@@ -718,9 +734,17 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = client.toolgroups.with_raw_response.list()
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
 
         assert response.retries_taken == failures_before_success
         assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success
@@ -742,9 +766,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
-
-        response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()})
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
+
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            extra_headers={"x-stainless-retry-count": Omit()},
+        )
 
         assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0
 
@@ -765,9 +798,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
-
-        response = client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"})
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
+
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            extra_headers={"x-stainless-retry-count": "42"},
+        )
 
         assert response.http_request.headers.get("x-stainless-retry-count") == "42"
 
@@ -1456,10 +1498,18 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte
     async def test_retrying_timeout_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.get("/v1/toolgroups").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            await async_client.toolgroups.with_streaming_response.list().__aenter__()
+            await async_client.chat.completions.with_streaming_response.create(
+                messages=[
+                    {
+                        "content": "string",
+                        "role": "user",
+                    }
+                ],
+                model="model",
+            ).__aenter__()
 
         assert _get_open_connections(self.client) == 0
 
@@ -1468,10 +1518,18 @@ async def test_retrying_timeout_errors_doesnt_leak(
     async def test_retrying_status_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.get("/v1/toolgroups").mock(return_value=httpx.Response(500))
+        respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            await async_client.toolgroups.with_streaming_response.list().__aenter__()
+            await async_client.chat.completions.with_streaming_response.create(
+                messages=[
+                    {
+                        "content": "string",
+                        "role": "user",
+                    }
+                ],
+                model="model",
+            ).__aenter__()
         assert _get_open_connections(self.client) == 0
 
     @pytest.mark.parametrize("failures_before_success", [0, 2, 4])
@@ -1499,9 +1557,17 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = await client.toolgroups.with_raw_response.list()
+        response = await client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
 
         assert response.retries_taken == failures_before_success
         assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success
@@ -1524,9 +1590,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
-
-        response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": Omit()})
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
+
+        response = await client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            extra_headers={"x-stainless-retry-count": Omit()},
+        )
 
         assert len(response.http_request.headers.get_list("x-stainless-retry-count")) == 0
 
@@ -1548,9 +1623,18 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.get("/v1/toolgroups").mock(side_effect=retry_handler)
-
-        response = await client.toolgroups.with_raw_response.list(extra_headers={"x-stainless-retry-count": "42"})
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
+
+        response = await client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            extra_headers={"x-stainless-retry-count": "42"},
+        )
 
         assert response.http_request.headers.get("x-stainless-retry-count") == "42"
 
From 4c75724250abc5a8424f35ff25956132d317c00f Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 03:39:36 +0000
Subject: [PATCH 7/8] codegen metadata
---
 .stats.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.stats.yml b/.stats.yml
index 36fa92d0..755df453 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 105
 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml
 openapi_spec_hash: f73b3af77108625edae3f25972b9e665
-config_hash: 06f95bf1b7786cfe2470af8f238fc36d
+config_hash: 548f336ac1b68ab1dfe385b79df764dd
From 1231814b1aeb959bb43ae2eecb3d6a118b7582be Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 03:41:14 +0000
Subject: [PATCH 8/8] release: 0.3.0-alpha.1
---
 .release-please-manifest.json |  2 +-
 CHANGELOG.md                  | 31 +++++++++++++++++++++++++++++++
 pyproject.toml                |  2 +-
 3 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index ed9acd29..1ae25264 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.2.23-alpha.1"
+  ".": "0.3.0-alpha.1"
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0011c19f..93d68692 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,36 @@
 # Changelog
 
+## 0.3.0-alpha.1 (2025-09-30)
+
+Full Changelog: [v0.2.23-alpha.1...v0.3.0-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.23-alpha.1...v0.3.0-alpha.1)
+
+### ⚠ BREAKING CHANGES
+
+* **api:** fixes to remove deprecated inference resources
+
+### Features
+
+* **api:** expires_after changes for /files ([7f24c43](https://github.com/llamastack/llama-stack-client-python/commit/7f24c432dc1859312710a4a1ff4a80f6f861bee8))
+* **api:** fixes to remove deprecated inference resources ([04834d2](https://github.com/llamastack/llama-stack-client-python/commit/04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec))
+* **api:** removing openai/v1 ([a918b43](https://github.com/llamastack/llama-stack-client-python/commit/a918b4323118c18f77c2abe7e1a3054c1eebeaac))
+* **api:** updating post /v1/files to have correct multipart/form-data ([433a996](https://github.com/llamastack/llama-stack-client-python/commit/433a996527bcca131ada4730376d8993f34ad6f5))
+
+
+### Bug Fixes
+
+* clean up deprecated code ([f10ead0](https://github.com/llamastack/llama-stack-client-python/commit/f10ead00522b7ca803cd7dc3617da0d451efa7da))
+* Don't retry for non-recoverable server http errors ([#212](https://github.com/llamastack/llama-stack-client-python/issues/212)) ([6782e8f](https://github.com/llamastack/llama-stack-client-python/commit/6782e8fc5931369223ed4446f8e7732f62712eff))
+
+
+### Documentation
+
+* update examples ([f896747](https://github.com/llamastack/llama-stack-client-python/commit/f89674726f55915a8cda0e2b4284be3c92978121))
+
+
+### Build System
+
+* Bump version to 0.2.23 ([0d4dc64](https://github.com/llamastack/llama-stack-client-python/commit/0d4dc6449224fa2a0f6d20f6229dd9d1a5427861))
+
 ## 0.2.23-alpha.1 (2025-09-26)
 
 Full Changelog: [v0.2.19-alpha.1...v0.2.23-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.19-alpha.1...v0.2.23-alpha.1)
diff --git a/pyproject.toml b/pyproject.toml
index 843dd9b7..3b50518e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "llama_stack_client"
-version = "0.2.23"
+version = "0.3.0-alpha.1"
 description = "The official Python library for the llama-stack-client API"
 dynamic = ["readme"]
 license = "MIT"