diff --git a/src/llama_stack_client/_base_client.py b/src/llama_stack_client/_base_client.py
index 5a0376e6..a0c4ea5b 100644
--- a/src/llama_stack_client/_base_client.py
+++ b/src/llama_stack_client/_base_client.py
@@ -98,7 +98,11 @@
 _AsyncStreamT = TypeVar("_AsyncStreamT", bound=AsyncStream[Any])
 
 if TYPE_CHECKING:
-    from httpx._config import DEFAULT_TIMEOUT_CONFIG as HTTPX_DEFAULT_TIMEOUT
+    from httpx._config import (
+        DEFAULT_TIMEOUT_CONFIG,  # pyright: ignore[reportPrivateImportUsage]
+    )
+
+    HTTPX_DEFAULT_TIMEOUT = DEFAULT_TIMEOUT_CONFIG
 else:
     try:
         from httpx._config import DEFAULT_TIMEOUT_CONFIG as HTTPX_DEFAULT_TIMEOUT
@@ -115,6 +119,7 @@ class PageInfo:
 
     url: URL | NotGiven
     params: Query | NotGiven
+    json: Body | NotGiven
 
     @overload
     def __init__(
@@ -130,19 +135,30 @@ def __init__(
         params: Query,
     ) -> None: ...
 
+    @overload
+    def __init__(
+        self,
+        *,
+        json: Body,
+    ) -> None: ...
+
     def __init__(
         self,
         *,
         url: URL | NotGiven = NOT_GIVEN,
+        json: Body | NotGiven = NOT_GIVEN,
         params: Query | NotGiven = NOT_GIVEN,
     ) -> None:
         self.url = url
+        self.json = json
         self.params = params
 
     @override
     def __repr__(self) -> str:
         if self.url:
             return f"{self.__class__.__name__}(url={self.url})"
+        if self.json:
+            return f"{self.__class__.__name__}(json={self.json})"
         return f"{self.__class__.__name__}(params={self.params})"
 
 
@@ -191,6 +207,19 @@ def _info_to_options(self, info: PageInfo) -> FinalRequestOptions:
             options.url = str(url)
             return options
 
+        if not isinstance(info.json, NotGiven):
+            if not is_mapping(info.json):
+                raise TypeError("Pagination is only supported with mappings")
+
+            if not options.json_data:
+                options.json_data = {**info.json}
+            else:
+                if not is_mapping(options.json_data):
+                    raise TypeError("Pagination is only supported with mappings")
+
+                options.json_data = {**options.json_data, **info.json}
+            return options
+
         raise ValueError("Unexpected PageInfo state")
 
 
@@ -408,8 +437,8 @@ def _build_headers(self, options: FinalRequestOptions, *, retries_taken: int = 0
         headers = httpx.Headers(headers_dict)
 
         idempotency_header = self._idempotency_header
-        if idempotency_header and options.method.lower() != "get" and idempotency_header not in headers:
-            headers[idempotency_header] = options.idempotency_key or self._idempotency_key()
+        if idempotency_header and options.idempotency_key and idempotency_header not in headers:
+            headers[idempotency_header] = options.idempotency_key
 
         # Don't set these headers if they were already set or removed by the caller. We check
         # `custom_headers`, which can contain `Omit()`, instead of `headers` to account for the removal case.
@@ -873,7 +902,6 @@ def request(
         self,
         cast_to: Type[ResponseT],
         options: FinalRequestOptions,
-        remaining_retries: Optional[int] = None,
         *,
         stream: Literal[True],
         stream_cls: Type[_StreamT],
@@ -884,7 +912,6 @@ def request(
         self,
         cast_to: Type[ResponseT],
         options: FinalRequestOptions,
-        remaining_retries: Optional[int] = None,
         *,
         stream: Literal[False] = False,
     ) -> ResponseT: ...
@@ -894,7 +921,6 @@ def request(
         self,
         cast_to: Type[ResponseT],
         options: FinalRequestOptions,
-        remaining_retries: Optional[int] = None,
         *,
         stream: bool = False,
         stream_cls: Type[_StreamT] | None = None,
@@ -904,121 +930,109 @@ def request(
         self,
         cast_to: Type[ResponseT],
         options: FinalRequestOptions,
-        remaining_retries: Optional[int] = None,
         *,
         stream: bool = False,
         stream_cls: type[_StreamT] | None = None,
     ) -> ResponseT | _StreamT:
-        if remaining_retries is not None:
-            retries_taken = options.get_max_retries(self.max_retries) - remaining_retries
-        else:
-            retries_taken = 0
-
-        return self._request(
-            cast_to=cast_to,
-            options=options,
-            stream=stream,
-            stream_cls=stream_cls,
-            retries_taken=retries_taken,
-        )
+        cast_to = self._maybe_override_cast_to(cast_to, options)
 
-    def _request(
-        self,
-        *,
-        cast_to: Type[ResponseT],
-        options: FinalRequestOptions,
-        retries_taken: int,
-        stream: bool,
-        stream_cls: type[_StreamT] | None,
-    ) -> ResponseT | _StreamT:
         # create a copy of the options we were given so that if the
         # options are mutated later & we then retry, the retries are
         # given the original options
         input_options = model_copy(options)
+        if input_options.idempotency_key is None and input_options.method.lower() != "get":
+            # ensure the idempotency key is reused between requests
+            input_options.idempotency_key = self._idempotency_key()
 
-        cast_to = self._maybe_override_cast_to(cast_to, options)
-        options = self._prepare_options(options)
-
-        remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
-        request = self._build_request(options, retries_taken=retries_taken)
-        self._prepare_request(request)
-
-        kwargs: HttpxSendArgs = {}
-        if self.custom_auth is not None:
-            kwargs["auth"] = self.custom_auth
+        response: httpx.Response | None = None
+        max_retries = input_options.get_max_retries(self.max_retries)
 
-        log.debug("Sending HTTP Request: %s %s", request.method, request.url)
+        retries_taken = 0
+        for retries_taken in range(max_retries + 1):
+            options = model_copy(input_options)
+            options = self._prepare_options(options)
 
-        try:
-            response = self._client.send(
-                request,
-                stream=stream or self._should_stream_response_body(request=request),
-                **kwargs,
-            )
-        except httpx.TimeoutException as err:
-            log.debug("Encountered httpx.TimeoutException", exc_info=True)
+            remaining_retries = max_retries - retries_taken
+            request = self._build_request(options, retries_taken=retries_taken)
+            self._prepare_request(request)
 
-            if remaining_retries > 0:
-                return self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                    response_headers=None,
-                )
+            kwargs: HttpxSendArgs = {}
+            if self.custom_auth is not None:
+                kwargs["auth"] = self.custom_auth
 
-            log.debug("Raising timeout error")
-            raise APITimeoutError(request=request) from err
-        except Exception as err:
-            log.debug("Encountered Exception", exc_info=True)
+            log.debug("Sending HTTP Request: %s %s", request.method, request.url)
 
-            if remaining_retries > 0:
-                return self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                    response_headers=None,
+            response = None
+            try:
+                response = self._client.send(
+                    request,
+                    stream=stream or self._should_stream_response_body(request=request),
+                    **kwargs,
                 )
+            except httpx.TimeoutException as err:
+                log.debug("Encountered httpx.TimeoutException", exc_info=True)
+
+                if remaining_retries > 0:
+                    self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=None,
+                    )
+                    continue
+
+                log.debug("Raising timeout error")
+                raise APITimeoutError(request=request) from err
+            except Exception as err:
+                log.debug("Encountered Exception", exc_info=True)
+
+                if remaining_retries > 0:
+                    self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=None,
+                    )
+                    continue
+
+                log.debug("Raising connection error")
+                raise APIConnectionError(request=request) from err
+
+            log.debug(
+                'HTTP Response: %s %s "%i %s" %s',
+                request.method,
+                request.url,
+                response.status_code,
+                response.reason_phrase,
+                response.headers,
+            )
 
-            log.debug("Raising connection error")
-            raise APIConnectionError(request=request) from err
-
-        log.debug(
-            'HTTP Response: %s %s "%i %s" %s',
-            request.method,
-            request.url,
-            response.status_code,
-            response.reason_phrase,
-            response.headers,
-        )
+            try:
+                response.raise_for_status()
+            except httpx.HTTPStatusError as err:  # thrown on 4xx and 5xx status code
+                log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
+
+                if remaining_retries > 0 and self._should_retry(err.response):
+                    err.response.close()
+                    self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=response,
+                    )
+                    continue
 
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as err:  # thrown on 4xx and 5xx status code
-            log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
-
-            if remaining_retries > 0 and self._should_retry(err.response):
-                err.response.close()
-                return self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    response_headers=err.response.headers,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                )
+                # If the response is streamed then we need to explicitly read the response
+                # to completion before attempting to access the response text.
+                if not err.response.is_closed:
+                    err.response.read()
 
-            # If the response is streamed then we need to explicitly read the response
-            # to completion before attempting to access the response text.
-            if not err.response.is_closed:
-                err.response.read()
+                log.debug("Re-raising status error")
+                raise self._make_status_error_from_response(err.response) from None
 
-            log.debug("Re-raising status error")
-            raise self._make_status_error_from_response(err.response) from None
+            break
 
+        assert response is not None, "could not resolve response (should never happen)"
         return self._process_response(
             cast_to=cast_to,
             options=options,
@@ -1028,37 +1042,20 @@ def _request(
             retries_taken=retries_taken,
         )
 
-    def _retry_request(
-        self,
-        options: FinalRequestOptions,
-        cast_to: Type[ResponseT],
-        *,
-        retries_taken: int,
-        response_headers: httpx.Headers | None,
-        stream: bool,
-        stream_cls: type[_StreamT] | None,
-    ) -> ResponseT | _StreamT:
-        remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
+    def _sleep_for_retry(
+        self, *, retries_taken: int, max_retries: int, options: FinalRequestOptions, response: httpx.Response | None
+    ) -> None:
+        remaining_retries = max_retries - retries_taken
         if remaining_retries == 1:
             log.debug("1 retry left")
         else:
             log.debug("%i retries left", remaining_retries)
 
-        timeout = self._calculate_retry_timeout(remaining_retries, options, response_headers)
+        timeout = self._calculate_retry_timeout(remaining_retries, options, response.headers if response else None)
         log.info("Retrying request to %s in %f seconds", options.url, timeout)
 
-        # In a synchronous context we are blocking the entire thread. Up to the library user to run the client in a
-        # different thread if necessary.
         time.sleep(timeout)
 
-        return self._request(
-            options=options,
-            cast_to=cast_to,
-            retries_taken=retries_taken + 1,
-            stream=stream,
-            stream_cls=stream_cls,
-        )
-
     def _process_response(
         self,
         *,
@@ -1402,7 +1399,6 @@ async def request(
         options: FinalRequestOptions,
         *,
         stream: Literal[False] = False,
-        remaining_retries: Optional[int] = None,
     ) -> ResponseT: ...
 
     @overload
@@ -1413,7 +1409,6 @@ async def request(
         *,
         stream: Literal[True],
         stream_cls: type[_AsyncStreamT],
-        remaining_retries: Optional[int] = None,
     ) -> _AsyncStreamT: ...
 
     @overload
@@ -1424,7 +1419,6 @@ async def request(
         *,
         stream: bool,
         stream_cls: type[_AsyncStreamT] | None = None,
-        remaining_retries: Optional[int] = None,
     ) -> ResponseT | _AsyncStreamT: ...
 
     async def request(
@@ -1434,116 +1428,111 @@ async def request(
         *,
         stream: bool = False,
         stream_cls: type[_AsyncStreamT] | None = None,
-        remaining_retries: Optional[int] = None,
-    ) -> ResponseT | _AsyncStreamT:
-        if remaining_retries is not None:
-            retries_taken = options.get_max_retries(self.max_retries) - remaining_retries
-        else:
-            retries_taken = 0
-
-        return await self._request(
-            cast_to=cast_to,
-            options=options,
-            stream=stream,
-            stream_cls=stream_cls,
-            retries_taken=retries_taken,
-        )
-
-    async def _request(
-        self,
-        cast_to: Type[ResponseT],
-        options: FinalRequestOptions,
-        *,
-        stream: bool,
-        stream_cls: type[_AsyncStreamT] | None,
-        retries_taken: int,
     ) -> ResponseT | _AsyncStreamT:
         if self._platform is None:
             # `get_platform` can make blocking IO calls so we
             # execute it earlier while we are in an async context
             self._platform = await asyncify(get_platform)()
 
+        cast_to = self._maybe_override_cast_to(cast_to, options)
+
         # create a copy of the options we were given so that if the
         # options are mutated later & we then retry, the retries are
         # given the original options
         input_options = model_copy(options)
+        if input_options.idempotency_key is None and input_options.method.lower() != "get":
+            # ensure the idempotency key is reused between requests
+            input_options.idempotency_key = self._idempotency_key()
 
-        cast_to = self._maybe_override_cast_to(cast_to, options)
-        options = await self._prepare_options(options)
+        response: httpx.Response | None = None
+        max_retries = input_options.get_max_retries(self.max_retries)
 
-        remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
-        request = self._build_request(options, retries_taken=retries_taken)
-        await self._prepare_request(request)
+        retries_taken = 0
+        for retries_taken in range(max_retries + 1):
+            options = model_copy(input_options)
+            options = await self._prepare_options(options)
 
-        kwargs: HttpxSendArgs = {}
-        if self.custom_auth is not None:
-            kwargs["auth"] = self.custom_auth
+            remaining_retries = max_retries - retries_taken
+            request = self._build_request(options, retries_taken=retries_taken)
+            await self._prepare_request(request)
 
-        try:
-            response = await self._client.send(
-                request,
-                stream=stream or self._should_stream_response_body(request=request),
-                **kwargs,
-            )
-        except httpx.TimeoutException as err:
-            log.debug("Encountered httpx.TimeoutException", exc_info=True)
-
-            if remaining_retries > 0:
-                return await self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                    response_headers=None,
-                )
+            kwargs: HttpxSendArgs = {}
+            if self.custom_auth is not None:
+                kwargs["auth"] = self.custom_auth
 
-            log.debug("Raising timeout error")
-            raise APITimeoutError(request=request) from err
-        except Exception as err:
-            log.debug("Encountered Exception", exc_info=True)
+            log.debug("Sending HTTP Request: %s %s", request.method, request.url)
 
-            if remaining_retries > 0:
-                return await self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                    response_headers=None,
+            response = None
+            try:
+                response = await self._client.send(
+                    request,
+                    stream=stream or self._should_stream_response_body(request=request),
+                    **kwargs,
                 )
+            except httpx.TimeoutException as err:
+                log.debug("Encountered httpx.TimeoutException", exc_info=True)
+
+                if remaining_retries > 0:
+                    await self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=None,
+                    )
+                    continue
+
+                log.debug("Raising timeout error")
+                raise APITimeoutError(request=request) from err
+            except Exception as err:
+                log.debug("Encountered Exception", exc_info=True)
+
+                if remaining_retries > 0:
+                    await self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=None,
+                    )
+                    continue
+
+                log.debug("Raising connection error")
+                raise APIConnectionError(request=request) from err
+
+            log.debug(
+                'HTTP Response: %s %s "%i %s" %s',
+                request.method,
+                request.url,
+                response.status_code,
+                response.reason_phrase,
+                response.headers,
+            )
 
-            log.debug("Raising connection error")
-            raise APIConnectionError(request=request) from err
+            try:
+                response.raise_for_status()
+            except httpx.HTTPStatusError as err:  # thrown on 4xx and 5xx status code
+                log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
+
+                if remaining_retries > 0 and self._should_retry(err.response):
+                    await err.response.aclose()
+                    await self._sleep_for_retry(
+                        retries_taken=retries_taken,
+                        max_retries=max_retries,
+                        options=input_options,
+                        response=response,
+                    )
+                    continue
 
-        log.debug(
-            'HTTP Request: %s %s "%i %s"', request.method, request.url, response.status_code, response.reason_phrase
-        )
+                # If the response is streamed then we need to explicitly read the response
+                # to completion before attempting to access the response text.
+                if not err.response.is_closed:
+                    await err.response.aread()
 
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as err:  # thrown on 4xx and 5xx status code
-            log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
-
-            if remaining_retries > 0 and self._should_retry(err.response):
-                await err.response.aclose()
-                return await self._retry_request(
-                    input_options,
-                    cast_to,
-                    retries_taken=retries_taken,
-                    response_headers=err.response.headers,
-                    stream=stream,
-                    stream_cls=stream_cls,
-                )
+                log.debug("Re-raising status error")
+                raise self._make_status_error_from_response(err.response) from None
 
-            # If the response is streamed then we need to explicitly read the response
-            # to completion before attempting to access the response text.
-            if not err.response.is_closed:
-                await err.response.aread()
-
-            log.debug("Re-raising status error")
-            raise self._make_status_error_from_response(err.response) from None
+            break
 
+        assert response is not None, "could not resolve response (should never happen)"
         return await self._process_response(
             cast_to=cast_to,
             options=options,
@@ -1553,35 +1542,20 @@ async def _request(
             retries_taken=retries_taken,
         )
 
-    async def _retry_request(
-        self,
-        options: FinalRequestOptions,
-        cast_to: Type[ResponseT],
-        *,
-        retries_taken: int,
-        response_headers: httpx.Headers | None,
-        stream: bool,
-        stream_cls: type[_AsyncStreamT] | None,
-    ) -> ResponseT | _AsyncStreamT:
-        remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
+    async def _sleep_for_retry(
+        self, *, retries_taken: int, max_retries: int, options: FinalRequestOptions, response: httpx.Response | None
+    ) -> None:
+        remaining_retries = max_retries - retries_taken
         if remaining_retries == 1:
             log.debug("1 retry left")
         else:
             log.debug("%i retries left", remaining_retries)
 
-        timeout = self._calculate_retry_timeout(remaining_retries, options, response_headers)
+        timeout = self._calculate_retry_timeout(remaining_retries, options, response.headers if response else None)
         log.info("Retrying request to %s in %f seconds", options.url, timeout)
 
         await anyio.sleep(timeout)
 
-        return await self._request(
-            options=options,
-            cast_to=cast_to,
-            retries_taken=retries_taken + 1,
-            stream=stream,
-            stream_cls=stream_cls,
-        )
-
     async def _process_response(
         self,
         *,
diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
index 7066ae2a..f9f22967 100644
--- a/src/llama_stack_client/_client.py
+++ b/src/llama_stack_client/_client.py
@@ -20,10 +20,7 @@
     ProxiesTypes,
     RequestOptions,
 )
-from ._utils import (
-    is_given,
-    get_async_library,
-)
+from ._utils import is_given, get_async_library
 from ._version import __version__
 from .resources import (
     tools,
@@ -41,6 +38,7 @@
     benchmarks,
     toolgroups,
     vector_dbs,
+    completions,
     scoring_functions,
     synthetic_data_generation,
 )
@@ -51,6 +49,7 @@
     SyncAPIClient,
     AsyncAPIClient,
 )
+from .resources.chat import chat
 from .resources.eval import eval
 from .resources.agents import agents
 from .resources.tool_runtime import tool_runtime
@@ -77,6 +76,8 @@ class LlamaStackClient(SyncAPIClient):
     eval: eval.EvalResource
     inspect: inspect.InspectResource
     inference: inference.InferenceResource
+    chat: chat.ChatResource
+    completions: completions.CompletionsResource
     vector_io: vector_io.VectorIoResource
     vector_dbs: vector_dbs.VectorDBsResource
     models: models.ModelsResource
@@ -157,6 +158,8 @@ def __init__(
         self.eval = eval.EvalResource(self)
         self.inspect = inspect.InspectResource(self)
         self.inference = inference.InferenceResource(self)
+        self.chat = chat.ChatResource(self)
+        self.completions = completions.CompletionsResource(self)
         self.vector_io = vector_io.VectorIoResource(self)
         self.vector_dbs = vector_dbs.VectorDBsResource(self)
         self.models = models.ModelsResource(self)
@@ -289,6 +292,8 @@ class AsyncLlamaStackClient(AsyncAPIClient):
     eval: eval.AsyncEvalResource
     inspect: inspect.AsyncInspectResource
     inference: inference.AsyncInferenceResource
+    chat: chat.AsyncChatResource
+    completions: completions.AsyncCompletionsResource
     vector_io: vector_io.AsyncVectorIoResource
     vector_dbs: vector_dbs.AsyncVectorDBsResource
     models: models.AsyncModelsResource
@@ -369,6 +374,8 @@ def __init__(
         self.eval = eval.AsyncEvalResource(self)
         self.inspect = inspect.AsyncInspectResource(self)
         self.inference = inference.AsyncInferenceResource(self)
+        self.chat = chat.AsyncChatResource(self)
+        self.completions = completions.AsyncCompletionsResource(self)
         self.vector_io = vector_io.AsyncVectorIoResource(self)
         self.vector_dbs = vector_dbs.AsyncVectorDBsResource(self)
         self.models = models.AsyncModelsResource(self)
@@ -502,6 +509,8 @@ def __init__(self, client: LlamaStackClient) -> None:
         self.eval = eval.EvalResourceWithRawResponse(client.eval)
         self.inspect = inspect.InspectResourceWithRawResponse(client.inspect)
         self.inference = inference.InferenceResourceWithRawResponse(client.inference)
+        self.chat = chat.ChatResourceWithRawResponse(client.chat)
+        self.completions = completions.CompletionsResourceWithRawResponse(client.completions)
         self.vector_io = vector_io.VectorIoResourceWithRawResponse(client.vector_io)
         self.vector_dbs = vector_dbs.VectorDBsResourceWithRawResponse(client.vector_dbs)
         self.models = models.ModelsResourceWithRawResponse(client.models)
@@ -529,6 +538,8 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
         self.eval = eval.AsyncEvalResourceWithRawResponse(client.eval)
         self.inspect = inspect.AsyncInspectResourceWithRawResponse(client.inspect)
         self.inference = inference.AsyncInferenceResourceWithRawResponse(client.inference)
+        self.chat = chat.AsyncChatResourceWithRawResponse(client.chat)
+        self.completions = completions.AsyncCompletionsResourceWithRawResponse(client.completions)
         self.vector_io = vector_io.AsyncVectorIoResourceWithRawResponse(client.vector_io)
         self.vector_dbs = vector_dbs.AsyncVectorDBsResourceWithRawResponse(client.vector_dbs)
         self.models = models.AsyncModelsResourceWithRawResponse(client.models)
@@ -558,6 +569,8 @@ def __init__(self, client: LlamaStackClient) -> None:
         self.eval = eval.EvalResourceWithStreamingResponse(client.eval)
         self.inspect = inspect.InspectResourceWithStreamingResponse(client.inspect)
         self.inference = inference.InferenceResourceWithStreamingResponse(client.inference)
+        self.chat = chat.ChatResourceWithStreamingResponse(client.chat)
+        self.completions = completions.CompletionsResourceWithStreamingResponse(client.completions)
         self.vector_io = vector_io.VectorIoResourceWithStreamingResponse(client.vector_io)
         self.vector_dbs = vector_dbs.VectorDBsResourceWithStreamingResponse(client.vector_dbs)
         self.models = models.ModelsResourceWithStreamingResponse(client.models)
@@ -587,6 +600,8 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
         self.eval = eval.AsyncEvalResourceWithStreamingResponse(client.eval)
         self.inspect = inspect.AsyncInspectResourceWithStreamingResponse(client.inspect)
         self.inference = inference.AsyncInferenceResourceWithStreamingResponse(client.inference)
+        self.chat = chat.AsyncChatResourceWithStreamingResponse(client.chat)
+        self.completions = completions.AsyncCompletionsResourceWithStreamingResponse(client.completions)
         self.vector_io = vector_io.AsyncVectorIoResourceWithStreamingResponse(client.vector_io)
         self.vector_dbs = vector_dbs.AsyncVectorDBsResourceWithStreamingResponse(client.vector_dbs)
         self.models = models.AsyncModelsResourceWithStreamingResponse(client.models)
diff --git a/src/llama_stack_client/_models.py b/src/llama_stack_client/_models.py
index 34935716..798956f1 100644
--- a/src/llama_stack_client/_models.py
+++ b/src/llama_stack_client/_models.py
@@ -19,7 +19,6 @@
 )
 
 import pydantic
-import pydantic.generics
 from pydantic.fields import FieldInfo
 
 from ._types import (
@@ -627,8 +626,8 @@ def _build_discriminated_union_meta(*, union: type, meta_annotations: tuple[Any,
                 # Note: if one variant defines an alias then they all should
                 discriminator_alias = field_info.alias
 
-                if field_info.annotation and is_literal_type(field_info.annotation):
-                    for entry in get_args(field_info.annotation):
+                if (annotation := getattr(field_info, "annotation", None)) and is_literal_type(annotation):
+                    for entry in get_args(annotation):
                         if isinstance(entry, str):
                             mapping[entry] = variant
 
diff --git a/src/llama_stack_client/_response.py b/src/llama_stack_client/_response.py
index 1938ae74..8486ab8e 100644
--- a/src/llama_stack_client/_response.py
+++ b/src/llama_stack_client/_response.py
@@ -235,7 +235,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
         # split is required to handle cases where additional information is included
         # in the response, e.g. application/json; charset=utf-8
         content_type, *_ = response.headers.get("content-type", "*").split(";")
-        if content_type != "application/json":
+        if not content_type.endswith("json"):
             if is_basemodel(cast_to):
                 try:
                     data = response.json()
diff --git a/src/llama_stack_client/_utils/_typing.py b/src/llama_stack_client/_utils/_typing.py
index 1958820f..1bac9542 100644
--- a/src/llama_stack_client/_utils/_typing.py
+++ b/src/llama_stack_client/_utils/_typing.py
@@ -110,7 +110,7 @@ class MyResponse(Foo[_T]):
     ```
     """
     cls = cast(object, get_origin(typ) or typ)
-    if cls in generic_bases:
+    if cls in generic_bases:  # pyright: ignore[reportUnnecessaryContains]
         # we're given the class directly
         return extract_type_arg(typ, index)
 
diff --git a/src/llama_stack_client/_utils/_utils.py b/src/llama_stack_client/_utils/_utils.py
index e5811bba..ea3cf3f2 100644
--- a/src/llama_stack_client/_utils/_utils.py
+++ b/src/llama_stack_client/_utils/_utils.py
@@ -72,8 +72,16 @@ def _extract_items(
         from .._files import assert_is_file_content
 
         # We have exhausted the path, return the entry we found.
-        assert_is_file_content(obj, key=flattened_key)
         assert flattened_key is not None
+
+        if is_list(obj):
+            files: list[tuple[str, FileTypes]] = []
+            for entry in obj:
+                assert_is_file_content(entry, key=flattened_key + "[]" if flattened_key else "")
+                files.append((flattened_key + "[]", cast(FileTypes, entry)))
+            return files
+
+        assert_is_file_content(obj, key=flattened_key)
         return [(flattened_key, cast(FileTypes, obj))]
 
     index += 1
diff --git a/src/llama_stack_client/resources/__init__.py b/src/llama_stack_client/resources/__init__.py
index 0e3373dc..ff5b3260 100644
--- a/src/llama_stack_client/resources/__init__.py
+++ b/src/llama_stack_client/resources/__init__.py
@@ -1,5 +1,13 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from .chat import (
+    ChatResource,
+    AsyncChatResource,
+    ChatResourceWithRawResponse,
+    AsyncChatResourceWithRawResponse,
+    ChatResourceWithStreamingResponse,
+    AsyncChatResourceWithStreamingResponse,
+)
 from .eval import (
     EvalResource,
     AsyncEvalResource,
@@ -136,6 +144,14 @@
     VectorDBsResourceWithStreamingResponse,
     AsyncVectorDBsResourceWithStreamingResponse,
 )
+from .completions import (
+    CompletionsResource,
+    AsyncCompletionsResource,
+    CompletionsResourceWithRawResponse,
+    AsyncCompletionsResourceWithRawResponse,
+    CompletionsResourceWithStreamingResponse,
+    AsyncCompletionsResourceWithStreamingResponse,
+)
 from .tool_runtime import (
     ToolRuntimeResource,
     AsyncToolRuntimeResource,
@@ -218,6 +234,18 @@
     "AsyncInferenceResourceWithRawResponse",
     "InferenceResourceWithStreamingResponse",
     "AsyncInferenceResourceWithStreamingResponse",
+    "ChatResource",
+    "AsyncChatResource",
+    "ChatResourceWithRawResponse",
+    "AsyncChatResourceWithRawResponse",
+    "ChatResourceWithStreamingResponse",
+    "AsyncChatResourceWithStreamingResponse",
+    "CompletionsResource",
+    "AsyncCompletionsResource",
+    "CompletionsResourceWithRawResponse",
+    "AsyncCompletionsResourceWithRawResponse",
+    "CompletionsResourceWithStreamingResponse",
+    "AsyncCompletionsResourceWithStreamingResponse",
     "VectorIoResource",
     "AsyncVectorIoResource",
     "VectorIoResourceWithRawResponse",
diff --git a/src/llama_stack_client/resources/agents/agents.py b/src/llama_stack_client/resources/agents/agents.py
index 314edd7e..ed03dde5 100644
--- a/src/llama_stack_client/resources/agents/agents.py
+++ b/src/llama_stack_client/resources/agents/agents.py
@@ -30,10 +30,7 @@
     AsyncSessionResourceWithStreamingResponse,
 )
 from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/agents/session.py b/src/llama_stack_client/resources/agents/session.py
index 592a94ba..0aec7449 100644
--- a/src/llama_stack_client/resources/agents/session.py
+++ b/src/llama_stack_client/resources/agents/session.py
@@ -7,10 +7,7 @@
 import httpx
 
 from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/agents/turn.py b/src/llama_stack_client/resources/agents/turn.py
index 6b1b4ae2..8c48869e 100644
--- a/src/llama_stack_client/resources/agents/turn.py
+++ b/src/llama_stack_client/resources/agents/turn.py
@@ -8,11 +8,7 @@
 import httpx
 
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from ..._utils import (
-    required_args,
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import required_args, maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/benchmarks.py b/src/llama_stack_client/resources/benchmarks.py
index f541a6ba..ff6af994 100644
--- a/src/llama_stack_client/resources/benchmarks.py
+++ b/src/llama_stack_client/resources/benchmarks.py
@@ -8,10 +8,7 @@
 
 from ..types import benchmark_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/chat/__init__.py b/src/llama_stack_client/resources/chat/__init__.py
new file mode 100644
index 00000000..ec960eb4
--- /dev/null
+++ b/src/llama_stack_client/resources/chat/__init__.py
@@ -0,0 +1,33 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from .chat import (
+    ChatResource,
+    AsyncChatResource,
+    ChatResourceWithRawResponse,
+    AsyncChatResourceWithRawResponse,
+    ChatResourceWithStreamingResponse,
+    AsyncChatResourceWithStreamingResponse,
+)
+from .completions import (
+    CompletionsResource,
+    AsyncCompletionsResource,
+    CompletionsResourceWithRawResponse,
+    AsyncCompletionsResourceWithRawResponse,
+    CompletionsResourceWithStreamingResponse,
+    AsyncCompletionsResourceWithStreamingResponse,
+)
+
+__all__ = [
+    "CompletionsResource",
+    "AsyncCompletionsResource",
+    "CompletionsResourceWithRawResponse",
+    "AsyncCompletionsResourceWithRawResponse",
+    "CompletionsResourceWithStreamingResponse",
+    "AsyncCompletionsResourceWithStreamingResponse",
+    "ChatResource",
+    "AsyncChatResource",
+    "ChatResourceWithRawResponse",
+    "AsyncChatResourceWithRawResponse",
+    "ChatResourceWithStreamingResponse",
+    "AsyncChatResourceWithStreamingResponse",
+]
diff --git a/src/llama_stack_client/resources/chat/chat.py b/src/llama_stack_client/resources/chat/chat.py
new file mode 100644
index 00000000..681051f3
--- /dev/null
+++ b/src/llama_stack_client/resources/chat/chat.py
@@ -0,0 +1,102 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from ..._compat import cached_property
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from .completions import (
+    CompletionsResource,
+    AsyncCompletionsResource,
+    CompletionsResourceWithRawResponse,
+    AsyncCompletionsResourceWithRawResponse,
+    CompletionsResourceWithStreamingResponse,
+    AsyncCompletionsResourceWithStreamingResponse,
+)
+
+__all__ = ["ChatResource", "AsyncChatResource"]
+
+
+class ChatResource(SyncAPIResource):
+    @cached_property
+    def completions(self) -> CompletionsResource:
+        return CompletionsResource(self._client)
+
+    @cached_property
+    def with_raw_response(self) -> ChatResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return ChatResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> ChatResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return ChatResourceWithStreamingResponse(self)
+
+
+class AsyncChatResource(AsyncAPIResource):
+    @cached_property
+    def completions(self) -> AsyncCompletionsResource:
+        return AsyncCompletionsResource(self._client)
+
+    @cached_property
+    def with_raw_response(self) -> AsyncChatResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return AsyncChatResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncChatResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return AsyncChatResourceWithStreamingResponse(self)
+
+
+class ChatResourceWithRawResponse:
+    def __init__(self, chat: ChatResource) -> None:
+        self._chat = chat
+
+    @cached_property
+    def completions(self) -> CompletionsResourceWithRawResponse:
+        return CompletionsResourceWithRawResponse(self._chat.completions)
+
+
+class AsyncChatResourceWithRawResponse:
+    def __init__(self, chat: AsyncChatResource) -> None:
+        self._chat = chat
+
+    @cached_property
+    def completions(self) -> AsyncCompletionsResourceWithRawResponse:
+        return AsyncCompletionsResourceWithRawResponse(self._chat.completions)
+
+
+class ChatResourceWithStreamingResponse:
+    def __init__(self, chat: ChatResource) -> None:
+        self._chat = chat
+
+    @cached_property
+    def completions(self) -> CompletionsResourceWithStreamingResponse:
+        return CompletionsResourceWithStreamingResponse(self._chat.completions)
+
+
+class AsyncChatResourceWithStreamingResponse:
+    def __init__(self, chat: AsyncChatResource) -> None:
+        self._chat = chat
+
+    @cached_property
+    def completions(self) -> AsyncCompletionsResourceWithStreamingResponse:
+        return AsyncCompletionsResourceWithStreamingResponse(self._chat.completions)
diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py
new file mode 100644
index 00000000..7c449d41
--- /dev/null
+++ b/src/llama_stack_client/resources/chat/completions.py
@@ -0,0 +1,848 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Union, Iterable, cast
+from typing_extensions import Literal, overload
+
+import httpx
+
+from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
+from ..._utils import required_args, maybe_transform, async_maybe_transform
+from ..._compat import cached_property
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from ..._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
+)
+from ..._streaming import Stream, AsyncStream
+from ...types.chat import completion_create_params
+from ..._base_client import make_request_options
+from ...types.chat_completion_chunk import ChatCompletionChunk
+from ...types.chat.completion_create_response import CompletionCreateResponse
+
+__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
+
+
+class CompletionsResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> CompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return CompletionsResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return CompletionsResourceWithStreamingResponse(self)
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream: (Optional) Whether to stream the response
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: Literal[True],
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[ChatCompletionChunk]:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          stream: (Optional) Whether to stream the response
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: bool,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          stream: (Optional) Whether to stream the response
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+        return self._post(
+            "/v1/openai/v1/chat/completions",
+            body=maybe_transform(
+                {
+                    "messages": messages,
+                    "model": model,
+                    "frequency_penalty": frequency_penalty,
+                    "function_call": function_call,
+                    "functions": functions,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_completion_tokens": max_completion_tokens,
+                    "max_tokens": max_tokens,
+                    "n": n,
+                    "parallel_tool_calls": parallel_tool_calls,
+                    "presence_penalty": presence_penalty,
+                    "response_format": response_format,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "stream_options": stream_options,
+                    "temperature": temperature,
+                    "tool_choice": tool_choice,
+                    "tools": tools,
+                    "top_logprobs": top_logprobs,
+                    "top_p": top_p,
+                    "user": user,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=cast(
+                Any, CompletionCreateResponse
+            ),  # Union types cannot be passed in as arguments in the type system
+            stream=stream or False,
+            stream_cls=Stream[ChatCompletionChunk],
+        )
+
+
+class AsyncCompletionsResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return AsyncCompletionsResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return AsyncCompletionsResourceWithStreamingResponse(self)
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream: (Optional) Whether to stream the response
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: Literal[True],
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          stream: (Optional) Whether to stream the response
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: bool,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+        """
+        Generate an OpenAI-compatible chat completion for the given messages using the
+        specified model.
+
+        Args:
+          messages: List of messages in the conversation
+
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          stream: (Optional) Whether to stream the response
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          function_call: (Optional) The function call to use
+
+          functions: (Optional) List of functions to use
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_completion_tokens: (Optional) The maximum number of tokens to generate
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          parallel_tool_calls: (Optional) Whether to parallelize tool calls
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          response_format: (Optional) The response format to use
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          tool_choice: (Optional) The tool choice to use
+
+          tools: (Optional) The tools to use
+
+          top_logprobs: (Optional) The top log probabilities to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_completion_tokens: int | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        parallel_tool_calls: bool | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+        | NotGiven = NOT_GIVEN,
+        tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]] | NotGiven = NOT_GIVEN,
+        top_logprobs: int | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+        return await self._post(
+            "/v1/openai/v1/chat/completions",
+            body=await async_maybe_transform(
+                {
+                    "messages": messages,
+                    "model": model,
+                    "frequency_penalty": frequency_penalty,
+                    "function_call": function_call,
+                    "functions": functions,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_completion_tokens": max_completion_tokens,
+                    "max_tokens": max_tokens,
+                    "n": n,
+                    "parallel_tool_calls": parallel_tool_calls,
+                    "presence_penalty": presence_penalty,
+                    "response_format": response_format,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "stream_options": stream_options,
+                    "temperature": temperature,
+                    "tool_choice": tool_choice,
+                    "tools": tools,
+                    "top_logprobs": top_logprobs,
+                    "top_p": top_p,
+                    "user": user,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=cast(
+                Any, CompletionCreateResponse
+            ),  # Union types cannot be passed in as arguments in the type system
+            stream=stream or False,
+            stream_cls=AsyncStream[ChatCompletionChunk],
+        )
+
+
+class CompletionsResourceWithRawResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = to_raw_response_wrapper(
+            completions.create,
+        )
+
+
+class AsyncCompletionsResourceWithRawResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = async_to_raw_response_wrapper(
+            completions.create,
+        )
+
+
+class CompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = to_streamed_response_wrapper(
+            completions.create,
+        )
+
+
+class AsyncCompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = async_to_streamed_response_wrapper(
+            completions.create,
+        )
diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py
new file mode 100644
index 00000000..8f57aeb4
--- /dev/null
+++ b/src/llama_stack_client/resources/completions.py
@@ -0,0 +1,715 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Union, Iterable
+from typing_extensions import Literal, overload
+
+import httpx
+
+from ..types import completion_create_params
+from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
+from .._utils import required_args, maybe_transform, async_maybe_transform
+from .._compat import cached_property
+from .._resource import SyncAPIResource, AsyncAPIResource
+from .._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
+)
+from .._streaming import Stream, AsyncStream
+from .._base_client import make_request_options
+from ..types.completion_create_response import CompletionCreateResponse
+
+__all__ = ["CompletionsResource", "AsyncCompletionsResource"]
+
+
+class CompletionsResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> CompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return CompletionsResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return CompletionsResourceWithStreamingResponse(self)
+
+    @overload
+    def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream: (Optional) Whether to stream the response
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        stream: Literal[True],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[CompletionCreateResponse]:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          stream: (Optional) Whether to stream the response
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        stream: bool,
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          stream: (Optional) Whether to stream the response
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["model", "prompt"], ["model", "prompt", "stream"])
+    def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]:
+        return self._post(
+            "/v1/openai/v1/completions",
+            body=maybe_transform(
+                {
+                    "model": model,
+                    "prompt": prompt,
+                    "best_of": best_of,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "guided_choice": guided_choice,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "prompt_logprobs": prompt_logprobs,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "stream_options": stream_options,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "user": user,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=CompletionCreateResponse,
+            stream=stream or False,
+            stream_cls=Stream[CompletionCreateResponse],
+        )
+
+
+class AsyncCompletionsResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncCompletionsResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return AsyncCompletionsResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return AsyncCompletionsResourceWithStreamingResponse(self)
+
+    @overload
+    async def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream: (Optional) Whether to stream the response
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        stream: Literal[True],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[CompletionCreateResponse]:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          stream: (Optional) Whether to stream the response
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        stream: bool,
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]:
+        """
+        Generate an OpenAI-compatible completion for the given prompt using the
+        specified model.
+
+        Args:
+          model: The identifier of the model to use. The model must be registered with Llama
+              Stack and available via the /models endpoint.
+
+          prompt: The prompt to generate a completion for
+
+          stream: (Optional) Whether to stream the response
+
+          best_of: (Optional) The number of completions to generate
+
+          echo: (Optional) Whether to echo the prompt
+
+          frequency_penalty: (Optional) The penalty for repeated tokens
+
+          logit_bias: (Optional) The logit bias to use
+
+          logprobs: (Optional) The log probabilities to use
+
+          max_tokens: (Optional) The maximum number of tokens to generate
+
+          n: (Optional) The number of completions to generate
+
+          presence_penalty: (Optional) The penalty for repeated tokens
+
+          seed: (Optional) The seed to use
+
+          stop: (Optional) The stop tokens to use
+
+          stream_options: (Optional) The stream options to use
+
+          temperature: (Optional) The temperature to use
+
+          top_p: (Optional) The top p to use
+
+          user: (Optional) The user to use
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["model", "prompt"], ["model", "prompt", "stream"])
+    async def create(
+        self,
+        *,
+        model: str,
+        prompt: Union[str, List[str], Iterable[int], Iterable[Iterable[int]]],
+        best_of: int | NotGiven = NOT_GIVEN,
+        echo: bool | NotGiven = NOT_GIVEN,
+        frequency_penalty: float | NotGiven = NOT_GIVEN,
+        guided_choice: List[str] | NotGiven = NOT_GIVEN,
+        logit_bias: Dict[str, float] | NotGiven = NOT_GIVEN,
+        logprobs: bool | NotGiven = NOT_GIVEN,
+        max_tokens: int | NotGiven = NOT_GIVEN,
+        n: int | NotGiven = NOT_GIVEN,
+        presence_penalty: float | NotGiven = NOT_GIVEN,
+        prompt_logprobs: int | NotGiven = NOT_GIVEN,
+        seed: int | NotGiven = NOT_GIVEN,
+        stop: Union[str, List[str]] | NotGiven = NOT_GIVEN,
+        stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        top_p: float | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]:
+        return await self._post(
+            "/v1/openai/v1/completions",
+            body=await async_maybe_transform(
+                {
+                    "model": model,
+                    "prompt": prompt,
+                    "best_of": best_of,
+                    "echo": echo,
+                    "frequency_penalty": frequency_penalty,
+                    "guided_choice": guided_choice,
+                    "logit_bias": logit_bias,
+                    "logprobs": logprobs,
+                    "max_tokens": max_tokens,
+                    "n": n,
+                    "presence_penalty": presence_penalty,
+                    "prompt_logprobs": prompt_logprobs,
+                    "seed": seed,
+                    "stop": stop,
+                    "stream": stream,
+                    "stream_options": stream_options,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "user": user,
+                },
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=CompletionCreateResponse,
+            stream=stream or False,
+            stream_cls=AsyncStream[CompletionCreateResponse],
+        )
+
+
+class CompletionsResourceWithRawResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = to_raw_response_wrapper(
+            completions.create,
+        )
+
+
+class AsyncCompletionsResourceWithRawResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = async_to_raw_response_wrapper(
+            completions.create,
+        )
+
+
+class CompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: CompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = to_streamed_response_wrapper(
+            completions.create,
+        )
+
+
+class AsyncCompletionsResourceWithStreamingResponse:
+    def __init__(self, completions: AsyncCompletionsResource) -> None:
+        self._completions = completions
+
+        self.create = async_to_streamed_response_wrapper(
+            completions.create,
+        )
diff --git a/src/llama_stack_client/resources/datasets.py b/src/llama_stack_client/resources/datasets.py
index aaa27a5e..845f182b 100644
--- a/src/llama_stack_client/resources/datasets.py
+++ b/src/llama_stack_client/resources/datasets.py
@@ -9,10 +9,7 @@
 
 from ..types import dataset_iterrows_params, dataset_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/eval/eval.py b/src/llama_stack_client/resources/eval/eval.py
index e73f7df1..23d1500c 100644
--- a/src/llama_stack_client/resources/eval/eval.py
+++ b/src/llama_stack_client/resources/eval/eval.py
@@ -21,10 +21,7 @@
     eval_evaluate_rows_alpha_params,
 )
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 428956ce..be87eda4 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -15,11 +15,7 @@
     inference_batch_chat_completion_params,
 )
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    required_args,
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import required_args, maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/models.py b/src/llama_stack_client/resources/models.py
index db08a9d5..02458d3b 100644
--- a/src/llama_stack_client/resources/models.py
+++ b/src/llama_stack_client/resources/models.py
@@ -9,10 +9,7 @@
 
 from ..types import model_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/post_training/job.py b/src/llama_stack_client/resources/post_training/job.py
index bcd31952..a55ba7fa 100644
--- a/src/llama_stack_client/resources/post_training/job.py
+++ b/src/llama_stack_client/resources/post_training/job.py
@@ -7,10 +7,7 @@
 import httpx
 
 from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/post_training/post_training.py b/src/llama_stack_client/resources/post_training/post_training.py
index a93a1ebb..fe0d2b7b 100644
--- a/src/llama_stack_client/resources/post_training/post_training.py
+++ b/src/llama_stack_client/resources/post_training/post_training.py
@@ -19,10 +19,7 @@
     post_training_supervised_fine_tune_params,
 )
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
@@ -113,10 +110,10 @@ def supervised_fine_tune(
         hyperparam_search_config: Dict[str, Union[bool, float, str, Iterable[object], object, None]],
         job_uuid: str,
         logger_config: Dict[str, Union[bool, float, str, Iterable[object], object, None]],
-        model: str,
         training_config: post_training_supervised_fine_tune_params.TrainingConfig,
         algorithm_config: AlgorithmConfigParam | NotGiven = NOT_GIVEN,
         checkpoint_dir: str | NotGiven = NOT_GIVEN,
+        model: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -141,10 +138,10 @@ def supervised_fine_tune(
                     "hyperparam_search_config": hyperparam_search_config,
                     "job_uuid": job_uuid,
                     "logger_config": logger_config,
-                    "model": model,
                     "training_config": training_config,
                     "algorithm_config": algorithm_config,
                     "checkpoint_dir": checkpoint_dir,
+                    "model": model,
                 },
                 post_training_supervised_fine_tune_params.PostTrainingSupervisedFineTuneParams,
             ),
@@ -230,10 +227,10 @@ async def supervised_fine_tune(
         hyperparam_search_config: Dict[str, Union[bool, float, str, Iterable[object], object, None]],
         job_uuid: str,
         logger_config: Dict[str, Union[bool, float, str, Iterable[object], object, None]],
-        model: str,
         training_config: post_training_supervised_fine_tune_params.TrainingConfig,
         algorithm_config: AlgorithmConfigParam | NotGiven = NOT_GIVEN,
         checkpoint_dir: str | NotGiven = NOT_GIVEN,
+        model: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -258,10 +255,10 @@ async def supervised_fine_tune(
                     "hyperparam_search_config": hyperparam_search_config,
                     "job_uuid": job_uuid,
                     "logger_config": logger_config,
-                    "model": model,
                     "training_config": training_config,
                     "algorithm_config": algorithm_config,
                     "checkpoint_dir": checkpoint_dir,
+                    "model": model,
                 },
                 post_training_supervised_fine_tune_params.PostTrainingSupervisedFineTuneParams,
             ),
diff --git a/src/llama_stack_client/resources/safety.py b/src/llama_stack_client/resources/safety.py
index 7382c81c..66646102 100644
--- a/src/llama_stack_client/resources/safety.py
+++ b/src/llama_stack_client/resources/safety.py
@@ -8,10 +8,7 @@
 
 from ..types import safety_run_shield_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/scoring.py b/src/llama_stack_client/resources/scoring.py
index ebe42934..33ee8969 100644
--- a/src/llama_stack_client/resources/scoring.py
+++ b/src/llama_stack_client/resources/scoring.py
@@ -8,10 +8,7 @@
 
 from ..types import scoring_score_params, scoring_score_batch_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/scoring_functions.py b/src/llama_stack_client/resources/scoring_functions.py
index c152c805..f01ff17b 100644
--- a/src/llama_stack_client/resources/scoring_functions.py
+++ b/src/llama_stack_client/resources/scoring_functions.py
@@ -8,10 +8,7 @@
 
 from ..types import scoring_function_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/shields.py b/src/llama_stack_client/resources/shields.py
index 150455c3..4ef88ac7 100644
--- a/src/llama_stack_client/resources/shields.py
+++ b/src/llama_stack_client/resources/shields.py
@@ -8,10 +8,7 @@
 
 from ..types import shield_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/synthetic_data_generation.py b/src/llama_stack_client/resources/synthetic_data_generation.py
index 3c848575..59df1b39 100644
--- a/src/llama_stack_client/resources/synthetic_data_generation.py
+++ b/src/llama_stack_client/resources/synthetic_data_generation.py
@@ -9,10 +9,7 @@
 
 from ..types import synthetic_data_generation_generate_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/telemetry.py b/src/llama_stack_client/resources/telemetry.py
index cd93e775..12261eee 100644
--- a/src/llama_stack_client/resources/telemetry.py
+++ b/src/llama_stack_client/resources/telemetry.py
@@ -14,10 +14,7 @@
     telemetry_save_spans_to_dataset_params,
 )
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/tool_runtime/rag_tool.py b/src/llama_stack_client/resources/tool_runtime/rag_tool.py
index 14ea8454..048ea980 100644
--- a/src/llama_stack_client/resources/tool_runtime/rag_tool.py
+++ b/src/llama_stack_client/resources/tool_runtime/rag_tool.py
@@ -7,10 +7,7 @@
 import httpx
 
 from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
diff --git a/src/llama_stack_client/resources/tool_runtime/tool_runtime.py b/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
index aa380f79..dda3f661 100644
--- a/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
+++ b/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
@@ -8,10 +8,7 @@
 
 from ...types import tool_runtime_list_tools_params, tool_runtime_invoke_tool_params
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from ..._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from ..._utils import maybe_transform, async_maybe_transform
 from .rag_tool import (
     RagToolResource,
     AsyncRagToolResource,
diff --git a/src/llama_stack_client/resources/toolgroups.py b/src/llama_stack_client/resources/toolgroups.py
index 6a9b79d0..d882a6eb 100644
--- a/src/llama_stack_client/resources/toolgroups.py
+++ b/src/llama_stack_client/resources/toolgroups.py
@@ -8,10 +8,7 @@
 
 from ..types import toolgroup_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/tools.py b/src/llama_stack_client/resources/tools.py
index 206389f3..8a9b91e8 100644
--- a/src/llama_stack_client/resources/tools.py
+++ b/src/llama_stack_client/resources/tools.py
@@ -8,10 +8,7 @@
 
 from ..types import tool_list_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/vector_dbs.py b/src/llama_stack_client/resources/vector_dbs.py
index 79d7939d..c75d261d 100644
--- a/src/llama_stack_client/resources/vector_dbs.py
+++ b/src/llama_stack_client/resources/vector_dbs.py
@@ -8,10 +8,7 @@
 
 from ..types import vector_db_register_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/resources/vector_io.py b/src/llama_stack_client/resources/vector_io.py
index a432ea40..9b1e8822 100644
--- a/src/llama_stack_client/resources/vector_io.py
+++ b/src/llama_stack_client/resources/vector_io.py
@@ -8,10 +8,7 @@
 
 from ..types import vector_io_query_params, vector_io_insert_params
 from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from .._utils import (
-    maybe_transform,
-    async_maybe_transform,
-)
+from .._utils import maybe_transform, async_maybe_transform
 from .._compat import cached_property
 from .._resource import SyncAPIResource, AsyncAPIResource
 from .._response import (
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index a78eae03..3db3080c 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -70,6 +70,7 @@
 from .scoring_score_params import ScoringScoreParams as ScoringScoreParams
 from .shield_list_response import ShieldListResponse as ShieldListResponse
 from .agent_create_response import AgentCreateResponse as AgentCreateResponse
+from .chat_completion_chunk import ChatCompletionChunk as ChatCompletionChunk
 from .dataset_list_response import DatasetListResponse as DatasetListResponse
 from .list_shields_response import ListShieldsResponse as ListShieldsResponse
 from .memory_retrieval_step import MemoryRetrievalStep as MemoryRetrievalStep
@@ -92,6 +93,7 @@
 from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse
 from .vector_db_list_response import VectorDBListResponse as VectorDBListResponse
 from .vector_io_insert_params import VectorIoInsertParams as VectorIoInsertParams
+from .completion_create_params import CompletionCreateParams as CompletionCreateParams
 from .list_benchmarks_response import ListBenchmarksResponse as ListBenchmarksResponse
 from .list_vector_dbs_response import ListVectorDBsResponse as ListVectorDBsResponse
 from .safety_run_shield_params import SafetyRunShieldParams as SafetyRunShieldParams
@@ -103,6 +105,7 @@
 from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse
 from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams
 from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams
+from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse
 from .eval_run_eval_alpha_params import EvalRunEvalAlphaParams as EvalRunEvalAlphaParams
 from .scoring_score_batch_params import ScoringScoreBatchParams as ScoringScoreBatchParams
 from .telemetry_log_event_params import TelemetryLogEventParams as TelemetryLogEventParams
diff --git a/src/llama_stack_client/types/agent_create_response.py b/src/llama_stack_client/types/agent_create_response.py
index 65d2275f..93651cb6 100644
--- a/src/llama_stack_client/types/agent_create_response.py
+++ b/src/llama_stack_client/types/agent_create_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 
 __all__ = ["AgentCreateResponse"]
diff --git a/src/llama_stack_client/types/agents/agent_turn_response_stream_chunk.py b/src/llama_stack_client/types/agents/agent_turn_response_stream_chunk.py
index bda45d88..c488ba81 100644
--- a/src/llama_stack_client/types/agents/agent_turn_response_stream_chunk.py
+++ b/src/llama_stack_client/types/agents/agent_turn_response_stream_chunk.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from ..._models import BaseModel
 from .turn_response_event import TurnResponseEvent
 
diff --git a/src/llama_stack_client/types/agents/session_create_response.py b/src/llama_stack_client/types/agents/session_create_response.py
index 6adcf0b2..abf18665 100644
--- a/src/llama_stack_client/types/agents/session_create_response.py
+++ b/src/llama_stack_client/types/agents/session_create_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from ..._models import BaseModel
 
 __all__ = ["SessionCreateResponse"]
diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py
index 1b9ad5a6..c6a42d75 100644
--- a/src/llama_stack_client/types/agents/turn_response_event.py
+++ b/src/llama_stack_client/types/agents/turn_response_event.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from ..._models import BaseModel
 from .turn_response_event_payload import TurnResponseEventPayload
 
diff --git a/src/llama_stack_client/types/chat/__init__.py b/src/llama_stack_client/types/chat/__init__.py
new file mode 100644
index 00000000..9384ac14
--- /dev/null
+++ b/src/llama_stack_client/types/chat/__init__.py
@@ -0,0 +1,6 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from .completion_create_params import CompletionCreateParams as CompletionCreateParams
+from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse
diff --git a/src/llama_stack_client/types/chat/completion_create_params.py b/src/llama_stack_client/types/chat/completion_create_params.py
new file mode 100644
index 00000000..0281420b
--- /dev/null
+++ b/src/llama_stack_client/types/chat/completion_create_params.py
@@ -0,0 +1,401 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Union, Iterable
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
+
+__all__ = [
+    "CompletionCreateParamsBase",
+    "Message",
+    "MessageOpenAIUserMessageParam",
+    "MessageOpenAIUserMessageParamContentUnionMember1",
+    "MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "MessageOpenAISystemMessageParam",
+    "MessageOpenAISystemMessageParamContentUnionMember1",
+    "MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "MessageOpenAIAssistantMessageParam",
+    "MessageOpenAIAssistantMessageParamContentUnionMember1",
+    "MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "MessageOpenAIAssistantMessageParamToolCall",
+    "MessageOpenAIAssistantMessageParamToolCallFunction",
+    "MessageOpenAIToolMessageParam",
+    "MessageOpenAIToolMessageParamContentUnionMember1",
+    "MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "MessageOpenAIDeveloperMessageParam",
+    "MessageOpenAIDeveloperMessageParamContentUnionMember1",
+    "MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "ResponseFormat",
+    "ResponseFormatOpenAIResponseFormatText",
+    "ResponseFormatOpenAIResponseFormatJsonSchema",
+    "ResponseFormatOpenAIResponseFormatJsonSchemaJsonSchema",
+    "ResponseFormatOpenAIResponseFormatJsonObject",
+    "CompletionCreateParamsNonStreaming",
+    "CompletionCreateParamsStreaming",
+]
+
+
+class CompletionCreateParamsBase(TypedDict, total=False):
+    messages: Required[Iterable[Message]]
+    """List of messages in the conversation"""
+
+    model: Required[str]
+    """The identifier of the model to use.
+
+    The model must be registered with Llama Stack and available via the /models
+    endpoint.
+    """
+
+    frequency_penalty: float
+    """(Optional) The penalty for repeated tokens"""
+
+    function_call: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+    """(Optional) The function call to use"""
+
+    functions: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+    """(Optional) List of functions to use"""
+
+    logit_bias: Dict[str, float]
+    """(Optional) The logit bias to use"""
+
+    logprobs: bool
+    """(Optional) The log probabilities to use"""
+
+    max_completion_tokens: int
+    """(Optional) The maximum number of tokens to generate"""
+
+    max_tokens: int
+    """(Optional) The maximum number of tokens to generate"""
+
+    n: int
+    """(Optional) The number of completions to generate"""
+
+    parallel_tool_calls: bool
+    """(Optional) Whether to parallelize tool calls"""
+
+    presence_penalty: float
+    """(Optional) The penalty for repeated tokens"""
+
+    response_format: ResponseFormat
+    """(Optional) The response format to use"""
+
+    seed: int
+    """(Optional) The seed to use"""
+
+    stop: Union[str, List[str]]
+    """(Optional) The stop tokens to use"""
+
+    stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+    """(Optional) The stream options to use"""
+
+    temperature: float
+    """(Optional) The temperature to use"""
+
+    tool_choice: Union[str, Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+    """(Optional) The tool choice to use"""
+
+    tools: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
+    """(Optional) The tools to use"""
+
+    top_logprobs: int
+    """(Optional) The top log probabilities to use"""
+
+    top_p: float
+    """(Optional) The top p to use"""
+
+    user: str
+    """(Optional) The user to use"""
+
+
+class MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
+    text: Required[str]
+
+    type: Required[Literal["text"]]
+
+
+class MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    TypedDict, total=False
+):
+    url: Required[str]
+
+    detail: str
+
+
+class MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
+    image_url: Required[
+        MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+    ]
+
+    type: Required[Literal["image_url"]]
+
+
+MessageOpenAIUserMessageParamContentUnionMember1: TypeAlias = Union[
+    MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+    MessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+]
+
+
+class MessageOpenAIUserMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageOpenAIUserMessageParamContentUnionMember1]]]
+    """The content of the message, which can include text and other media"""
+
+    role: Required[Literal["user"]]
+    """Must be "user" to identify this as a user message"""
+
+    name: str
+    """(Optional) The name of the user message participant."""
+
+
+class MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    TypedDict, total=False
+):
+    text: Required[str]
+
+    type: Required[Literal["text"]]
+
+
+class MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    TypedDict, total=False
+):
+    url: Required[str]
+
+    detail: str
+
+
+class MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    TypedDict, total=False
+):
+    image_url: Required[
+        MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+    ]
+
+    type: Required[Literal["image_url"]]
+
+
+MessageOpenAISystemMessageParamContentUnionMember1: TypeAlias = Union[
+    MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+    MessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+]
+
+
+class MessageOpenAISystemMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageOpenAISystemMessageParamContentUnionMember1]]]
+    """The content of the "system prompt".
+
+    If multiple system messages are provided, they are concatenated. The underlying
+    Llama Stack code may also add other system messages (for example, for formatting
+    tool definitions).
+    """
+
+    role: Required[Literal["system"]]
+    """Must be "system" to identify this as a system message"""
+
+    name: str
+    """(Optional) The name of the system message participant."""
+
+
+class MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    TypedDict, total=False
+):
+    text: Required[str]
+
+    type: Required[Literal["text"]]
+
+
+class MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    TypedDict, total=False
+):
+    url: Required[str]
+
+    detail: str
+
+
+class MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    TypedDict, total=False
+):
+    image_url: Required[
+        MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+    ]
+
+    type: Required[Literal["image_url"]]
+
+
+MessageOpenAIAssistantMessageParamContentUnionMember1: TypeAlias = Union[
+    MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+    MessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+]
+
+
+class MessageOpenAIAssistantMessageParamToolCallFunction(TypedDict, total=False):
+    arguments: str
+
+    name: str
+
+
+class MessageOpenAIAssistantMessageParamToolCall(TypedDict, total=False):
+    type: Required[Literal["function"]]
+
+    id: str
+
+    function: MessageOpenAIAssistantMessageParamToolCallFunction
+
+    index: int
+
+
+class MessageOpenAIAssistantMessageParam(TypedDict, total=False):
+    role: Required[Literal["assistant"]]
+    """Must be "assistant" to identify this as the model's response"""
+
+    content: Union[str, Iterable[MessageOpenAIAssistantMessageParamContentUnionMember1]]
+    """The content of the model's response"""
+
+    name: str
+    """(Optional) The name of the assistant message participant."""
+
+    tool_calls: Iterable[MessageOpenAIAssistantMessageParamToolCall]
+    """List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object."""
+
+
+class MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(TypedDict, total=False):
+    text: Required[str]
+
+    type: Required[Literal["text"]]
+
+
+class MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    TypedDict, total=False
+):
+    url: Required[str]
+
+    detail: str
+
+
+class MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(TypedDict, total=False):
+    image_url: Required[
+        MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+    ]
+
+    type: Required[Literal["image_url"]]
+
+
+MessageOpenAIToolMessageParamContentUnionMember1: TypeAlias = Union[
+    MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+    MessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+]
+
+
+class MessageOpenAIToolMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageOpenAIToolMessageParamContentUnionMember1]]]
+    """The response content from the tool"""
+
+    role: Required[Literal["tool"]]
+    """Must be "tool" to identify this as a tool response"""
+
+    tool_call_id: Required[str]
+    """Unique identifier for the tool call this response is for"""
+
+
+class MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    TypedDict, total=False
+):
+    text: Required[str]
+
+    type: Required[Literal["text"]]
+
+
+class MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    TypedDict, total=False
+):
+    url: Required[str]
+
+    detail: str
+
+
+class MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    TypedDict, total=False
+):
+    image_url: Required[
+        MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+    ]
+
+    type: Required[Literal["image_url"]]
+
+
+MessageOpenAIDeveloperMessageParamContentUnionMember1: TypeAlias = Union[
+    MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+    MessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+]
+
+
+class MessageOpenAIDeveloperMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageOpenAIDeveloperMessageParamContentUnionMember1]]]
+    """The content of the developer message"""
+
+    role: Required[Literal["developer"]]
+    """Must be "developer" to identify this as a developer message"""
+
+    name: str
+    """(Optional) The name of the developer message participant."""
+
+
+Message: TypeAlias = Union[
+    MessageOpenAIUserMessageParam,
+    MessageOpenAISystemMessageParam,
+    MessageOpenAIAssistantMessageParam,
+    MessageOpenAIToolMessageParam,
+    MessageOpenAIDeveloperMessageParam,
+]
+
+
+class ResponseFormatOpenAIResponseFormatText(TypedDict, total=False):
+    type: Required[Literal["text"]]
+
+
+class ResponseFormatOpenAIResponseFormatJsonSchemaJsonSchema(TypedDict, total=False):
+    name: Required[str]
+
+    description: str
+
+    schema: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+
+    strict: bool
+
+
+class ResponseFormatOpenAIResponseFormatJsonSchema(TypedDict, total=False):
+    json_schema: Required[ResponseFormatOpenAIResponseFormatJsonSchemaJsonSchema]
+
+    type: Required[Literal["json_schema"]]
+
+
+class ResponseFormatOpenAIResponseFormatJsonObject(TypedDict, total=False):
+    type: Required[Literal["json_object"]]
+
+
+ResponseFormat: TypeAlias = Union[
+    ResponseFormatOpenAIResponseFormatText,
+    ResponseFormatOpenAIResponseFormatJsonSchema,
+    ResponseFormatOpenAIResponseFormatJsonObject,
+]
+
+
+class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase, total=False):
+    stream: Literal[False]
+    """(Optional) Whether to stream the response"""
+
+
+class CompletionCreateParamsStreaming(CompletionCreateParamsBase):
+    stream: Required[Literal[True]]
+    """(Optional) Whether to stream the response"""
+
+
+CompletionCreateParams = Union[CompletionCreateParamsNonStreaming, CompletionCreateParamsStreaming]
diff --git a/src/llama_stack_client/types/chat/completion_create_response.py b/src/llama_stack_client/types/chat/completion_create_response.py
new file mode 100644
index 00000000..5c8eb51c
--- /dev/null
+++ b/src/llama_stack_client/types/chat/completion_create_response.py
@@ -0,0 +1,383 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+from ..chat_completion_chunk import ChatCompletionChunk
+
+__all__ = [
+    "CompletionCreateResponse",
+    "OpenAIChatCompletion",
+    "OpenAIChatCompletionChoice",
+    "OpenAIChatCompletionChoiceMessage",
+    "OpenAIChatCompletionChoiceMessageOpenAIUserMessageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1",
+    "OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "OpenAIChatCompletionChoiceMessageOpenAISystemMessageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1",
+    "OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCall",
+    "OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCallFunction",
+    "OpenAIChatCompletionChoiceMessageOpenAIToolMessageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1",
+    "OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1",
+    "OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam",
+    "OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL",
+    "OpenAIChatCompletionChoiceLogprobs",
+    "OpenAIChatCompletionChoiceLogprobsContent",
+    "OpenAIChatCompletionChoiceLogprobsContentTopLogprob",
+    "OpenAIChatCompletionChoiceLogprobsRefusal",
+    "OpenAIChatCompletionChoiceLogprobsRefusalTopLogprob",
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+        OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIUserMessageParam(BaseModel):
+    content: Union[str, List[OpenAIChatCompletionChoiceMessageOpenAIUserMessageParamContentUnionMember1]]
+    """The content of the message, which can include text and other media"""
+
+    role: Literal["user"]
+    """Must be "user" to identify this as a user message"""
+
+    name: Optional[str] = None
+    """(Optional) The name of the user message participant."""
+
+
+class OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+        OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAISystemMessageParam(BaseModel):
+    content: Union[str, List[OpenAIChatCompletionChoiceMessageOpenAISystemMessageParamContentUnionMember1]]
+    """The content of the "system prompt".
+
+    If multiple system messages are provided, they are concatenated. The underlying
+    Llama Stack code may also add other system messages (for example, for formatting
+    tool definitions).
+    """
+
+    role: Literal["system"]
+    """Must be "system" to identify this as a system message"""
+
+    name: Optional[str] = None
+    """(Optional) The name of the system message participant."""
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+        OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCallFunction(BaseModel):
+    arguments: Optional[str] = None
+
+    name: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCall(BaseModel):
+    type: Literal["function"]
+
+    id: Optional[str] = None
+
+    function: Optional[OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCallFunction] = None
+
+    index: Optional[int] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(BaseModel):
+    role: Literal["assistant"]
+    """Must be "assistant" to identify this as the model's response"""
+
+    content: Union[str, List[OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamContentUnionMember1], None] = (
+        None
+    )
+    """The content of the model's response"""
+
+    name: Optional[str] = None
+    """(Optional) The name of the assistant message participant."""
+
+    tool_calls: Optional[List[OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParamToolCall]] = None
+    """List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object."""
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+        OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIToolMessageParam(BaseModel):
+    content: Union[str, List[OpenAIChatCompletionChoiceMessageOpenAIToolMessageParamContentUnionMember1]]
+    """The response content from the tool"""
+
+    role: Literal["tool"]
+    """Must be "tool" to identify this as a tool response"""
+
+    tool_call_id: str
+    """Unique identifier for the tool call this response is for"""
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[str] = None
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartTextParam,
+        OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1OpenAIChatCompletionContentPartImageParam,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParam(BaseModel):
+    content: Union[str, List[OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParamContentUnionMember1]]
+    """The content of the developer message"""
+
+    role: Literal["developer"]
+    """Must be "developer" to identify this as a developer message"""
+
+    name: Optional[str] = None
+    """(Optional) The name of the developer message participant."""
+
+
+OpenAIChatCompletionChoiceMessage: TypeAlias = Annotated[
+    Union[
+        OpenAIChatCompletionChoiceMessageOpenAIUserMessageParam,
+        OpenAIChatCompletionChoiceMessageOpenAISystemMessageParam,
+        OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam,
+        OpenAIChatCompletionChoiceMessageOpenAIToolMessageParam,
+        OpenAIChatCompletionChoiceMessageOpenAIDeveloperMessageParam,
+    ],
+    PropertyInfo(discriminator="role"),
+]
+
+
+class OpenAIChatCompletionChoiceLogprobsContentTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class OpenAIChatCompletionChoiceLogprobsContent(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[OpenAIChatCompletionChoiceLogprobsContentTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class OpenAIChatCompletionChoiceLogprobsRefusalTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class OpenAIChatCompletionChoiceLogprobsRefusal(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[OpenAIChatCompletionChoiceLogprobsRefusalTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class OpenAIChatCompletionChoiceLogprobs(BaseModel):
+    content: Optional[List[OpenAIChatCompletionChoiceLogprobsContent]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+    refusal: Optional[List[OpenAIChatCompletionChoiceLogprobsRefusal]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+
+class OpenAIChatCompletionChoice(BaseModel):
+    finish_reason: str
+    """The reason the model stopped generating"""
+
+    index: int
+    """The index of the choice"""
+
+    message: OpenAIChatCompletionChoiceMessage
+    """The message from the model"""
+
+    logprobs: Optional[OpenAIChatCompletionChoiceLogprobs] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+
+class OpenAIChatCompletion(BaseModel):
+    id: str
+    """The ID of the chat completion"""
+
+    choices: List[OpenAIChatCompletionChoice]
+    """List of choices"""
+
+    created: int
+    """The Unix timestamp in seconds when the chat completion was created"""
+
+    model: str
+    """The model that was used to generate the chat completion"""
+
+    object: Literal["chat.completion"]
+    """The object type, which will be "chat.completion" """
+
+
+CompletionCreateResponse: TypeAlias = Union[OpenAIChatCompletion, ChatCompletionChunk]
diff --git a/src/llama_stack_client/types/chat_completion_chunk.py b/src/llama_stack_client/types/chat_completion_chunk.py
new file mode 100644
index 00000000..7d74663a
--- /dev/null
+++ b/src/llama_stack_client/types/chat_completion_chunk.py
@@ -0,0 +1,124 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from .._models import BaseModel
+
+__all__ = [
+    "ChatCompletionChunk",
+    "Choice",
+    "ChoiceDelta",
+    "ChoiceDeltaToolCall",
+    "ChoiceDeltaToolCallFunction",
+    "ChoiceLogprobs",
+    "ChoiceLogprobsContent",
+    "ChoiceLogprobsContentTopLogprob",
+    "ChoiceLogprobsRefusal",
+    "ChoiceLogprobsRefusalTopLogprob",
+]
+
+
+class ChoiceDeltaToolCallFunction(BaseModel):
+    arguments: Optional[str] = None
+
+    name: Optional[str] = None
+
+
+class ChoiceDeltaToolCall(BaseModel):
+    type: Literal["function"]
+
+    id: Optional[str] = None
+
+    function: Optional[ChoiceDeltaToolCallFunction] = None
+
+    index: Optional[int] = None
+
+
+class ChoiceDelta(BaseModel):
+    content: Optional[str] = None
+    """(Optional) The content of the delta"""
+
+    refusal: Optional[str] = None
+    """(Optional) The refusal of the delta"""
+
+    role: Optional[str] = None
+    """(Optional) The role of the delta"""
+
+    tool_calls: Optional[List[ChoiceDeltaToolCall]] = None
+    """(Optional) The tool calls of the delta"""
+
+
+class ChoiceLogprobsContentTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsContent(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[ChoiceLogprobsContentTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsRefusalTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsRefusal(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[ChoiceLogprobsRefusalTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobs(BaseModel):
+    content: Optional[List[ChoiceLogprobsContent]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+    refusal: Optional[List[ChoiceLogprobsRefusal]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+
+class Choice(BaseModel):
+    delta: ChoiceDelta
+    """The delta from the chunk"""
+
+    finish_reason: str
+    """The reason the model stopped generating"""
+
+    index: int
+    """The index of the choice"""
+
+    logprobs: Optional[ChoiceLogprobs] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    """The ID of the chat completion"""
+
+    choices: List[Choice]
+    """List of choices"""
+
+    created: int
+    """The Unix timestamp in seconds when the chat completion was created"""
+
+    model: str
+    """The model that was used to generate the chat completion"""
+
+    object: Literal["chat.completion.chunk"]
+    """The object type, which will be "chat.completion.chunk" """
diff --git a/src/llama_stack_client/types/completion_create_params.py b/src/llama_stack_client/types/completion_create_params.py
new file mode 100644
index 00000000..a92b733e
--- /dev/null
+++ b/src/llama_stack_client/types/completion_create_params.py
@@ -0,0 +1,79 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Union, Iterable
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["CompletionCreateParamsBase", "CompletionCreateParamsNonStreaming", "CompletionCreateParamsStreaming"]
+
+
+class CompletionCreateParamsBase(TypedDict, total=False):
+    model: Required[str]
+    """The identifier of the model to use.
+
+    The model must be registered with Llama Stack and available via the /models
+    endpoint.
+    """
+
+    prompt: Required[Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]]
+    """The prompt to generate a completion for"""
+
+    best_of: int
+    """(Optional) The number of completions to generate"""
+
+    echo: bool
+    """(Optional) Whether to echo the prompt"""
+
+    frequency_penalty: float
+    """(Optional) The penalty for repeated tokens"""
+
+    guided_choice: List[str]
+
+    logit_bias: Dict[str, float]
+    """(Optional) The logit bias to use"""
+
+    logprobs: bool
+    """(Optional) The log probabilities to use"""
+
+    max_tokens: int
+    """(Optional) The maximum number of tokens to generate"""
+
+    n: int
+    """(Optional) The number of completions to generate"""
+
+    presence_penalty: float
+    """(Optional) The penalty for repeated tokens"""
+
+    prompt_logprobs: int
+
+    seed: int
+    """(Optional) The seed to use"""
+
+    stop: Union[str, List[str]]
+    """(Optional) The stop tokens to use"""
+
+    stream_options: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+    """(Optional) The stream options to use"""
+
+    temperature: float
+    """(Optional) The temperature to use"""
+
+    top_p: float
+    """(Optional) The top p to use"""
+
+    user: str
+    """(Optional) The user to use"""
+
+
+class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase, total=False):
+    stream: Literal[False]
+    """(Optional) Whether to stream the response"""
+
+
+class CompletionCreateParamsStreaming(CompletionCreateParamsBase):
+    stream: Required[Literal[True]]
+    """(Optional) Whether to stream the response"""
+
+
+CompletionCreateParams = Union[CompletionCreateParamsNonStreaming, CompletionCreateParamsStreaming]
diff --git a/src/llama_stack_client/types/completion_create_response.py b/src/llama_stack_client/types/completion_create_response.py
new file mode 100644
index 00000000..0c43e68a
--- /dev/null
+++ b/src/llama_stack_client/types/completion_create_response.py
@@ -0,0 +1,86 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from .._models import BaseModel
+
+__all__ = [
+    "CompletionCreateResponse",
+    "Choice",
+    "ChoiceLogprobs",
+    "ChoiceLogprobsContent",
+    "ChoiceLogprobsContentTopLogprob",
+    "ChoiceLogprobsRefusal",
+    "ChoiceLogprobsRefusalTopLogprob",
+]
+
+
+class ChoiceLogprobsContentTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsContent(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[ChoiceLogprobsContentTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsRefusalTopLogprob(BaseModel):
+    token: str
+
+    logprob: float
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobsRefusal(BaseModel):
+    token: str
+
+    logprob: float
+
+    top_logprobs: List[ChoiceLogprobsRefusalTopLogprob]
+
+    bytes: Optional[List[int]] = None
+
+
+class ChoiceLogprobs(BaseModel):
+    content: Optional[List[ChoiceLogprobsContent]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+    refusal: Optional[List[ChoiceLogprobsRefusal]] = None
+    """(Optional) The log probabilities for the tokens in the message"""
+
+
+class Choice(BaseModel):
+    finish_reason: str
+
+    index: int
+
+    text: str
+
+    logprobs: Optional[ChoiceLogprobs] = None
+    """
+    The log probabilities for the tokens in the message from an OpenAI-compatible
+    chat completion response.
+    """
+
+
+class CompletionCreateResponse(BaseModel):
+    id: str
+
+    choices: List[Choice]
+
+    created: int
+
+    model: str
+
+    object: Literal["text_completion"]
diff --git a/src/llama_stack_client/types/health_info.py b/src/llama_stack_client/types/health_info.py
index f410c8d2..3441ddd1 100644
--- a/src/llama_stack_client/types/health_info.py
+++ b/src/llama_stack_client/types/health_info.py
@@ -1,5 +1,6 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from typing_extensions import Literal
 
 from .._models import BaseModel
 
@@ -7,4 +8,4 @@
 
 
 class HealthInfo(BaseModel):
-    status: str
+    status: Literal["OK", "Error", "Not Implemented"]
diff --git a/src/llama_stack_client/types/list_benchmarks_response.py b/src/llama_stack_client/types/list_benchmarks_response.py
index 4185f3d1..f265f130 100644
--- a/src/llama_stack_client/types/list_benchmarks_response.py
+++ b/src/llama_stack_client/types/list_benchmarks_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .benchmark_list_response import BenchmarkListResponse
 
diff --git a/src/llama_stack_client/types/list_datasets_response.py b/src/llama_stack_client/types/list_datasets_response.py
index 635c9c88..5a897f78 100644
--- a/src/llama_stack_client/types/list_datasets_response.py
+++ b/src/llama_stack_client/types/list_datasets_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .dataset_list_response import DatasetListResponse
 
diff --git a/src/llama_stack_client/types/list_models_response.py b/src/llama_stack_client/types/list_models_response.py
index 32dcc9d9..a36896b8 100644
--- a/src/llama_stack_client/types/list_models_response.py
+++ b/src/llama_stack_client/types/list_models_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .model_list_response import ModelListResponse
 
diff --git a/src/llama_stack_client/types/list_providers_response.py b/src/llama_stack_client/types/list_providers_response.py
index cbe69e3b..4904c0b1 100644
--- a/src/llama_stack_client/types/list_providers_response.py
+++ b/src/llama_stack_client/types/list_providers_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .provider_list_response import ProviderListResponse
 
diff --git a/src/llama_stack_client/types/list_routes_response.py b/src/llama_stack_client/types/list_routes_response.py
index 02cbd1e3..59e8392b 100644
--- a/src/llama_stack_client/types/list_routes_response.py
+++ b/src/llama_stack_client/types/list_routes_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .route_list_response import RouteListResponse
 
diff --git a/src/llama_stack_client/types/list_scoring_functions_response.py b/src/llama_stack_client/types/list_scoring_functions_response.py
index 845c37be..2c044ba1 100644
--- a/src/llama_stack_client/types/list_scoring_functions_response.py
+++ b/src/llama_stack_client/types/list_scoring_functions_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .scoring_function_list_response import ScoringFunctionListResponse
 
diff --git a/src/llama_stack_client/types/list_shields_response.py b/src/llama_stack_client/types/list_shields_response.py
index 35d1650d..fabbc9da 100644
--- a/src/llama_stack_client/types/list_shields_response.py
+++ b/src/llama_stack_client/types/list_shields_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .shield_list_response import ShieldListResponse
 
diff --git a/src/llama_stack_client/types/list_tool_groups_response.py b/src/llama_stack_client/types/list_tool_groups_response.py
index fec39d2f..6433b164 100644
--- a/src/llama_stack_client/types/list_tool_groups_response.py
+++ b/src/llama_stack_client/types/list_tool_groups_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .toolgroup_list_response import ToolgroupListResponse
 
diff --git a/src/llama_stack_client/types/list_tools_response.py b/src/llama_stack_client/types/list_tools_response.py
index 02013c4f..c9b4ec6b 100644
--- a/src/llama_stack_client/types/list_tools_response.py
+++ b/src/llama_stack_client/types/list_tools_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .tool_list_response import ToolListResponse
 
diff --git a/src/llama_stack_client/types/list_vector_dbs_response.py b/src/llama_stack_client/types/list_vector_dbs_response.py
index 7d64c3d6..fede6c42 100644
--- a/src/llama_stack_client/types/list_vector_dbs_response.py
+++ b/src/llama_stack_client/types/list_vector_dbs_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .vector_db_list_response import VectorDBListResponse
 
diff --git a/src/llama_stack_client/types/post_training_job.py b/src/llama_stack_client/types/post_training_job.py
index 8cd98126..d0ba5fce 100644
--- a/src/llama_stack_client/types/post_training_job.py
+++ b/src/llama_stack_client/types/post_training_job.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 
 __all__ = ["PostTrainingJob"]
diff --git a/src/llama_stack_client/types/post_training_preference_optimize_params.py b/src/llama_stack_client/types/post_training_preference_optimize_params.py
index 0d79173a..11392907 100644
--- a/src/llama_stack_client/types/post_training_preference_optimize_params.py
+++ b/src/llama_stack_client/types/post_training_preference_optimize_params.py
@@ -10,8 +10,8 @@
     "AlgorithmConfig",
     "TrainingConfig",
     "TrainingConfigDataConfig",
-    "TrainingConfigOptimizerConfig",
     "TrainingConfigEfficiencyConfig",
+    "TrainingConfigOptimizerConfig",
 ]
 
 
@@ -55,16 +55,6 @@ class TrainingConfigDataConfig(TypedDict, total=False):
     validation_dataset_id: str
 
 
-class TrainingConfigOptimizerConfig(TypedDict, total=False):
-    lr: Required[float]
-
-    num_warmup_steps: Required[int]
-
-    optimizer_type: Required[Literal["adam", "adamw", "sgd"]]
-
-    weight_decay: Required[float]
-
-
 class TrainingConfigEfficiencyConfig(TypedDict, total=False):
     enable_activation_checkpointing: bool
 
@@ -75,19 +65,29 @@ class TrainingConfigEfficiencyConfig(TypedDict, total=False):
     memory_efficient_fsdp_wrap: bool
 
 
-class TrainingConfig(TypedDict, total=False):
-    data_config: Required[TrainingConfigDataConfig]
+class TrainingConfigOptimizerConfig(TypedDict, total=False):
+    lr: Required[float]
+
+    num_warmup_steps: Required[int]
 
+    optimizer_type: Required[Literal["adam", "adamw", "sgd"]]
+
+    weight_decay: Required[float]
+
+
+class TrainingConfig(TypedDict, total=False):
     gradient_accumulation_steps: Required[int]
 
     max_steps_per_epoch: Required[int]
 
-    max_validation_steps: Required[int]
-
     n_epochs: Required[int]
 
-    optimizer_config: Required[TrainingConfigOptimizerConfig]
+    data_config: TrainingConfigDataConfig
 
     dtype: str
 
     efficiency_config: TrainingConfigEfficiencyConfig
+
+    max_validation_steps: int
+
+    optimizer_config: TrainingConfigOptimizerConfig
diff --git a/src/llama_stack_client/types/post_training_supervised_fine_tune_params.py b/src/llama_stack_client/types/post_training_supervised_fine_tune_params.py
index fa18742a..ad298817 100644
--- a/src/llama_stack_client/types/post_training_supervised_fine_tune_params.py
+++ b/src/llama_stack_client/types/post_training_supervised_fine_tune_params.py
@@ -11,8 +11,8 @@
     "PostTrainingSupervisedFineTuneParams",
     "TrainingConfig",
     "TrainingConfigDataConfig",
-    "TrainingConfigOptimizerConfig",
     "TrainingConfigEfficiencyConfig",
+    "TrainingConfigOptimizerConfig",
 ]
 
 
@@ -23,14 +23,14 @@ class PostTrainingSupervisedFineTuneParams(TypedDict, total=False):
 
     logger_config: Required[Dict[str, Union[bool, float, str, Iterable[object], object, None]]]
 
-    model: Required[str]
-
     training_config: Required[TrainingConfig]
 
     algorithm_config: AlgorithmConfigParam
 
     checkpoint_dir: str
 
+    model: str
+
 
 class TrainingConfigDataConfig(TypedDict, total=False):
     batch_size: Required[int]
@@ -48,16 +48,6 @@ class TrainingConfigDataConfig(TypedDict, total=False):
     validation_dataset_id: str
 
 
-class TrainingConfigOptimizerConfig(TypedDict, total=False):
-    lr: Required[float]
-
-    num_warmup_steps: Required[int]
-
-    optimizer_type: Required[Literal["adam", "adamw", "sgd"]]
-
-    weight_decay: Required[float]
-
-
 class TrainingConfigEfficiencyConfig(TypedDict, total=False):
     enable_activation_checkpointing: bool
 
@@ -68,19 +58,29 @@ class TrainingConfigEfficiencyConfig(TypedDict, total=False):
     memory_efficient_fsdp_wrap: bool
 
 
-class TrainingConfig(TypedDict, total=False):
-    data_config: Required[TrainingConfigDataConfig]
+class TrainingConfigOptimizerConfig(TypedDict, total=False):
+    lr: Required[float]
+
+    num_warmup_steps: Required[int]
+
+    optimizer_type: Required[Literal["adam", "adamw", "sgd"]]
 
+    weight_decay: Required[float]
+
+
+class TrainingConfig(TypedDict, total=False):
     gradient_accumulation_steps: Required[int]
 
     max_steps_per_epoch: Required[int]
 
-    max_validation_steps: Required[int]
-
     n_epochs: Required[int]
 
-    optimizer_config: Required[TrainingConfigOptimizerConfig]
+    data_config: TrainingConfigDataConfig
 
     dtype: str
 
     efficiency_config: TrainingConfigEfficiencyConfig
+
+    max_validation_steps: int
+
+    optimizer_config: TrainingConfigOptimizerConfig
diff --git a/src/llama_stack_client/types/provider_info.py b/src/llama_stack_client/types/provider_info.py
index 3e0d0d85..c9c748cc 100644
--- a/src/llama_stack_client/types/provider_info.py
+++ b/src/llama_stack_client/types/provider_info.py
@@ -12,6 +12,8 @@ class ProviderInfo(BaseModel):
 
     config: Dict[str, Union[bool, float, str, List[object], object, None]]
 
+    health: Dict[str, Union[bool, float, str, List[object], object, None]]
+
     provider_id: str
 
     provider_type: str
diff --git a/src/llama_stack_client/types/query_spans_response.py b/src/llama_stack_client/types/query_spans_response.py
index 5c54e623..488a4331 100644
--- a/src/llama_stack_client/types/query_spans_response.py
+++ b/src/llama_stack_client/types/query_spans_response.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 from .telemetry_query_spans_response import TelemetryQuerySpansResponse
 
diff --git a/src/llama_stack_client/types/shared/agent_config.py b/src/llama_stack_client/types/shared/agent_config.py
index 04997ac4..eb116159 100644
--- a/src/llama_stack_client/types/shared/agent_config.py
+++ b/src/llama_stack_client/types/shared/agent_config.py
@@ -51,21 +51,27 @@ class ToolgroupAgentToolGroupWithArgs(BaseModel):
 
 class AgentConfig(BaseModel):
     instructions: str
+    """The system instructions for the agent"""
 
     model: str
+    """The model identifier to use for the agent"""
 
     client_tools: Optional[List[ToolDef]] = None
 
     enable_session_persistence: Optional[bool] = None
+    """Optional flag indicating whether session data has to be persisted"""
 
     input_shields: Optional[List[str]] = None
 
     max_infer_iters: Optional[int] = None
 
+    name: Optional[str] = None
+    """Optional name for the agent, used in telemetry and identification"""
+
     output_shields: Optional[List[str]] = None
 
     response_format: Optional[ResponseFormat] = None
-    """Configuration for JSON schema-guided response generation."""
+    """Optional response format configuration"""
 
     sampling_params: Optional[SamplingParams] = None
     """Sampling parameters."""
diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py
index 1bfd872a..679f7dcb 100644
--- a/src/llama_stack_client/types/shared/query_config.py
+++ b/src/llama_stack_client/types/shared/query_config.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from ..._models import BaseModel
 from .query_generator_config import QueryGeneratorConfig
 
diff --git a/src/llama_stack_client/types/shared_params/agent_config.py b/src/llama_stack_client/types/shared_params/agent_config.py
index f07efa39..5cebec3f 100644
--- a/src/llama_stack_client/types/shared_params/agent_config.py
+++ b/src/llama_stack_client/types/shared_params/agent_config.py
@@ -52,21 +52,27 @@ class ToolgroupAgentToolGroupWithArgs(TypedDict, total=False):
 
 class AgentConfig(TypedDict, total=False):
     instructions: Required[str]
+    """The system instructions for the agent"""
 
     model: Required[str]
+    """The model identifier to use for the agent"""
 
     client_tools: Iterable[ToolDefParam]
 
     enable_session_persistence: bool
+    """Optional flag indicating whether session data has to be persisted"""
 
     input_shields: List[str]
 
     max_infer_iters: int
 
+    name: str
+    """Optional name for the agent, used in telemetry and identification"""
+
     output_shields: List[str]
 
     response_format: ResponseFormat
-    """Configuration for JSON schema-guided response generation."""
+    """Optional response format configuration"""
 
     sampling_params: SamplingParams
     """Sampling parameters."""
diff --git a/src/llama_stack_client/types/version_info.py b/src/llama_stack_client/types/version_info.py
index 3e877545..5fc5bbb4 100644
--- a/src/llama_stack_client/types/version_info.py
+++ b/src/llama_stack_client/types/version_info.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-
 from .._models import BaseModel
 
 __all__ = ["VersionInfo"]
diff --git a/tests/api_resources/chat/__init__.py b/tests/api_resources/chat/__init__.py
new file mode 100644
index 00000000..fd8019a9
--- /dev/null
+++ b/tests/api_resources/chat/__init__.py
@@ -0,0 +1 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
diff --git a/tests/api_resources/chat/test_completions.py b/tests/api_resources/chat/test_completions.py
new file mode 100644
index 00000000..5c3d96c3
--- /dev/null
+++ b/tests/api_resources/chat/test_completions.py
@@ -0,0 +1,362 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+import os
+from typing import Any, cast
+
+import pytest
+
+from tests.utils import assert_matches_type
+from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
+from llama_stack_client.types.chat import CompletionCreateResponse
+
+base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
+
+
+class TestCompletions:
+    parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    def test_method_create_overload_1(self, client: LlamaStackClient) -> None:
+        completion = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_method_create_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
+        completion = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                    "name": "name",
+                }
+            ],
+            model="model",
+            frequency_penalty=0,
+            function_call="string",
+            functions=[{"foo": True}],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=0,
+            max_tokens=0,
+            n=0,
+            parallel_tool_calls=True,
+            presence_penalty=0,
+            response_format={"type": "text"},
+            seed=0,
+            stop="string",
+            stream=False,
+            stream_options={"foo": True},
+            temperature=0,
+            tool_choice="string",
+            tools=[{"foo": True}],
+            top_logprobs=0,
+            top_p=0,
+            user="user",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_raw_response_create_overload_1(self, client: LlamaStackClient) -> None:
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        completion = response.parse()
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_streaming_response_create_overload_1(self, client: LlamaStackClient) -> None:
+        with client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            completion = response.parse()
+            assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_method_create_overload_2(self, client: LlamaStackClient) -> None:
+        completion_stream = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        )
+        completion_stream.response.close()
+
+    @parametrize
+    def test_method_create_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
+        completion_stream = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                    "name": "name",
+                }
+            ],
+            model="model",
+            stream=True,
+            frequency_penalty=0,
+            function_call="string",
+            functions=[{"foo": True}],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=0,
+            max_tokens=0,
+            n=0,
+            parallel_tool_calls=True,
+            presence_penalty=0,
+            response_format={"type": "text"},
+            seed=0,
+            stop="string",
+            stream_options={"foo": True},
+            temperature=0,
+            tool_choice="string",
+            tools=[{"foo": True}],
+            top_logprobs=0,
+            top_p=0,
+            user="user",
+        )
+        completion_stream.response.close()
+
+    @parametrize
+    def test_raw_response_create_overload_2(self, client: LlamaStackClient) -> None:
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = response.parse()
+        stream.close()
+
+    @parametrize
+    def test_streaming_response_create_overload_2(self, client: LlamaStackClient) -> None:
+        with client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = response.parse()
+            stream.close()
+
+        assert cast(Any, response.is_closed) is True
+
+
+class TestAsyncCompletions:
+    parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    async def test_method_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        completion = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_method_create_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        completion = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                    "name": "name",
+                }
+            ],
+            model="model",
+            frequency_penalty=0,
+            function_call="string",
+            functions=[{"foo": True}],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=0,
+            max_tokens=0,
+            n=0,
+            parallel_tool_calls=True,
+            presence_penalty=0,
+            response_format={"type": "text"},
+            seed=0,
+            stop="string",
+            stream=False,
+            stream_options={"foo": True},
+            temperature=0,
+            tool_choice="string",
+            tools=[{"foo": True}],
+            top_logprobs=0,
+            top_p=0,
+            user="user",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_raw_response_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        completion = await response.parse()
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            completion = await response.parse()
+            assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_method_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        completion_stream = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        )
+        await completion_stream.response.aclose()
+
+    @parametrize
+    async def test_method_create_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        completion_stream = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                    "name": "name",
+                }
+            ],
+            model="model",
+            stream=True,
+            frequency_penalty=0,
+            function_call="string",
+            functions=[{"foo": True}],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=0,
+            max_tokens=0,
+            n=0,
+            parallel_tool_calls=True,
+            presence_penalty=0,
+            response_format={"type": "text"},
+            seed=0,
+            stop="string",
+            stream_options={"foo": True},
+            temperature=0,
+            tool_choice="string",
+            tools=[{"foo": True}],
+            top_logprobs=0,
+            top_p=0,
+            user="user",
+        )
+        await completion_stream.response.aclose()
+
+    @parametrize
+    async def test_raw_response_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = await response.parse()
+        await stream.close()
+
+    @parametrize
+    async def test_streaming_response_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "user",
+                }
+            ],
+            model="model",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = await response.parse()
+            await stream.close()
+
+        assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py
index 235d6258..1c0478a6 100644
--- a/tests/api_resources/test_agents.py
+++ b/tests/api_resources/test_agents.py
@@ -52,6 +52,7 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
                 "enable_session_persistence": True,
                 "input_shields": ["string"],
                 "max_infer_iters": 0,
+                "name": "name",
                 "output_shields": ["string"],
                 "response_format": {
                     "json_schema": {"foo": True},
@@ -182,6 +183,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack
                 "enable_session_persistence": True,
                 "input_shields": ["string"],
                 "max_infer_iters": 0,
+                "name": "name",
                 "output_shields": ["string"],
                 "response_format": {
                     "json_schema": {"foo": True},
diff --git a/tests/api_resources/test_completions.py b/tests/api_resources/test_completions.py
new file mode 100644
index 00000000..30e15b7b
--- /dev/null
+++ b/tests/api_resources/test_completions.py
@@ -0,0 +1,262 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+import os
+from typing import Any, cast
+
+import pytest
+
+from tests.utils import assert_matches_type
+from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
+from llama_stack_client.types import CompletionCreateResponse
+
+base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
+
+
+class TestCompletions:
+    parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    def test_method_create_overload_1(self, client: LlamaStackClient) -> None:
+        completion = client.completions.create(
+            model="model",
+            prompt="string",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_method_create_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
+        completion = client.completions.create(
+            model="model",
+            prompt="string",
+            best_of=0,
+            echo=True,
+            frequency_penalty=0,
+            guided_choice=["string"],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_tokens=0,
+            n=0,
+            presence_penalty=0,
+            prompt_logprobs=0,
+            seed=0,
+            stop="string",
+            stream=False,
+            stream_options={"foo": True},
+            temperature=0,
+            top_p=0,
+            user="user",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_raw_response_create_overload_1(self, client: LlamaStackClient) -> None:
+        response = client.completions.with_raw_response.create(
+            model="model",
+            prompt="string",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        completion = response.parse()
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    def test_streaming_response_create_overload_1(self, client: LlamaStackClient) -> None:
+        with client.completions.with_streaming_response.create(
+            model="model",
+            prompt="string",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            completion = response.parse()
+            assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_method_create_overload_2(self, client: LlamaStackClient) -> None:
+        completion_stream = client.completions.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        )
+        completion_stream.response.close()
+
+    @parametrize
+    def test_method_create_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
+        completion_stream = client.completions.create(
+            model="model",
+            prompt="string",
+            stream=True,
+            best_of=0,
+            echo=True,
+            frequency_penalty=0,
+            guided_choice=["string"],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_tokens=0,
+            n=0,
+            presence_penalty=0,
+            prompt_logprobs=0,
+            seed=0,
+            stop="string",
+            stream_options={"foo": True},
+            temperature=0,
+            top_p=0,
+            user="user",
+        )
+        completion_stream.response.close()
+
+    @parametrize
+    def test_raw_response_create_overload_2(self, client: LlamaStackClient) -> None:
+        response = client.completions.with_raw_response.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = response.parse()
+        stream.close()
+
+    @parametrize
+    def test_streaming_response_create_overload_2(self, client: LlamaStackClient) -> None:
+        with client.completions.with_streaming_response.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = response.parse()
+            stream.close()
+
+        assert cast(Any, response.is_closed) is True
+
+
+class TestAsyncCompletions:
+    parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    async def test_method_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        completion = await async_client.completions.create(
+            model="model",
+            prompt="string",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_method_create_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        completion = await async_client.completions.create(
+            model="model",
+            prompt="string",
+            best_of=0,
+            echo=True,
+            frequency_penalty=0,
+            guided_choice=["string"],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_tokens=0,
+            n=0,
+            presence_penalty=0,
+            prompt_logprobs=0,
+            seed=0,
+            stop="string",
+            stream=False,
+            stream_options={"foo": True},
+            temperature=0,
+            top_p=0,
+            user="user",
+        )
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_raw_response_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.completions.with_raw_response.create(
+            model="model",
+            prompt="string",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        completion = await response.parse()
+        assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_create_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.completions.with_streaming_response.create(
+            model="model",
+            prompt="string",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            completion = await response.parse()
+            assert_matches_type(CompletionCreateResponse, completion, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_method_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        completion_stream = await async_client.completions.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        )
+        await completion_stream.response.aclose()
+
+    @parametrize
+    async def test_method_create_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        completion_stream = await async_client.completions.create(
+            model="model",
+            prompt="string",
+            stream=True,
+            best_of=0,
+            echo=True,
+            frequency_penalty=0,
+            guided_choice=["string"],
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_tokens=0,
+            n=0,
+            presence_penalty=0,
+            prompt_logprobs=0,
+            seed=0,
+            stop="string",
+            stream_options={"foo": True},
+            temperature=0,
+            top_p=0,
+            user="user",
+        )
+        await completion_stream.response.aclose()
+
+    @parametrize
+    async def test_raw_response_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.completions.with_raw_response.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = await response.parse()
+        await stream.close()
+
+    @parametrize
+    async def test_streaming_response_create_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.completions.with_streaming_response.create(
+            model="model",
+            prompt="string",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = await response.parse()
+            await stream.close()
+
+        assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_post_training.py b/tests/api_resources/test_post_training.py
index 98047e4c..1d0613da 100644
--- a/tests/api_resources/test_post_training.py
+++ b/tests/api_resources/test_post_training.py
@@ -33,22 +33,9 @@ def test_method_preference_optimize(self, client: LlamaStackClient) -> None:
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -67,6 +54,9 @@ def test_method_preference_optimize_with_all_params(self, client: LlamaStackClie
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
+                "gradient_accumulation_steps": 0,
+                "max_steps_per_epoch": 0,
+                "n_epochs": 0,
                 "data_config": {
                     "batch_size": 0,
                     "data_format": "instruct",
@@ -76,16 +66,6 @@ def test_method_preference_optimize_with_all_params(self, client: LlamaStackClie
                     "train_on_input": True,
                     "validation_dataset_id": "validation_dataset_id",
                 },
-                "gradient_accumulation_steps": 0,
-                "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
-                "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
                 "dtype": "dtype",
                 "efficiency_config": {
                     "enable_activation_checkpointing": True,
@@ -93,6 +73,13 @@ def test_method_preference_optimize_with_all_params(self, client: LlamaStackClie
                     "fsdp_cpu_offload": True,
                     "memory_efficient_fsdp_wrap": True,
                 },
+                "max_validation_steps": 0,
+                "optimizer_config": {
+                    "lr": 0,
+                    "num_warmup_steps": 0,
+                    "optimizer_type": "adam",
+                    "weight_decay": 0,
+                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -111,22 +98,9 @@ def test_raw_response_preference_optimize(self, client: LlamaStackClient) -> Non
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
 
@@ -149,22 +123,9 @@ def test_streaming_response_preference_optimize(self, client: LlamaStackClient)
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         ) as response:
             assert not response.is_closed
@@ -181,24 +142,10 @@ def test_method_supervised_fine_tune(self, client: LlamaStackClient) -> None:
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -209,8 +156,10 @@ def test_method_supervised_fine_tune_with_all_params(self, client: LlamaStackCli
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
+                "gradient_accumulation_steps": 0,
+                "max_steps_per_epoch": 0,
+                "n_epochs": 0,
                 "data_config": {
                     "batch_size": 0,
                     "data_format": "instruct",
@@ -220,16 +169,6 @@ def test_method_supervised_fine_tune_with_all_params(self, client: LlamaStackCli
                     "train_on_input": True,
                     "validation_dataset_id": "validation_dataset_id",
                 },
-                "gradient_accumulation_steps": 0,
-                "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
-                "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
                 "dtype": "dtype",
                 "efficiency_config": {
                     "enable_activation_checkpointing": True,
@@ -237,6 +176,13 @@ def test_method_supervised_fine_tune_with_all_params(self, client: LlamaStackCli
                     "fsdp_cpu_offload": True,
                     "memory_efficient_fsdp_wrap": True,
                 },
+                "max_validation_steps": 0,
+                "optimizer_config": {
+                    "lr": 0,
+                    "num_warmup_steps": 0,
+                    "optimizer_type": "adam",
+                    "weight_decay": 0,
+                },
             },
             algorithm_config={
                 "alpha": 0,
@@ -249,6 +195,7 @@ def test_method_supervised_fine_tune_with_all_params(self, client: LlamaStackCli
                 "use_dora": True,
             },
             checkpoint_dir="checkpoint_dir",
+            model="model",
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
 
@@ -258,24 +205,10 @@ def test_raw_response_supervised_fine_tune(self, client: LlamaStackClient) -> No
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
 
@@ -290,24 +223,10 @@ def test_streaming_response_supervised_fine_tune(self, client: LlamaStackClient)
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         ) as response:
             assert not response.is_closed
@@ -336,22 +255,9 @@ async def test_method_preference_optimize(self, async_client: AsyncLlamaStackCli
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -370,6 +276,9 @@ async def test_method_preference_optimize_with_all_params(self, async_client: As
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
+                "gradient_accumulation_steps": 0,
+                "max_steps_per_epoch": 0,
+                "n_epochs": 0,
                 "data_config": {
                     "batch_size": 0,
                     "data_format": "instruct",
@@ -379,16 +288,6 @@ async def test_method_preference_optimize_with_all_params(self, async_client: As
                     "train_on_input": True,
                     "validation_dataset_id": "validation_dataset_id",
                 },
-                "gradient_accumulation_steps": 0,
-                "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
-                "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
                 "dtype": "dtype",
                 "efficiency_config": {
                     "enable_activation_checkpointing": True,
@@ -396,6 +295,13 @@ async def test_method_preference_optimize_with_all_params(self, async_client: As
                     "fsdp_cpu_offload": True,
                     "memory_efficient_fsdp_wrap": True,
                 },
+                "max_validation_steps": 0,
+                "optimizer_config": {
+                    "lr": 0,
+                    "num_warmup_steps": 0,
+                    "optimizer_type": "adam",
+                    "weight_decay": 0,
+                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -414,22 +320,9 @@ async def test_raw_response_preference_optimize(self, async_client: AsyncLlamaSt
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
 
@@ -452,22 +345,9 @@ async def test_streaming_response_preference_optimize(self, async_client: AsyncL
             job_uuid="job_uuid",
             logger_config={"foo": True},
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         ) as response:
             assert not response.is_closed
@@ -484,24 +364,10 @@ async def test_method_supervised_fine_tune(self, async_client: AsyncLlamaStackCl
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
@@ -512,8 +378,10 @@ async def test_method_supervised_fine_tune_with_all_params(self, async_client: A
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
+                "gradient_accumulation_steps": 0,
+                "max_steps_per_epoch": 0,
+                "n_epochs": 0,
                 "data_config": {
                     "batch_size": 0,
                     "data_format": "instruct",
@@ -523,16 +391,6 @@ async def test_method_supervised_fine_tune_with_all_params(self, async_client: A
                     "train_on_input": True,
                     "validation_dataset_id": "validation_dataset_id",
                 },
-                "gradient_accumulation_steps": 0,
-                "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
-                "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
                 "dtype": "dtype",
                 "efficiency_config": {
                     "enable_activation_checkpointing": True,
@@ -540,6 +398,13 @@ async def test_method_supervised_fine_tune_with_all_params(self, async_client: A
                     "fsdp_cpu_offload": True,
                     "memory_efficient_fsdp_wrap": True,
                 },
+                "max_validation_steps": 0,
+                "optimizer_config": {
+                    "lr": 0,
+                    "num_warmup_steps": 0,
+                    "optimizer_type": "adam",
+                    "weight_decay": 0,
+                },
             },
             algorithm_config={
                 "alpha": 0,
@@ -552,6 +417,7 @@ async def test_method_supervised_fine_tune_with_all_params(self, async_client: A
                 "use_dora": True,
             },
             checkpoint_dir="checkpoint_dir",
+            model="model",
         )
         assert_matches_type(PostTrainingJob, post_training, path=["response"])
 
@@ -561,24 +427,10 @@ async def test_raw_response_supervised_fine_tune(self, async_client: AsyncLlamaS
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         )
 
@@ -593,24 +445,10 @@ async def test_streaming_response_supervised_fine_tune(self, async_client: Async
             hyperparam_search_config={"foo": True},
             job_uuid="job_uuid",
             logger_config={"foo": True},
-            model="model",
             training_config={
-                "data_config": {
-                    "batch_size": 0,
-                    "data_format": "instruct",
-                    "dataset_id": "dataset_id",
-                    "shuffle": True,
-                },
                 "gradient_accumulation_steps": 0,
                 "max_steps_per_epoch": 0,
-                "max_validation_steps": 0,
                 "n_epochs": 0,
-                "optimizer_config": {
-                    "lr": 0,
-                    "num_warmup_steps": 0,
-                    "optimizer_type": "adam",
-                    "weight_decay": 0,
-                },
             },
         ) as response:
             assert not response.is_closed
diff --git a/tests/conftest.py b/tests/conftest.py
index 645cbf63..dd04ad98 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
 
 if TYPE_CHECKING:
-    from _pytest.fixtures import FixtureRequest
+    from _pytest.fixtures import FixtureRequest  # pyright: ignore[reportPrivateImportUsage]
 
 pytest.register_assert_rewrite("tests.utils")
 
diff --git a/tests/test_models.py b/tests/test_models.py
index 8b4c0bc9..a27dfa46 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -492,12 +492,15 @@ class Model(BaseModel):
         resource_id: Optional[str] = None
 
     m = Model.construct()
+    assert m.resource_id is None
     assert "resource_id" not in m.model_fields_set
 
     m = Model.construct(resource_id=None)
+    assert m.resource_id is None
     assert "resource_id" in m.model_fields_set
 
     m = Model.construct(resource_id="foo")
+    assert m.resource_id == "foo"
     assert "resource_id" in m.model_fields_set
 
 
@@ -832,7 +835,7 @@ class B(BaseModel):
 
 @pytest.mark.skipif(not PYDANTIC_V2, reason="TypeAliasType is not supported in Pydantic v1")
 def test_type_alias_type() -> None:
-    Alias = TypeAliasType("Alias", str)
+    Alias = TypeAliasType("Alias", str)  # pyright: ignore
 
     class Model(BaseModel):
         alias: Alias