llamastack · ehhuang · Feb 7, 2025 · Feb 6, 2025
diff --git a/src/llama_stack_client/_base_client.py b/src/llama_stack_client/_base_client.py
@@ -418,10 +418,17 @@ def _build_headers(self, options: FinalRequestOptions, *, retries_taken: int = 0
         if idempotency_header and options.method.lower() != "get" and idempotency_header not in headers:
             headers[idempotency_header] = options.idempotency_key or self._idempotency_key()
 
-        # Don't set the retry count header if it was already set or removed by the caller. We check
+        # Don't set these headers if they were already set or removed by the caller. We check
         # `custom_headers`, which can contain `Omit()`, instead of `headers` to account for the removal case.
-        if "x-stainless-retry-count" not in (header.lower() for header in custom_headers):
+        lower_custom_headers = [header.lower() for header in custom_headers]
+        if "x-stainless-retry-count" not in lower_custom_headers:
             headers["x-stainless-retry-count"] = str(retries_taken)
+        if "x-stainless-read-timeout" not in lower_custom_headers:
+            timeout = self.timeout if isinstance(options.timeout, NotGiven) else options.timeout
+            if isinstance(timeout, Timeout):
+                timeout = timeout.read
+            if timeout is not None:
+                headers["x-stainless-read-timeout"] = str(timeout)
 
         return headers
 

diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
@@ -98,12 +98,13 @@ class LlamaStackClient(SyncAPIClient):
     with_streaming_response: LlamaStackClientWithStreamedResponse
 
     # client options
+    api_key: str | None
 
     def __init__(
         self,
         *,
-        base_url: str | httpx.URL | None = None,
         api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
         timeout: Union[float, Timeout, None, NotGiven] = NOT_GIVEN,
         max_retries: int = DEFAULT_MAX_RETRIES,
         default_headers: Mapping[str, str] | None = None,
@@ -123,19 +124,20 @@ def __init__(
         _strict_response_validation: bool = False,
         provider_data: Mapping[str, Any] | None = None,
     ) -> None:
-        """Construct a new synchronous llama-stack-client client instance."""
-        if base_url is None:
-            base_url = os.environ.get("LLAMA_STACK_CLIENT_BASE_URL")
-        if base_url is None:
-            base_url = f"http://any-hosted-llama-stack.com"
+        """Construct a new synchronous llama-stack-client client instance.
 
+        This automatically infers the `api_key` argument from the `LLAMA_STACK_CLIENT_API_KEY` environment variable if it is not provided.
+        """
         if api_key is None:
             api_key = os.environ.get("LLAMA_STACK_CLIENT_API_KEY")
         self.api_key = api_key
 
+        if base_url is None:
+            base_url = os.environ.get("LLAMA_STACK_CLIENT_BASE_URL")
+        if base_url is None:
+            base_url = f"http://any-hosted-llama-stack.com"
+
         custom_headers = default_headers or {}
-        if api_key is not None:
-            custom_headers["Authorization"] = f"Bearer {api_key}"
         custom_headers["X-LlamaStack-Client-Version"] = __version__
         if provider_data is not None:
             custom_headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)
@@ -182,6 +184,14 @@ def __init__(
     def qs(self) -> Querystring:
         return Querystring(array_format="comma")
 
+    @property
+    @override
+    def auth_headers(self) -> dict[str, str]:
+        api_key = self.api_key
+        if api_key is None:
+            return {}
+        return {"Authorization": f"Bearer {api_key}"}
+
     @property
     @override
     def default_headers(self) -> dict[str, str | Omit]:
@@ -194,8 +204,8 @@ def default_headers(self) -> dict[str, str | Omit]:
     def copy(
         self,
         *,
-        base_url: str | httpx.URL | None = None,
         api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
         timeout: float | Timeout | None | NotGiven = NOT_GIVEN,
         http_client: httpx.Client | None = None,
         max_retries: int | NotGiven = NOT_GIVEN,
@@ -228,8 +238,8 @@ def copy(
 
         http_client = http_client or self._client
         return self.__class__(
-            base_url=base_url or self.base_url,
             api_key=api_key or self.api_key,
+            base_url=base_url or self.base_url,
             timeout=self.timeout if isinstance(timeout, NotGiven) else timeout,
             http_client=http_client,
             max_retries=max_retries if is_given(max_retries) else self.max_retries,
@@ -304,12 +314,13 @@ class AsyncLlamaStackClient(AsyncAPIClient):
     with_streaming_response: AsyncLlamaStackClientWithStreamedResponse
 
     # client options
+    api_key: str | None
 
     def __init__(
         self,
         *,
-        base_url: str | httpx.URL | None = None,
         api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
         timeout: Union[float, Timeout, None, NotGiven] = NOT_GIVEN,
         max_retries: int = DEFAULT_MAX_RETRIES,
         default_headers: Mapping[str, str] | None = None,
@@ -329,19 +340,20 @@ def __init__(
         _strict_response_validation: bool = False,
         provider_data: Mapping[str, Any] | None = None,
     ) -> None:
-        """Construct a new async llama-stack-client client instance."""
-        if base_url is None:
-            base_url = os.environ.get("LLAMA_STACK_CLIENT_BASE_URL")
-        if base_url is None:
-            base_url = f"http://any-hosted-llama-stack.com"
+        """Construct a new async llama-stack-client client instance.
 
+        This automatically infers the `api_key` argument from the `LLAMA_STACK_CLIENT_API_KEY` environment variable if it is not provided.
+        """
         if api_key is None:
             api_key = os.environ.get("LLAMA_STACK_CLIENT_API_KEY")
         self.api_key = api_key
 
+        if base_url is None:
+            base_url = os.environ.get("LLAMA_STACK_CLIENT_BASE_URL")
+        if base_url is None:
+            base_url = f"http://any-hosted-llama-stack.com"
+
         custom_headers = default_headers or {}
-        if api_key is not None:
-            custom_headers["Authorization"] = f"Bearer {api_key}"
         custom_headers["X-LlamaStack-Client-Version"] = __version__
         if provider_data is not None:
             custom_headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)
@@ -388,6 +400,14 @@ def __init__(
     def qs(self) -> Querystring:
         return Querystring(array_format="comma")
 
+    @property
+    @override
+    def auth_headers(self) -> dict[str, str]:
+        api_key = self.api_key
+        if api_key is None:
+            return {}
+        return {"Authorization": f"Bearer {api_key}"}
+
     @property
     @override
     def default_headers(self) -> dict[str, str | Omit]:
@@ -400,8 +420,8 @@ def default_headers(self) -> dict[str, str | Omit]:
     def copy(
         self,
         *,
-        base_url: str | httpx.URL | None = None,
         api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
         timeout: float | Timeout | None | NotGiven = NOT_GIVEN,
         http_client: httpx.AsyncClient | None = None,
         max_retries: int | NotGiven = NOT_GIVEN,
@@ -434,8 +454,8 @@ def copy(
 
         http_client = http_client or self._client
         return self.__class__(
-            base_url=base_url or self.base_url,
             api_key=api_key or self.api_key,
+            base_url=base_url or self.base_url,
             timeout=self.timeout if isinstance(timeout, NotGiven) else timeout,
             http_client=http_client,
             max_retries=max_retries if is_given(max_retries) else self.max_retries,

diff --git a/src/llama_stack_client/_constants.py b/src/llama_stack_client/_constants.py
@@ -6,7 +6,7 @@
 OVERRIDE_CAST_TO_HEADER = "____stainless_override_cast_to"
 
 # default timeout is 1 minute
-DEFAULT_TIMEOUT = httpx.Timeout(timeout=60.0, connect=5.0)
+DEFAULT_TIMEOUT = httpx.Timeout(timeout=60, connect=5.0)
 DEFAULT_MAX_RETRIES = 2
 DEFAULT_CONNECTION_LIMITS = httpx.Limits(max_connections=100, max_keepalive_connections=20)
 

diff --git a/src/llama_stack_client/resources/agents/turn.py b/src/llama_stack_client/resources/agents/turn.py
@@ -59,6 +59,7 @@ def create(
         messages: Iterable[turn_create_params.Message],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
         stream: Literal[False] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -69,6 +70,8 @@ def create(
     ) -> Turn:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -88,6 +91,7 @@ def create(
         messages: Iterable[turn_create_params.Message],
         stream: Literal[True],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -98,6 +102,8 @@ def create(
     ) -> Stream[AgentTurnResponseStreamChunk]:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -117,6 +123,7 @@ def create(
         messages: Iterable[turn_create_params.Message],
         stream: bool,
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -127,6 +134,8 @@ def create(
     ) -> Turn | Stream[AgentTurnResponseStreamChunk]:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -146,6 +155,7 @@ def create(
         messages: Iterable[turn_create_params.Message],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
         stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -165,6 +175,7 @@ def create(
                     "messages": messages,
                     "documents": documents,
                     "stream": stream,
+                    "tool_config": tool_config,
                     "toolgroups": toolgroups,
                 },
                 turn_create_params.TurnCreateParams,
@@ -244,6 +255,7 @@ async def create(
         messages: Iterable[turn_create_params.Message],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
         stream: Literal[False] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -254,6 +266,8 @@ async def create(
     ) -> Turn:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -273,6 +287,7 @@ async def create(
         messages: Iterable[turn_create_params.Message],
         stream: Literal[True],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -283,6 +298,8 @@ async def create(
     ) -> AsyncStream[AgentTurnResponseStreamChunk]:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -302,6 +319,7 @@ async def create(
         messages: Iterable[turn_create_params.Message],
         stream: bool,
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -312,6 +330,8 @@ async def create(
     ) -> Turn | AsyncStream[AgentTurnResponseStreamChunk]:
         """
         Args:
+          tool_config: Configuration for tool use.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -331,6 +351,7 @@ async def create(
         messages: Iterable[turn_create_params.Message],
         documents: Iterable[turn_create_params.Document] | NotGiven = NOT_GIVEN,
         stream: Literal[False] | Literal[True] | NotGiven = NOT_GIVEN,
+        tool_config: turn_create_params.ToolConfig | NotGiven = NOT_GIVEN,
         toolgroups: List[turn_create_params.Toolgroup] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -350,6 +371,7 @@ async def create(
                     "messages": messages,
                     "documents": documents,
                     "stream": stream,
+                    "tool_config": tool_config,
                     "toolgroups": toolgroups,
                 },
                 turn_create_params.TurnCreateParams,

diff --git a/src/llama_stack_client/resources/batch_inference.py b/src/llama_stack_client/resources/batch_inference.py
@@ -72,6 +72,14 @@ def chat_completion(
     ) -> BatchInferenceChatCompletionResponse:
         """
         Args:
+          response_format: Configuration for JSON schema-guided response generation.
+
+          tool_choice: Whether tool use is required or automatic. This is a hint to the model which may
+              not be followed. It depends on the Instruction Following capabilities of the
+              model.
+
+          tool_prompt_format: Prompt format for calling custom / zero shot tools.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -118,6 +126,8 @@ def completion(
     ) -> BatchCompletion:
         """
         Args:
+          response_format: Configuration for JSON schema-guided response generation.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -185,6 +195,14 @@ async def chat_completion(
     ) -> BatchInferenceChatCompletionResponse:
         """
         Args:
+          response_format: Configuration for JSON schema-guided response generation.
+
+          tool_choice: Whether tool use is required or automatic. This is a hint to the model which may
+              not be followed. It depends on the Instruction Following capabilities of the
+              model.
+
+          tool_prompt_format: Prompt format for calling custom / zero shot tools.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -231,6 +249,8 @@ async def completion(
     ) -> BatchCompletion:
         """
         Args:
+          response_format: Configuration for JSON schema-guided response generation.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request