guidance-ai · riedgar-ms · May 6, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/guidance/models/__init__.py b/guidance/models/__init__.py
@@ -19,6 +19,7 @@
     AzureOpenAICompletion,
     AzureOpenAIInstruct,
 )
+from ._azureai_studio import AzureAIStudioChat
 from ._openai import OpenAI, OpenAIChat, OpenAIInstruct, OpenAICompletion
 from ._lite_llm import LiteLLM, LiteLLMChat, LiteLLMInstruct, LiteLLMCompletion
 from ._cohere import Cohere, CohereCompletion, CohereInstruct

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
@@ -0,0 +1,162 @@
+import hashlib
+import pathlib
+
+import diskcache as dc
+import platformdirs
+import requests
+
+from ._model import Chat
+from ._grammarless import GrammarlessEngine, Grammarless
+
+
+class AzureAIStudioChatEngine(GrammarlessEngine):
+    def __init__(
+        self,
+        *,
+        tokenizer,
+        max_streaming_tokens: int,
+        timeout: float,
+        compute_log_probs: bool,
+        azureai_studio_endpoint: str,
+        azureai_model_deployment: str,
+        azureai_studio_key: str,
+    ):
+        self._endpoint = azureai_studio_endpoint
+        self._deployment = azureai_model_deployment
+        self._api_key = azureai_studio_key
+
+        # There is a cache... better make sure it's specific
+        # to the endpoint and deployment
+        deployment_id = self._hash_prompt(self._endpoint + self._deployment)
+
+        path = (
+            pathlib.Path(platformdirs.user_cache_dir("guidance"))
+            / f"azureaistudio.tokens.{deployment_id}"
+        )
+        self.cache = dc.Cache(path)
+
+        super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
+
+    def _hash_prompt(self, prompt):
+        # Copied from OpenAIChatEngine
+        return hashlib.sha256(f"{prompt}".encode()).hexdigest()
+
+    def _generator(self, prompt, temperature: float):
+        # Initial parts of this straight up copied from OpenAIChatEngine
+
+        # The next loop (or one like it) appears in several places,
+        # and quite possibly belongs in a library function or superclass
+        # That said, I'm not _completely sure that there aren't subtle
+        # differences between the various versions
+
+        # find the role tags
+        pos = 0
+        role_end = b"<|im_end|>"
+        messages = []
+        found = True
+        while found:
+
+            # find the role text blocks
+            found = False
+            for role_name, start_bytes in (
+                ("system", b"<|im_start|>system\n"),
+                ("user", b"<|im_start|>user\n"),
+                ("assistant", b"<|im_start|>assistant\n"),
+            ):
+                if prompt[pos:].startswith(start_bytes):
+                    pos += len(start_bytes)
+                    end_pos = prompt[pos:].find(role_end)
+                    if end_pos < 0:
+                        assert (
+                            role_name == "assistant"
+                        ), "Bad chat format! Last role before gen needs to be assistant!"
+                        break
+                    btext = prompt[pos : pos + end_pos]
+                    pos += end_pos + len(role_end)
+                    messages.append(
+                        {"role": role_name, "content": btext.decode("utf8")}
+                    )
+                    found = True
+                    break
+
+        # Add nice exception if no role tags were used in the prompt.
+        # TODO: Move this somewhere more general for all chat models?
+        if messages == []:
+            raise ValueError(
+                f"The model is a Chat-based model and requires role tags in the prompt! \
+            Make sure you are using guidance context managers like `with system():`, `with user():` and `with assistant():` \
+            to appropriately format your guidance program for this type of model."
+            )
+
+        # Update shared data state
+        self._reset_shared_data(prompt[:pos], temperature)
+
+        # Use cache only when temperature is 0
+        if temperature == 0:
+            cache_key = self._hash_prompt(prompt)
+
+            # Check if the result is already in the cache
+            if cache_key in self.cache:
+                for chunk in self.cache[cache_key]:
+                    yield chunk
+                return
+
+        # Prepare for the API call (this might be model specific....)
+        parameters = dict(temperature=temperature)
+        payload = dict(input_data=dict(input_string=messages, parameters=parameters))
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": ("Bearer " + self._api_key),
+            "azureml-model-deployment": self._deployment,
+        }
+
+        response = requests.post(
+            self._endpoint,
+            json=payload,
+            headers=headers,
+        )
+
+        result = response.json()
+
+        # Now back to OpenAIChatEngine, with slight modifications since
+        # this isn't a streaming API
+        if temperature == 0:
+            cached_results = []
+
+        encoded_chunk = result["output"].encode("utf8")
+
+        yield encoded_chunk
+
+        if temperature == 0:
+            cached_results.append(encoded_chunk)
+
+        # Cache the results after the generator is exhausted
+        if temperature == 0:
+            self.cache[cache_key] = cached_results
+
+
+class AzureAIStudioChat(Grammarless, Chat):
+    def __init__(
+        self,
+        azureai_studio_endpoint: str,
+        azureai_studio_deployment: str,
+        azureai_studio_key: str,
+        tokenizer=None,
+        echo: bool = True,
+        max_streaming_tokens: int = 1000,
+        timeout: float = 0.5,
+        compute_log_probs: bool = False,
+    ):
+        super().__init__(
+            AzureAIStudioChatEngine(
+                azureai_studio_endpoint=azureai_studio_endpoint,
+                azureai_model_deployment=azureai_studio_deployment,
+                azureai_studio_key=azureai_studio_key,
+                tokenizer=tokenizer,
+                max_streaming_tokens=max_streaming_tokens,
+                timeout=timeout,
+                compute_log_probs=compute_log_probs,
+            ),
+            echo=echo,
+        )
diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
@@ -7,23 +7,18 @@
 
 from guidance import assistant, gen, models, system, user
 
+from ..utils import env_or_fail
+
 # Everything in here needs credentials to work
 # Mark is configured in pyproject.toml
 pytestmark = pytest.mark.needs_credentials
 
 
-def _env_or_fail(var_name: str) -> str:
-    env_value = os.getenv(var_name, None)
-
-    assert env_value is not None, f"Env '{var_name}' not found"
-
-    return env_value
-
 
 def test_azureai_openai_chat_smoke(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
@@ -45,9 +40,9 @@ def test_azureai_openai_chat_smoke(rate_limiter):
 
 
 def test_azureai_openai_chat_alt_args(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     parsed_url = urlparse(azureai_endpoint)
     parsed_query = parse_qs(parsed_url.query)
@@ -78,9 +73,9 @@ def test_azureai_openai_chat_alt_args(rate_limiter):
 
 
 def test_azureai_openai_completion_smoke(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_COMPLETION_KEY")
-    model = _env_or_fail("AZUREAI_COMPLETION_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_COMPLETION_KEY")
+    model = env_or_fail("AZUREAI_COMPLETION_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
@@ -93,9 +88,9 @@ def test_azureai_openai_completion_smoke(rate_limiter):
 
 
 def test_azureai_openai_completion_alt_args(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_COMPLETION_KEY")
-    model = _env_or_fail("AZUREAI_COMPLETION_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_COMPLETION_KEY")
+    model = env_or_fail("AZUREAI_COMPLETION_MODEL")
 
     parsed_url = urlparse(azureai_endpoint)
     parsed_query = parse_qs(parsed_url.query)
@@ -118,9 +113,9 @@ def test_azureai_openai_completion_alt_args(rate_limiter):
 
 
 def test_azureai_openai_chat_loop(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
@@ -0,0 +1,65 @@
+import pytest
+
+import pytest
+
+from guidance import assistant, gen, models, system, user
+
+from ..utils import env_or_fail
+
+# Everything in here needs credentials to work
+# Mark is configured in pyproject.toml
+pytestmark = pytest.mark.needs_credentials
+
+
+def test_azureai_phi3_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_PHI3_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_PHI3_DEPLOYMENT")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_PHI3_KEY")
+
+    lm = models.AzureAIStudioChat(
+        azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_deployment=azureai_studio_deployment,
+        azureai_studio_key=azureai_studio_key,
+    )
+    assert isinstance(lm, models.AzureAIStudioChat)
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text", temperature=0.5)
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+
+
+def test_azureai_mistral_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_KEY")
+
+    lm = models.AzureAIStudioChat(
+        azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_deployment=azureai_studio_deployment,
+        azureai_studio_key=azureai_studio_key,
+    )
+    assert isinstance(lm, models.AzureAIStudioChat)
+    lm.engine.cache.clear()
+
+    # No "system" role for Mistral?
+    # with system():
+    #    lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=15, name="text", temperature=0.5)
+        lm += "\nPick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
diff --git a/tests/utils.py b/tests/utils.py
@@ -8,6 +8,12 @@
 
 opanai_model_cache = {}
 
+def env_or_fail(var_name: str) -> str:
+    env_value = os.getenv(var_name, None)
+
+    assert env_value is not None, f"Env '{var_name}' not found"
+
+    return env_value
 
 def get_model(model_name, caching=False, **kwargs):
     """Get an LLM by name."""