From 2704056d40ec502d57564a8334b53ace89a921a2 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 21 Jul 2024 14:18:53 +0300
Subject: [PATCH 01/11] anthropic model and asset file added

---
 .../ThatiARSubjectivity_Sonet_ZeroShot_en.py  |  63 ++++++++++
 llmebench/models/Anthropic.py                 | 111 ++++++++++++++++++
 llmebench/models/__init__.py                  |   1 +
 3 files changed, 175 insertions(+)
 create mode 100644 assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py
 create mode 100644 llmebench/models/Anthropic.py

diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py
new file mode 100644
index 00000000..282f5cbc
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py
@@ -0,0 +1,63 @@
+import json
+
+from llmebench.datasets import ThatiARDataset
+from llmebench.models import AnthropicModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "claude-3-5-sonnet-20240620",
+        "description": "Anthropic model",
+        "scores": {},
+    }
+
+
+def config():
+    system_msg = "AI assistant specialized in classifying news article sentences into subjective or objective. A subjective sentence expresses personal opinions, feelings, or beliefs, while an objective sentence presents facts, data, or unbiased information."
+    return {
+        "dataset": ThatiARDataset,
+        "task": SubjectivityTask,
+        "model": AnthropicModel,
+        "model_args": {
+            "class_labels": ["SUBJ", "OBJ"],
+            "max_tries": 30,
+            "system": system_msg,
+        },
+    }
+
+
+def prompt(input_sample):
+
+    prompt = f"""
+        Classify the following Arabic 'sentence' as subjective or objective. Provide only the label. 
+        Provide your response in the following JSON format: {{"label": "your label"}}. 
+        Please provide JSON output only. No additional text.
+
+        sentence: {input_sample}
+        """
+    return [
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+
+
+def post_process(response):
+    data = response["content"][0]["text"].lower()
+    data = json.loads(data)
+    label = data["label"]
+    if "label: objective" in label:
+        label_fixed = "OBJ"
+    elif "label: subjective" in label:
+        label_fixed = "SUBJ"
+    elif label == "objective" or label == "objective.":
+        label_fixed = "OBJ"
+    elif label == "subjective" or label == "subjective.":
+        label_fixed = "SUBJ"
+    else:
+        label_fixed = None
+
+    return label_fixed
diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
new file mode 100644
index 00000000..b1de5cef
--- /dev/null
+++ b/llmebench/models/Anthropic.py
@@ -0,0 +1,111 @@
+import json
+import logging
+import os
+
+import anthropic
+
+from llmebench.models.model_base import ModelBase
+
+
+class AnthropicModel(ModelBase):
+    """
+    Anthropic Model interface.
+
+    Arguments
+    ---------
+    api_url : EMPTY
+    timeout : int
+        Number of seconds before the request to the server is timed out
+    temperature : float
+        Temperature value to use for the model. Defaults to zero for reproducibility.
+    top_p : float
+        Top P value to use for the model. Defaults to 0.95
+    max_tokens : int
+        Maximum number of tokens to pass to the model. Defaults to 1512
+    """
+
+    def __init__(
+        self,
+        api_base=None,
+        api_key=None,
+        model_name=None,
+        timeout=20,
+        temperature=0,
+        top_p=0.95,
+        max_tokens=2000,
+        **kwargs,
+    ):
+        # API parameters
+        self.api_base = api_base or os.getenv("ANTHROPIC_API_URL")
+        self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
+        self.model_name = model_name or os.getenv("ANTHROPIC_MODEL")
+
+        # Parameters
+        self.api_timeout = timeout
+        tolerance = 1e-7
+        self.temperature = temperature
+        if self.temperature < tolerance:
+            # Currently, the model inference fails if temperature
+            # is exactly 0, so we nudge it slightly to work around
+            # the issue
+            self.temperature += tolerance
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+
+        if self.api_key is None:
+            raise Exception(
+                "API key must be provided as model config or environment variable (`ANTHROPIC_API_KEY`)"
+            )
+        if self.model_name is None:
+            raise Exception(
+                "Model name must be provided as model config or environment variable (`ANTHROPIC_MODEL`)"
+            )
+        self.model = self.model_name
+        # GPT parameters
+        self.model_params = {}
+        self.model_params["system"] = (
+            kwargs.get("system_msg")
+            if "system_msg" in kwargs and kwargs["system_msg"]
+            else "You are an expert AI assistant"
+        )
+        self.model_params["temperature"] = temperature
+        self.model_params["top_p"] = top_p
+        self.model_params["max_tokens"] = max_tokens
+        self.client = anthropic.Anthropic(api_key=self.api_key)
+
+
+
+    def summarize_response(self, response):
+        """Returns the first reply from the "assistant", if available"""
+        if (
+            "choices" in response
+            and isinstance(response["choices"], list)
+            and len(response["choices"]) > 0
+            and "message" in response["choices"][0]
+            and "content" in response["choices"][0]["message"]
+            and response["choices"][0]["message"]["role"] == "assistant"
+        ):
+            return response["choices"][0]["message"]["content"]
+
+        return response
+
+    def prompt(self, processed_input):
+        """
+        AnthropicModel API Implementation
+
+        Arguments
+        ---------
+        processed_input : dictionary
+            Must be a dictionary with one key "prompt", the value of which
+            must be a string.
+
+        Returns
+        -------
+        response : AnthropicModel API response
+        """
+
+        response = self.client.messages.create(
+            model=self.model, messages=processed_input, **self.model_params
+        )
+        response = json.loads(response.json())
+        return response
diff --git a/llmebench/models/__init__.py b/llmebench/models/__init__.py
index 92e44a30..7e7d8c99 100644
--- a/llmebench/models/__init__.py
+++ b/llmebench/models/__init__.py
@@ -5,3 +5,4 @@
 from .Petals import PetalsModel
 from .Random import RandomModel
 from .VLLM import VLLMModel
+from .Anthropic import AnthropicModel

From 78946d7fce1cd8d5bc3aec12606d879c52d13494 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 21 Jul 2024 14:23:31 +0300
Subject: [PATCH 02/11] fixed formatting issue

---
 llmebench/models/Anthropic.py | 2 --
 llmebench/models/__init__.py  | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index b1de5cef..32029463 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -73,8 +73,6 @@ def __init__(
         self.model_params["max_tokens"] = max_tokens
         self.client = anthropic.Anthropic(api_key=self.api_key)
 
-
-
     def summarize_response(self, response):
         """Returns the first reply from the "assistant", if available"""
         if (
diff --git a/llmebench/models/__init__.py b/llmebench/models/__init__.py
index 7e7d8c99..16b0524a 100644
--- a/llmebench/models/__init__.py
+++ b/llmebench/models/__init__.py
@@ -1,3 +1,4 @@
+from .Anthropic import AnthropicModel
 from .AzureModel import AzureModel
 from .FastChat import FastChatModel
 from .HuggingFaceInferenceAPI import HuggingFaceInferenceAPIModel, HuggingFaceTaskTypes
@@ -5,4 +6,3 @@
 from .Petals import PetalsModel
 from .Random import RandomModel
 from .VLLM import VLLMModel
-from .Anthropic import AnthropicModel

From d00370536fc3146e834a00740eff9db672164158 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 21 Jul 2024 14:47:57 +0300
Subject: [PATCH 03/11] added anthropic package

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index eb8df2cc..97e1dc82 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,6 +23,7 @@ install_requires =
     datasets==2.14.6
     nltk==3.8.1
     openai==1.35.10
+    anthropic==0.31.2
     pandas==2.0.2
     pooch==1.7.0
     python-dotenv==1.0.0

From 6602c299c35b99bb456ed177e9f498acbf1f7516 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 21 Jul 2024 17:26:20 +0300
Subject: [PATCH 04/11] fixed calling ModelBase

---
 llmebench/models/Anthropic.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index 32029463..a5436de8 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -7,6 +7,23 @@
 from llmebench.models.model_base import ModelBase
 
 
+class AnthropicFailure(Exception):
+    """Exception class to map various failure types from the AzureModel server"""
+
+    def __init__(self, failure_type, failure_message):
+        self.type_mapping = {
+            "processing": "Model Inference failure",
+            "connection": "Failed to connect to the API endpoint",
+        }
+        self.type = failure_type
+        self.failure_message = failure_message
+
+    def __str__(self):
+        return (
+            f"{self.type_mapping.get(self.type, self.type)}: \n {self.failure_message}"
+        )
+
+
 class AnthropicModel(ModelBase):
     """
     Anthropic Model interface.
@@ -73,6 +90,10 @@ def __init__(
         self.model_params["max_tokens"] = max_tokens
         self.client = anthropic.Anthropic(api_key=self.api_key)
 
+        super(AnthropicModel, self).__init__(
+            retry_exceptions=(TimeoutError, AnthropicFailure), **kwargs
+        )
+
     def summarize_response(self, response):
         """Returns the first reply from the "assistant", if available"""
         if (

From e9f33b0ae200039ccd261083b9d3be1fa811afc4 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 21 Jul 2024 19:05:44 +0300
Subject: [PATCH 05/11] updated name and info

---
 ...ZeroShot_en.py => ThatiARSubjectivity_Sonnet_ZeroShot_en.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename assets/ar/factuality_disinformation_harmful_content/subjectivity/{ThatiARSubjectivity_Sonet_ZeroShot_en.py => ThatiARSubjectivity_Sonnet_ZeroShot_en.py} (93%)

diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonnet_ZeroShot_en.py
similarity index 93%
rename from assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py
rename to assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonnet_ZeroShot_en.py
index 282f5cbc..71662c9b 100644
--- a/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonet_ZeroShot_en.py
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/ThatiARSubjectivity_Sonnet_ZeroShot_en.py
@@ -9,7 +9,7 @@ def metadata():
     return {
         "author": "Arabic Language Technologies, QCRI, HBKU",
         "model": "claude-3-5-sonnet-20240620",
-        "description": "Anthropic model",
+        "description": "Anthropic model - claude-3-5-sonnet. Find more https://www.anthropic.com/news/claude-3-5-sonnet",
         "scores": {},
     }
 

From a358dcfbadc8235a3518fba261071cacfd5da9fb Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Wed, 24 Jul 2024 13:07:51 +0300
Subject: [PATCH 06/11] Updated cases for exception and test

---
 llmebench/models/Anthropic.py       | 24 ++-----
 tests/models/test_AnthropicModel.py | 98 +++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 18 deletions(-)
 create mode 100644 tests/models/test_AnthropicModel.py

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index a5436de8..1b673d52 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -7,23 +7,6 @@
 from llmebench.models.model_base import ModelBase
 
 
-class AnthropicFailure(Exception):
-    """Exception class to map various failure types from the AzureModel server"""
-
-    def __init__(self, failure_type, failure_message):
-        self.type_mapping = {
-            "processing": "Model Inference failure",
-            "connection": "Failed to connect to the API endpoint",
-        }
-        self.type = failure_type
-        self.failure_message = failure_message
-
-    def __str__(self):
-        return (
-            f"{self.type_mapping.get(self.type, self.type)}: \n {self.failure_message}"
-        )
-
-
 class AnthropicModel(ModelBase):
     """
     Anthropic Model interface.
@@ -91,7 +74,12 @@ def __init__(
         self.client = anthropic.Anthropic(api_key=self.api_key)
 
         super(AnthropicModel, self).__init__(
-            retry_exceptions=(TimeoutError, AnthropicFailure), **kwargs
+            retry_exceptions=(
+                anthropic.RateLimitError,
+                anthropic.APITimeoutError,
+                anthropic.APIConnectionError,
+            ),
+            **kwargs,
         )
 
     def summarize_response(self, response):
diff --git a/tests/models/test_AnthropicModel.py b/tests/models/test_AnthropicModel.py
new file mode 100644
index 00000000..17f428d4
--- /dev/null
+++ b/tests/models/test_AnthropicModel.py
@@ -0,0 +1,98 @@
+import unittest
+from unittest.mock import patch
+
+from llmebench import Benchmark
+from llmebench.models import AnthropicModel
+
+from llmebench.utils import is_fewshot_asset
+
+
+class TestAssetsForAnthropicPrompts(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Load the benchmark assets
+        benchmark = Benchmark(benchmark_dir="assets")
+        all_assets = benchmark.find_assets()
+
+        # Filter out assets not using the Petals model
+        cls.assets = [
+            asset
+            for asset in all_assets
+            if asset["config"]["model"] in [AnthropicModel]
+        ]
+
+    def test_anthropic_prompts(self):
+        "Test if all assets using this model return data in an appropriate format for prompting"
+        # self.test_openai_prompts()
+        n_shots = 3  # Sample for few shot prompts
+
+        for asset in self.assets:
+            with self.subTest(msg=asset["name"]):
+                config = asset["config"]
+                dataset_args = config.get("dataset_args", {})
+                dataset_args["data_dir"] = ""
+                dataset = config["dataset"](**dataset_args)
+                data_sample = dataset.get_data_sample()
+                if is_fewshot_asset(config, asset["module"].prompt):
+                    prompt = asset["module"].prompt(
+                        data_sample["input"],
+                        [data_sample for _ in range(n_shots)],
+                    )
+                else:
+                    prompt = asset["module"].prompt(data_sample["input"])
+
+                self.assertIsInstance(prompt, list)
+
+                for message in prompt:
+                    self.assertIsInstance(message, dict)
+                    self.assertIn("role", message)
+                    self.assertIsInstance(message["role"], str)
+                    self.assertIn("content", message)
+                    self.assertIsInstance(message["content"], (str, list))
+
+                    # Multi-modal input
+                    if isinstance(message["content"], list):
+                        for elem in message["content"]:
+                            self.assertIsInstance(elem, dict)
+                            self.assertIn("type", elem)
+
+                            if elem["type"] == "text":
+                                self.assertIn("text", elem)
+                                self.assertIsInstance(elem["text"], str)
+                            elif elem["type"] == "image_url":
+                                self.assertIn("image_url", elem)
+                                self.assertIsInstance(elem["image_url"], dict)
+                                self.assertIn("url", elem["image_url"])
+                                self.assertIsInstance(elem["image_url"]["url"], str)
+
+
+class TestAnthropicConfig(unittest.TestCase):
+    def test_anthropic_config(self):
+        "Test if model config parameters passed as arguments are used"
+        model = AnthropicModel(api_key="secret-key", model_name="private-model")
+        self.assertEqual(model.api_key, "secret-key")
+
+    @patch.dict(
+        "os.environ",
+        {
+            "ANTHROPIC_API_KEY": "secret-key",
+            "ANTHROPIC_MODEL": "model",
+        },
+    )
+    def test_anthropic_config_env_var(self):
+        "Test if model config parameters passed as environment variables are used"
+        model = AnthropicModel(api_key="secret-key", model_name="private-model")
+        self.assertEqual(model.api_key, "secret-key")
+
+    @patch.dict(
+        "os.environ",
+        {
+            "ANTHROPIC_API_KEY": "secret-key",
+            "ANTHROPIC_MODEL": "model",
+        },
+    )
+    def test_anthropic_config_priority(self):
+        "Test if model config parameters passed as environment variables are used"
+        model = AnthropicModel(api_key="secret-key", model_name="private-model")
+
+        self.assertEqual(model.api_key, "secret-key")

From c6b4143986746187b1428837888bb8e5077fedaa Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Tue, 30 Jul 2024 10:31:53 +0300
Subject: [PATCH 07/11] Clean up Anthropic error handling by using built-in
 exceptions

---
 llmebench/models/Anthropic.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index a5436de8..755af541 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -7,23 +7,6 @@
 from llmebench.models.model_base import ModelBase
 
 
-class AnthropicFailure(Exception):
-    """Exception class to map various failure types from the AzureModel server"""
-
-    def __init__(self, failure_type, failure_message):
-        self.type_mapping = {
-            "processing": "Model Inference failure",
-            "connection": "Failed to connect to the API endpoint",
-        }
-        self.type = failure_type
-        self.failure_message = failure_message
-
-    def __str__(self):
-        return (
-            f"{self.type_mapping.get(self.type, self.type)}: \n {self.failure_message}"
-        )
-
-
 class AnthropicModel(ModelBase):
     """
     Anthropic Model interface.
@@ -91,7 +74,14 @@ def __init__(
         self.client = anthropic.Anthropic(api_key=self.api_key)
 
         super(AnthropicModel, self).__init__(
-            retry_exceptions=(TimeoutError, AnthropicFailure), **kwargs
+            retry_exceptions=(
+                TimeoutError,
+                anthropic.APIStatusError,
+                anthropic.RateLimitError,
+                anthropic.APITimeoutError,
+                anthropic.APIConnectionError,
+            ),
+            **kwargs,
         )
 
     def summarize_response(self, response):

From fbe253bfc9c43c8992099f1fcd3bfc6d473113fd Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Tue, 30 Jul 2024 10:34:22 +0300
Subject: [PATCH 08/11] Remove unused `api_base` in AnthropicModel

---
 llmebench/models/Anthropic.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index 755af541..030b5ef7 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -26,7 +26,6 @@ class AnthropicModel(ModelBase):
 
     def __init__(
         self,
-        api_base=None,
         api_key=None,
         model_name=None,
         timeout=20,
@@ -36,7 +35,6 @@ def __init__(
         **kwargs,
     ):
         # API parameters
-        self.api_base = api_base or os.getenv("ANTHROPIC_API_URL")
         self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
         self.model_name = model_name or os.getenv("ANTHROPIC_MODEL")
 

From af12b30ddb52f02e2a37d07218e4a710bb0f1b90 Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Tue, 30 Jul 2024 10:35:03 +0300
Subject: [PATCH 09/11] Expand config tests

---
 tests/models/test_AnthropicModel.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_AnthropicModel.py b/tests/models/test_AnthropicModel.py
index 17f428d4..84a91261 100644
--- a/tests/models/test_AnthropicModel.py
+++ b/tests/models/test_AnthropicModel.py
@@ -71,24 +71,27 @@ def test_anthropic_config(self):
         "Test if model config parameters passed as arguments are used"
         model = AnthropicModel(api_key="secret-key", model_name="private-model")
         self.assertEqual(model.api_key, "secret-key")
+        self.assertEqual(model.model, "private-model")
 
     @patch.dict(
         "os.environ",
         {
-            "ANTHROPIC_API_KEY": "secret-key",
-            "ANTHROPIC_MODEL": "model",
+            "ANTHROPIC_API_KEY": "secret-env-key",
+            "ANTHROPIC_MODEL": "private-env-model",
         },
     )
     def test_anthropic_config_env_var(self):
         "Test if model config parameters passed as environment variables are used"
-        model = AnthropicModel(api_key="secret-key", model_name="private-model")
-        self.assertEqual(model.api_key, "secret-key")
+        model = AnthropicModel()
+
+        self.assertEqual(model.api_key, "secret-env-key")
+        self.assertEqual(model.model, "private-env-model")
 
     @patch.dict(
         "os.environ",
         {
-            "ANTHROPIC_API_KEY": "secret-key",
-            "ANTHROPIC_MODEL": "model",
+            "ANTHROPIC_API_KEY": "secret-env-key",
+            "ANTHROPIC_MODEL": "private-env-model",
         },
     )
     def test_anthropic_config_priority(self):
@@ -96,3 +99,4 @@ def test_anthropic_config_priority(self):
         model = AnthropicModel(api_key="secret-key", model_name="private-model")
 
         self.assertEqual(model.api_key, "secret-key")
+        self.assertEqual(model.model, "private-model")

From 2d7b5bc36e14285a13a5ef1cc27bcc0df154df25 Mon Sep 17 00:00:00 2001
From: Firoj Alam <firojalam@gmail.com>
Date: Sun, 4 Aug 2024 22:29:02 +0300
Subject: [PATCH 10/11] updated with doc string

---
 llmebench/models/Anthropic.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index 030b5ef7..8388733a 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -83,16 +83,7 @@ def __init__(
         )
 
     def summarize_response(self, response):
-        """Returns the first reply from the "assistant", if available"""
-        if (
-            "choices" in response
-            and isinstance(response["choices"], list)
-            and len(response["choices"]) > 0
-            and "message" in response["choices"][0]
-            and "content" in response["choices"][0]["message"]
-            and response["choices"][0]["message"]["role"] == "assistant"
-        ):
-            return response["choices"][0]["message"]["content"]
+        """Returns the response"""
 
         return response
 
@@ -102,13 +93,16 @@ def prompt(self, processed_input):
 
         Arguments
         ---------
-        processed_input : dictionary
-            Must be a dictionary with one key "prompt", the value of which
-            must be a string.
+        processed_input : list
+            Must be list of dictionaries, where each dictionary has two keys;
+            "role" defines a role in the chat (e.g. "user") and
+            "content" can be a list or message for that turn. The list can have object with {"type": text,"text": "text"} for text input/prompt or {"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"media_file"}},{"type":"text","text":"What is in this image?"} for multimodal (image + text)
 
         Returns
         -------
         response : AnthropicModel API response
+            Response from the anthropic python library
+
         """
 
         response = self.client.messages.create(

From 96925d2866f261e8eb231d55c13e0cd07cfc9457 Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Mon, 5 Aug 2024 14:40:14 +0300
Subject: [PATCH 11/11] Update image input tests

---
 llmebench/models/Anthropic.py       |  5 ++++-
 tests/models/test_AnthropicModel.py | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/llmebench/models/Anthropic.py b/llmebench/models/Anthropic.py
index 8388733a..fc3bd0f4 100644
--- a/llmebench/models/Anthropic.py
+++ b/llmebench/models/Anthropic.py
@@ -96,7 +96,10 @@ def prompt(self, processed_input):
         processed_input : list
             Must be list of dictionaries, where each dictionary has two keys;
             "role" defines a role in the chat (e.g. "user") and
-            "content" can be a list or message for that turn. The list can have object with {"type": text,"text": "text"} for text input/prompt or {"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"media_file"}},{"type":"text","text":"What is in this image?"} for multimodal (image + text)
+            "content" can be a list or message for that turn. If it is a list, it must contain objects matching one of the following:
+                - {"type": "text", "text": "....."} for text input/prompt
+                - {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": "media_file"}} for image input
+                - the list can contain mix of the above formats for multimodal input (image + text)
 
         Returns
         -------
diff --git a/tests/models/test_AnthropicModel.py b/tests/models/test_AnthropicModel.py
index 84a91261..82afad60 100644
--- a/tests/models/test_AnthropicModel.py
+++ b/tests/models/test_AnthropicModel.py
@@ -59,11 +59,18 @@ def test_anthropic_prompts(self):
                             if elem["type"] == "text":
                                 self.assertIn("text", elem)
                                 self.assertIsInstance(elem["text"], str)
-                            elif elem["type"] == "image_url":
-                                self.assertIn("image_url", elem)
-                                self.assertIsInstance(elem["image_url"], dict)
-                                self.assertIn("url", elem["image_url"])
-                                self.assertIsInstance(elem["image_url"]["url"], str)
+                            elif elem["type"] == "image":
+                                self.assertIn("source", elem)
+                                self.assertIsInstance(elem["source"], dict)
+
+                                # Current support is for base64
+                                self.assertIn("type", elem["source"])
+                                self.assertIsInstance(elem["source"]["type"], str)
+                                self.assertIn("data", elem["source"])
+                                self.assertIsInstance(elem["source"]["data"], str)
+
+                                self.assertIn("media_type", elem["source"])
+                                self.assertIsInstance(elem["source"]["media_type"], str)
 
 
 class TestAnthropicConfig(unittest.TestCase):