Azure · w-javed · Oct 28, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 3, 2024
@@ -1,9 +1,9 @@
 # Release History
 
-
 ## 1.0.0b5 (Unreleased)
 
 ### Features Added
+- Adding evaluator for multimodal use cases
 
 ### Breaking Changes
 - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.

@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_1390701e9d"
+  "Tag": "python/evaluation/azure-ai-evaluation_3eeaa3bdee"
 }
@@ -12,6 +12,14 @@
     SexualEvaluator,
     ViolenceEvaluator,
 )
+from ._evaluators._multimodal._content_safety_multimodal import (
+    ContentSafetyMultimodalEvaluator,
+    HateUnfairnessMultimodalEvaluator,
+    SelfHarmMultimodalEvaluator,
+    SexualMultimodalEvaluator,
+    ViolenceMultimodalEvaluator,
+)
+from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
 from ._evaluators._f1_score import F1ScoreEvaluator
 from ._evaluators._fluency import FluencyEvaluator
 from ._evaluators._gleu import GleuScoreEvaluator
@@ -63,4 +71,10 @@
     "Conversation",
     "Message",
     "EvaluationResult",
+    "ContentSafetyMultimodalEvaluator",
+    "HateUnfairnessMultimodalEvaluator",
+    "SelfHarmMultimodalEvaluator",
+    "SexualMultimodalEvaluator",
+    "ViolenceMultimodalEvaluator",
+    "ProtectedMaterialMultimodalEvaluator"
 ]
@@ -11,10 +11,12 @@
 from urllib.parse import urlparse
 
 import jwt
+import json
 
+from promptflow.core._errors import MissingRequiredPackage
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
-from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.ai.evaluation._model_configurations import AzureAIProject, Message
 from azure.core.credentials import TokenCredential
 from azure.core.pipeline.policies import AsyncRetryPolicy
 
@@ -26,7 +28,7 @@
     _InternalAnnotationTasks,
     _InternalEvaluationMetrics,
 )
-from .utils import get_harm_severity_level
+from .utils import get_harm_severity_level, retrieve_content_type
 
 try:
     version = importlib.metadata.version("azure-ai-evaluation")
@@ -169,7 +171,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
     operation_id = result["location"].split("/")[-1]
     return operation_id
 
-
 async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict:
     """Fetch the annotation result from Responsible AI service
 
@@ -265,7 +266,11 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
     if key == EvaluationMetrics.HATE_FAIRNESS:
         key = EvaluationMetrics.HATE_UNFAIRNESS
 
-    result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
+    result: Dict[str, Union[str, float]] = {
+    (key.value if hasattr(key, 'value') else key): math.nan,
+    f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
+    f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan
+}
 
     response = batch_response[0]
     if metric_name not in response:
@@ -314,6 +319,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
         reason = ""
 
     harm_score = metric_value
+    if metric_value == 'n/a':
+        return result
     if not math.isnan(metric_value):
         # int(math.nan) causes a value error, and math.nan is already handled
         # by get_harm_severity_level
@@ -442,3 +449,106 @@ async def evaluate_with_rai_service(
     result = parse_response(annotation_response, metric_name)
 
     return result
+
+def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
+    """Generate the payload for the annotation request
+    :param content_type: The type of the content representing multimodal or images.
+    :type content_type: str
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
+        in the payload.
+    :type metric: str
+    :return: The payload for the annotation request.
+    :rtype: Dict
+    """
+    include_metric = True
+    task = Tasks.CONTENT_HARM
+    if metric == EvaluationMetrics.PROTECTED_MATERIAL:
+        task = Tasks.PROTECTED_MATERIAL
+        include_metric = False
+
+    return (
+        {
+            "ContentType": content_type,
+            "Contents": [{"messages" : messages }],
+            "AnnotationTask": task,
+            "MetricList": [metric],
+        }
+        if include_metric
+        else {
+            "ContentType": content_type,
+            "Contents": [{"messages" : messages }],
+            "AnnotationTask": task,
+        }
+    )
+
+async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
+    """Submit request to Responsible AI service for evaluation and return operation ID
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use.
+    :type metric: str
+    :param rai_svc_url: The Responsible AI service URL.
+    :type rai_svc_url: str
+    :param token: The Azure authentication token.
+    :type token: str
+    :return: The operation ID.
+    :rtype: str
+    """
+    ## handle json payload and payload from inference sdk strongly type messages
+    if len(messages) > 0 and not isinstance(messages[0], Dict):
+        try:
+            from azure.ai.inference.models import ChatRequestMessage
+        except ImportError:
+            error_message = "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
+            raise MissingRequiredPackage(message=error_message)
+        else:
+            if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
+                messages = [message.as_dict() for message in messages]
+
+    filtered_messages = [message for message in messages if message["role"] != "system"]
+    assistant_messages = [message for message in messages if message["role"] == "assistant"]
+    content_type = retrieve_content_type(assistant_messages, metric)
+    payload = generate_payload_multimodal(content_type, filtered_messages, metric)
+
+    ## calling rai service for annotation
+    url = rai_svc_url + "/submitannotation"
+    headers = get_common_headers(token)
+    async with get_async_http_client() as client:
+        response = await client.post(  # pylint: disable=too-many-function-args,unexpected-keyword-arg
+            url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
+        )
+    if response.status_code != 202:
+        print("Fail evaluating '%s' with error message: %s" % (payload["Contents"], response.text))
+        response.raise_for_status()
+    result = response.json()
+    operation_id = result["location"].split("/")[-1]
+    return operation_id
+
+async def evaluate_with_rai_service_multimodal(
+    messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
+):
+    """ "Evaluate the content safety of the response using Responsible AI service
+       :param messages: The normalized list of messages.
+       :type messages: str
+       :param metric_name: The evaluation metric to use.
+       :type metric_name: str
+       :param project_scope: The Azure AI project scope details.
+       :type project_scope: Dict
+       :param credential: The Azure authentication credential.
+       :type credential:
+    ~azure.core.credentials.TokenCredential
+       :return: The parsed annotation result.
+       :rtype: List[List[Dict]]
+    """
+
+    # Get RAI service URL from discovery service and check service availability
+    token = await fetch_or_reuse_token(credential)
+    rai_svc_url = await get_rai_svc_url(project_scope, token)
+    await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
+    # Submit annotation request and fetch result
+    operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
+    annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
+    result = parse_response(annotation_response, metric_name)
+    return result
@@ -272,3 +272,33 @@ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool
         validate_annotation(v, annotations[k])
 
     return cast(T_TypedDict, o)
+
+def retrieve_content_type(assistant_messages: list, metric: str) -> str:
+    """Get the content type for service payload.
+
+    :param messages: The list of messages to be annotated by evaluation service 
+    :type messages: list
+    :param metric: A string representing the metric type
+    :type metric: str
+    :return: A text representing the content type. Example: 'text', or 'image'
+    :rtype: str
+    """
+    # Check if metric is "protected_material"
+    if metric == "protected_material":
+        return "image"
+
+    # Ensure there are messages
+    if assistant_messages:
+        # Iterate through each message
+        for item in assistant_messages:
+            # Ensure "content" exists in the message and is iterable
+            if "content" in item:
+                for content in item["content"]:
+                    # Check if the content type is "image_url"
+                    if content.get("type") == "image_url":
+                        return "image"
+        # Default return if no image was found
+        return "text"
+
+    # Default return if no messages
+    return "text" 
@@ -8,6 +8,8 @@
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
+import uuid
+import base64
 
 import pandas as pd
 from promptflow.client import PFClient
@@ -80,6 +82,32 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
 
     return azure_pf_client, ws_triad
 
+def _store_multimodal_content(messages, tmpdir: str):
+    # verify if images folder exists
+    images_folder_path = os.path.join(tmpdir, "images")
+    os.makedirs(images_folder_path, exist_ok=True)
+
+    # traverse all messages and replace base64 image data with new file name.
+    for message in messages:
+        if "content" in message:
+            for content in message["content"]:
+                if content.get("type") == "image_url":
+                    image_url = content.get("image_url")
+                    if image_url and 'url' in image_url and image_url['url'].startswith("data:image/jpg;base64,"):
+                        # Extract the base64 string
+                        base64image = image_url['url'].replace("data:image/jpg;base64,", "")
+
+                        # Generate a unique filename
+                        image_file_name = f"{str(uuid.uuid4())}.jpg"
+                        image_url['url'] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
+
+                        # Decode the base64 string to binary image data
+                        image_data_binary = base64.b64decode(base64image)
+
+                        # Write the binary image data to the file
+                        image_file_path = os.path.join(images_folder_path, image_file_name)
+                        with open(image_file_path, "wb") as f:
+                            f.write(image_data_binary)
 
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
@@ -110,6 +138,14 @@ def _log_metrics_and_instance_results(
         artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
 
         with tempfile.TemporaryDirectory() as tmpdir:
+            # storing multi_modal images if exists
+            col_name = "inputs.conversation"
+            if col_name in instance_results.columns:
+                for key, item in instance_results[col_name].items():
+                    if "messages" in item:
+                        _store_multimodal_content(item["messages"], tmpdir)
+
+            # storing artifact result
             tmp_path = os.path.join(tmpdir, artifact_name)
 
             with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:

@@ -99,10 +99,10 @@ def __init__(
         self._eval_last_turn = eval_last_turn
         self._parallel = parallel
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
-            ViolenceEvaluator(azure_ai_project, credential),
-            SexualEvaluator(azure_ai_project, credential),
-            SelfHarmEvaluator(azure_ai_project, credential),
-            HateUnfairnessEvaluator(azure_ai_project, credential),
+            ViolenceEvaluator(credential, azure_ai_project),
+            SexualEvaluator(credential, azure_ai_project),
+            SelfHarmEvaluator(credential, azure_ai_project),
+            HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
 
     def __call__(self, *, conversation: list, **kwargs):

@@ -0,0 +1,20 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._content_safety_multimodal import ContentSafetyMultimodalEvaluator
+from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase
+from ._hate_unfairness import HateUnfairnessMultimodalEvaluator
+from ._self_harm import SelfHarmMultimodalEvaluator
+from ._sexual import SexualMultimodalEvaluator
+from ._violence import ViolenceMultimodalEvaluator
+from ._protected_material import ProtectedMaterialMultimodalEvaluator
+
+__all__ = [
+    "ContentSafetyMultimodalEvaluator",
+    "ContentSafetyMultimodalEvaluatorBase",
+    "ViolenceMultimodalEvaluator",
+    "SexualMultimodalEvaluator",
+    "SelfHarmMultimodalEvaluator",
+    "HateUnfairnessMultimodalEvaluator",
+    "ProtectedMaterialMultimodalEvaluator",
+]