diff --git a/.gitignore b/.gitignore
index 9dc0e4dfa..32a152c53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 ### dotenv template
 python/.env
 
+test_results
+
 ### Go template
 # If you prefer the allow list template instead of the deny list, see community template:
 # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
diff --git a/python/prompts/__init__.py b/python/prompts/__init__.py
index a8c473958..918fee363 100644
--- a/python/prompts/__init__.py
+++ b/python/prompts/__init__.py
@@ -1,3 +1,4 @@
 from ._istio_crd import get_istio_crd_prompt
+from .models import IstioCrdType
 
-__all__ = ["get_istio_crd_prompt"]
+__all__ = ["get_istio_crd_prompt"]
\ No newline at end of file
diff --git a/python/prompts/_istio_crd.py b/python/prompts/_istio_crd.py
index 038c1797e..4021f42e1 100644
--- a/python/prompts/_istio_crd.py
+++ b/python/prompts/_istio_crd.py
@@ -551,6 +551,7 @@ def __init__(self, crd_type: "IstioCrdType"):
         super().__init__(
             name=f"Istio {crd_type.value} Template",
             description=f"Template for generating {crd_type.value} configurations",
+            version="0.0.1",
             sections=[
                 PromptSection(
                     name="Role Definition",
diff --git a/python/prompts/base.py b/python/prompts/base.py
index 529a98b1d..ff4a7ee8b 100644
--- a/python/prompts/base.py
+++ b/python/prompts/base.py
@@ -49,11 +49,13 @@ def __init__(
         self,
         name: str,
         description: str,
+        version: str,
         sections: list[PromptSection] | None,
         variables: list[TemplateVariable] | None = None,
     ):
         self.name = name
         self.description = description
+        self.version = version
         self.sections = sections
         self.variables = variables or []
         self._validate_variables()
diff --git a/python/tests/README.md b/python/tests/README.md
new file mode 100644
index 000000000..d34bbd9c4
--- /dev/null
+++ b/python/tests/README.md
@@ -0,0 +1,115 @@
+## Artifacts
+
+The test data is split into two groups - test cases and agent configurations. 
+
+### Test cases
+
+The test cases file consists of one or more test cases, each having an input prompt (this is the query that's sent directly to the agent) and the expected output, which is the response that the agent should return.
+
+```yaml
+version: "1.0"
+metadata:
+  description: "Authorization Policy Test Cases"
+
+test_cases:
+  - name: deny_post_8080
+    input: "Deny results with POST method on port 8080 on all workloads in the foo namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+        name: httpbin
+        namespace: foo
+      spec:
+        action: DENY
+        rules:
+          - to:
+              - operation:
+                  methods:
+                    - POST
+                  ports:
+                    - "8080"
+```
+
+### Agent configuration
+
+The agent configuration specifies the system prompt for the agent. This is the context that the agent uses to generate the response. 
+
+```yaml
+version: "1.0"
+name: "istio_authpolicy_crd_agent"
+metadata:
+  description: "Agent for generating Istio Authorization Policy CRDs"
+  version: "0.0.1"
+
+system_messages:
+  - |
+    You're an Istio CRD agent. You modify or create a new JSON based on the UQ. The JSON must conform to the PROTO SPEC. The response must only include one or more AuthorizationPolicy resource type.
+
+    PROTO...
+```
+## Running tests
+
+To run the tests you pass in the test case, the agent file and specify the model you want to use:
+
+```bash
+uv run main.py run test_cases/authpolicy_test_cases.yaml agents/istio_crd_agent_0.yaml --model gpt-4o-mini
+```
+
+Once you've created the baseline results (or first test results), you can modify the prompt or model and run the tests again to see if the results change.
+
+To check for changes in the results, you can use the `compare` command:
+
+```bash
+uv run main.py compare test_results/results1.json test_results/results2.json
+```
+
+The `compare` command will output the differences between the two test results files and provide a summary of the changes between the two runs:
+
+```console
+Comparing results_20250122_153628.json with results_20250122_153642.json
+Model changed: True
+Prompt changed: False
+
+Analyzing 2 test cases:
+
+=== Test 1 ===
+Input: Deny results with POST method on port 8080 on all workloads in the foo namespace
+Duration delta: 6569.08ms
+Similarity: 95.46% → 95.46% (Δ: +0.00%)
+No differences in output
+
+=== Test 2 ===
+Input: Allow GET requests on port 3000 for service-a in the bar namespace
+Duration delta: 3134.77ms
+Similarity: 87.52% → 83.91% (Δ: -3.62%)
+Output differences:
+--- results_20250122_153628.json (Test 2)
++++ results_20250122_153642.json (Test 2)
+@@ -2,16 +2,16 @@
+   "apiVersion": "security.istio.io/v1",
+   "kind": "AuthorizationPolicy",
+   "metadata": {
+-    "name": "allow-get-service-a",
++    "name": "allow-get-port-3000",
+     "namespace": "bar"
+   },
+   "spec": {
++    "action": "ALLOW",
+     "selector": {
+-      "matchLabels": {
++      "match_labels": {
+         "app": "service-a"
+       }
+     },
+-    "action": "ALLOW",
+     "rules": [
+       {
+         "to": [
+
+=== Summary Statistics ===
+Tests with differences: 1 of 2
+Average similarity delta: -1.81%
+Average duration delta: +4851.93ms
+```
\ No newline at end of file
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/tests/agent_tester.py b/python/tests/agent_tester.py
new file mode 100644
index 000000000..ce5695dd7
--- /dev/null
+++ b/python/tests/agent_tester.py
@@ -0,0 +1,93 @@
+from autogen_agentchat.agents import AssistantAgent
+from schema import TestCase, TestRunResult, TestResult
+from datetime import datetime
+from pathlib import Path
+from dataclasses import asdict
+import json
+import hashlib
+import difflib
+
+class AgentTester:
+    def __init__(self, agent: "AssistantAgent", test_cases: list[TestCase], results_dir: str = "test_results"):
+        self.agent = agent
+        self.test_cases = test_cases
+        self.results_dir = Path(results_dir)
+        self.results_dir.mkdir(exist_ok=True)
+
+    async def run_tests(self) -> TestRunResult:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        model = self.agent._model_client._to_config().model_dump().get('model')
+
+        # Get all system messages once for the entire test run
+        system_msg = "\n".join([msg.content for msg in self.agent._system_messages])
+
+        # Create config once for the entire test run
+        config = {
+            "agent_name": self.agent.name,
+            "tools": "\n".join([tool.name for tool in self.agent._tools]),
+            "model": model,
+            "prompt": system_msg,
+            "prompt_hash": hashlib.sha256(system_msg.encode('utf-8')).hexdigest(),
+        }
+
+        results = []
+        for test_case in self.test_cases:
+            print(".", end="", flush=True)
+            start = datetime.now()
+            # Run the agent with the test input
+            response = await self.agent.run(task=test_case.input)
+            end = datetime.now()
+
+            duration_ms = (end - start).total_seconds() * 1000
+
+            # Only get the last TextMessage and get the content of it
+            output = response.messages[-1].content
+
+            similarity = self._calculate_similarity(test_case.expected_output, json.loads(output))
+
+            # Create test result without config
+            result = TestResult(
+                category=test_case.category,
+                input=test_case.input,
+                expected_output=test_case.expected_output,
+                actual_output=json.loads(output),
+                duration_ms=duration_ms,
+                similarity=similarity
+            )
+            results.append(result)
+
+        print()
+        # Create the test run result that combines config and individual results
+        test_run_result = TestRunResult(
+            timestamp=timestamp,
+            config=config,
+            results=results
+        )
+
+        # Save results
+        self._save_results(test_run_result)
+
+        return test_run_result
+
+    def _calculate_similarity(self, dict1: dict, dict2: dict):
+        # Convert both dictionaries to strings with consistent formatting
+        str1 = json.dumps(dict1, sort_keys=True)
+        str2 = json.dumps(dict2, sort_keys=True)
+        
+        return difflib.SequenceMatcher(None, str1, str2).ratio() * 100
+
+    def _save_results(self, test_run_result: TestRunResult):
+        # Convert results to JSON-serializable format
+        results_dict = {
+            "timestamp": test_run_result.timestamp,
+            "config": test_run_result.config,
+            "results": [asdict(result) for result in test_run_result.results]
+        }
+        
+        # Save to JSON file
+        results_file = self.results_dir / f"results_{test_run_result.timestamp}_{test_run_result.config.get('model')}.json"
+        with open(results_file, "w") as f:
+            json.dump(results_dict, f, indent=2)
+        
+        print(f"Results saved to: {results_file}")
\ No newline at end of file
diff --git a/python/tests/agents/istio_crd_agent_0.yaml b/python/tests/agents/istio_crd_agent_0.yaml
new file mode 100644
index 000000000..e8b31662f
--- /dev/null
+++ b/python/tests/agents/istio_crd_agent_0.yaml
@@ -0,0 +1,35 @@
+version: "1.0"
+name: "istio_authpolicy_crd_agent"
+metadata:
+  description: "Agent for generating Istio Authorization Policy CRDs (no protos)"
+  version: "0.0.1"
+
+system_messages:
+  - |
+    You're an Istio CRD agent. You modify or create a new JSON based on the UQ. The response must only include one or more AuthorizationPolicy resource type.
+
+    EXAMPLES:
+    UQ: Deny requests from dev namespace to POST method on all workloads in the foo namespace
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-dev-post", "namespace": "foo"}, "spec": {"action": "DENY", "rules": [{"from": [{"source": {"namespaces": ["dev"]}}], "to": [{"operation": {"methods": ["POST"]}}]}]}}
+
+    UQ: Create a deny policy to deny all requests with POST method on port 8080 on all workloads in the foo namespace
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-post-8080", "namespace": "foo"}, "spec": {"action": "DENY", "rules": [{"to": [{"operation": {"methods": ["POST"], "ports": ["8080"]}}]}]}}
+
+    UQ: Audit any GET requests to the path with the prefix /user/profile
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "audit-user-profile", "namespace": "ns1"}, "spec": {"selector": {"matchLabels": {"app": "myapi"}}, "action": "AUDIT", "rules": [{"to": [{"operation": {"methods": ["GET"], "paths": ["/user/profile/*"]}}]}]}}
+
+    UQ: Deny all requests to workloads in namespace foo
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-all", "namespace": "foo"}, "spec": {}}
+
+    UQ: Allow all requests to workloads in namespace foo
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-all", "namespace": "foo"}, "spec": {"rules": [{}]}}
+
+    UQ: Allow requests to workloads labeled with app=customers in the customers namespace if the request is from the service account cluster.local/ns/orders/orders or from the payments namespace, and the request header "foo" has the value "bar" or the request header "user" has the value "peterj".
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-customers", "namespace": "customers"}, "spec": {"action": "ALLOW", "selector": {"matchLabels": {"app": "customers"}}, "rules": [{"from": [{"source": {"principals": ["cluster.local/ns/orders/sa/orders"]}}, {"source": {"namespaces": ["payments"]}}], "to": [{"operation": {"when": [{"key": "request.headers[foo]", "values": ["bar"]}, {"key": "request.headers[user]", "values": ["peterj"]}]}}]}]}}
+
+    UQ: Allow IP address 1.2.3.4 and IPs from block 5.6.7.0/24 to access the apps labeled with app=payments.
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "ingress-policy", "namespace": "foo"}, "spec": {"selector": {"matchLabels": {"app": "payments"}}, "action": "ALLOW", "rules": [{"from": [{"source": {"ipBlocks": ["1.2.3.4", "5.6.7.0/24"]}}]}]}}
+
+    UQ: Apply the policy to all workloads in the foo namespace and allows GET requests to prefix /info or POST requests to /data for workloads using cluster.local/ns/default/sleep service account or workloads in test namespace when the issuer claim is set to https://accounts.google.common
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-info-data", "namespace": "foo"}, "spec": {"action": "ALLOW", "rules": [{"from": [{"source": {"principals": ["cluster.local/ns/default/sa/sleep"]}}, {"source": {"namespaces": ["test"]}}], "to": [{"operation": {"methods": ["GET"], "paths": ["/info*"]}}, {"operation": {"methods": ["POST"], "paths": ["/data"]}}], "when": [{"key": "request.auth.claims[iss]", "values": ["https://accounts.google.com"]}]}]}}
+tools: []
\ No newline at end of file
diff --git a/python/tests/agents/istio_crd_agent_1.yaml b/python/tests/agents/istio_crd_agent_1.yaml
new file mode 100644
index 000000000..c0facbd9e
--- /dev/null
+++ b/python/tests/agents/istio_crd_agent_1.yaml
@@ -0,0 +1,376 @@
+version: "1.0"
+name: "istio_authpolicy_crd_agent"
+metadata:
+  description: "Agent for generating Istio Authorization Policy CRDs"
+  version: "0.0.1"
+
+system_messages:
+  - |
+    You're an Istio CRD agent. You modify or create a new JSON based on the UQ. The JSON must conform to the PROTO SPEC. The response must only include one or more AuthorizationPolicy resource type.
+
+    PROTO SPEC:
+
+    // WorkloadSelector specifies the criteria used to determine if a policy can be applied
+    // to a proxy. The matching criteria includes the metadata associated with a proxy,
+    // workload instance info such as labels attached to the pod/VM, or any other info
+    // that the proxy provides to Istio during the initial handshake. If multiple conditions are
+    // specified, all conditions need to match in order for the workload instance to be
+    // selected. Currently, only label based selection mechanism is supported.
+    message WorkloadSelector {
+      // One or more labels that indicate a specific set of pods/VMs
+      // on which a policy should be applied. The scope of label search is restricted to
+      // the configuration namespace in which the resource is present.
+      map<string, string> match_labels = 1;
+    }
+
+    // PortSelector is the criteria for specifying if a policy can be applied to 
+    // a listener having a specific port.
+    message PortSelector {
+      // Port number
+      uint32 number = 1  [(google.api.field_behavior) = REQUIRED];
+    }
+
+    // WorkloadMode allows selection of the role of the underlying workload in
+    // network traffic. A workload is considered as acting as a SERVER if it is
+    // the destination of the traffic (that is, traffic direction, from the
+    // perspective of the workload is *inbound*). If the workload is the source of
+    // the network traffic, it is considered to be in CLIENT mode (traffic is
+    // *outbound* from the workload).
+    enum WorkloadMode {
+      // Default value, which will be interpreted by its own usage.
+      UNDEFINED = 0;
+
+      // Selects for scenarios when the workload is the
+      // source of the network traffic. In addition, 
+      // if the workload is a gateway, selects this.
+      CLIENT = 1;
+
+      // Selects for scenarios when the workload is the
+      // destination of the network traffic.
+      SERVER = 2;
+
+      // Selects for scenarios when the workload is either the
+      // source or destination of the network traffic.
+      CLIENT_AND_SERVER = 3;
+    }
+
+    // PolicyTargetReference format as defined by [GEP-2648](https://gateway-api.sigs.k8s.io/geps/gep-2648/#direct-policy-design-rules).
+    //
+    // PolicyTargetReference specifies the targeted resource which the policy
+    // should be applied to. It must only target a single resource at a time, but it
+    // can be used to target larger resources such as Gateways that may apply to
+    // multiple child resources. The PolicyTargetReference will be used instead of
+    // a WorkloadSelector in the RequestAuthentication, AuthorizationPolicy,
+    // Telemetry, and WasmPlugin CRDs to target a Kubernetes Gateway.
+
+    message PolicyTargetReference {
+      // group is the group of the target resource.
+      string group = 1;
+
+      // kind is kind of the target resource.
+      string kind = 2 [(google.api.field_behavior) = REQUIRED];
+
+      // name is the name of the target resource.
+      // +kubebuilder:validation:MinLength=1
+      // +kubebuilder:validation:MaxLength=253
+      string name = 3 [(google.api.field_behavior) = REQUIRED];
+
+      // namespace is the namespace of the referent. When unspecified, the local
+      // namespace is inferred.
+      // +kubebuilder:validation:XValidation:message="cross namespace referencing is not currently supported",rule="self.size() == 0"
+      string namespace = 4;
+    }
+
+
+    // Istio Authorization Policy enables access control on workloads in the mesh.
+    //
+    // Authorization policy supports CUSTOM, DENY and ALLOW actions for access control. When CUSTOM, DENY and ALLOW actions
+    // are used for a workload at the same time, the CUSTOM action is evaluated first, then the DENY action, and finally the ALLOW action.
+    // The evaluation is determined by the following rules:
+    //
+    // 1. If there are any CUSTOM policies that match the request, evaluate and deny the request if the evaluation result is deny.
+    // 2. If there are any DENY policies that match the request, deny the request.
+    // 3. If there are no ALLOW policies for the workload, allow the request.
+    // 4. If any of the ALLOW policies match the request, allow the request.
+    // 5. Deny the request.
+    //
+    // Istio Authorization Policy also supports the AUDIT action to decide whether to log requests.
+    // AUDIT policies do not affect whether requests are allowed or denied to the workload.
+    // Requests will be allowed or denied based solely on CUSTOM, DENY and ALLOW actions.
+
+    package istio.security.v1beta1;
+
+    option go_package="istio.io/api/security/v1beta1";
+
+    // AuthorizationPolicy enables access control on workloads.
+    message AuthorizationPolicy {
+      // Optional. The selector decides where to apply the authorization policy. The selector will match with workloads
+      // in the same namespace as the authorization policy. If the authorization policy is in the root namespace, the selector
+      // will additionally match with workloads in all namespaces.
+      //
+      // If the selector and the targetRef are not set, the selector will match all workloads.
+      //
+      // At most one of selector or targetRefs can be set for a given policy.
+      istio.type.v1beta1.WorkloadSelector selector = 1;
+
+      // Optional. The targetRefs specifies a list of resources the policy should be
+      // applied to. The targeted resources specified will determine which workloads
+      // the policy applies to.
+      //
+      // Currently, the following resource attachment types are supported:
+      // * kind: Gateway with group: gateway.networking.k8s.io in the same namespace.
+      // * kind: Service with "" in the same namespace. This type is only supported for waypoints.
+      //
+      // If not set, the policy is applied as defined by the selector.
+      // At most one of the selector and targetRefs can be set.
+      //
+      // NOTE: If you are using the targetRefs field in a multi-revision environment with Istio versions prior to 1.22,
+      // it is highly recommended that you pin the policy to a revision running 1.22+ via the istio.io/rev label.
+      // This is to prevent proxies connected to older control planes (that don't know about the targetRefs field)
+      // from misinterpreting the policy as namespace-wide during the upgrade process.
+      //
+      // NOTE: Waypoint proxies are required to use this field for policies to apply; selector policies will be ignored.
+      repeated istio.type.v1beta1.PolicyTargetReference targetRefs = 6;
+
+      // Optional. A list of rules to match the request. A match occurs when at least one rule matches the request.
+      //
+      // If not set, the match will never occur. This is equivalent to setting a default of deny for the target workloads if
+      // the action is ALLOW.
+      repeated Rule rules = 2;
+
+      // Action specifies the operation to take.
+      enum Action {
+        // Allow a request only if it matches the rules. This is the default type.
+        ALLOW = 0;
+
+        // Deny a request if it matches any of the rules.
+        DENY = 1;
+
+        // Audit a request if it matches any of the rules.
+        AUDIT = 2;
+
+        CUSTOM = 3;
+      }
+
+      // Optional. The action to take if the request is matched with the rules. Default is ALLOW if not specified.
+      Action action = 3;
+
+      message ExtensionProvider {
+        // Specifies the name of the extension provider. The list of available providers is defined in the MeshConfig.
+        // Note, currently at most 1 extension provider is allowed per workload. Different workloads can use different extension provider.
+        string name = 1;
+      }
+
+      oneof action_detail {
+        // Specifies detailed configuration of the CUSTOM action. Must be used only with CUSTOM action.
+        ExtensionProvider provider = 4;
+      }
+    }
+
+    // Rule matches requests from a list of sources that perform a list of operations subject to a
+    // list of conditions. A match occurs when at least one source, one operation and all conditions
+    // matches the request. An empty rule is always matched.
+    //
+    // Any string field in the rule supports Exact, Prefix, Suffix and Presence match:
+    //
+    // - Exact match: abc will match on value abc.
+    // - Prefix match: abc* will match on value abc and abcd.
+    // - Suffix match: *abc will match on value abc and xabc.
+    // - Presence match: * will match when value is not empty.
+    message Rule {
+      // From includes a list of sources.
+      message From {
+        // Source specifies the source of a request.
+        Source source = 1;
+      }
+
+      // Optional. from specifies the source of a request.
+      //
+      // If not set, any source is allowed.
+      repeated From from = 1;
+
+      // To includes a list of operations.
+      message To {
+        // Operation specifies the operation of a request.
+        Operation operation = 1;
+      }
+
+      // Optional. to specifies the operation of a request.
+      //
+      // If not set, any operation is allowed.
+      repeated To to = 2;
+
+      // Optional. when specifies a list of additional conditions of a request.
+      //
+      // If not set, any condition is allowed.
+      repeated Condition when = 3;
+    }
+
+    // Source specifies the source identities of a request. Fields in the source are
+    // ANDed together.
+    //
+    // For example, the following source matches if the principal is admin or dev
+    // and the namespace is prod or test and the ip is not 203.0.113.4.
+    //
+    // yaml
+    // principals: ["admin", "dev"]
+    // namespaces: ["prod", "test"]
+    // notIpBlocks: ["203.0.113.4"]
+    // 
+    message Source {
+      // Optional. A list of peer identities derived from the peer certificate. The peer identity is in the format of
+      // "<TRUST_DOMAIN>/ns/<NAMESPACE>/sa/<SERVICE_ACCOUNT>", for example, "cluster.local/ns/default/sa/productpage".
+      // This field requires mTLS enabled and is the same as the source.principal attribute.
+      //
+      // If not set, any principal is allowed.
+      repeated string principals = 1;
+
+      // Optional. A list of negative match of peer identities.
+      repeated string not_principals = 5;
+
+      // Optional. A list of request identities derived from the JWT. The request identity is in the format of
+      // "<ISS>/<SUB>", for example, "example.com/sub-1". This field requires request authentication enabled and is the
+      // same as the request.auth.principal attribute.
+      //
+      // If not set, any request principal is allowed.
+      repeated string request_principals = 2;
+
+      // Optional. A list of negative match of request identities.
+      repeated string not_request_principals = 6;
+
+      // Optional. A list of namespaces derived from the peer certificate.
+      // This field requires mTLS enabled and is the same as the source.namespace attribute.
+      //
+      // If not set, any namespace is allowed.
+      repeated string namespaces = 3;
+
+      // Optional. A list of negative match of namespaces.
+      repeated string not_namespaces = 7;
+
+      // Optional. A list of IP blocks, populated from the source address of the IP packet. Single IP (e.g. 203.0.113.4) and
+      // CIDR (e.g. 203.0.113.0/24) are supported. This is the same as the source.ip attribute.
+      //
+      // If not set, any IP is allowed.
+      repeated string ip_blocks = 4;
+
+      // Optional. A list of negative match of IP blocks.
+      repeated string not_ip_blocks = 8;
+
+      // Optional. A list of IP blocks, populated from X-Forwarded-For header or proxy protocol.
+      // To make use of this field, you must configure the numTrustedProxies field of the gatewayTopology under the meshConfig
+      // when you install Istio or using an annotation on the ingress gateway.  See the documentation here:
+      // [Configuring Gateway Network Topology](https://istio.io/latest/docs/ops/configuration/traffic-management/network-topologies/).
+      // Single IP (e.g. 203.0.113.4) and CIDR (e.g. 203.0.113.0/24) are supported.
+      // This is the same as the remote.ip attribute.
+      //
+      // If not set, any IP is allowed.
+      repeated string remote_ip_blocks = 9;
+
+      // Optional. A list of negative match of remote IP blocks.
+      repeated string not_remote_ip_blocks = 10;
+    }
+
+    // Operation specifies the operations of a request. Fields in the operation are
+    // ANDed together.
+    //
+    // For example, the following operation matches if the host has suffix .example.com
+    // and the method is GET or HEAD and the path doesn't have prefix /admin.
+    //
+    // yaml
+    // hosts: ["*.example.com"]
+    // methods: ["GET", "HEAD"]
+    // notPaths: ["/admin*"]
+    // 
+    message Operation {
+      // Optional. A list of hosts as specified in the HTTP request. The match is case-insensitive.
+      // See the [security best practices](https://istio.io/latest/docs/ops/best-practices/security/#writing-host-match-policies) for
+      // recommended usage of this field.
+      //
+      // If not set, any host is allowed. Must be used only with HTTP.
+      repeated string hosts = 1;
+
+      // Optional. A list of negative match of hosts as specified in the HTTP request. The match is case-insensitive.
+      repeated string not_hosts = 5;
+
+      // Optional. A list of ports as specified in the connection.
+      //
+      // If not set, any port is allowed.
+      repeated string ports = 2;
+
+      // Optional. A list of negative match of ports as specified in the connection.
+      repeated string not_ports = 6;
+
+      // Optional. A list of methods as specified in the HTTP request.
+      // For gRPC service, this will always be POST.
+      //
+      // If not set, any method is allowed. Must be used only with HTTP.
+      repeated string methods = 3;
+
+      // Optional. A list of negative match of methods as specified in the HTTP request.
+      repeated string not_methods = 7;
+
+      // Optional. A list of paths as specified in the HTTP request. See the [Authorization Policy Normalization](https://istio.io/latest/docs/reference/config/security/normalization/)
+      // for details of the path normalization.
+      // For gRPC service, this will be the fully-qualified name in the form of /package.service/method.
+      //
+      // If a path in the list contains the {*} or {**} path template operator, it will be interpreted as an [Envoy Uri Template](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/path/match/uri_template/v3/uri_template_match.proto).
+      // To be a valid path template, the path must not contain *, {, or } outside of a supported operator. No other characters are allowed in the path segment with the path template operator.
+      // - {*} matches a single glob that cannot extend beyond a path segment.
+      // - {**} matches zero or more globs. If a path contains {**}, it must be the last operator.
+      //
+      // Examples:
+      // - /foo/{*} matches /foo/bar but not /foo/bar/baz
+      // - /foo/{**}/ matches /foo/bar/, /foo/bar/baz.txt, and /foo// but not /foo/bar
+      // - /foo/{*}/bar/{**} matches /foo/buzz/bar/ and /foo/buzz/bar/baz
+      // - /*/baz/{*} is not a valid path template since it includes * outside of a supported operator
+      // - /**/baz/{*} is not a valid path template since it includes ** outside of a supported operator
+      // - /{**}/foo/{*} is not a valid path template since {**} is not the last operator
+      // - /foo/{*}.txt is invalid since there are characters other than {*} in the path segment
+      //
+      // If not set, any path is allowed. Must be used only with HTTP.
+      repeated string paths = 4;
+
+      // Optional. A list of negative match of paths.
+      repeated string not_paths = 8;
+    }
+
+    // Condition specifies additional required attributes.
+    message Condition {
+      // The name of an Istio attribute.
+      // See the [full list of supported attributes](https://istio.io/docs/reference/config/security/conditions/).
+      string key = 1 [(google.api.field_behavior) = REQUIRED];
+
+      // Optional. A list of allowed values for the attribute.
+      // Note: at least one of values or notValues must be set.
+      repeated string values = 2;
+
+      // Optional. A list of negative match of values for the attribute.
+      // Note: at least one of values or notValues must be set.
+      repeated string not_values = 3;
+    };
+
+    EXAMPLES:
+    UQ: Deny requests from dev namespace to POST method on all workloads in the foo namespace
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-dev-post", "namespace": "foo"}, "spec": {"action": "DENY", "rules": [{"from": [{"source": {"namespaces": ["dev"]}}], "to": [{"operation": {"methods": ["POST"]}}]}]}}
+
+    UQ: Create a deny policy to deny all requests with POST method on port 8080 on all workloads in the foo namespace
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-post-8080", "namespace": "foo"}, "spec": {"action": "DENY", "rules": [{"to": [{"operation": {"methods": ["POST"], "ports": ["8080"]}}]}]}}
+
+    UQ: Audit any GET requests to the path with the prefix /user/profile
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "audit-user-profile", "namespace": "ns1"}, "spec": {"selector": {"matchLabels": {"app": "myapi"}}, "action": "AUDIT", "rules": [{"to": [{"operation": {"methods": ["GET"], "paths": ["/user/profile/*"]}}]}]}}
+
+    UQ: Deny all requests to workloads in namespace foo
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "deny-all", "namespace": "foo"}, "spec": {}}
+
+    UQ: Allow all requests to workloads in namespace foo
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-all", "namespace": "foo"}, "spec": {"rules": [{}]}}
+
+    UQ: Allow requests to workloads labeled with app=customers in the customers namespace if the request is from the service account cluster.local/ns/orders/orders or from the payments namespace, and the request header "foo" has the value "bar" or the request header "user" has the value "peterj".
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-customers", "namespace": "customers"}, "spec": {"action": "ALLOW", "selector": {"matchLabels": {"app": "customers"}}, "rules": [{"from": [{"source": {"principals": ["cluster.local/ns/orders/sa/orders"]}}, {"source": {"namespaces": ["payments"]}}], "to": [{"operation": {"when": [{"key": "request.headers[foo]", "values": ["bar"]}, {"key": "request.headers[user]", "values": ["peterj"]}]}}]}]}}
+
+    UQ: Allow IP address 1.2.3.4 and IPs from block 5.6.7.0/24 to access the apps labeled with app=payments.
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "ingress-policy", "namespace": "foo"}, "spec": {"selector": {"matchLabels": {"app": "payments"}}, "action": "ALLOW", "rules": [{"from": [{"source": {"ipBlocks": ["1.2.3.4", "5.6.7.0/24"]}}]}]}}
+
+    UQ: Apply the policy to all workloads in the foo namespace and allows GET requests to prefix /info or POST requests to /data for workloads using cluster.local/ns/default/sleep service account or workloads in test namespace when the issuer claim is set to https://accounts.google.common
+    JSON: {"apiVersion": "security.istio.io/v1", "kind": "AuthorizationPolicy", "metadata": {"name": "allow-info-data", "namespace": "foo"}, "spec": {"action": "ALLOW", "rules": [{"from": [{"source": {"principals": ["cluster.local/ns/default/sa/sleep"]}}, {"source": {"namespaces": ["test"]}}], "to": [{"operation": {"methods": ["GET"], "paths": ["/info*"]}}, {"operation": {"methods": ["POST"], "paths": ["/data"]}}], "when": [{"key": "request.auth.claims[iss]", "values": ["https://accounts.google.com"]}]}]}}
+tools: []
\ No newline at end of file
diff --git a/python/tests/comparator.py b/python/tests/comparator.py
new file mode 100644
index 000000000..7b3b7a0f6
--- /dev/null
+++ b/python/tests/comparator.py
@@ -0,0 +1,273 @@
+import difflib
+import json
+from pathlib import Path
+from typing import Any
+
+
+def analyze_results_command(results_file: Path) -> dict[str, Any]:
+    """Analyze a single test run results file for similarity to expected outputs."""
+    if not results_file.exists():
+        raise FileNotFoundError(f"Results file not found: {results_file}")
+
+    with open(results_file) as f:
+        run = json.load(f)
+
+    test_analyses = []
+    total_similarity = 0
+    total_duration = 0
+    similarity_ranges = {
+        "excellent": 0,  # 90-100%
+        "good": 0,      # 75-90%
+        "fair": 0,      # 50-75%
+        "poor": 0       # <50%
+    }
+
+    for test in run["results"]:
+        # Calculate similarity between expected and actual output
+        expected_str = json.dumps(test["expected_output"], sort_keys=True)
+        actual_str = json.dumps(test["actual_output"], sort_keys=True)
+        similarity = difflib.SequenceMatcher(None, expected_str, actual_str).ratio() * 100
+        
+        # Track similarity distribution
+        if similarity >= 90:
+            similarity_ranges["excellent"] += 1
+        elif similarity >= 75:
+            similarity_ranges["good"] += 1
+        elif similarity >= 50:
+            similarity_ranges["fair"] += 1
+        else:
+            similarity_ranges["poor"] += 1
+
+        total_similarity += similarity
+        total_duration += test["duration_ms"]
+
+        # Generate diff if outputs don't match
+        differences = []
+        if test["actual_output"] != test["expected_output"]:
+            diff = difflib.unified_diff(
+                json.dumps(test["expected_output"], indent=2).splitlines(),
+                json.dumps(test["actual_output"], indent=2).splitlines(),
+                fromfile="expected",
+                tofile="actual",
+                lineterm=""
+            )
+            differences = list(diff)
+
+        analysis = {
+            "category": test["category"],
+            "input": test["input"],
+            "similarity": similarity,
+            "duration_ms": test["duration_ms"],
+            "differences": differences
+        }
+        test_analyses.append(analysis)
+
+    num_tests = len(test_analyses)
+    analysis_results = {
+        "file": results_file.name,
+        "model": run["config"]["model"],
+        "total_tests": num_tests,
+        "test_analyses": test_analyses,
+        "summary": {
+            "avg_similarity": total_similarity / num_tests if num_tests > 0 else 0,
+            "avg_duration": total_duration / num_tests if num_tests > 0 else 0,
+            "similarity_distribution": similarity_ranges,
+            "similarity_by_category": {}
+        }
+    }
+
+    # Calculate average similarity by category
+    category_totals = {}
+    category_counts = {}
+    for analysis in test_analyses:
+        cat = analysis["category"]
+        if cat not in category_totals:
+            category_totals[cat] = 0
+            category_counts[cat] = 0
+        category_totals[cat] += analysis["similarity"]
+        category_counts[cat] += 1
+
+    for cat in category_totals:
+        analysis_results["summary"]["similarity_by_category"][cat] = {
+            "avg_similarity": category_totals[cat] / category_counts[cat],
+            "test_count": category_counts[cat]
+        }
+
+    # Print analysis results
+    print(f"\nAnalyzing {results_file.name}")
+    print(f"Model: {run['config']['model']}")
+    print(f"\nAnalyzing {num_tests} test cases:")
+
+    # Print summary statistics
+    print("\n=== Summary Statistics ===")
+    print(f"Average similarity to expected output: {analysis_results['summary']['avg_similarity']:.2f}%")
+    print(f"Average duration: {analysis_results['summary']['avg_duration']:.2f}ms")
+    
+    print("\nSimilarity Distribution:")
+    print(f"Excellent (90-100%): {similarity_ranges['excellent']} tests")
+    print(f"Good (75-90%): {similarity_ranges['good']} tests")
+    print(f"Fair (50-75%): {similarity_ranges['fair']} tests")
+    print(f"Poor (<50%): {similarity_ranges['poor']} tests")
+
+    print("\nPerformance by Category:")
+    for cat, stats in analysis_results["summary"]["similarity_by_category"].items():
+        print(f"{cat}:")
+        print(f"  - Average similarity: {stats['avg_similarity']:.2f}%")
+        print(f"  - Number of tests: {stats['test_count']}")
+
+    print("\nDetailed Test Analysis:")
+    for analysis in test_analyses:
+        print(f"\n=== Test Category: {analysis['category']} ===")
+        print(f"Input: {analysis['input']}")
+        print(f"Similarity to expected: {analysis['similarity']:.2f}%")
+        print(f"Duration: {analysis['duration_ms']:.2f}ms")
+        
+        if analysis["differences"]:
+            print("Differences from expected output:")
+            for line in analysis["differences"]:
+                print(line)
+
+    return analysis_results
+
+
+def compare_results_command(results_file1: Path, results_file2: Path) -> dict[str, Any]:
+    """Compare two specific test result files including detailed test case analysis."""
+    if not results_file1.exists():
+        raise FileNotFoundError(f"Results file not found: {results_file1}")
+    if not results_file2.exists():
+        raise FileNotFoundError(f"Results file not found: {results_file2}")
+
+    with open(results_file1) as f:
+        run1 = json.load(f)
+    with open(results_file2) as f:
+        run2 = json.load(f)
+
+    # Compare results
+    test_comparisons = []
+    total_similarity_run1 = 0
+    total_similarity_run2 = 0
+    better_similarity_count_run1 = 0
+    better_similarity_count_run2 = 0
+    equal_similarity_count = 0
+    
+    for idx, (test1, test2) in enumerate(zip(run1["results"], run2["results"]), 1):
+        similarity1 = test1.get("similarity", 0)
+        similarity2 = test2.get("similarity", 0)
+        total_similarity_run1 += similarity1
+        total_similarity_run2 += similarity2
+        
+        # Track which run performed better for this test
+        if similarity1 > similarity2:
+            better_similarity_count_run1 += 1
+        elif similarity2 > similarity1:
+            better_similarity_count_run2 += 1
+        else:
+            equal_similarity_count += 1
+            
+        comparison = {
+            "test_number": idx,
+            "input": test1["input"],
+            "differences": [],
+            "duration_delta": test2.get("duration_ms", 0) - test1.get("duration_ms", 0),
+            "similarity_old": similarity1,
+            "similarity_new": similarity2,
+            "similarity_delta": similarity2 - similarity1
+        }
+        
+        # Compare outputs
+        if test1["actual_output"] != test2["actual_output"]:
+            diff = difflib.unified_diff(
+                json.dumps(test1["actual_output"], indent=2).splitlines(),
+                json.dumps(test2["actual_output"], indent=2).splitlines(),
+                fromfile=f"{results_file1.name} (Test {idx})",
+                tofile=f"{results_file2.name} (Test {idx})",
+                lineterm=""
+            )
+            comparison["differences"] = list(diff)
+        
+        test_comparisons.append(comparison)
+
+    num_tests = len(test_comparisons)
+    avg_similarity_run1 = total_similarity_run1 / num_tests if num_tests > 0 else 0
+    avg_similarity_run2 = total_similarity_run2 / num_tests if num_tests > 0 else 0
+    
+    comparison_results = {
+        "file1": results_file1.name,
+        "file2": results_file2.name,
+        "test_comparisons": test_comparisons,
+        "total_tests": num_tests,
+        "tests_with_differences": sum(1 for t in test_comparisons if t["differences"]),
+        "prompt_changed": run1["config"]["prompt"] != run2["config"]["prompt"],
+        "model_changed": run1["config"]["model"] != run2["config"]["model"],
+        "config_changed": run1["config"] != run2["config"],
+        "run1_stats": {
+            "model": run1["config"]["model"],
+            "avg_similarity": avg_similarity_run1,
+            "better_tests_count": better_similarity_count_run1
+        },
+        "run2_stats": {
+            "model": run2["config"]["model"],
+            "avg_similarity": avg_similarity_run2,
+            "better_tests_count": better_similarity_count_run2
+        },
+        "equal_tests_count": equal_similarity_count
+    }
+
+    # Print detailed comparison results
+    print(f"\nComparing {results_file1.name} with {results_file2.name}")
+    print(f"Model changed: {comparison_results['model_changed']}")
+    print(f"Prompt changed: {comparison_results['prompt_changed']}")
+    print(f"\nAnalyzing {comparison_results['total_tests']} test cases:")
+    
+    for test in test_comparisons:
+        print(f"\n=== Test {test['test_number']} ===")
+        print(f"Input: {test['input']}")
+        print(f"Duration delta: {test['duration_delta']:.2f}ms")
+        print(f"Similarity: {test['similarity_old']:.2f}% → {test['similarity_new']:.2f}% "
+              f"(Δ: {test['similarity_delta']:+.2f}%)")
+        
+        if test["differences"]:
+            print("Output differences:")
+            for line in test["differences"]:
+                print(line)
+        else:
+            print("No differences in output")
+
+    # Print enhanced summary statistics
+    if test_comparisons:
+        avg_similarity_delta = sum(t["similarity_delta"] for t in test_comparisons) / num_tests
+        avg_duration_delta = sum(t["duration_delta"] for t in test_comparisons) / num_tests
+        
+        print("\n=== Summary Statistics ===")
+        print(f"Tests with differences: {comparison_results['tests_with_differences']} of {comparison_results['total_tests']}")
+        
+        # Print overall performance comparison
+        print(f"\nOverall Performance Comparison:")
+        print(f"Run 1 ({run1['config']['model']}):")
+        print(f"  - Average similarity: {avg_similarity_run1:.2f}%")
+        print(f"  - Better performance in: {better_similarity_count_run1} tests")
+        
+        print(f"\nRun 2 ({run2['config']['model']}):")
+        print(f"  - Average similarity: {avg_similarity_run2:.2f}%")
+        print(f"  - Better performance in: {better_similarity_count_run2} tests")
+        
+        print(f"\nEqual performance in: {equal_similarity_count} tests")
+        print(f"Overall similarity delta: {avg_similarity_delta:+.2f}%")
+        print(f"Overall duration delta: {avg_duration_delta:+.2f}ms")
+        
+        # Determine overall winner
+        if avg_similarity_run1 > avg_similarity_run2:
+            winner = f"Run 1 ({run1['config']['model']})"
+            margin = avg_similarity_run1 - avg_similarity_run2
+        elif avg_similarity_run2 > avg_similarity_run1:
+            winner = f"Run 2 ({run2['config']['model']})"
+            margin = avg_similarity_run2 - avg_similarity_run1
+        else:
+            winner = "Tie"
+            margin = 0
+            
+        if winner != "Tie":
+            print(f"\nOverall Winner: {winner}")
+            print(f"Winning margin: {margin:.2f}% higher average similarity")
+
+    return comparison_results
\ No newline at end of file
diff --git a/python/tests/loader.py b/python/tests/loader.py
new file mode 100644
index 000000000..c3096c19b
--- /dev/null
+++ b/python/tests/loader.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+from typing import Any
+from autogen_agentchat.agents import AssistantAgent
+import yaml
+
+from schema import AgentDefinition, AgentMetadata, TestMetadata, TestCase, TestSuite
+
+def load_agent_definition(filepath: Path) -> AgentDefinition:
+    """
+    Load agent definition from a YAML file and return an AgentDefinition object.
+    
+    Args:
+        filepath: Path to the YAML file containing agent definition
+        
+    Returns:
+        AgentDefinition object containing the agent configuration
+    """
+    with open(filepath, 'r') as f:
+        data = yaml.safe_load(f)
+    
+    # Create metadata object
+    metadata = AgentMetadata(
+        description=data['metadata']['description'],
+        version=data['metadata']['version'],
+    )
+    
+    # Create agent definition object
+    return AgentDefinition(
+        name=data['name'],
+        system_messages=data['system_messages'],
+        metadata=metadata,
+        tools=data.get('tools', [])
+    )
+
+def create_agent(agent_def: AgentDefinition, model_client: Any) -> AssistantAgent:
+    """
+    Create an AssistantAgent instance from an AgentDefinition.
+    
+    Args:
+        agent_def: AgentDefinition object containing the agent configuration
+        model_client: The model client to use for the agent
+        
+    Returns:
+        AssistantAgent instance configured according to the definition
+    """
+    return AssistantAgent(
+        name=agent_def.name,
+        model_client=model_client,
+        system_message="\n".join(agent_def.system_messages),
+        tools=agent_def.tools or []
+    )
+
+def load_test_cases(filepath: Path) -> TestSuite:
+    """
+    Load test cases from a YAML file and return a TestSuite object.
+    
+    Args:
+        filepath: Path to the YAML file containing test cases
+        
+    Returns:
+        TestSuite object containing all test cases and metadata
+    """
+    with open(filepath, 'r') as f:
+        data = yaml.safe_load(f)
+    
+    # Create metadata object
+    metadata = TestMetadata(
+        description=data['metadata']['description'],
+    )
+    
+    # Create test case objects
+    test_cases = [
+        TestCase(
+            name=tc['name'],
+            category=tc['category'],
+            input=tc['input'],
+            expected_output=tc['expected_output']
+        )
+        for tc in data['test_cases']
+    ]
+    
+    return TestSuite(
+        version=data['version'],
+        metadata=metadata,
+        test_cases=test_cases
+    )
diff --git a/python/tests/main.py b/python/tests/main.py
new file mode 100644
index 000000000..e6bb04f53
--- /dev/null
+++ b/python/tests/main.py
@@ -0,0 +1,64 @@
+import argparse
+import asyncio
+from dotenv import load_dotenv
+from pathlib import Path
+from autogen_ext.models.openai import OpenAIChatCompletionClient
+from autogen_ext.models.openai.config import ResponseFormat 
+
+from comparator import analyze_results_command, compare_results_command
+from loader import load_test_cases, load_agent_definition, create_agent
+from agent_tester import AgentTester
+
+load_dotenv()
+
+async def run_test_command(test_cases_file: Path, agent_def_file: Path, model: str, results_dir: str = "test_results"):
+    test_suite = load_test_cases(test_cases_file)
+    agent_def = load_agent_definition(agent_def_file)
+
+    model_client = OpenAIChatCompletionClient(
+        model=model,
+        response_format=ResponseFormat(type="json_object"),
+    )
+
+    agent = create_agent(agent_def, model_client)
+
+    tester = AgentTester(agent, test_cases=test_suite.test_cases, results_dir=results_dir)
+    
+    print(f"Running {len(test_suite.test_cases)} test cases for agent: {agent_def.name}...")
+    results = await tester.run_tests()
+    return results
+
+def main():
+    parser = argparse.ArgumentParser(description="Test runner for agents")
+    subparsers = parser.add_subparsers(dest="command", help="Command to execute")
+
+    # Run tests command
+    run_parser = subparsers.add_parser("run", help="Run tests from a YAML file")
+    run_parser.add_argument("test_file", type=Path, help="Path to the YAML file containing test cases")
+    run_parser.add_argument("agent_file", type=Path, help="Path to the agent definition YAML file")
+    run_parser.add_argument("--results-dir", type=str, default="test_results",
+                           help="Directory to store test results (default: test_results)")
+    run_parser.add_argument("--model", type=str, help="OpenAI model to use for testing", required=True)
+
+    # Compare results command
+    compare_parser = subparsers.add_parser("compare", help="Compare two test result files")
+    compare_parser.add_argument("file1", type=Path, help="First test results file")
+    compare_parser.add_argument("file2", type=Path, help="Second test results file")
+
+    # Analyze results command
+    analyze_parser = subparsers.add_parser("analyze", help="Analyze a single test results file")
+    analyze_parser.add_argument("file", type=Path, help="Test results file to analyze")
+
+    args = parser.parse_args()
+
+    if args.command == "run":
+        asyncio.run(run_test_command(args.test_file, args.agent_file, args.model, args.results_dir))
+    elif args.command == "compare":
+        compare_results_command(args.file1, args.file2)
+    elif args.command == "analyze":
+        analyze_results_command(args.file)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/python/tests/schema.py b/python/tests/schema.py
new file mode 100644
index 000000000..6caa131a1
--- /dev/null
+++ b/python/tests/schema.py
@@ -0,0 +1,47 @@
+from dataclasses import dataclass
+from typing import Any, Optional
+
+
+@dataclass
+class AgentMetadata:
+    description: str
+    version: str
+
+@dataclass
+class AgentDefinition:
+    name: str
+    system_messages: list[str]
+    metadata: AgentMetadata
+    tools: Optional[list[str]] = None
+    
+@dataclass
+class TestMetadata:
+    description: str
+
+@dataclass
+class TestCase:
+    name: str
+    category: str
+    input: str
+    expected_output: dict[str, Any]
+
+@dataclass
+class TestSuite:
+    version: str
+    metadata: TestMetadata
+    test_cases: list[TestCase]
+
+@dataclass
+class TestResult:
+    input: str
+    expected_output: Optional[str]
+    actual_output: str
+    category: str
+    duration_ms: float
+    similarity: float
+
+@dataclass
+class TestRunResult:
+    timestamp: str
+    config: dict[str, Any]
+    results: list[TestResult]
\ No newline at end of file
diff --git a/python/tests/test_cases/authpolicy_test_cases.yaml b/python/tests/test_cases/authpolicy_test_cases.yaml
new file mode 100644
index 000000000..06a03eb6f
--- /dev/null
+++ b/python/tests/test_cases/authpolicy_test_cases.yaml
@@ -0,0 +1,77 @@
+version: "1.0"
+metadata:
+  description: "Authorization Policy Test Cases"
+
+test_cases:
+  - name: deny_post_8080
+    input: "Deny results with POST method on port 8080 on all workloads in the foo namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+        name: httpbin
+        namespace: foo
+      spec:
+        action: DENY
+        rules:
+          - to:
+              - operation:
+                  methods:
+                    - POST
+                  ports:
+                    - "8080"
+
+  - name: allow_get_3000
+    input: "Allow GET requests on port 3000 for service-a in the bar namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+        name: service-a
+        namespace: bar
+      spec:
+        action: ALLOW
+        rules:
+          - to:
+              - operation:
+                  methods:
+                    - GET
+                  ports:
+                    - "3000"
+  - name: allow_nothing
+    input: "Create an allow nothing policy in the foo namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+      name: allow-nothing
+      namespace: foo
+      spec:
+        {}
+  - name: allow_nothing_1
+    input: "Deny all requests between the workloads in the foo namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+      name: allow-nothing
+      namespace: foo
+      spec:
+        {}
+
+  - name: allow_all
+    input: "Allow all requests in the default namespace"
+    category: AuthorizationPolicy
+    expected_output:
+      apiVersion: security.istio.io/v1
+      kind: AuthorizationPolicy
+      metadata:
+      name: allow-all
+      namespace: foo
+      spec:
+      rules:
+      - {}
\ No newline at end of file