llm judges (#1237)

evidentlyai · Aug 9, 2024 · 172029a · 172029a
1 parent 90918cf
commit 172029a
Show file tree

Hide file tree

Showing 35 changed files with 1,009 additions and 59 deletions.
diff --git a/examples/how_to_questions/how_to_use_llm_judge_template.ipynb b/examples/how_to_questions/how_to_use_llm_judge_template.ipynb
@@ -0,0 +1,345 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fed31aaf-c264-4d5b-a49c-e7228290f876",
+   "metadata": {},
+   "source": [
+    "# How to use llm judge template?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a647a31-2765-4004-94ea-1217671976c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evidently.descriptors import LLMJudgeDescriptor, NegativityLLMJudge, PIILLMJudge, DeclineLLMJudge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07bfbab2-17ec-439d-b5ca-15bb54505fc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from datetime import datetime\n",
+    "from datetime import time\n",
+    "from datetime import timedelta\n",
+    "\n",
+    "import requests\n",
+    "from io import BytesIO\n",
+    "\n",
+    "from sklearn import datasets, ensemble, model_selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3e21967-2614-428d-8f69-93dc90b280bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evidently.ui.workspace.cloud import CloudWorkspace\n",
+    "\n",
+    "from evidently import ColumnMapping\n",
+    "from evidently.report import Report\n",
+    "\n",
+    "from evidently.metrics import ColumnSummaryMetric\n",
+    "\n",
+    "from evidently.metric_preset import DataQualityPreset, TextOverviewPreset, TextEvals"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6bb0349-b436-484f-963d-64f7e33d8c2b",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d58a568-0e1c-42ec-97ab-9943048c3882",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.get(\"https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv\")\n",
+    "csv_content = BytesIO(response.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27c71c6d-5230-4c3e-9839-d04ac88b81d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])\n",
+    "assistant_logs.index = assistant_logs.start_time\n",
+    "assistant_logs.index.rename('index', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe638c07-777e-44a2-a853-3aad67412187",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_colwidth', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "130cb841-23f7-4fad-b4f1-fcb6349a57ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assistant_logs[[\"question\", \"response\"]].head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96ecf6d7-0e5c-48ae-9389-5d914b34692e",
+   "metadata": {},
+   "source": [
+    "## One-off reports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2762d6d-65b0-412c-a0f5-339594168ad5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "column_mapping = ColumnMapping(\n",
+    "    datetime='start_time',\n",
+    "    datetime_features=['end_time'],\n",
+    "    text_features=['question', 'response'],\n",
+    "    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01ccd583-2788-411e-b25c-3ec594ced7c9",
+   "metadata": {},
+   "source": [
+    "### LLM-based descriptors without parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d675f9ef-6502-40b3-b805-06a8eb751567",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = Report(metrics=[\n",
+    "    TextEvals(column_name=\"question\"),\n",
+    "    TextEvals(column_name=\"response\")\n",
+    "])\n",
+    "\n",
+    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
+    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
+    "           column_mapping=column_mapping)\n",
+    "report "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9ff1096-c60f-4c70-a9b1-7b6623cb77cf",
+   "metadata": {},
+   "source": [
+    "### LLM-based descriptors without parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "810b6b54-b395-41c1-bfe3-d97c01a9cce4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = Report(metrics=[\n",
+    "    TextEvals(column_name=\"question\", descriptors=[\n",
+    "        NegativityLLMJudge()   \n",
+    "    ]),\n",
+    "    TextEvals(column_name=\"response\", descriptors=[\n",
+    "        PIILLMJudge(), \n",
+    "        DeclineLLMJudge()\n",
+    "    ])\n",
+    "])\n",
+    "\n",
+    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
+    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
+    "           column_mapping=column_mapping)\n",
+    "report "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0bdfd280-b477-4484-b0c0-6720c5b9a226",
+   "metadata": {},
+   "source": [
+    "### LLM-based descriptors with parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8b1f7c7-eb47-4dc1-99e6-f0c71d118373",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = Report(metrics=[\n",
+    "    TextEvals(column_name=\"question\", descriptors=[\n",
+    "        NegativityLLMJudge(include_category=True)   \n",
+    "    ]),\n",
+    "    TextEvals(column_name=\"response\", descriptors=[\n",
+    "        PIILLMJudge(include_reasonning=False), \n",
+    "        DeclineLLMJudge(include_score=True)\n",
+    "    ])\n",
+    "])\n",
+    "\n",
+    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
+    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
+    "           column_mapping=column_mapping)\n",
+    "report "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3806d7d8-5acf-45cb-b16b-3b4336dea6e0",
+   "metadata": {},
+   "source": [
+    "### Custom LLM-based descriptor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2965eb66-b27e-4101-8893-8d7c9389b61e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evidently.features.llm_judge import BinaryClassificationPromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55226466-786c-4ed0-9085-d9bffc4e266e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_judge = LLMJudgeDescriptor(\n",
+    "    subcolumn=\"category\",\n",
+    "    template = BinaryClassificationPromptTemplate(      \n",
+    "        criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n",
+    "            A concise response should:\n",
+    "            - Provide the necessary information without unnecessary details or repetition.\n",
+    "            - Be brief yet comprehensive enough to address the query.\n",
+    "            - Use simple and direct language to convey the message effectively.\n",
+    "        \"\"\",\n",
+    "        target_category=\"Conciseness\",\n",
+    "        non_target_category=\"Ok\",\n",
+    "        uncertainty=\"unknown\",\n",
+    "        include_reasoning=True,\n",
+    "        pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
+    "        ),\n",
+    "    provider = \"openai\",\n",
+    "    model = \"gpt-4o-mini\"\n",
+    ")\n",
+    "\n",
+    "report = Report(metrics=[\n",
+    "    TextEvals(column_name=\"response\", descriptors=[\n",
+    "        custom_judge\n",
+    "    ])\n",
+    "])\n",
+    "\n",
+    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
+    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
+    "           column_mapping=column_mapping)\n",
+    "report "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa7824f1-f293-4462-b377-21c798338bca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_judge = LLMJudgeDescriptor(\n",
+    "    subcolumn=\"score\",\n",
+    "    template = BinaryClassificationPromptTemplate(      \n",
+    "        criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n",
+    "            A concise response should:\n",
+    "            - Provide the necessary information without unnecessary details or repetition.\n",
+    "            - Be brief yet comprehensive enough to address the query.\n",
+    "            - Use simple and direct language to convey the message effectively.\n",
+    "        \"\"\",\n",
+    "        target_category=\"Conciseness\",\n",
+    "        non_target_category=\"Ok\",\n",
+    "        uncertainty=\"unknown\",\n",
+    "        include_reasoning=True,\n",
+    "        include_score=True,\n",
+    "        pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
+    "        ),\n",
+    "    provider = \"openai\",\n",
+    "    model = \"gpt-4o-mini\"\n",
+    ")\n",
+    "\n",
+    "report = Report(metrics=[\n",
+    "    TextEvals(column_name=\"response\", descriptors=[\n",
+    "        custom_judge\n",
+    "    ])\n",
+    "])\n",
+    "\n",
+    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
+    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
+    "           column_mapping=column_mapping)\n",
+    "report "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26352e01-7342-4c5e-b3e1-cf9a56fb3f2e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/setup.cfg b/setup.cfg
@@ -103,6 +103,9 @@ ignore_missing_imports = True
 [mypy-deprecation.*]
 ignore_missing_imports = True
 
+[mypy-litellm.*]
+ignore_missing_imports = True
+
 [tool:pytest]
 testpaths=tests
 python_classes=*Test

diff --git a/src/evidently/base_metric.py b/src/evidently/base_metric.py
@@ -162,8 +162,8 @@ def get_data(self, column: Union[str, ColumnName]) -> Tuple[ColumnType, pd.Serie
         return self._determine_type(column), self.get_current_column(column), ref_data
 
     def _determine_type(self, column: Union[str, ColumnName]) -> ColumnType:
-        if isinstance(column, ColumnName) and column._feature_class is not None:
-            column_type = column._feature_class.feature_type
+        if isinstance(column, ColumnName) and column.feature_class is not None:
+            column_type = column.feature_class.get_type(column.name)
         else:
             if isinstance(column, ColumnName):
                 column_name = column.name

diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -2,6 +2,10 @@
 from .custom_descriptor import CustomPairColumnEval
 from .hf_descriptor import HuggingFaceModel
 from .hf_descriptor import HuggingFaceToxicityModel
+from .llm_judges import DeclineLLMJudge
+from .llm_judges import LLMJudgeDescriptor
+from .llm_judges import NegativityLLMJudge
+from .llm_judges import PIILLMJudge
 from .non_letter_character_percentage_descriptor import NonLetterCharacterPercentage
 from .oov_words_percentage_descriptor import OOV
 from .openai_descriptor import OpenAIPrompting
@@ -24,6 +28,10 @@
     "CustomPairColumnEval",
     "HuggingFaceModel",
     "HuggingFaceToxicityModel",
+    "LLMJudgeDescriptor",
+    "NegativityLLMJudge",
+    "PIILLMJudge",
+    "DeclineLLMJudge",
     "OpenAIPrompting",
     "NonLetterCharacterPercentage",
     "OOV",