Skip to content

Commit

Permalink
llm judges (#1237)
Browse files Browse the repository at this point in the history
  • Loading branch information
mike0sv authored Aug 9, 2024
1 parent 90918cf commit 172029a
Show file tree
Hide file tree
Showing 35 changed files with 1,009 additions and 59 deletions.
345 changes: 345 additions & 0 deletions examples/how_to_questions/how_to_use_llm_judge_template.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fed31aaf-c264-4d5b-a49c-e7228290f876",
"metadata": {},
"source": [
"# How to use llm judge template?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a647a31-2765-4004-94ea-1217671976c0",
"metadata": {},
"outputs": [],
"source": [
"from evidently.descriptors import LLMJudgeDescriptor, NegativityLLMJudge, PIILLMJudge, DeclineLLMJudge"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07bfbab2-17ec-439d-b5ca-15bb54505fc9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from datetime import datetime\n",
"from datetime import time\n",
"from datetime import timedelta\n",
"\n",
"import requests\n",
"from io import BytesIO\n",
"\n",
"from sklearn import datasets, ensemble, model_selection"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3e21967-2614-428d-8f69-93dc90b280bc",
"metadata": {},
"outputs": [],
"source": [
"from evidently.ui.workspace.cloud import CloudWorkspace\n",
"\n",
"from evidently import ColumnMapping\n",
"from evidently.report import Report\n",
"\n",
"from evidently.metrics import ColumnSummaryMetric\n",
"\n",
"from evidently.metric_preset import DataQualityPreset, TextOverviewPreset, TextEvals"
]
},
{
"cell_type": "markdown",
"id": "e6bb0349-b436-484f-963d-64f7e33d8c2b",
"metadata": {},
"source": [
"## Load Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d58a568-0e1c-42ec-97ab-9943048c3882",
"metadata": {},
"outputs": [],
"source": [
"response = requests.get(\"https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv\")\n",
"csv_content = BytesIO(response.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27c71c6d-5230-4c3e-9839-d04ac88b81d0",
"metadata": {},
"outputs": [],
"source": [
"assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])\n",
"assistant_logs.index = assistant_logs.start_time\n",
"assistant_logs.index.rename('index', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe638c07-777e-44a2-a853-3aad67412187",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "130cb841-23f7-4fad-b4f1-fcb6349a57ec",
"metadata": {},
"outputs": [],
"source": [
"assistant_logs[[\"question\", \"response\"]].head()"
]
},
{
"cell_type": "markdown",
"id": "96ecf6d7-0e5c-48ae-9389-5d914b34692e",
"metadata": {},
"source": [
"## One-off reports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2762d6d-65b0-412c-a0f5-339594168ad5",
"metadata": {},
"outputs": [],
"source": [
"column_mapping = ColumnMapping(\n",
" datetime='start_time',\n",
" datetime_features=['end_time'],\n",
" text_features=['question', 'response'],\n",
" categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],\n",
")"
]
},
{
"cell_type": "markdown",
"id": "01ccd583-2788-411e-b25c-3ec594ced7c9",
"metadata": {},
"source": [
"### LLM-based descriptors without parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d675f9ef-6502-40b3-b805-06a8eb751567",
"metadata": {},
"outputs": [],
"source": [
"report = Report(metrics=[\n",
" TextEvals(column_name=\"question\"),\n",
" TextEvals(column_name=\"response\")\n",
"])\n",
"\n",
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
" column_mapping=column_mapping)\n",
"report "
]
},
{
"cell_type": "markdown",
"id": "b9ff1096-c60f-4c70-a9b1-7b6623cb77cf",
"metadata": {},
"source": [
"### LLM-based descriptors without parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "810b6b54-b395-41c1-bfe3-d97c01a9cce4",
"metadata": {},
"outputs": [],
"source": [
"report = Report(metrics=[\n",
" TextEvals(column_name=\"question\", descriptors=[\n",
" NegativityLLMJudge() \n",
" ]),\n",
" TextEvals(column_name=\"response\", descriptors=[\n",
" PIILLMJudge(), \n",
" DeclineLLMJudge()\n",
" ])\n",
"])\n",
"\n",
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
" column_mapping=column_mapping)\n",
"report "
]
},
{
"cell_type": "markdown",
"id": "0bdfd280-b477-4484-b0c0-6720c5b9a226",
"metadata": {},
"source": [
"### LLM-based descriptors with parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8b1f7c7-eb47-4dc1-99e6-f0c71d118373",
"metadata": {},
"outputs": [],
"source": [
"report = Report(metrics=[\n",
" TextEvals(column_name=\"question\", descriptors=[\n",
" NegativityLLMJudge(include_category=True) \n",
" ]),\n",
" TextEvals(column_name=\"response\", descriptors=[\n",
" PIILLMJudge(include_reasonning=False), \n",
" DeclineLLMJudge(include_score=True)\n",
" ])\n",
"])\n",
"\n",
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
" column_mapping=column_mapping)\n",
"report "
]
},
{
"cell_type": "markdown",
"id": "3806d7d8-5acf-45cb-b16b-3b4336dea6e0",
"metadata": {},
"source": [
"### Custom LLM-based descriptor"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2965eb66-b27e-4101-8893-8d7c9389b61e",
"metadata": {},
"outputs": [],
"source": [
"from evidently.features.llm_judge import BinaryClassificationPromptTemplate"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55226466-786c-4ed0-9085-d9bffc4e266e",
"metadata": {},
"outputs": [],
"source": [
"custom_judge = LLMJudgeDescriptor(\n",
" subcolumn=\"category\",\n",
" template = BinaryClassificationPromptTemplate( \n",
" criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n",
" A concise response should:\n",
" - Provide the necessary information without unnecessary details or repetition.\n",
" - Be brief yet comprehensive enough to address the query.\n",
" - Use simple and direct language to convey the message effectively.\n",
" \"\"\",\n",
" target_category=\"Conciseness\",\n",
" non_target_category=\"Ok\",\n",
" uncertainty=\"unknown\",\n",
" include_reasoning=True,\n",
" pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
" ),\n",
" provider = \"openai\",\n",
" model = \"gpt-4o-mini\"\n",
")\n",
"\n",
"report = Report(metrics=[\n",
" TextEvals(column_name=\"response\", descriptors=[\n",
" custom_judge\n",
" ])\n",
"])\n",
"\n",
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
" column_mapping=column_mapping)\n",
"report "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa7824f1-f293-4462-b377-21c798338bca",
"metadata": {},
"outputs": [],
"source": [
"custom_judge = LLMJudgeDescriptor(\n",
" subcolumn=\"score\",\n",
" template = BinaryClassificationPromptTemplate( \n",
" criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n",
" A concise response should:\n",
" - Provide the necessary information without unnecessary details or repetition.\n",
" - Be brief yet comprehensive enough to address the query.\n",
" - Use simple and direct language to convey the message effectively.\n",
" \"\"\",\n",
" target_category=\"Conciseness\",\n",
" non_target_category=\"Ok\",\n",
" uncertainty=\"unknown\",\n",
" include_reasoning=True,\n",
" include_score=True,\n",
" pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
" ),\n",
" provider = \"openai\",\n",
" model = \"gpt-4o-mini\"\n",
")\n",
"\n",
"report = Report(metrics=[\n",
" TextEvals(column_name=\"response\", descriptors=[\n",
" custom_judge\n",
" ])\n",
"])\n",
"\n",
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n",
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n",
" column_mapping=column_mapping)\n",
"report "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26352e01-7342-4c5e-b3e1-cf9a56fb3f2e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ ignore_missing_imports = True
[mypy-deprecation.*]
ignore_missing_imports = True

[mypy-litellm.*]
ignore_missing_imports = True

[tool:pytest]
testpaths=tests
python_classes=*Test
Expand Down
4 changes: 2 additions & 2 deletions src/evidently/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ def get_data(self, column: Union[str, ColumnName]) -> Tuple[ColumnType, pd.Serie
return self._determine_type(column), self.get_current_column(column), ref_data

def _determine_type(self, column: Union[str, ColumnName]) -> ColumnType:
if isinstance(column, ColumnName) and column._feature_class is not None:
column_type = column._feature_class.feature_type
if isinstance(column, ColumnName) and column.feature_class is not None:
column_type = column.feature_class.get_type(column.name)
else:
if isinstance(column, ColumnName):
column_name = column.name
Expand Down
8 changes: 8 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from .custom_descriptor import CustomPairColumnEval
from .hf_descriptor import HuggingFaceModel
from .hf_descriptor import HuggingFaceToxicityModel
from .llm_judges import DeclineLLMJudge
from .llm_judges import LLMJudgeDescriptor
from .llm_judges import NegativityLLMJudge
from .llm_judges import PIILLMJudge
from .non_letter_character_percentage_descriptor import NonLetterCharacterPercentage
from .oov_words_percentage_descriptor import OOV
from .openai_descriptor import OpenAIPrompting
Expand All @@ -24,6 +28,10 @@
"CustomPairColumnEval",
"HuggingFaceModel",
"HuggingFaceToxicityModel",
"LLMJudgeDescriptor",
"NegativityLLMJudge",
"PIILLMJudge",
"DeclineLLMJudge",
"OpenAIPrompting",
"NonLetterCharacterPercentage",
"OOV",
Expand Down
Loading

0 comments on commit 172029a

Please sign in to comment.