-
Notifications
You must be signed in to change notification settings - Fork 625
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
35 changed files
with
1,009 additions
and
59 deletions.
There are no files selected for viewing
345 changes: 345 additions & 0 deletions
345
examples/how_to_questions/how_to_use_llm_judge_template.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,345 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fed31aaf-c264-4d5b-a49c-e7228290f876", | ||
"metadata": {}, | ||
"source": [ | ||
"# How to use llm judge template?" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "3a647a31-2765-4004-94ea-1217671976c0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from evidently.descriptors import LLMJudgeDescriptor, NegativityLLMJudge, PIILLMJudge, DeclineLLMJudge" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "07bfbab2-17ec-439d-b5ca-15bb54505fc9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"from datetime import datetime\n", | ||
"from datetime import time\n", | ||
"from datetime import timedelta\n", | ||
"\n", | ||
"import requests\n", | ||
"from io import BytesIO\n", | ||
"\n", | ||
"from sklearn import datasets, ensemble, model_selection" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c3e21967-2614-428d-8f69-93dc90b280bc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from evidently.ui.workspace.cloud import CloudWorkspace\n", | ||
"\n", | ||
"from evidently import ColumnMapping\n", | ||
"from evidently.report import Report\n", | ||
"\n", | ||
"from evidently.metrics import ColumnSummaryMetric\n", | ||
"\n", | ||
"from evidently.metric_preset import DataQualityPreset, TextOverviewPreset, TextEvals" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e6bb0349-b436-484f-963d-64f7e33d8c2b", | ||
"metadata": {}, | ||
"source": [ | ||
"## Load Data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2d58a568-0e1c-42ec-97ab-9943048c3882", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"response = requests.get(\"https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv\")\n", | ||
"csv_content = BytesIO(response.content)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "27c71c6d-5230-4c3e-9839-d04ac88b81d0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])\n", | ||
"assistant_logs.index = assistant_logs.start_time\n", | ||
"assistant_logs.index.rename('index', inplace=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fe638c07-777e-44a2-a853-3aad67412187", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.set_option('display.max_colwidth', None)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "130cb841-23f7-4fad-b4f1-fcb6349a57ec", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"assistant_logs[[\"question\", \"response\"]].head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "96ecf6d7-0e5c-48ae-9389-5d914b34692e", | ||
"metadata": {}, | ||
"source": [ | ||
"## One-off reports" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e2762d6d-65b0-412c-a0f5-339594168ad5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"column_mapping = ColumnMapping(\n", | ||
" datetime='start_time',\n", | ||
" datetime_features=['end_time'],\n", | ||
" text_features=['question', 'response'],\n", | ||
" categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "01ccd583-2788-411e-b25c-3ec594ced7c9", | ||
"metadata": {}, | ||
"source": [ | ||
"### LLM-based descriptors without parameters" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d675f9ef-6502-40b3-b805-06a8eb751567", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"report = Report(metrics=[\n", | ||
" TextEvals(column_name=\"question\"),\n", | ||
" TextEvals(column_name=\"response\")\n", | ||
"])\n", | ||
"\n", | ||
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n", | ||
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n", | ||
" column_mapping=column_mapping)\n", | ||
"report " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b9ff1096-c60f-4c70-a9b1-7b6623cb77cf", | ||
"metadata": {}, | ||
"source": [ | ||
"### LLM-based descriptors without parameters" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "810b6b54-b395-41c1-bfe3-d97c01a9cce4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"report = Report(metrics=[\n", | ||
" TextEvals(column_name=\"question\", descriptors=[\n", | ||
" NegativityLLMJudge() \n", | ||
" ]),\n", | ||
" TextEvals(column_name=\"response\", descriptors=[\n", | ||
" PIILLMJudge(), \n", | ||
" DeclineLLMJudge()\n", | ||
" ])\n", | ||
"])\n", | ||
"\n", | ||
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n", | ||
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n", | ||
" column_mapping=column_mapping)\n", | ||
"report " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "0bdfd280-b477-4484-b0c0-6720c5b9a226", | ||
"metadata": {}, | ||
"source": [ | ||
"### LLM-based descriptors with parameters" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e8b1f7c7-eb47-4dc1-99e6-f0c71d118373", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"report = Report(metrics=[\n", | ||
" TextEvals(column_name=\"question\", descriptors=[\n", | ||
" NegativityLLMJudge(include_category=True) \n", | ||
" ]),\n", | ||
" TextEvals(column_name=\"response\", descriptors=[\n", | ||
" PIILLMJudge(include_reasonning=False), \n", | ||
" DeclineLLMJudge(include_score=True)\n", | ||
" ])\n", | ||
"])\n", | ||
"\n", | ||
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n", | ||
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n", | ||
" column_mapping=column_mapping)\n", | ||
"report " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3806d7d8-5acf-45cb-b16b-3b4336dea6e0", | ||
"metadata": {}, | ||
"source": [ | ||
"### Custom LLM-based descriptor" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2965eb66-b27e-4101-8893-8d7c9389b61e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from evidently.features.llm_judge import BinaryClassificationPromptTemplate" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "55226466-786c-4ed0-9085-d9bffc4e266e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"custom_judge = LLMJudgeDescriptor(\n", | ||
" subcolumn=\"category\",\n", | ||
" template = BinaryClassificationPromptTemplate( \n", | ||
" criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n", | ||
" A concise response should:\n", | ||
" - Provide the necessary information without unnecessary details or repetition.\n", | ||
" - Be brief yet comprehensive enough to address the query.\n", | ||
" - Use simple and direct language to convey the message effectively.\n", | ||
" \"\"\",\n", | ||
" target_category=\"Conciseness\",\n", | ||
" non_target_category=\"Ok\",\n", | ||
" uncertainty=\"unknown\",\n", | ||
" include_reasoning=True,\n", | ||
" pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n", | ||
" ),\n", | ||
" provider = \"openai\",\n", | ||
" model = \"gpt-4o-mini\"\n", | ||
")\n", | ||
"\n", | ||
"report = Report(metrics=[\n", | ||
" TextEvals(column_name=\"response\", descriptors=[\n", | ||
" custom_judge\n", | ||
" ])\n", | ||
"])\n", | ||
"\n", | ||
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n", | ||
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n", | ||
" column_mapping=column_mapping)\n", | ||
"report " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "aa7824f1-f293-4462-b377-21c798338bca", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"custom_judge = LLMJudgeDescriptor(\n", | ||
" subcolumn=\"score\",\n", | ||
" template = BinaryClassificationPromptTemplate( \n", | ||
" criteria = \"\"\"Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.\n", | ||
" A concise response should:\n", | ||
" - Provide the necessary information without unnecessary details or repetition.\n", | ||
" - Be brief yet comprehensive enough to address the query.\n", | ||
" - Use simple and direct language to convey the message effectively.\n", | ||
" \"\"\",\n", | ||
" target_category=\"Conciseness\",\n", | ||
" non_target_category=\"Ok\",\n", | ||
" uncertainty=\"unknown\",\n", | ||
" include_reasoning=True,\n", | ||
" include_score=True,\n", | ||
" pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n", | ||
" ),\n", | ||
" provider = \"openai\",\n", | ||
" model = \"gpt-4o-mini\"\n", | ||
")\n", | ||
"\n", | ||
"report = Report(metrics=[\n", | ||
" TextEvals(column_name=\"response\", descriptors=[\n", | ||
" custom_judge\n", | ||
" ])\n", | ||
"])\n", | ||
"\n", | ||
"report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)][:10], \n", | ||
" current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)][:10], \n", | ||
" column_mapping=column_mapping)\n", | ||
"report " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "26352e01-7342-4c5e-b3e1-cf9a56fb3f2e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.