From 0c3b1053f95b88e234ed3ccfffa50b27f48dc359 Mon Sep 17 00:00:00 2001 From: Xander Song Date: Tue, 7 Nov 2023 15:59:32 -0800 Subject: [PATCH] feat: add long-context evaluators, including map reduce and refine patterns (#1710) --- src/phoenix/experimental/evals/evaluators.py | 139 ++++++++ .../internal/long_context_summary_evals.ipynb | 328 ++++++++++++++++++ 2 files changed, 467 insertions(+) create mode 100644 src/phoenix/experimental/evals/evaluators.py create mode 100644 tutorials/internal/long_context_summary_evals.ipynb diff --git a/src/phoenix/experimental/evals/evaluators.py b/src/phoenix/experimental/evals/evaluators.py new file mode 100644 index 0000000000..397f564da5 --- /dev/null +++ b/src/phoenix/experimental/evals/evaluators.py @@ -0,0 +1,139 @@ +from typing import List, Optional + +from phoenix.experimental.evals import PromptTemplate +from phoenix.experimental.evals.models import BaseEvalModel + + +class MapReducer: + """ + Evaluates data that is too large to fit into a single context window using a + map-reduce strategy. The data must first be divided into "chunks" that + individually fit into an LLM's context window. Each chunk of data is + individually evaluated (the "map" step), producing intermediate outputs that + are combined into a single result (the "reduce" step). + + This is the simplest strategy for evaluating long-context data. + """ + + def __init__( + self, + model: BaseEvalModel, + map_prompt_template: PromptTemplate, + reduce_prompt_template: PromptTemplate, + ) -> None: + """Initializes an instance. + + Args: + model (BaseEvalModel): The LLM model to use for evaluation. + + map_prompt_template (PromptTemplate): The template that is mapped + over each chunk to produce intermediate outputs. Must contain the + {chunk} placeholder. + + reduce_prompt_template (PromptTemplate): The template that combines + the intermediate outputs into a single result. Must contain the + {mapped} placeholder, which will be formatted as a list of the + intermediate outputs produced by the map step. + """ + self._model = model + self._map_prompt_template = map_prompt_template + self._reduce_prompt_template = reduce_prompt_template + + def evaluate(self, chunks: List[str]) -> str: + """Evaluates a list of two or more chunks. + + Args: + chunks (List[str]): A list of chunks to be evaluated. Each chunk is + inserted into the map_prompt_template and must therefore fit within + the LLM's context window and still leave room for the rest of the + prompt. + + Returns: + str: The output of the map-reduce process. + """ + if len(chunks) < 2: + raise ValueError( + "The map-reduce strategy is not needed to evaluate data " + "that fits within a single context window. " + "Consider using llm_classify instead." + ) + model = self._model + mapped_records = [] + for chunk in chunks: + map_prompt = self._map_prompt_template.format({"chunk": chunk}) + intermediate_output = model(map_prompt) + mapped_records.append(intermediate_output) + reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)}) + return model(reduce_prompt) + + +class Refiner: + """ + Evaluates data that is too large to fit into a single context window using a + refine strategy. The data must first be divided into "chunks" that + individually fit into an LLM's context window. An initial "accumulator" is + generated from the first chunk of data. The accumulator is subsequently + refined by iteratively updating and incorporating new information from each + subsequent chunk. An optional synthesis step can be used to synthesize the + final accumulator into a desired format. + """ + + def __init__( + self, + model: BaseEvalModel, + initial_prompt_template: PromptTemplate, + refine_prompt_template: PromptTemplate, + synthesize_prompt_template: Optional[PromptTemplate] = None, + ) -> None: + """Initializes an instance. + + Args: + model (BaseEvalModel): The LLM model to use for evaluation. + + initial_prompt_template (PromptTemplate): The template for the + initial invocation of the model that will generate the initial + accumulator. Should contain the {chunk} placeholder. + + refine_prompt_template (PromptTemplate): The template for refining + the accumulator across all subsequence chunks. Must contain the + {chunk} and {accumulator} placeholders. + + synthesize_prompt_template (Optional[PromptTemplate], optional): An + optional template to synthesize the final version of the + accumulator. Must contain the {accumulator} placeholder. + """ + self._model = model + self._initial_prompt_template = initial_prompt_template + self._refine_prompt_template = refine_prompt_template + self._synthesize_prompt_template = synthesize_prompt_template + + def evaluate(self, chunks: List[str]) -> str: + """Evaluates a list of two or more chunks. + + Args: + chunks (List[str]): A list of chunks to be evaluated. Each chunk is + inserted into the initial_prompt_template and refine_prompt_template + and must therefore fit within the LLM's context window and still + leave room for the rest of the prompt. + + Returns: + str: The output of the refine process. + """ + if len(chunks) < 2: + raise ValueError( + "The refine strategy is not needed to evaluate data " + "that fits within a single context window. " + "Consider using llm_classify instead." + ) + model = self._model + initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]}) + accumulator = model(initial_prompt) + for chunk in chunks[1:]: + refine_prompt = self._refine_prompt_template.format( + {"accumulator": accumulator, "chunk": chunk} + ) + accumulator = model(refine_prompt) + if not self._synthesize_prompt_template: + return accumulator + reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator}) + return model(reduce_prompt) diff --git a/tutorials/internal/long_context_summary_evals.ipynb b/tutorials/internal/long_context_summary_evals.ipynb new file mode 100644 index 0000000000..3a70e9018e --- /dev/null +++ b/tutorials/internal/long_context_summary_evals.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary Evaluations\n", + "\n", + "Download Amazon product reviews and parse the raw data into a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "from typing import List\n", + "from urllib.request import urlopen\n", + "\n", + "import pandas as pd\n", + "import tiktoken\n", + "from langchain.chains.summarize import load_summarize_chain\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema import Document\n", + "from phoenix.experimental.evals import PromptTemplate\n", + "from phoenix.experimental.evals.evaluators import MapReducer, Refiner\n", + "from phoenix.experimental.evals.models import OpenAIModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://snap.stanford.edu/data/amazon/Cell_Phones_&_Accessories.txt.gz\"\n", + "data = []\n", + "review_data = {}\n", + "with urlopen(url) as response:\n", + " with gzip.open(response, \"rt\", encoding=\"utf-8\") as unzipped:\n", + " for line in unzipped:\n", + " line = line.strip()\n", + " if line:\n", + " parts = line.split(\": \", 1)\n", + " key = parts[0]\n", + " value = parts[1] if len(parts) > 1 else None\n", + " review_data[key] = value\n", + " else:\n", + " if review_data:\n", + " data.append(review_data)\n", + " review_data = {}\n", + " if review_data:\n", + " data.append(review_data)\n", + "\n", + "df = pd.DataFrame(data)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"product/productId\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_product_id = \"B0009B0IX4\"\n", + "product_df = df[df[\"product/productId\"] == target_product_id]\n", + "product_df[\"review/summary\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gather documents into chunks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoding = tiktoken.get_encoding(\"cl100k_base\")\n", + "\n", + "\n", + "def gather_documents_into_chunks(\n", + " documents: List[str],\n", + " max_tokens_per_chunk: int,\n", + " separator=\"\\n\\n======\\n\\n\",\n", + ") -> List[str]:\n", + " chunks = []\n", + " current_chunk_documents = []\n", + " current_chunk_tokens = 0\n", + " num_tokens_in_separator = len(encoding.encode(separator))\n", + " for document in documents:\n", + " document_tokens = len(encoding.encode(document))\n", + " tokens_to_add = document_tokens + (\n", + " num_tokens_in_separator if current_chunk_documents else 0\n", + " )\n", + " if current_chunk_tokens + tokens_to_add <= max_tokens_per_chunk:\n", + " current_chunk_documents.append(document)\n", + " current_chunk_tokens += tokens_to_add\n", + " else:\n", + " if current_chunk_documents:\n", + " chunks.append(separator.join(current_chunk_documents))\n", + " current_chunk_documents = [document]\n", + " current_chunk_tokens = document_tokens\n", + " if current_chunk_documents:\n", + " chunks.append(separator.join(current_chunk_documents))\n", + " return chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = product_df[\"review/text\"].sample(frac=1, random_state=0).to_list()\n", + "gpt4_context_window_in_tokens = 8192\n", + "chunks = gather_documents_into_chunks(\n", + " documents=documents,\n", + " max_tokens_per_chunk=(gpt4_context_window_in_tokens - 1000), # add in a buffer\n", + ")[:3]\n", + "chunks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Summarize with a LangChain \"refine\" chain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model=\"gpt-4\")\n", + "chain = load_summarize_chain(llm, chain_type=\"refine\")\n", + "documents = [Document(page_content=chunk) for chunk in chunks]\n", + "summary = chain.run(documents)\n", + "print(summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the summary using `MapReducer`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAIModel(\n", + " model_name=\"gpt-4\",\n", + ")\n", + "map_prompt_template = PromptTemplate(\n", + " \"You will be given a CONTEXT that contains multiple documents. \"\n", + " \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n", + " \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n", + " \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n", + " \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n", + " \"Focus on important points, not trivial details.\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " f\"SUMMARY: {summary}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"CONTEXT: {chunk}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"EVALUATION: \"\n", + ")\n", + "reduce_prompt_template = PromptTemplate(\n", + " \"You will be given a SUMMARY that summarizes a large number of documents. \"\n", + " \"You will also be given a list of EVALUATIONS of the quality of that SUMMARY. \"\n", + " \"Each evaluation judges the SUMMARY relative to a different subset of the documents it summarizes. \"\n", + " \"Given this list, you must provide a single, OVERALL EVALUATION of the quality of the SUMMARY that should take into account the individual EVALUATIONS. \"\n", + " 'Your OVERALL EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " f\"SUMMARY: {summary}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"EVALUATIONS: {mapped}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"OVERALL EVALUATION: \"\n", + ")\n", + "evaluator = MapReducer(\n", + " model=model,\n", + " map_prompt_template=map_prompt_template,\n", + " reduce_prompt_template=reduce_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "summary_evaluation = evaluator.evaluate(chunks)\n", + "print(summary_evaluation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate summary using `Refiner`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAIModel(model_name=\"gpt-4\")\n", + "initial_prompt_template = PromptTemplate(\n", + " \"You will be given a CONTEXT that contains multiple documents. \"\n", + " \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n", + " \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n", + " \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n", + " \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n", + " \"Focus on important points, not trivial details.\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " f\"SUMMARY: {summary}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"CONTEXT: {chunk}\"\n", + " \"=======\"\n", + " \"EVALUATION: \"\n", + ")\n", + "refine_prompt_template = PromptTemplate(\n", + " \"You will be given: \\n\"\n", + " \" - a CONTEXT that contains multiple documents\\n\"\n", + " \" - a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents\\n\"\n", + " \" - an ACCUMULATED EVALUATION of the quality of the SUMMARY relative to other subsets of the summarized documents\\n\"\n", + " \"You must provide a REFINED EVALUATION of the quality of the SUMMARY that considers the current CONTEXT. \"\n", + " \"Bear in mind that the SUMMARY may include information from unseen documents, although you don't need to mention explicitly mention that. \"\n", + " \"Focus on important points, not trivial details.\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " f\"SUMMARY: {summary}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"CONTEXT: {chunk}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"ACCUMULATED EVALUATION: {accumulator}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"REFINED EVALUATION: \"\n", + ")\n", + "synthesize_prompt_template = PromptTemplate(\n", + " \"You will be given a SUMMARY that summarizes a large number of documents. \"\n", + " \"You will also be given a VERBOSE EVALUATION of the quality of that SUMMARY. \"\n", + " \"Given this VERBOSE EVALUATION, you must provide a single, CONCISE EVALUATION of the quality of the SUMMARY. \"\n", + " 'Your CONCISE EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " f\"SUMMARY: {summary}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"VERBOSE EVALUATION: {accumulator}\"\n", + " \"\\n\\n\"\n", + " \"=======\"\n", + " \"\\n\\n\"\n", + " \"CONCISE EVALUATION: \"\n", + ")\n", + "evaluator = Refiner(\n", + " model=model,\n", + " initial_prompt_template=initial_prompt_template,\n", + " refine_prompt_template=refine_prompt_template,\n", + " synthesize_prompt_template=synthesize_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "summary_evaluation = evaluator.evaluate(chunks)\n", + "print(summary_evaluation)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}