From 0c3b1053f95b88e234ed3ccfffa50b27f48dc359 Mon Sep 17 00:00:00 2001
From: Xander Song <axiomofjoy@gmail.com>
Date: Tue, 7 Nov 2023 15:59:32 -0800
Subject: [PATCH] feat: add long-context evaluators, including map reduce and
 refine patterns (#1710)

---
 src/phoenix/experimental/evals/evaluators.py  | 139 ++++++++
 .../internal/long_context_summary_evals.ipynb | 328 ++++++++++++++++++
 2 files changed, 467 insertions(+)
 create mode 100644 src/phoenix/experimental/evals/evaluators.py
 create mode 100644 tutorials/internal/long_context_summary_evals.ipynb

diff --git a/src/phoenix/experimental/evals/evaluators.py b/src/phoenix/experimental/evals/evaluators.py
new file mode 100644
index 0000000000..397f564da5
--- /dev/null
+++ b/src/phoenix/experimental/evals/evaluators.py
@@ -0,0 +1,139 @@
+from typing import List, Optional
+
+from phoenix.experimental.evals import PromptTemplate
+from phoenix.experimental.evals.models import BaseEvalModel
+
+
+class MapReducer:
+    """
+    Evaluates data that is too large to fit into a single context window using a
+    map-reduce strategy. The data must first be divided into "chunks" that
+    individually fit into an LLM's context window. Each chunk of data is
+    individually evaluated (the "map" step), producing intermediate outputs that
+    are combined into a single result (the "reduce" step).
+
+    This is the simplest strategy for evaluating long-context data.
+    """
+
+    def __init__(
+        self,
+        model: BaseEvalModel,
+        map_prompt_template: PromptTemplate,
+        reduce_prompt_template: PromptTemplate,
+    ) -> None:
+        """Initializes an instance.
+
+        Args:
+            model (BaseEvalModel): The LLM model to use for evaluation.
+
+            map_prompt_template (PromptTemplate): The template that is mapped
+            over each chunk to produce intermediate outputs. Must contain the
+            {chunk} placeholder.
+
+            reduce_prompt_template (PromptTemplate): The template that combines
+            the intermediate outputs into a single result. Must contain the
+            {mapped} placeholder, which will be formatted as a list of the
+            intermediate outputs produced by the map step.
+        """
+        self._model = model
+        self._map_prompt_template = map_prompt_template
+        self._reduce_prompt_template = reduce_prompt_template
+
+    def evaluate(self, chunks: List[str]) -> str:
+        """Evaluates a list of two or more chunks.
+
+        Args:
+            chunks (List[str]): A list of chunks to be evaluated. Each chunk is
+            inserted into the map_prompt_template and must therefore fit within
+            the LLM's context window and still leave room for the rest of the
+            prompt.
+
+        Returns:
+            str: The output of the map-reduce process.
+        """
+        if len(chunks) < 2:
+            raise ValueError(
+                "The map-reduce strategy is not needed to evaluate data "
+                "that fits within a single context window. "
+                "Consider using llm_classify instead."
+            )
+        model = self._model
+        mapped_records = []
+        for chunk in chunks:
+            map_prompt = self._map_prompt_template.format({"chunk": chunk})
+            intermediate_output = model(map_prompt)
+            mapped_records.append(intermediate_output)
+        reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)})
+        return model(reduce_prompt)
+
+
+class Refiner:
+    """
+    Evaluates data that is too large to fit into a single context window using a
+    refine strategy. The data must first be divided into "chunks" that
+    individually fit into an LLM's context window. An initial "accumulator" is
+    generated from the first chunk of data. The accumulator is subsequently
+    refined by iteratively updating and incorporating new information from each
+    subsequent chunk. An optional synthesis step can be used to synthesize the
+    final accumulator into a desired format.
+    """
+
+    def __init__(
+        self,
+        model: BaseEvalModel,
+        initial_prompt_template: PromptTemplate,
+        refine_prompt_template: PromptTemplate,
+        synthesize_prompt_template: Optional[PromptTemplate] = None,
+    ) -> None:
+        """Initializes an instance.
+
+        Args:
+            model (BaseEvalModel): The LLM model to use for evaluation.
+
+            initial_prompt_template (PromptTemplate): The template for the
+            initial invocation of the model that will generate the initial
+            accumulator. Should contain the {chunk} placeholder.
+
+            refine_prompt_template (PromptTemplate): The template for refining
+            the accumulator across all subsequence chunks. Must contain the
+            {chunk} and {accumulator} placeholders.
+
+            synthesize_prompt_template (Optional[PromptTemplate], optional): An
+            optional template to synthesize the final version of the
+            accumulator. Must contain the {accumulator} placeholder.
+        """
+        self._model = model
+        self._initial_prompt_template = initial_prompt_template
+        self._refine_prompt_template = refine_prompt_template
+        self._synthesize_prompt_template = synthesize_prompt_template
+
+    def evaluate(self, chunks: List[str]) -> str:
+        """Evaluates a list of two or more chunks.
+
+        Args:
+            chunks (List[str]): A list of chunks to be evaluated. Each chunk is
+            inserted into the initial_prompt_template and refine_prompt_template
+            and must therefore fit within the LLM's context window and still
+            leave room for the rest of the prompt.
+
+        Returns:
+            str: The output of the refine process.
+        """
+        if len(chunks) < 2:
+            raise ValueError(
+                "The refine strategy is not needed to evaluate data "
+                "that fits within a single context window. "
+                "Consider using llm_classify instead."
+            )
+        model = self._model
+        initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]})
+        accumulator = model(initial_prompt)
+        for chunk in chunks[1:]:
+            refine_prompt = self._refine_prompt_template.format(
+                {"accumulator": accumulator, "chunk": chunk}
+            )
+            accumulator = model(refine_prompt)
+        if not self._synthesize_prompt_template:
+            return accumulator
+        reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator})
+        return model(reduce_prompt)
diff --git a/tutorials/internal/long_context_summary_evals.ipynb b/tutorials/internal/long_context_summary_evals.ipynb
new file mode 100644
index 0000000000..3a70e9018e
--- /dev/null
+++ b/tutorials/internal/long_context_summary_evals.ipynb
@@ -0,0 +1,328 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Summary Evaluations\n",
+    "\n",
+    "Download Amazon product reviews and parse the raw data into a pandas dataframe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "from typing import List\n",
+    "from urllib.request import urlopen\n",
+    "\n",
+    "import pandas as pd\n",
+    "import tiktoken\n",
+    "from langchain.chains.summarize import load_summarize_chain\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.schema import Document\n",
+    "from phoenix.experimental.evals import PromptTemplate\n",
+    "from phoenix.experimental.evals.evaluators import MapReducer, Refiner\n",
+    "from phoenix.experimental.evals.models import OpenAIModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://snap.stanford.edu/data/amazon/Cell_Phones_&_Accessories.txt.gz\"\n",
+    "data = []\n",
+    "review_data = {}\n",
+    "with urlopen(url) as response:\n",
+    "    with gzip.open(response, \"rt\", encoding=\"utf-8\") as unzipped:\n",
+    "        for line in unzipped:\n",
+    "            line = line.strip()\n",
+    "            if line:\n",
+    "                parts = line.split(\": \", 1)\n",
+    "                key = parts[0]\n",
+    "                value = parts[1] if len(parts) > 1 else None\n",
+    "                review_data[key] = value\n",
+    "            else:\n",
+    "                if review_data:\n",
+    "                    data.append(review_data)\n",
+    "                    review_data = {}\n",
+    "        if review_data:\n",
+    "            data.append(review_data)\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"product/productId\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_product_id = \"B0009B0IX4\"\n",
+    "product_df = df[df[\"product/productId\"] == target_product_id]\n",
+    "product_df[\"review/summary\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gather documents into chunks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
+    "\n",
+    "\n",
+    "def gather_documents_into_chunks(\n",
+    "    documents: List[str],\n",
+    "    max_tokens_per_chunk: int,\n",
+    "    separator=\"\\n\\n======\\n\\n\",\n",
+    ") -> List[str]:\n",
+    "    chunks = []\n",
+    "    current_chunk_documents = []\n",
+    "    current_chunk_tokens = 0\n",
+    "    num_tokens_in_separator = len(encoding.encode(separator))\n",
+    "    for document in documents:\n",
+    "        document_tokens = len(encoding.encode(document))\n",
+    "        tokens_to_add = document_tokens + (\n",
+    "            num_tokens_in_separator if current_chunk_documents else 0\n",
+    "        )\n",
+    "        if current_chunk_tokens + tokens_to_add <= max_tokens_per_chunk:\n",
+    "            current_chunk_documents.append(document)\n",
+    "            current_chunk_tokens += tokens_to_add\n",
+    "        else:\n",
+    "            if current_chunk_documents:\n",
+    "                chunks.append(separator.join(current_chunk_documents))\n",
+    "            current_chunk_documents = [document]\n",
+    "            current_chunk_tokens = document_tokens\n",
+    "    if current_chunk_documents:\n",
+    "        chunks.append(separator.join(current_chunk_documents))\n",
+    "    return chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = product_df[\"review/text\"].sample(frac=1, random_state=0).to_list()\n",
+    "gpt4_context_window_in_tokens = 8192\n",
+    "chunks = gather_documents_into_chunks(\n",
+    "    documents=documents,\n",
+    "    max_tokens_per_chunk=(gpt4_context_window_in_tokens - 1000),  # add in a buffer\n",
+    ")[:3]\n",
+    "chunks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Summarize with a LangChain \"refine\" chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = ChatOpenAI(model=\"gpt-4\")\n",
+    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
+    "documents = [Document(page_content=chunk) for chunk in chunks]\n",
+    "summary = chain.run(documents)\n",
+    "print(summary)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the summary using `MapReducer`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = OpenAIModel(\n",
+    "    model_name=\"gpt-4\",\n",
+    ")\n",
+    "map_prompt_template = PromptTemplate(\n",
+    "    \"You will be given a CONTEXT that contains multiple documents. \"\n",
+    "    \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n",
+    "    \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n",
+    "    \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n",
+    "    \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n",
+    "    \"Focus on important points, not trivial details.\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    f\"SUMMARY: {summary}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"CONTEXT: {chunk}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"EVALUATION: \"\n",
+    ")\n",
+    "reduce_prompt_template = PromptTemplate(\n",
+    "    \"You will be given a SUMMARY that summarizes a large number of documents. \"\n",
+    "    \"You will also be given a list of EVALUATIONS of the quality of that SUMMARY. \"\n",
+    "    \"Each evaluation judges the SUMMARY relative to a different subset of the documents it summarizes. \"\n",
+    "    \"Given this list, you must provide a single, OVERALL EVALUATION of the quality of the SUMMARY that should take into account the individual EVALUATIONS. \"\n",
+    "    'Your OVERALL EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    f\"SUMMARY: {summary}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"EVALUATIONS: {mapped}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"OVERALL EVALUATION: \"\n",
+    ")\n",
+    "evaluator = MapReducer(\n",
+    "    model=model,\n",
+    "    map_prompt_template=map_prompt_template,\n",
+    "    reduce_prompt_template=reduce_prompt_template,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_evaluation = evaluator.evaluate(chunks)\n",
+    "print(summary_evaluation)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate summary using `Refiner`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = OpenAIModel(model_name=\"gpt-4\")\n",
+    "initial_prompt_template = PromptTemplate(\n",
+    "    \"You will be given a CONTEXT that contains multiple documents. \"\n",
+    "    \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n",
+    "    \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n",
+    "    \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n",
+    "    \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n",
+    "    \"Focus on important points, not trivial details.\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    f\"SUMMARY: {summary}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"CONTEXT: {chunk}\"\n",
+    "    \"=======\"\n",
+    "    \"EVALUATION: \"\n",
+    ")\n",
+    "refine_prompt_template = PromptTemplate(\n",
+    "    \"You will be given: \\n\"\n",
+    "    \"  - a CONTEXT that contains multiple documents\\n\"\n",
+    "    \"  - a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents\\n\"\n",
+    "    \"  - an ACCUMULATED EVALUATION of the quality of the SUMMARY relative to other subsets of the summarized documents\\n\"\n",
+    "    \"You must provide a REFINED EVALUATION of the quality of the SUMMARY that considers the current CONTEXT. \"\n",
+    "    \"Bear in mind that the SUMMARY may include information from unseen documents, although you don't need to mention explicitly mention that. \"\n",
+    "    \"Focus on important points, not trivial details.\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    f\"SUMMARY: {summary}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"CONTEXT: {chunk}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"ACCUMULATED EVALUATION: {accumulator}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"REFINED EVALUATION: \"\n",
+    ")\n",
+    "synthesize_prompt_template = PromptTemplate(\n",
+    "    \"You will be given a SUMMARY that summarizes a large number of documents. \"\n",
+    "    \"You will also be given a VERBOSE EVALUATION of the quality of that SUMMARY. \"\n",
+    "    \"Given this VERBOSE EVALUATION, you must provide a single, CONCISE EVALUATION of the quality of the SUMMARY. \"\n",
+    "    'Your CONCISE EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    f\"SUMMARY: {summary}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"VERBOSE EVALUATION: {accumulator}\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"=======\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"CONCISE EVALUATION: \"\n",
+    ")\n",
+    "evaluator = Refiner(\n",
+    "    model=model,\n",
+    "    initial_prompt_template=initial_prompt_template,\n",
+    "    refine_prompt_template=refine_prompt_template,\n",
+    "    synthesize_prompt_template=synthesize_prompt_template,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_evaluation = evaluator.evaluate(chunks)\n",
+    "print(summary_evaluation)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}