diff --git a/app/schema.graphql b/app/schema.graphql
index edf634e517..7c1e164987 100644
--- a/app/schema.graphql
+++ b/app/schema.graphql
@@ -179,6 +179,29 @@ type DimensionWithValue {
   value: String
 }
 
+type DocumentEvaluation implements Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+
+  """
+  The zero-based index among retrieved documents, which is collected as a list (even when ordering is not inherently meaningful).
+  """
+  documentPosition: Int!
+}
+
 type DriftTimeSeries implements TimeSeries {
   data: [TimeSeriesDataPoint!]!
 }
@@ -261,6 +284,24 @@ type EmbeddingMetadata {
   linkToData: String
 }
 
+interface Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+}
+
 type Event {
   id: ID!
   eventMetadata: EventMetadata!
@@ -460,6 +501,11 @@ type Query {
     clusterSelectionEpsilon: Float! = 0
   ): [Cluster!]!
   spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection!
+
+  """
+  Names of all available evaluations for spans. (The list contains no duplicates.)
+  """
+  spanEvaluationNames: [String!]!
   traceDatasetInfo: TraceDatasetInfo
   validateSpanFilterCondition(condition: String!): ValidationResult!
 }
@@ -527,6 +573,16 @@ type Span {
   """
   cumulativeTokenCountCompletion: Int
 
+  """
+  Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may assess the helpfulness of its response with respect to its input.
+  """
+  spanEvaluations: [SpanEvaluation!]!
+
+  """
+  Evaluations of the documents associated with the span, e.g. if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS attribute, an evaluation for each document may assess its relevance respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is a list, and each evaluation is identified by its document's (zero-based) index in that list.
+  """
+  documentEvaluations: [DocumentEvaluation!]!
+
   """All descendant spans (children, grandchildren, etc.)"""
   descendants: [Span!]!
 }
@@ -558,6 +614,24 @@ type SpanEdge {
   cursor: String!
 }
 
+type SpanEvaluation implements Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+}
+
 type SpanEvent {
   name: String!
   message: String!
diff --git a/pyproject.toml b/pyproject.toml
index 0f3524a6bd..092f57c7b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -234,7 +234,10 @@ dependencies = [
 ]
 
 [tool.hatch.envs.proto.scripts]
-recompile = "python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto"
+recompile = """
+python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto &&
+python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto
+"""
 
 [tool.interrogate]
 fail-under = 0
diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py
new file mode 100644
index 0000000000..4972a38fd1
--- /dev/null
+++ b/src/phoenix/core/evals.py
@@ -0,0 +1,81 @@
+import weakref
+from collections import defaultdict
+from queue import SimpleQueue
+from threading import RLock, Thread
+from types import MethodType
+from typing import DefaultDict, Dict, List, Optional
+
+from typing_extensions import TypeAlias
+
+import phoenix.trace.v1 as pb
+from phoenix.trace.schemas import SpanID
+
+END_OF_QUEUE = None  # sentinel value for queue termination
+
+EvaluationName: TypeAlias = str
+DocumentPosition: TypeAlias = int
+
+
+class Evals:
+    def __init__(self) -> None:
+        self._queue: "SimpleQueue[Optional[pb.Evaluation]]" = SimpleQueue()
+        weakref.finalize(self, self._queue.put, END_OF_QUEUE)
+        self._lock = RLock()
+        self._start_consumer()
+        self._span_evaluations_by_name: DefaultDict[
+            EvaluationName, Dict[SpanID, pb.Evaluation]
+        ] = defaultdict(dict)
+        self._evaluations_by_span_id: DefaultDict[
+            SpanID, Dict[EvaluationName, pb.Evaluation]
+        ] = defaultdict(dict)
+        self._document_evaluations_by_span_id: DefaultDict[
+            SpanID, DefaultDict[EvaluationName, Dict[DocumentPosition, pb.Evaluation]]
+        ] = defaultdict(lambda: defaultdict(dict))
+
+    def put(self, evaluation: pb.Evaluation) -> None:
+        self._queue.put(evaluation)
+
+    def _start_consumer(self) -> None:
+        Thread(
+            target=MethodType(
+                self.__class__._consume_evaluations,
+                weakref.proxy(self),
+            ),
+            daemon=True,
+        ).start()
+
+    def _consume_evaluations(self) -> None:
+        while (item := self._queue.get()) is not END_OF_QUEUE:
+            with self._lock:
+                self._process_evaluation(item)
+
+    def _process_evaluation(self, evaluation: pb.Evaluation) -> None:
+        subject_id = evaluation.subject_id
+        name = evaluation.name
+        subject_id_kind = subject_id.WhichOneof("kind")
+        if subject_id_kind == "document_retrieval_id":
+            document_retrieval_id = subject_id.document_retrieval_id
+            span_id = SpanID(document_retrieval_id.span_id)
+            document_position = document_retrieval_id.document_position
+            self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation
+        elif subject_id_kind == "span_id":
+            span_id = SpanID(subject_id.span_id)
+            self._evaluations_by_span_id[span_id][name] = evaluation
+            self._span_evaluations_by_name[name][span_id] = evaluation
+        else:
+            raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}")
+
+    def get_span_evaluation_names(self) -> List[EvaluationName]:
+        with self._lock:
+            return list(self._span_evaluations_by_name.keys())
+
+    def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
+        with self._lock:
+            return list(self._evaluations_by_span_id[span_id].values())
+
+    def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
+        all_evaluations: List[pb.Evaluation] = []
+        with self._lock:
+            for evaluations in self._document_evaluations_by_span_id[span_id].values():
+                all_evaluations.extend(evaluations.values())
+        return all_evaluations
diff --git a/src/phoenix/proto/trace/v1/evaluation.proto b/src/phoenix/proto/trace/v1/evaluation.proto
new file mode 100644
index 0000000000..93441161a6
--- /dev/null
+++ b/src/phoenix/proto/trace/v1/evaluation.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+package phoenix.proto.evaluation.v1;
+
+import "google/protobuf/wrappers.proto";
+
+message Evaluation {
+  string name = 1;
+  message SubjectId {
+    message DocumentRetrievalId {
+      string span_id = 1;
+      int32 document_position = 2;  // zero-based index
+    }
+    oneof kind {
+      string trace_id = 1;
+      string span_id = 2;
+      DocumentRetrievalId document_retrieval_id = 3;
+    }
+  }
+  SubjectId subject_id = 2;
+  message Result {
+    google.protobuf.DoubleValue score = 1;
+    google.protobuf.StringValue label = 2;
+    google.protobuf.StringValue explanation = 3;
+  }
+  Result result = 3;
+}
diff --git a/src/phoenix/server/api/context.py b/src/phoenix/server/api/context.py
index 2d6a95be7e..b0480bb960 100644
--- a/src/phoenix/server/api/context.py
+++ b/src/phoenix/server/api/context.py
@@ -6,6 +6,7 @@
 from starlette.responses import Response
 from starlette.websockets import WebSocket
 
+from phoenix.core.evals import Evals
 from phoenix.core.model_schema import Model
 from phoenix.core.traces import Traces
 
@@ -18,3 +19,4 @@ class Context:
     export_path: Path
     corpus: Optional[Model] = None
     traces: Optional[Traces] = None
+    evals: Optional[Evals] = None
diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py
index cfd9fad317..844cdb32a8 100644
--- a/src/phoenix/server/api/schema.py
+++ b/src/phoenix/server/api/schema.py
@@ -239,6 +239,18 @@ def spans(
         data = list(map(to_gql_span, spans))
         return connection_from_list(data=data, args=args)
 
+    @strawberry.field(
+        description="Names of all available evaluations for spans. "
+        "(The list contains no duplicates.)"
+    )  # type: ignore
+    def span_evaluation_names(
+        self,
+        info: Info[Context, None],
+    ) -> List[str]:
+        if (evals := info.context.evals) is None:
+            return []
+        return evals.get_span_evaluation_names()
+
     @strawberry.field
     def trace_dataset_info(
         self,
diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py
new file mode 100644
index 0000000000..5b4ccd0915
--- /dev/null
+++ b/src/phoenix/server/api/types/Evaluation.py
@@ -0,0 +1,71 @@
+from typing import Optional
+
+import strawberry
+
+import phoenix.trace.v1 as pb
+from phoenix.trace.schemas import SpanID
+
+
+@strawberry.interface
+class Evaluation:
+    name: str = strawberry.field(
+        description="Name of the evaluation, e.g. 'helpfulness' or 'relevance'."
+    )
+    score: Optional[float] = strawberry.field(
+        description="Result of the evaluation in the form of a numeric score."
+    )
+    label: Optional[str] = strawberry.field(
+        description="Result of the evaluation in the form of a string, e.g. "
+        "'helpful' or 'not helpful'. Note that the label is not necessarily binary."
+    )
+    explanation: Optional[str] = strawberry.field(
+        description="The evaluator's explanation for the evaluation result (i.e. "
+        "score or label, or both) given to the subject."
+    )
+
+
+@strawberry.type
+class SpanEvaluation(Evaluation):
+    span_id: strawberry.Private[SpanID]
+
+    @staticmethod
+    def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation":
+        result = evaluation.result
+        score = result.score.value if result.HasField("score") else None
+        label = result.label.value if result.HasField("label") else None
+        explanation = result.explanation.value if result.HasField("explanation") else None
+        span_id = SpanID(evaluation.subject_id.span_id)
+        return SpanEvaluation(
+            name=evaluation.name,
+            score=score,
+            label=label,
+            explanation=explanation,
+            span_id=span_id,
+        )
+
+
+@strawberry.type
+class DocumentEvaluation(Evaluation):
+    span_id: strawberry.Private[SpanID]
+    document_position: int = strawberry.field(
+        description="The zero-based index among retrieved documents, which "
+        "is collected as a list (even when ordering is not inherently meaningful)."
+    )
+
+    @staticmethod
+    def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation":
+        result = evaluation.result
+        score = result.score.value if result.HasField("score") else None
+        label = result.label.value if result.HasField("label") else None
+        explanation = result.explanation.value if result.HasField("explanation") else None
+        document_retrieval_id = evaluation.subject_id.document_retrieval_id
+        document_position = document_retrieval_id.document_position
+        span_id = SpanID(document_retrieval_id.span_id)
+        return DocumentEvaluation(
+            name=evaluation.name,
+            score=score,
+            label=label,
+            explanation=explanation,
+            document_position=document_position,
+            span_id=span_id,
+        )
diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py
index 02559445b4..4de44d14a6 100644
--- a/src/phoenix/server/api/types/Span.py
+++ b/src/phoenix/server/api/types/Span.py
@@ -11,6 +11,7 @@
 import phoenix.trace.schemas as trace_schema
 from phoenix.core.traces import ComputedAttributes
 from phoenix.server.api.context import Context
+from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation
 from phoenix.server.api.types.MimeType import MimeType
 from phoenix.trace.schemas import SpanID
 from phoenix.trace.semantic_conventions import (
@@ -123,6 +124,43 @@ class Span:
         "descendant spans (children, grandchildren, etc.)",
     )
 
+    @strawberry.field(
+        description="Evaluations associated with the span, e.g. if the span is "
+        "an LLM, an evaluation may assess the helpfulness of its response with "
+        "respect to its input."
+    )  # type: ignore
+    def span_evaluations(
+        self,
+        info: Info[Context, None],
+    ) -> List[SpanEvaluation]:
+        if not (evals := info.context.evals):
+            return []
+        span_id = SpanID(str(self.context.span_id))
+        return [
+            SpanEvaluation.from_pb_evaluation(evaluation)
+            for evaluation in evals.get_evaluations_by_span_id(span_id)
+        ]
+
+    @strawberry.field(
+        description="Evaluations of the documents associated with the span, e.g. "
+        "if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS "
+        "attribute, an evaluation for each document may assess its relevance "
+        "respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is "
+        "a list, and each evaluation is identified by its document's (zero-based) "
+        "index in that list."
+    )  # type: ignore
+    def document_evaluations(
+        self,
+        info: Info[Context, None],
+    ) -> List[DocumentEvaluation]:
+        if not (evals := info.context.evals):
+            return []
+        span_id = SpanID(str(self.context.span_id))
+        return [
+            DocumentEvaluation.from_pb_evaluation(evaluation)
+            for evaluation in evals.get_document_evaluations_by_span_id(span_id)
+        ]
+
     @strawberry.field(
         description="All descendant spans (children, grandchildren, etc.)",
     )  # type: ignore
diff --git a/src/phoenix/server/app.py b/src/phoenix/server/app.py
index f1efd195e3..4f16f4769e 100644
--- a/src/phoenix/server/app.py
+++ b/src/phoenix/server/app.py
@@ -20,11 +20,13 @@
 
 import phoenix
 from phoenix.config import SERVER_DIR
+from phoenix.core.evals import Evals
 from phoenix.core.model_schema import Model
 from phoenix.core.traces import Traces
 from phoenix.pointcloud.umap_parameters import UMAPParameters
 from phoenix.server.api.context import Context
 from phoenix.server.api.schema import schema
+from phoenix.server.evaluation_handler import EvaluationHandler
 from phoenix.server.span_handler import SpanHandler
 
 logger = logging.getLogger(__name__)
@@ -109,10 +111,12 @@ def __init__(
         graphiql: bool = False,
         corpus: Optional[Model] = None,
         traces: Optional[Traces] = None,
+        evals: Optional[Evals] = None,
     ) -> None:
         self.model = model
         self.corpus = corpus
         self.traces = traces
+        self.evals = evals
         self.export_path = export_path
         super().__init__(schema, graphiql=graphiql)
 
@@ -127,6 +131,7 @@ async def get_context(
             model=self.model,
             corpus=self.corpus,
             traces=self.traces,
+            evals=self.evals,
             export_path=self.export_path,
         )
 
@@ -156,6 +161,7 @@ def create_app(
     umap_params: UMAPParameters,
     corpus: Optional[Model] = None,
     traces: Optional[Traces] = None,
+    evals: Optional[Evals] = None,
     debug: bool = False,
 ) -> Starlette:
     graphql = GraphQLWithContext(
@@ -163,6 +169,7 @@ def create_app(
         model=model,
         corpus=corpus,
         traces=traces,
+        evals=evals,
         export_path=export_path,
         graphiql=True,
     )
@@ -185,6 +192,16 @@ def create_app(
                 ),
             ]
         )
+        + (
+            []
+            if evals is None
+            else [
+                Route(
+                    "/v1/evaluations",
+                    type("SpanEndpoint", (EvaluationHandler,), {"queue": evals}),
+                )
+            ]
+        )
         + [
             Route("/arize_phoenix_version", version),
             Route(
diff --git a/src/phoenix/server/evaluation_handler.py b/src/phoenix/server/evaluation_handler.py
new file mode 100644
index 0000000000..238476272f
--- /dev/null
+++ b/src/phoenix/server/evaluation_handler.py
@@ -0,0 +1,46 @@
+import gzip
+from typing import Protocol
+
+from google.protobuf.message import DecodeError
+from starlette.endpoints import HTTPEndpoint
+from starlette.requests import Request
+from starlette.responses import Response
+from starlette.status import HTTP_415_UNSUPPORTED_MEDIA_TYPE, HTTP_422_UNPROCESSABLE_ENTITY
+
+import phoenix.trace.v1 as pb
+
+
+class SupportsPutEvaluation(Protocol):
+    def put(self, evaluation: pb.Evaluation) -> None:
+        ...
+
+
+class EvaluationHandler(HTTPEndpoint):
+    queue: SupportsPutEvaluation
+
+    async def post(self, request: Request) -> Response:
+        content_type = request.headers.get("content-type")
+        if content_type != "application/x-protobuf":
+            return Response(
+                content="Unsupported content type",
+                status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            )
+        body = await request.body()
+        content_encoding = request.headers.get("content-encoding")
+        if content_encoding == "gzip":
+            body = gzip.decompress(body)
+        elif content_encoding:
+            return Response(
+                content="Unsupported content encoding",
+                status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            )
+        evaluation = pb.Evaluation()
+        try:
+            evaluation.ParseFromString(body)
+        except DecodeError:
+            return Response(
+                content="Request body is invalid",
+                status_code=HTTP_422_UNPROCESSABLE_ENTITY,
+            )
+        self.queue.put(evaluation)
+        return Response()
diff --git a/src/phoenix/server/main.py b/src/phoenix/server/main.py
index ce5e996662..eda3382145 100644
--- a/src/phoenix/server/main.py
+++ b/src/phoenix/server/main.py
@@ -11,6 +11,7 @@
 from uvicorn import Config, Server
 
 from phoenix.config import EXPORT_DIR, get_env_host, get_env_port, get_pids_path
+from phoenix.core.evals import Evals
 from phoenix.core.model_schema_adapter import create_model_from_datasets
 from phoenix.core.traces import Traces
 from phoenix.datasets.dataset import EMPTY_DATASET, Dataset
@@ -26,6 +27,7 @@
     TRACES_FIXTURES,
     _download_traces_fixture,
     _get_trace_fixture_by_name,
+    get_evals_from_fixture,
 )
 from phoenix.trace.span_json_decoder import json_string_to_span
 
@@ -143,6 +145,7 @@ def _load_items(
         reference_dataset,
     )
     traces = Traces()
+    evals = Evals()
     if trace_dataset_name is not None:
         fixture_spans = map(
             json_string_to_span,
@@ -157,6 +160,12 @@ def _load_items(
             args=(traces, fixture_spans, simulate_streaming),
             daemon=True,
         ).start()
+        fixture_evals = get_evals_from_fixture(trace_dataset_name)
+        Thread(
+            target=_load_items,
+            args=(evals, fixture_evals, simulate_streaming),
+            daemon=True,
+        ).start()
     umap_params_list = args.umap_params.split(",")
     umap_params = UMAPParameters(
         min_dist=float(umap_params_list[0]),
@@ -169,6 +178,7 @@ def _load_items(
         model=model,
         umap_params=umap_params,
         traces=traces,
+        evals=evals,
         corpus=None if corpus_dataset is None else create_model_from_datasets(corpus_dataset),
         debug=args.debug,
     )
diff --git a/src/phoenix/session/evaluation.py b/src/phoenix/session/evaluation.py
new file mode 100644
index 0000000000..a4de5e1029
--- /dev/null
+++ b/src/phoenix/session/evaluation.py
@@ -0,0 +1,158 @@
+"""
+A set of **highly experimental** helper functions to
+  - extract spans from Phoenix for evaluation
+    - explode retrieved documents from (horizontal) lists to a (vertical) series
+      indexed by `context.span_id` and `document_position`
+  - ingest evaluation results into Phoenix via HttpExporter
+"""
+import collections
+import math
+from typing import (
+    Any,
+    Iterable,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
+
+import pandas as pd
+from google.protobuf.wrappers_pb2 import DoubleValue, StringValue
+
+import phoenix.trace.v1 as pb
+from phoenix.core.traces import TRACE_ID
+from phoenix.session.session import Session
+from phoenix.trace.exporter import HttpExporter
+from phoenix.trace.schemas import ATTRIBUTE_PREFIX
+from phoenix.trace.semantic_conventions import (
+    DOCUMENT_CONTENT,
+    DOCUMENT_SCORE,
+    INPUT_VALUE,
+    RETRIEVAL_DOCUMENTS,
+)
+
+
+def get_retrieved_documents(session: Session) -> pd.DataFrame:
+    data: List[Mapping[str, Any]] = []
+    if (df := session.get_spans_dataframe("span_kind == 'RETRIEVER'")) is not None:
+        for span_id, query, documents, trace_id in df.loc[
+            :,
+            [
+                ATTRIBUTE_PREFIX + INPUT_VALUE,
+                ATTRIBUTE_PREFIX + RETRIEVAL_DOCUMENTS,
+                TRACE_ID,
+            ],
+        ].itertuples():
+            if not isinstance(documents, Iterable):
+                continue
+            for position, document in enumerate(documents):
+                if not hasattr(document, "get"):
+                    continue
+                data.append(
+                    {
+                        "context.trace_id": trace_id,
+                        "context.span_id": span_id,
+                        "input": query,
+                        "document_position": position,
+                        "reference": document.get(DOCUMENT_CONTENT),
+                        "document_score": document.get(DOCUMENT_SCORE),
+                    }
+                )
+    index = ["context.span_id", "document_position"]
+    columns = [
+        "context.span_id",
+        "document_position",
+        "input",
+        "reference",
+        "document_score",
+        "context.trace_id",
+    ]
+    return pd.DataFrame(data=data, columns=columns).set_index(index)
+
+
+def add_evaluations(
+    exporter: HttpExporter,
+    evaluations: pd.DataFrame,
+    evaluation_name: str,
+) -> None:
+    index_names = evaluations.index.names
+    for index, row in evaluations.iterrows():
+        subject_id = _extract_subject_id_from_index(
+            index_names,
+            cast(Union[str, Tuple[Any]], index),
+        )
+        if (result := _extract_result(row)) is None:
+            continue
+        evaluation = pb.Evaluation(
+            name=evaluation_name,
+            result=result,
+            subject_id=subject_id,
+        )
+        exporter.export(evaluation)
+
+
+def _extract_subject_id_from_index(
+    names: Sequence[str],
+    value: Union[str, Sequence[Any]],
+) -> pb.Evaluation.SubjectId:
+    """
+    (**Highly Experimental**)
+    Returns `SubjectId` given the format of `index_names`. Allowed formats are:
+        - DocumentRetrievalId
+            - index_names=["context.span_id", "document_position"]
+            - index_names=["span_id", "document_position"]
+            - index_names=["document_position", "context.span_id"]
+            - index_names=["document_position", "span_id"]
+        - SpanId
+            - index_names=["span_id"]
+            - index_names=["context.span_id"]
+        - TraceId
+            - index_names=["context.span_id"]
+            - index_names=["trace_id"]
+    """
+    assert isinstance(names, collections.Sequence)
+    if len(names) == 2:
+        assert isinstance(value, collections.Sequence) and len(value) == 2
+        if "document_position" in names:
+            document_position = value[names.index("document_position")]
+            assert isinstance(document_position, int)
+            if "context.span_id" in names:
+                span_id = value[names.index("context.span_id")]
+            elif "span_id" in names:
+                span_id = value[names.index("span_id")]
+            else:
+                raise ValueError(f"Unexpected index names: {names}")
+            assert isinstance(span_id, str)
+            return pb.Evaluation.SubjectId(
+                document_retrieval_id=pb.Evaluation.SubjectId.DocumentRetrievalId(
+                    document_position=document_position,
+                    span_id=span_id,
+                ),
+            )
+    elif len(names) == 1:
+        assert isinstance(value, str)
+        if names[0] in ("context.span_id", "span_id"):
+            return pb.Evaluation.SubjectId(span_id=value)
+        if names[0] in ("context.trace_id", "trace_id"):
+            return pb.Evaluation.SubjectId(trace_id=value)
+    raise ValueError(f"Unexpected index names: {names}")
+
+
+def _extract_result(row: "pd.Series[Any]") -> Optional[pb.Evaluation.Result]:
+    score = cast(Optional[float], row.get("score"))
+    label = cast(Optional[str], row.get("label"))
+    explanation = cast(Optional[str], row.get("explanation"))
+    if (
+        (score is None or isinstance(score, float) and math.isnan(score))
+        and not label
+        and not explanation
+    ):
+        return None
+    return pb.Evaluation.Result(
+        score=DoubleValue(value=score) if score is not None else None,
+        label=StringValue(value=label) if label else None,
+        explanation=StringValue(value=explanation) if explanation else None,
+    )
diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py
index ba5422bde9..3ff682eb17 100644
--- a/src/phoenix/session/session.py
+++ b/src/phoenix/session/session.py
@@ -21,6 +21,7 @@
 import pandas as pd
 
 from phoenix.config import ENV_NOTEBOOK_ENV, get_env_host, get_env_port, get_exported_files
+from phoenix.core.evals import Evals
 from phoenix.core.model_schema_adapter import create_model_from_datasets
 from phoenix.core.traces import Traces
 from phoenix.datasets.dataset import EMPTY_DATASET, Dataset
@@ -117,6 +118,8 @@ def __init__(
             for span in trace_dataset.to_spans():
                 self.traces.put(span)
 
+        self.evals: Evals = Evals()
+
         self.host = host or get_env_host()
         self.port = port or get_env_port()
         self.temp_dir = TemporaryDirectory()
@@ -279,6 +282,7 @@ def __init__(
             model=self.model,
             corpus=self.corpus,
             traces=self.traces,
+            evals=self.evals,
             umap_params=self.umap_parameters,
         )
         self.server = ThreadServer(
diff --git a/src/phoenix/trace/exporter.py b/src/phoenix/trace/exporter.py
index a1f4eb66e5..f7b71fad4e 100644
--- a/src/phoenix/trace/exporter.py
+++ b/src/phoenix/trace/exporter.py
@@ -4,10 +4,11 @@
 from queue import SimpleQueue
 from threading import Thread
 from types import MethodType
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import requests
 from requests import Session
+from typing_extensions import TypeAlias
 
 import phoenix.trace.v1 as pb
 from phoenix.config import get_env_host, get_env_port
@@ -19,6 +20,8 @@
 
 END_OF_QUEUE = None  # sentinel value for queue termination
 
+Message: TypeAlias = Union[pb.Span, pb.Evaluation]
+
 
 class NoOpExporter:
     def export(self, _: Any) -> None:
@@ -32,7 +35,7 @@ def __init__(
         port: Optional[int] = None,
     ) -> None:
         """
-        Span Exporter using HTTP.
+        Span/Evaluation Exporter using HTTP.
 
         Parameters
         ----------
@@ -55,13 +58,16 @@ def __init__(
                 "content-encoding": "gzip",
             }
         )
-        self._queue: "SimpleQueue[Optional[pb.Span]]" = SimpleQueue()
+        self._queue: "SimpleQueue[Optional[Message]]" = SimpleQueue()
         # Putting `None` as the sentinel value for queue termination.
         weakref.finalize(self, self._queue.put, END_OF_QUEUE)
         self._start_consumer()
 
-    def export(self, span: Span) -> None:
-        self._queue.put(encode(span))
+    def export(self, item: Union[Span, pb.Evaluation]) -> None:
+        if isinstance(item, Span):
+            self._queue.put(encode(item))
+        elif isinstance(item, pb.Evaluation):
+            self._queue.put(item)
 
     def _start_consumer(self) -> None:
         Thread(
@@ -76,16 +82,20 @@ def _consume_items(self) -> None:
         while (item := self._queue.get()) is not END_OF_QUEUE:
             self._send(item)
 
-    def _send(self, item: pb.Span) -> None:
-        serialized = item.SerializeToString()
+    def _send(self, message: Message) -> None:
+        serialized = message.SerializeToString()
         data = gzip.compress(serialized)
         try:
-            self._session.post(self._url(item), data=data)
+            self._session.post(self._url(message), data=data).raise_for_status()
         except Exception as e:
             logger.exception(e)
 
-    def _url(self, _: pb.Span) -> str:
-        return f"{self._base_url}/v1/spans"
+    def _url(self, message: Message) -> str:
+        if isinstance(message, pb.Span):
+            return f"{self._base_url}/v1/spans"
+        if isinstance(message, pb.Evaluation):
+            return f"{self._base_url}/v1/evaluations"
+        raise ValueError(f"Unknown message type: {type(message)}")
 
     def _warn_if_phoenix_is_not_running(self) -> None:
         try:
diff --git a/src/phoenix/trace/fixtures.py b/src/phoenix/trace/fixtures.py
index 6341bd9acd..cd5a474986 100644
--- a/src/phoenix/trace/fixtures.py
+++ b/src/phoenix/trace/fixtures.py
@@ -1,22 +1,67 @@
-from dataclasses import dataclass
-from typing import List, Optional, cast
+from dataclasses import dataclass, field
+from typing import Iterable, Iterator, List, NamedTuple, Optional, Tuple, cast
 from urllib import request
 
+import pandas as pd
+from google.protobuf.wrappers_pb2 import DoubleValue, StringValue
+
+import phoenix.trace.v1 as pb
 from phoenix.trace.trace_dataset import TraceDataset
 from phoenix.trace.utils import json_lines_to_df
 
 
+class EvaluationResultSchema(NamedTuple):
+    label: Optional[str] = "label"
+    score: Optional[str] = "score"
+    explanation: Optional[str] = "explanation"
+
+
+@dataclass(frozen=True)
+class EvaluationFixture:
+    evaluation_name: str
+    file_name: str
+    evaluation_result_schema: EvaluationResultSchema = field(default_factory=EvaluationResultSchema)
+
+
+@dataclass(frozen=True)
+class DocumentEvaluationFixture(EvaluationFixture):
+    document_position: str = "document_position"
+
+
 @dataclass(frozen=True)
 class TracesFixture:
     name: str
     description: str
     file_name: str
+    evaluation_fixtures: Iterable[EvaluationFixture] = ()
 
 
 llama_index_rag_fixture = TracesFixture(
     name="llama_index_rag",
     description="Traces from running the llama_index on a RAG use case.",
     file_name="llama_index_rag_with_rerank.jsonl",
+    evaluation_fixtures=(
+        EvaluationFixture(
+            evaluation_name="Q&A Correctness",
+            file_name="llama_index_rag_with_rerank.qa_correctness_eval.parquet",
+        ),
+        EvaluationFixture(
+            evaluation_name="Hallucination",
+            file_name="llama_index_rag_with_rerank.hallucination_eval.parquet",
+        ),
+        EvaluationFixture(
+            evaluation_name="NDCG@2",
+            file_name="llama_index_rag_with_rerank.ndcg_at_2.parquet",
+        ),
+        EvaluationFixture(
+            evaluation_name="Precision@3",
+            file_name="llama_index_rag_with_rerank.precision_at_3.parquet",
+        ),
+        DocumentEvaluationFixture(
+            evaluation_name="Relevance",
+            file_name="llama_index_rag_with_rerank.documents_eval.parquet",
+        ),
+    ),
 )
 
 llama_index_calculator_agent_fixture = TracesFixture(
@@ -105,3 +150,48 @@ def load_example_traces(use_case: str) -> TraceDataset:
     """
     fixture = _get_trace_fixture_by_name(use_case)
     return TraceDataset(json_lines_to_df(_download_traces_fixture(fixture)))
+
+
+def get_evals_from_fixture(use_case: str) -> Iterator[pb.Evaluation]:
+    fixture = _get_trace_fixture_by_name(use_case)
+    for eval_fixture in fixture.evaluation_fixtures:
+        yield from _read_eval_fixture(eval_fixture)
+
+
+def _read_eval_fixture(eval_fixture: EvaluationFixture) -> Iterator[pb.Evaluation]:
+    df = pd.read_parquet(_url(eval_fixture.file_name))
+    for index, row in df.iterrows():
+        schema = eval_fixture.evaluation_result_schema
+        label = row.get(schema.label)
+        score = row.get(schema.score)
+        explanation = row.get(schema.explanation)
+        result = pb.Evaluation.Result(
+            score=DoubleValue(value=cast(float, score)) if score is not None else None,
+            label=StringValue(value=cast(str, label)) if label else None,
+            explanation=StringValue(value=cast(str, explanation)) if explanation else None,
+        )
+        if isinstance(eval_fixture, DocumentEvaluationFixture):
+            span_id, document_position = cast(Tuple[str, int], index)
+            subject_id = pb.Evaluation.SubjectId(
+                document_retrieval_id=pb.Evaluation.SubjectId.DocumentRetrievalId(
+                    document_position=document_position,
+                    span_id=span_id,
+                ),
+            )
+        else:
+            span_id = cast(str, index)
+            subject_id = pb.Evaluation.SubjectId(span_id=span_id)
+        yield pb.Evaluation(
+            name=eval_fixture.evaluation_name,
+            result=result,
+            subject_id=subject_id,
+        )
+
+
+def _url(
+    file_name: str,
+    host: Optional[str] = "https://storage.googleapis.com/",
+    bucket: Optional[str] = "arize-assets",
+    prefix: Optional[str] = "phoenix/traces/",
+) -> str:
+    return f"{host}{bucket}/{prefix}{file_name}"
diff --git a/src/phoenix/trace/v1/__init__.py b/src/phoenix/trace/v1/__init__.py
index 00d6a00abc..39315c8a60 100644
--- a/src/phoenix/trace/v1/__init__.py
+++ b/src/phoenix/trace/v1/__init__.py
@@ -1,7 +1,9 @@
+from phoenix.trace.v1.evaluation_pb2 import Evaluation
 from phoenix.trace.v1.trace_pb2 import Embedding, Retrieval, Span
 
 __all__ = [
     "Span",
     "Retrieval",
     "Embedding",
+    "Evaluation",
 ]
diff --git a/src/phoenix/trace/v1/evaluation_pb2.py b/src/phoenix/trace/v1/evaluation_pb2.py
new file mode 100644
index 0000000000..a7a2a284cd
--- /dev/null
+++ b/src/phoenix/trace/v1/evaluation_pb2.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: trace/v1/evaluation.proto
+"""Generated protocol buffer code."""
+from google.protobuf.internal import builder as _builder
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19trace/v1/evaluation.proto\x12\x1bphoenix.proto.evaluation.v1\x1a\x1egoogle/protobuf/wrappers.proto\"\xa1\x04\n\nEvaluation\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x45\n\nsubject_id\x18\x02 \x01(\x0b\x32\x31.phoenix.proto.evaluation.v1.Evaluation.SubjectId\x12>\n\x06result\x18\x03 \x01(\x0b\x32..phoenix.proto.evaluation.v1.Evaluation.Result\x1a\xe5\x01\n\tSubjectId\x12\x12\n\x08trace_id\x18\x01 \x01(\tH\x00\x12\x11\n\x07span_id\x18\x02 \x01(\tH\x00\x12\x66\n\x15\x64ocument_retrieval_id\x18\x03 \x01(\x0b\x32\x45.phoenix.proto.evaluation.v1.Evaluation.SubjectId.DocumentRetrievalIdH\x00\x1a\x41\n\x13\x44ocumentRetrievalId\x12\x0f\n\x07span_id\x18\x01 \x01(\t\x12\x19\n\x11\x64ocument_position\x18\x02 \x01(\x05\x42\x06\n\x04kind\x1a\x95\x01\n\x06Result\x12+\n\x05score\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12+\n\x05label\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.StringValue\x12\x31\n\x0b\x65xplanation\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.StringValueb\x06proto3')
+
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trace.v1.evaluation_pb2', globals())
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  _EVALUATION._serialized_start=91
+  _EVALUATION._serialized_end=636
+  _EVALUATION_SUBJECTID._serialized_start=255
+  _EVALUATION_SUBJECTID._serialized_end=484
+  _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_start=411
+  _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_end=476
+  _EVALUATION_RESULT._serialized_start=487
+  _EVALUATION_RESULT._serialized_end=636
+# @@protoc_insertion_point(module_scope)
diff --git a/src/phoenix/trace/v1/evaluation_pb2.pyi b/src/phoenix/trace/v1/evaluation_pb2.pyi
new file mode 100644
index 0000000000..ab34b84f5c
--- /dev/null
+++ b/src/phoenix/trace/v1/evaluation_pb2.pyi
@@ -0,0 +1,102 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+"""
+import builtins
+import google.protobuf.descriptor
+import google.protobuf.message
+import google.protobuf.wrappers_pb2
+import sys
+
+if sys.version_info >= (3, 8):
+    import typing as typing_extensions
+else:
+    import typing_extensions
+
+DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
+
+@typing_extensions.final
+class Evaluation(google.protobuf.message.Message):
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    @typing_extensions.final
+    class SubjectId(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        @typing_extensions.final
+        class DocumentRetrievalId(google.protobuf.message.Message):
+            DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+            SPAN_ID_FIELD_NUMBER: builtins.int
+            DOCUMENT_POSITION_FIELD_NUMBER: builtins.int
+            span_id: builtins.str
+            document_position: builtins.int
+            """zero-based index"""
+            def __init__(
+                self,
+                *,
+                span_id: builtins.str = ...,
+                document_position: builtins.int = ...,
+            ) -> None: ...
+            def ClearField(self, field_name: typing_extensions.Literal["document_position", b"document_position", "span_id", b"span_id"]) -> None: ...
+
+        TRACE_ID_FIELD_NUMBER: builtins.int
+        SPAN_ID_FIELD_NUMBER: builtins.int
+        DOCUMENT_RETRIEVAL_ID_FIELD_NUMBER: builtins.int
+        trace_id: builtins.str
+        span_id: builtins.str
+        @property
+        def document_retrieval_id(self) -> global___Evaluation.SubjectId.DocumentRetrievalId: ...
+        def __init__(
+            self,
+            *,
+            trace_id: builtins.str = ...,
+            span_id: builtins.str = ...,
+            document_retrieval_id: global___Evaluation.SubjectId.DocumentRetrievalId | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> None: ...
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["kind", b"kind"]) -> typing_extensions.Literal["trace_id", "span_id", "document_retrieval_id"] | None: ...
+
+    @typing_extensions.final
+    class Result(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        SCORE_FIELD_NUMBER: builtins.int
+        LABEL_FIELD_NUMBER: builtins.int
+        EXPLANATION_FIELD_NUMBER: builtins.int
+        @property
+        def score(self) -> google.protobuf.wrappers_pb2.DoubleValue: ...
+        @property
+        def label(self) -> google.protobuf.wrappers_pb2.StringValue: ...
+        @property
+        def explanation(self) -> google.protobuf.wrappers_pb2.StringValue: ...
+        def __init__(
+            self,
+            *,
+            score: google.protobuf.wrappers_pb2.DoubleValue | None = ...,
+            label: google.protobuf.wrappers_pb2.StringValue | None = ...,
+            explanation: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> None: ...
+
+    NAME_FIELD_NUMBER: builtins.int
+    SUBJECT_ID_FIELD_NUMBER: builtins.int
+    RESULT_FIELD_NUMBER: builtins.int
+    name: builtins.str
+    @property
+    def subject_id(self) -> global___Evaluation.SubjectId: ...
+    @property
+    def result(self) -> global___Evaluation.Result: ...
+    def __init__(
+        self,
+        *,
+        name: builtins.str = ...,
+        subject_id: global___Evaluation.SubjectId | None = ...,
+        result: global___Evaluation.Result | None = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["result", b"result", "subject_id", b"subject_id"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["name", b"name", "result", b"result", "subject_id", b"subject_id"]) -> None: ...
+
+global___Evaluation = Evaluation
diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb
new file mode 100644
index 0000000000..bc07fcedbf
--- /dev/null
+++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb
@@ -0,0 +1,468 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d58c5245e6d7811f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import phoenix as px\n",
+    "from phoenix.experimental.evals.functions import llm_classify\n",
+    "from phoenix.experimental.evals.models import OpenAIModel\n",
+    "from phoenix.experimental.evals.templates.default_templates import (\n",
+    "    HALLUCINATION_PROMPT_RAILS_MAP,\n",
+    "    HALLUCINATION_PROMPT_TEMPLATE,\n",
+    "    QA_PROMPT_RAILS_MAP,\n",
+    "    QA_PROMPT_TEMPLATE,\n",
+    "    RAG_RELEVANCY_PROMPT_RAILS_MAP,\n",
+    "    RAG_RELEVANCY_PROMPT_TEMPLATE,\n",
+    ")\n",
+    "from phoenix.session.evaluation import add_evaluations, get_retrieved_documents\n",
+    "from phoenix.trace.exporter import HttpExporter\n",
+    "from sklearn.metrics import ndcg_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b43166d02c26e8d",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Start Phoenix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = px.load_example_traces(\"llama_index_rag\")\n",
+    "px.launch_app(trace=ds)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1362576ff0fe4e2c",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Extract Retrieved Documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2c45b85c6644735",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retrieved_documents = get_retrieved_documents(px.active_session())\n",
+    "retrieved_documents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ac938a5c199dc82",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Set Up OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e14465175520ce42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from getpass import getpass\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "if not (openai_api_key := os.getenv(\"OPENAI_API_KEY\")):\n",
+    "    openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n",
+    "openai.api_key = openai_api_key\n",
+    "os.environ[\"OPENAI_API_KEY\"] = openai_api_key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de9664171d3e33b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = OpenAIModel(model_name=\"gpt-4-1106-preview\")\n",
+    "model(\"hi\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d694213dcf35676f",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Evaluate Document Relevance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "516dc273735ad00c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retrieved_documents_eval = llm_classify(\n",
+    "    retrieved_documents,\n",
+    "    model,\n",
+    "    RAG_RELEVANCY_PROMPT_TEMPLATE,\n",
+    "    list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),\n",
+    "    provide_explanation=True,\n",
+    ")\n",
+    "retrieved_documents_eval[\"score\"] = (\n",
+    "    retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == \"relevant\"\n",
+    ").astype(int)\n",
+    "retrieved_documents_eval.to_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f341795ae24ca024",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retrieved_documents_eval = pd.read_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")\n",
+    "retrieved_documents_eval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "357fe94b02b22a6b",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Merge Data to Compute Ranking Metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3bd04b678c9d18c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined = pd.concat([retrieved_documents, retrieved_documents_eval.add_prefix(\"eval_\")], axis=1)\n",
+    "combined"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b162eccd6c69aa7f",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Compute NDCG@2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77d9fdebd46d268b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _compute_ndcg(df: pd.DataFrame, k: int):\n",
+    "    \"\"\"Compute NDCG@k in the presence of missing values (e.g. as a result of keyboard interrupt).\"\"\"\n",
+    "    eval_scores = [np.nan] * k\n",
+    "    pred_scores = [np.nan] * k\n",
+    "    for i in range(k):\n",
+    "        if i >= len(df.eval_score):\n",
+    "            break\n",
+    "        eval_scores[i] = df.eval_score[i]\n",
+    "        pred_scores[i] = df.document_score[i]\n",
+    "    try:\n",
+    "        return ndcg_score([eval_scores], [pred_scores])\n",
+    "    except ValueError:\n",
+    "        return np.nan\n",
+    "\n",
+    "\n",
+    "ndcg_at_2 = pd.DataFrame({\"score\": combined.groupby(\"context.span_id\").apply(_compute_ndcg, k=2)})\n",
+    "ndcg_at_2.to_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8032851d13b63d55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ndcg_at_2 = pd.read_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")\n",
+    "ndcg_at_2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8d5816954fbaa4d",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Compute Precision@3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3167f4675c7313",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision_at_3 = pd.DataFrame(\n",
+    "    {\n",
+    "        \"score\": combined.groupby(\"context.span_id\").apply(\n",
+    "            lambda x: x.eval_score[:3].sum(skipna=False) / 3\n",
+    "        )\n",
+    "    }\n",
+    ")\n",
+    "precision_at_3.to_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c1d31d1d1c95429",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision_at_3 = pd.read_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")\n",
+    "precision_at_3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1819b377e7602361",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Merge Documents from Retrieval Spans to Q&A Spans (to Compute Q&A Correctness)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb27fd4724e0e27e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_df = (\n",
+    "    px.active_session()\n",
+    "    .get_spans_dataframe(\"output.value is not None\", root_spans_only=True)\n",
+    "    .set_index(\"context.trace_id\")[\n",
+    "        [\"attributes.input.value\", \"attributes.output.value\", \"context.span_id\"]\n",
+    "    ]\n",
+    "    .rename({\"attributes.input.value\": \"input\", \"attributes.output.value\": \"output\"}, axis=1)\n",
+    ")\n",
+    "qa_df[\"reference\"] = retrieved_documents.groupby(\"context.trace_id\").apply(\n",
+    "    lambda x: \"\\n\\n\".join(x.reference)\n",
+    ")\n",
+    "qa_df.set_index(\"context.span_id\", inplace=True)\n",
+    "qa_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4084449c986aed8",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Evaluate Q&A Correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae507af54ce886a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_correctness_eval = llm_classify(\n",
+    "    qa_df,\n",
+    "    model,\n",
+    "    QA_PROMPT_TEMPLATE,\n",
+    "    list(QA_PROMPT_RAILS_MAP.values()),\n",
+    "    provide_explanation=True,\n",
+    ")\n",
+    "qa_correctness_eval[\"score\"] = (\n",
+    "    qa_correctness_eval.label[~qa_correctness_eval.label.isna()] == \"correct\"\n",
+    ").astype(int)\n",
+    "qa_correctness_eval.to_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2041d2dcc7d02322",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_correctness_eval = pd.read_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")\n",
+    "qa_correctness_eval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a88f90ea9c24832b",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Evaluate Hallucination"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "946b5aad5d72c1f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hallucination_eval = llm_classify(\n",
+    "    qa_df,\n",
+    "    model,\n",
+    "    HALLUCINATION_PROMPT_TEMPLATE,\n",
+    "    list(HALLUCINATION_PROMPT_RAILS_MAP.values()),\n",
+    "    provide_explanation=True,\n",
+    ")\n",
+    "hallucination_eval[\"score\"] = (\n",
+    "    hallucination_eval.label[~hallucination_eval.label.isna()] == \"factual\"\n",
+    ").astype(int)\n",
+    "hallucination_eval.to_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8b1a6d7143c986e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hallucination_eval = pd.read_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")\n",
+    "hallucination_eval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bdd4d1c641fb5e15",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Ingest Evaluations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eed5bc68320bb18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exporter = HttpExporter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "495a5e74b469a660",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_evaluations(exporter, retrieved_documents_eval, \"Relevance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20cc931d1529f84c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_evaluations(exporter, ndcg_at_2, \"NDCG@2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb48a5daae9d5bcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_evaluations(exporter, precision_at_3, \"Precision@2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "848420ee90e10f62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_evaluations(exporter, qa_correctness_eval, \"Q&A Correctness\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c03dde5802ed98a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_evaluations(exporter, hallucination_eval, \"Hallucination\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83dd4cd21c966504",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# End Session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5842da4238a93554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# px.active_session().end()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}