Arize-ai · RogerHYang · Nov 21, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/app/schema.graphql b/app/schema.graphql
@@ -179,6 +179,29 @@ type DimensionWithValue {
   value: String
 }
 
+type DocumentEvaluation implements Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+
+  """
+  The zero-based index among retrieved documents, which is collected as a list (even when ordering is not inherently meaningful).
+  """
+  documentPosition: Int!
+}
+
 type DriftTimeSeries implements TimeSeries {
   data: [TimeSeriesDataPoint!]!
 }
@@ -261,6 +284,24 @@ type EmbeddingMetadata {
   linkToData: String
 }
 
+interface Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+}
+
 type Event {
   id: ID!
   eventMetadata: EventMetadata!
@@ -460,6 +501,11 @@ type Query {
     clusterSelectionEpsilon: Float! = 0
   ): [Cluster!]!
   spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection!
+
+  """
+  Names of all available evaluations for spans. (The list contains no duplicates.)
+  """
+  spanEvaluationNames: [String!]!
   traceDatasetInfo: TraceDatasetInfo
   validateSpanFilterCondition(condition: String!): ValidationResult!
 }
@@ -527,6 +573,16 @@ type Span {
   """
   cumulativeTokenCountCompletion: Int
 
+  """
+  Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may assess the helpfulness of its response with respect to its input.
+  """
+  spanEvaluations: [SpanEvaluation!]!
+
+  """
+  Evaluations of the documents associated with the span, e.g. if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS attribute, an evaluation for each document may assess its relevance respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is a list, and each evaluation is identified by its document's (zero-based) index in that list.
+  """
+  documentEvaluations: [DocumentEvaluation!]!
+
   """All descendant spans (children, grandchildren, etc.)"""
   descendants: [Span!]!
 }
@@ -558,6 +614,24 @@ type SpanEdge {
   cursor: String!
 }
 
+type SpanEvaluation implements Evaluation {
+  """Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
+  name: String!
+
+  """Result of the evaluation in the form of a numeric score."""
+  score: Float
+
+  """
+  Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
+  """
+  label: String
+
+  """
+  The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
+  """
+  explanation: String
+}
+
 type SpanEvent {
   name: String!
   message: String!

diff --git a/pyproject.toml b/pyproject.toml
@@ -234,7 +234,10 @@ dependencies = [
 ]
 
 [tool.hatch.envs.proto.scripts]
-recompile = "python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto"
+recompile = """
+python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto &&
+python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto
+"""
 
 [tool.interrogate]
 fail-under = 0

diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py
@@ -0,0 +1,81 @@
+import weakref
+from collections import defaultdict
+from queue import SimpleQueue
+from threading import RLock, Thread
+from types import MethodType
+from typing import DefaultDict, Dict, List, Optional
+
+from typing_extensions import TypeAlias
+
+import phoenix.trace.v1 as pb
+from phoenix.trace.schemas import SpanID
+
+END_OF_QUEUE = None  # sentinel value for queue termination
+
+EvaluationName: TypeAlias = str
+DocumentPosition: TypeAlias = int
+
+
+class Evals:
+    def __init__(self) -> None:
+        self._queue: "SimpleQueue[Optional[pb.Evaluation]]" = SimpleQueue()
+        weakref.finalize(self, self._queue.put, END_OF_QUEUE)
+        self._lock = RLock()
+        self._start_consumer()
+        self._span_evaluations_by_name: DefaultDict[
+            EvaluationName, Dict[SpanID, pb.Evaluation]
+        ] = defaultdict(dict)
+        self._evaluations_by_span_id: DefaultDict[
+            SpanID, Dict[EvaluationName, pb.Evaluation]
+        ] = defaultdict(dict)
+        self._document_evaluations_by_span_id: DefaultDict[
+            SpanID, DefaultDict[EvaluationName, Dict[DocumentPosition, pb.Evaluation]]
+        ] = defaultdict(lambda: defaultdict(dict))
+
+    def put(self, evaluation: pb.Evaluation) -> None:
+        self._queue.put(evaluation)
+
+    def _start_consumer(self) -> None:
+        Thread(
+            target=MethodType(
+                self.__class__._consume_evaluations,
+                weakref.proxy(self),
+            ),
+            daemon=True,
+        ).start()
+
+    def _consume_evaluations(self) -> None:
+        while (item := self._queue.get()) is not END_OF_QUEUE:
+            with self._lock:
+                self._process_evaluation(item)
+
+    def _process_evaluation(self, evaluation: pb.Evaluation) -> None:
+        subject_id = evaluation.subject_id
+        name = evaluation.name
+        subject_id_kind = subject_id.WhichOneof("kind")
+        if subject_id_kind == "document_retrieval_id":
+            document_retrieval_id = subject_id.document_retrieval_id
+            span_id = SpanID(document_retrieval_id.span_id)
+            document_position = document_retrieval_id.document_position
+            self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation
+        elif subject_id_kind == "span_id":
+            span_id = SpanID(subject_id.span_id)
+            self._evaluations_by_span_id[span_id][name] = evaluation
+            self._span_evaluations_by_name[name][span_id] = evaluation
+        else:
+            raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}")
+
+    def get_span_evaluation_names(self) -> List[EvaluationName]:
+        with self._lock:
+            return list(self._span_evaluations_by_name.keys())
+
+    def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
+        with self._lock:
+            return list(self._evaluations_by_span_id[span_id].values())
+
+    def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
+        all_evaluations: List[pb.Evaluation] = []
+        with self._lock:
+            for evaluations in self._document_evaluations_by_span_id[span_id].values():
+                all_evaluations.extend(evaluations.values())
+        return all_evaluations
diff --git a/src/phoenix/proto/trace/v1/evaluation.proto b/src/phoenix/proto/trace/v1/evaluation.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+package phoenix.proto.evaluation.v1;
+
+import "google/protobuf/wrappers.proto";
+
+message Evaluation {
+  string name = 1;
+  message SubjectId {
+    message DocumentRetrievalId {
+      string span_id = 1;
+      int32 document_position = 2;  // zero-based index
+    }
+    oneof kind {
+      string trace_id = 1;
+      string span_id = 2;
+      DocumentRetrievalId document_retrieval_id = 3;
+    }
+  }
+  SubjectId subject_id = 2;
+  message Result {
+    google.protobuf.DoubleValue score = 1;
+    google.protobuf.StringValue label = 2;
+    google.protobuf.StringValue explanation = 3;
+  }
+  Result result = 3;
+}
diff --git a/src/phoenix/server/api/context.py b/src/phoenix/server/api/context.py
@@ -6,6 +6,7 @@
 from starlette.responses import Response
 from starlette.websockets import WebSocket
 
+from phoenix.core.evals import Evals
 from phoenix.core.model_schema import Model
 from phoenix.core.traces import Traces
 
@@ -18,3 +19,4 @@ class Context:
     export_path: Path
     corpus: Optional[Model] = None
     traces: Optional[Traces] = None
+    evals: Optional[Evals] = None
diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py
@@ -239,6 +239,18 @@ def spans(
         data = list(map(to_gql_span, spans))
         return connection_from_list(data=data, args=args)
 
+    @strawberry.field(
+        description="Names of all available evaluations for spans. "
+        "(The list contains no duplicates.)"
+    )  # type: ignore
+    def span_evaluation_names(
+        self,
+        info: Info[Context, None],
+    ) -> List[str]:
+        if (evals := info.context.evals) is None:
+            return []
+        return evals.get_span_evaluation_names()
+
     @strawberry.field
     def trace_dataset_info(
         self,

diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py
@@ -0,0 +1,71 @@
+from typing import Optional
+
+import strawberry
+
+import phoenix.trace.v1 as pb
+from phoenix.trace.schemas import SpanID
+
+
+@strawberry.interface
+class Evaluation:
+    name: str = strawberry.field(
+        description="Name of the evaluation, e.g. 'helpfulness' or 'relevance'."
+    )
+    score: Optional[float] = strawberry.field(
+        description="Result of the evaluation in the form of a numeric score."
+    )
+    label: Optional[str] = strawberry.field(
+        description="Result of the evaluation in the form of a string, e.g. "
+        "'helpful' or 'not helpful'. Note that the label is not necessarily binary."
+    )
+    explanation: Optional[str] = strawberry.field(
+        description="The evaluator's explanation for the evaluation result (i.e. "
+        "score or label, or both) given to the subject."
+    )
+
+
+@strawberry.type
+class SpanEvaluation(Evaluation):
+    span_id: strawberry.Private[SpanID]
+
+    @staticmethod
+    def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation":
+        result = evaluation.result
+        score = result.score.value if result.HasField("score") else None
+        label = result.label.value if result.HasField("label") else None
+        explanation = result.explanation.value if result.HasField("explanation") else None
+        span_id = SpanID(evaluation.subject_id.span_id)
+        return SpanEvaluation(
+            name=evaluation.name,
+            score=score,
+            label=label,
+            explanation=explanation,
+            span_id=span_id,
+        )
+
+
+@strawberry.type
+class DocumentEvaluation(Evaluation):
+    span_id: strawberry.Private[SpanID]
+    document_position: int = strawberry.field(
+        description="The zero-based index among retrieved documents, which "
+        "is collected as a list (even when ordering is not inherently meaningful)."
+    )
+
+    @staticmethod
+    def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation":
+        result = evaluation.result
+        score = result.score.value if result.HasField("score") else None
+        label = result.label.value if result.HasField("label") else None
+        explanation = result.explanation.value if result.HasField("explanation") else None
+        document_retrieval_id = evaluation.subject_id.document_retrieval_id
+        document_position = document_retrieval_id.document_position
+        span_id = SpanID(document_retrieval_id.span_id)
+        return DocumentEvaluation(
+            name=evaluation.name,
+            score=score,
+            label=label,
+            explanation=explanation,
+            document_position=document_position,
+            span_id=span_id,
+        )
diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py
@@ -11,6 +11,7 @@
 import phoenix.trace.schemas as trace_schema
 from phoenix.core.traces import ComputedAttributes
 from phoenix.server.api.context import Context
+from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation
 from phoenix.server.api.types.MimeType import MimeType
 from phoenix.trace.schemas import SpanID
 from phoenix.trace.semantic_conventions import (
@@ -123,6 +124,43 @@ class Span:
         "descendant spans (children, grandchildren, etc.)",
     )
 
+    @strawberry.field(
+        description="Evaluations associated with the span, e.g. if the span is "
+        "an LLM, an evaluation may assess the helpfulness of its response with "
+        "respect to its input."
+    )  # type: ignore
+    def span_evaluations(
+        self,
+        info: Info[Context, None],
+    ) -> List[SpanEvaluation]:
+        if not (evals := info.context.evals):
+            return []
+        span_id = SpanID(str(self.context.span_id))
+        return [
+            SpanEvaluation.from_pb_evaluation(evaluation)
+            for evaluation in evals.get_evaluations_by_span_id(span_id)
+        ]
+
+    @strawberry.field(
+        description="Evaluations of the documents associated with the span, e.g. "
+        "if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS "
+        "attribute, an evaluation for each document may assess its relevance "
+        "respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is "
+        "a list, and each evaluation is identified by its document's (zero-based) "
+        "index in that list."
+    )  # type: ignore
+    def document_evaluations(
+        self,
+        info: Info[Context, None],
+    ) -> List[DocumentEvaluation]:
+        if not (evals := info.context.evals):
+            return []
+        span_id = SpanID(str(self.context.span_id))
+        return [
+            DocumentEvaluation.from_pb_evaluation(evaluation)
+            for evaluation in evals.get_document_evaluations_by_span_id(span_id)
+        ]
+
     @strawberry.field(
         description="All descendant spans (children, grandchildren, etc.)",
     )  # type: ignore