Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: evaluation ingestion (no user-facing feature is added) #1764

Merged
merged 19 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions app/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,29 @@ type DimensionWithValue {
value: String
}

type DocumentEvaluation implements Evaluation {
"""Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
name: String!

"""Result of the evaluation in the form of a numeric score."""
score: Float

"""
Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
"""
label: String

"""
The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
"""
explanation: String

"""
The zero-based index among retrieved documents, which is collected as a list (even when ordering is not inherently meaningful).
"""
documentPosition: Int!
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs: Adding descriptions for the fields would be useful

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

}

type DriftTimeSeries implements TimeSeries {
data: [TimeSeriesDataPoint!]!
}
Expand Down Expand Up @@ -261,6 +284,24 @@ type EmbeddingMetadata {
linkToData: String
}

interface Evaluation {
"""Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
name: String!

"""Result of the evaluation in the form of a numeric score."""
score: Float

"""
Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
"""
label: String

"""
The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
"""
explanation: String
}

type Event {
id: ID!
eventMetadata: EventMetadata!
Expand Down Expand Up @@ -460,6 +501,11 @@ type Query {
clusterSelectionEpsilon: Float! = 0
): [Cluster!]!
spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection!

"""
Names of all available evaluations for spans. (The list contains no duplicates.)
"""
spanEvaluationNames: [String!]!
traceDatasetInfo: TraceDatasetInfo
validateSpanFilterCondition(condition: String!): ValidationResult!
}
Expand Down Expand Up @@ -527,6 +573,16 @@ type Span {
"""
cumulativeTokenCountCompletion: Int

"""
Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may assess the helpfulness of its response with respect to its input.
"""
spanEvaluations: [SpanEvaluation!]!

"""
Evaluations of the documents associated with the span, e.g. if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS attribute, an evaluation for each document may assess its relevance respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is a list, and each evaluation is identified by its document's (zero-based) index in that list.
"""
documentEvaluations: [DocumentEvaluation!]!

"""All descendant spans (children, grandchildren, etc.)"""
descendants: [Span!]!
}
Expand Down Expand Up @@ -558,6 +614,24 @@ type SpanEdge {
cursor: String!
}

type SpanEvaluation implements Evaluation {
"""Name of the evaluation, e.g. 'helpfulness' or 'relevance'."""
name: String!

"""Result of the evaluation in the form of a numeric score."""
score: Float

"""
Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary.
"""
label: String

"""
The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject.
"""
explanation: String
}

type SpanEvent {
name: String!
message: String!
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,10 @@ dependencies = [
]

[tool.hatch.envs.proto.scripts]
recompile = "python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto"
recompile = """
python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto &&
python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto
"""

[tool.interrogate]
fail-under = 0
Expand Down
81 changes: 81 additions & 0 deletions src/phoenix/core/evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import weakref
from collections import defaultdict
from queue import SimpleQueue
from threading import RLock, Thread
from types import MethodType
from typing import DefaultDict, Dict, List, Optional

from typing_extensions import TypeAlias

import phoenix.trace.v1 as pb
from phoenix.trace.schemas import SpanID

END_OF_QUEUE = None # sentinel value for queue termination

EvaluationName: TypeAlias = str
DocumentPosition: TypeAlias = int


class Evals:
def __init__(self) -> None:
self._queue: "SimpleQueue[Optional[pb.Evaluation]]" = SimpleQueue()
weakref.finalize(self, self._queue.put, END_OF_QUEUE)
self._lock = RLock()
self._start_consumer()
self._span_evaluations_by_name: DefaultDict[
EvaluationName, Dict[SpanID, pb.Evaluation]
] = defaultdict(dict)
self._evaluations_by_span_id: DefaultDict[
SpanID, Dict[EvaluationName, pb.Evaluation]
] = defaultdict(dict)
self._document_evaluations_by_span_id: DefaultDict[
SpanID, DefaultDict[EvaluationName, Dict[DocumentPosition, pb.Evaluation]]
] = defaultdict(lambda: defaultdict(dict))

def put(self, evaluation: pb.Evaluation) -> None:
self._queue.put(evaluation)

def _start_consumer(self) -> None:
Thread(
target=MethodType(
self.__class__._consume_evaluations,
weakref.proxy(self),
),
daemon=True,
).start()

def _consume_evaluations(self) -> None:
while (item := self._queue.get()) is not END_OF_QUEUE:
with self._lock:
self._process_evaluation(item)

def _process_evaluation(self, evaluation: pb.Evaluation) -> None:
subject_id = evaluation.subject_id
name = evaluation.name
subject_id_kind = subject_id.WhichOneof("kind")
if subject_id_kind == "document_retrieval_id":
document_retrieval_id = subject_id.document_retrieval_id
span_id = SpanID(document_retrieval_id.span_id)
document_position = document_retrieval_id.document_position
self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation
elif subject_id_kind == "span_id":
span_id = SpanID(subject_id.span_id)
self._evaluations_by_span_id[span_id][name] = evaluation
self._span_evaluations_by_name[name][span_id] = evaluation
else:
raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}")

def get_span_evaluation_names(self) -> List[EvaluationName]:
with self._lock:
return list(self._span_evaluations_by_name.keys())

def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
with self._lock:
return list(self._evaluations_by_span_id[span_id].values())

def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
all_evaluations: List[pb.Evaluation] = []
with self._lock:
for evaluations in self._document_evaluations_by_span_id[span_id].values():
all_evaluations.extend(evaluations.values())
return all_evaluations
26 changes: 26 additions & 0 deletions src/phoenix/proto/trace/v1/evaluation.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
syntax = "proto3";
package phoenix.proto.evaluation.v1;

import "google/protobuf/wrappers.proto";

message Evaluation {
string name = 1;
message SubjectId {
message DocumentRetrievalId {
string span_id = 1;
int32 document_position = 2; // zero-based index
}
oneof kind {
string trace_id = 1;
string span_id = 2;
DocumentRetrievalId document_retrieval_id = 3;
}
}
SubjectId subject_id = 2;
message Result {
google.protobuf.DoubleValue score = 1;
google.protobuf.StringValue label = 2;
google.protobuf.StringValue explanation = 3;
}
Result result = 3;
}
2 changes: 2 additions & 0 deletions src/phoenix/server/api/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from starlette.responses import Response
from starlette.websockets import WebSocket

from phoenix.core.evals import Evals
from phoenix.core.model_schema import Model
from phoenix.core.traces import Traces

Expand All @@ -18,3 +19,4 @@ class Context:
export_path: Path
corpus: Optional[Model] = None
traces: Optional[Traces] = None
evals: Optional[Evals] = None
12 changes: 12 additions & 0 deletions src/phoenix/server/api/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,18 @@ def spans(
data = list(map(to_gql_span, spans))
return connection_from_list(data=data, args=args)

@strawberry.field(
description="Names of all available evaluations for spans. "
"(The list contains no duplicates.)"
) # type: ignore
def span_evaluation_names(
self,
info: Info[Context, None],
) -> List[str]:
if (evals := info.context.evals) is None:
return []
return evals.get_span_evaluation_names()

@strawberry.field
def trace_dataset_info(
self,
Expand Down
71 changes: 71 additions & 0 deletions src/phoenix/server/api/types/Evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from typing import Optional

import strawberry

import phoenix.trace.v1 as pb
from phoenix.trace.schemas import SpanID


@strawberry.interface
class Evaluation:
name: str = strawberry.field(
description="Name of the evaluation, e.g. 'helpfulness' or 'relevance'."
)
score: Optional[float] = strawberry.field(
description="Result of the evaluation in the form of a numeric score."
)
label: Optional[str] = strawberry.field(
description="Result of the evaluation in the form of a string, e.g. "
"'helpful' or 'not helpful'. Note that the label is not necessarily binary."
)
explanation: Optional[str] = strawberry.field(
description="The evaluator's explanation for the evaluation result (i.e. "
"score or label, or both) given to the subject."
)


@strawberry.type
class SpanEvaluation(Evaluation):
span_id: strawberry.Private[SpanID]

@staticmethod
def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation":
result = evaluation.result
score = result.score.value if result.HasField("score") else None
label = result.label.value if result.HasField("label") else None
explanation = result.explanation.value if result.HasField("explanation") else None
span_id = SpanID(evaluation.subject_id.span_id)
return SpanEvaluation(
name=evaluation.name,
score=score,
label=label,
explanation=explanation,
span_id=span_id,
)


@strawberry.type
class DocumentEvaluation(Evaluation):
span_id: strawberry.Private[SpanID]
document_position: int = strawberry.field(
description="The zero-based index among retrieved documents, which "
"is collected as a list (even when ordering is not inherently meaningful)."
)

@staticmethod
def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation":
result = evaluation.result
score = result.score.value if result.HasField("score") else None
label = result.label.value if result.HasField("label") else None
explanation = result.explanation.value if result.HasField("explanation") else None
document_retrieval_id = evaluation.subject_id.document_retrieval_id
document_position = document_retrieval_id.document_position
span_id = SpanID(document_retrieval_id.span_id)
return DocumentEvaluation(
name=evaluation.name,
score=score,
label=label,
explanation=explanation,
document_position=document_position,
span_id=span_id,
)
38 changes: 38 additions & 0 deletions src/phoenix/server/api/types/Span.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import phoenix.trace.schemas as trace_schema
from phoenix.core.traces import ComputedAttributes
from phoenix.server.api.context import Context
from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation
from phoenix.server.api.types.MimeType import MimeType
from phoenix.trace.schemas import SpanID
from phoenix.trace.semantic_conventions import (
Expand Down Expand Up @@ -123,6 +124,43 @@ class Span:
"descendant spans (children, grandchildren, etc.)",
)

@strawberry.field(
description="Evaluations associated with the span, e.g. if the span is "
"an LLM, an evaluation may assess the helpfulness of its response with "
"respect to its input."
) # type: ignore
def span_evaluations(
self,
info: Info[Context, None],
) -> List[SpanEvaluation]:
if not (evals := info.context.evals):
return []
span_id = SpanID(str(self.context.span_id))
return [
SpanEvaluation.from_pb_evaluation(evaluation)
for evaluation in evals.get_evaluations_by_span_id(span_id)
]

@strawberry.field(
description="Evaluations of the documents associated with the span, e.g. "
"if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS "
"attribute, an evaluation for each document may assess its relevance "
"respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is "
"a list, and each evaluation is identified by its document's (zero-based) "
"index in that list."
) # type: ignore
def document_evaluations(
self,
info: Info[Context, None],
) -> List[DocumentEvaluation]:
if not (evals := info.context.evals):
return []
span_id = SpanID(str(self.context.span_id))
return [
DocumentEvaluation.from_pb_evaluation(evaluation)
for evaluation in evals.get_document_evaluations_by_span_id(span_id)
]

@strawberry.field(
description="All descendant spans (children, grandchildren, etc.)",
) # type: ignore
Expand Down
Loading
Loading