From d53b5b6ea3eaa77125fc77de4713fcaa831d02bd Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Thu, 16 Nov 2023 09:27:48 -0800 Subject: [PATCH 01/15] wip --- app/schema.graphql | 31 ++ pyproject.toml | 5 +- src/phoenix/core/evals.py | 116 +++++ src/phoenix/proto/trace/v1/evaluation.proto | 30 ++ src/phoenix/server/api/context.py | 2 + src/phoenix/server/api/schema.py | 11 +- src/phoenix/server/api/types/Evaluation.py | 53 ++ src/phoenix/server/api/types/Span.py | 25 +- src/phoenix/server/app.py | 17 + src/phoenix/server/evaluation_handler.py | 39 ++ src/phoenix/server/main.py | 7 + src/phoenix/session/evaluaton.py | 102 ++++ src/phoenix/session/session.py | 4 + src/phoenix/trace/exporter.py | 49 +- src/phoenix/trace/fixtures.py | 95 +++- src/phoenix/trace/v1/__init__.py | 2 + src/phoenix/trace/v1/evaluation_pb2.py | 34 ++ src/phoenix/trace/v1/evaluation_pb2.pyi | 112 +++++ .../trace_eval_ingestion_testing.ipynb | 466 ++++++++++++++++++ 19 files changed, 1176 insertions(+), 24 deletions(-) create mode 100644 src/phoenix/core/evals.py create mode 100644 src/phoenix/proto/trace/v1/evaluation.proto create mode 100644 src/phoenix/server/api/types/Evaluation.py create mode 100644 src/phoenix/server/evaluation_handler.py create mode 100644 src/phoenix/session/evaluaton.py create mode 100644 src/phoenix/trace/v1/evaluation_pb2.py create mode 100644 src/phoenix/trace/v1/evaluation_pb2.pyi create mode 100644 tutorials/internal/trace_eval_ingestion_testing.ipynb diff --git a/app/schema.graphql b/app/schema.graphql index edf634e517..427ce3a729 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -179,6 +179,15 @@ type DimensionWithValue { value: String } +type DocumentEvaluation implements Evaluation { + name: String! + score: Float + label: String + explanation: String + spanId: String! + documentPosition: Int! +} + type DriftTimeSeries implements TimeSeries { data: [TimeSeriesDataPoint!]! } @@ -261,6 +270,13 @@ type EmbeddingMetadata { linkToData: String } +interface Evaluation { + name: String! + score: Float + label: String + explanation: String +} + type Event { id: ID! eventMetadata: EventMetadata! @@ -460,6 +476,7 @@ type Query { clusterSelectionEpsilon: Float! = 0 ): [Cluster!]! spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection! + spanEvaluationNames: [String!]! traceDatasetInfo: TraceDatasetInfo validateSpanFilterCondition(condition: String!): ValidationResult! } @@ -527,6 +544,12 @@ type Span { """ cumulativeTokenCountCompletion: Int + """Span evaluations""" + spanEvaluations: [SpanEvaluation!]! + + """Document evaluations""" + documentEvaluations: [DocumentEvaluation!]! + """All descendant spans (children, grandchildren, etc.)""" descendants: [Span!]! } @@ -558,6 +581,14 @@ type SpanEdge { cursor: String! } +type SpanEvaluation implements Evaluation { + name: String! + score: Float + label: String + explanation: String + spanId: String! +} + type SpanEvent { name: String! message: String! diff --git a/pyproject.toml b/pyproject.toml index 22f9e4b0d4..8c9e32c20d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -231,7 +231,10 @@ dependencies = [ ] [tool.hatch.envs.proto.scripts] -recompile = "python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto" +recompile = """ +python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto && +python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto +""" [tool.interrogate] fail-under = 0 diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py new file mode 100644 index 0000000000..32f8aff1e1 --- /dev/null +++ b/src/phoenix/core/evals.py @@ -0,0 +1,116 @@ +import weakref +from collections import defaultdict +from queue import SimpleQueue +from threading import RLock, Thread +from types import MethodType +from typing import DefaultDict, Dict, List, Optional +from uuid import UUID + +import numpy as np +from typing_extensions import TypeAlias + +import phoenix.trace.v1 as pb +from phoenix.trace.schemas import SpanID, TraceID + +END_OF_QUEUE = None # sentinel value for queue termination + +EvaluationName: TypeAlias = str +DocumentPosition: TypeAlias = int + + +class Evals: + def __init__(self) -> None: + self._queue: "SimpleQueue[Optional[pb.Evaluation]]" = SimpleQueue() + weakref.finalize(self, self._queue.put, END_OF_QUEUE) + self._lock = RLock() + self._start_consumer() + self._trace_evaluations_by_name: DefaultDict[ + EvaluationName, Dict[TraceID, pb.Evaluation] + ] = defaultdict(dict) + self._evaluations_by_trace_id: DefaultDict[ + TraceID, Dict[EvaluationName, pb.Evaluation] + ] = defaultdict(dict) + self._span_evaluations_by_name: DefaultDict[ + EvaluationName, Dict[SpanID, pb.Evaluation] + ] = defaultdict(dict) + self._evaluations_by_span_id: DefaultDict[ + SpanID, Dict[EvaluationName, pb.Evaluation] + ] = defaultdict(dict) + self._document_evaluations_by_span_id: DefaultDict[ + SpanID, DefaultDict[EvaluationName, Dict[DocumentPosition, pb.Evaluation]] + ] = defaultdict(lambda: defaultdict(dict)) + + def put(self, evaluation: pb.Evaluation) -> None: + self._queue.put(evaluation) + + def _start_consumer(self) -> None: + Thread( + target=MethodType( + self.__class__._consume_evaluations, + weakref.proxy(self), + ), + daemon=True, + ).start() + + def _consume_evaluations(self) -> None: + while (item := self._queue.get()) is not END_OF_QUEUE: + with self._lock: + self._process_evaluation(item) + + def _process_evaluation(self, evaluation: pb.Evaluation) -> None: + subject_id = evaluation.subject_id + name = evaluation.name + subject_id_kind = subject_id.WhichOneof("kind") + if subject_id_kind == "document_retrieval_id": + document_retrieval_id = subject_id.document_retrieval_id + span_id = UUID(document_retrieval_id.span_id) + document_position = document_retrieval_id.document_position + self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation + elif subject_id_kind == "span_id": + span_id = UUID(subject_id.span_id) + self._evaluations_by_span_id[span_id][name] = evaluation + self._span_evaluations_by_name[name][span_id] = evaluation + elif subject_id_kind == "trace_id": + trace_id = UUID(subject_id.trace_id) + self._evaluations_by_span_id[trace_id][name] = evaluation + self._trace_evaluations_by_name[name][trace_id] = evaluation + else: + raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}") + + def get_span_evaluation_names(self) -> List[EvaluationName]: + with self._lock: + return list(self._span_evaluations_by_name.keys()) + + def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]: + with self._lock: + return list(self._evaluations_by_span_id[span_id].values()) + + def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]: + all_evaluations: List[pb.Evaluation] = [] + with self._lock: + for evaluations in self._document_evaluations_by_span_id[span_id].values(): + all_evaluations.extend(evaluations.values()) + return all_evaluations + + def get_document_evaluations( + self, span_id: SpanID, evaluation_name: str, num_documents: int + ) -> List[Optional[pb.Evaluation]]: + relevance_evaluations: List[Optional[pb.Evaluation]] = [None] * num_documents + with self._lock: + evaluations = self._document_evaluations_by_span_id[span_id][evaluation_name] + for document_position, document_relevance in evaluations.items(): + if document_position < len(relevance_evaluations): + relevance_evaluations[document_position] = document_relevance + return relevance_evaluations + + def get_document_evaluation_scores( + self, span_id: SpanID, evaluation_name: str, num_documents: int + ) -> List[Optional[float]]: + scores: List[Optional[float]] = [np.nan] * num_documents + with self._lock: + evaluations = self._document_evaluations_by_span_id[span_id][evaluation_name] + for document_position, document_relevance in evaluations.items(): + result = document_relevance.result + if result.HasField("score") and document_position < len(scores): + scores[document_position] = document_relevance.result.score.value + return scores diff --git a/src/phoenix/proto/trace/v1/evaluation.proto b/src/phoenix/proto/trace/v1/evaluation.proto new file mode 100644 index 0000000000..a7d073477b --- /dev/null +++ b/src/phoenix/proto/trace/v1/evaluation.proto @@ -0,0 +1,30 @@ +syntax = "proto3"; +package phoenix.proto.evaluation.v1; + +import "google/protobuf/timestamp.proto"; +import "google/protobuf/struct.proto"; +import "google/protobuf/wrappers.proto"; + +message Evaluation { + message SubjectID { + message DocumentRetrievalID { + string span_id = 1; + int32 document_position = 2; // zero-based-index + } + oneof kind { + string trace_id = 1; + string span_id = 2; + DocumentRetrievalID document_retrieval_id = 3; + } + } + SubjectID subject_id = 1; + message Result { + google.protobuf.DoubleValue score = 1; + google.protobuf.StringValue label = 2; + google.protobuf.StringValue explanation = 3; + } + Result result = 2; + string name = 3; + google.protobuf.Timestamp timestamp = 4; + google.protobuf.Struct attributes = 5; +} diff --git a/src/phoenix/server/api/context.py b/src/phoenix/server/api/context.py index 2d6a95be7e..b0480bb960 100644 --- a/src/phoenix/server/api/context.py +++ b/src/phoenix/server/api/context.py @@ -6,6 +6,7 @@ from starlette.responses import Response from starlette.websockets import WebSocket +from phoenix.core.evals import Evals from phoenix.core.model_schema import Model from phoenix.core.traces import Traces @@ -18,3 +19,4 @@ class Context: export_path: Path corpus: Optional[Model] = None traces: Optional[Traces] = None + evals: Optional[Evals] = None diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py index cfd9fad317..37d4b2abb2 100644 --- a/src/phoenix/server/api/schema.py +++ b/src/phoenix/server/api/schema.py @@ -236,9 +236,18 @@ def spans( spans = filter(predicate, spans) if sort: spans = sort(spans) - data = list(map(to_gql_span, spans)) + data = [to_gql_span(span, info.context.evals) for span in spans] return connection_from_list(data=data, args=args) + @strawberry.field + def span_evaluation_names( + self, + info: Info[Context, None], + ) -> List[str]: + if (evals := info.context.evals) is None: + return [] + return evals.get_span_evaluation_names() + @strawberry.field def trace_dataset_info( self, diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py new file mode 100644 index 0000000000..c3762ddd93 --- /dev/null +++ b/src/phoenix/server/api/types/Evaluation.py @@ -0,0 +1,53 @@ +from typing import Optional + +import strawberry + +import phoenix.trace.v1 as pb + + +@strawberry.interface +class Evaluation: + name: str + score: Optional[float] + label: Optional[str] + explanation: Optional[str] + + +@strawberry.type +class SpanEvaluation(Evaluation): + span_id: str + + @staticmethod + def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation": + result = evaluation.result + score = result.score.value if result.HasField("score") else None + label = result.label.value if result.HasField("label") else None + explanation = result.explanation.value if result.HasField("explanation") else None + return SpanEvaluation( + name=evaluation.name, + score=score, + label=label, + explanation=explanation, + span_id=evaluation.subject_id.span_id, + ) + + +@strawberry.type +class DocumentEvaluation(Evaluation): + span_id: str + document_position: int + + @staticmethod + def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation": + result = evaluation.result + score = result.score.value if result.HasField("score") else None + label = result.label.value if result.HasField("label") else None + explanation = result.explanation.value if result.HasField("explanation") else None + return DocumentEvaluation( + name=evaluation.name, + score=score, + label=label, + explanation=explanation, + span_id=evaluation.subject_id.document_retrieval_id.span_id, + document_position=evaluation.subject_id.document_retrieval_id.document_position, + ) diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index 02559445b4..c53c5366aa 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -9,8 +9,10 @@ from strawberry.types import Info import phoenix.trace.schemas as trace_schema +from phoenix.core.evals import Evals from phoenix.core.traces import ComputedAttributes from phoenix.server.api.context import Context +from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation from phoenix.server.api.types.MimeType import MimeType from phoenix.trace.schemas import SpanID from phoenix.trace.semantic_conventions import ( @@ -122,6 +124,14 @@ class Span: description="Cumulative (completion) token count from self and all " "descendant spans (children, grandchildren, etc.)", ) + span_evaluations: List[SpanEvaluation] = strawberry.field( + description="Span evaluations", + default_factory=list, + ) + document_evaluations: List[DocumentEvaluation] = strawberry.field( + description="Document evaluations", + default_factory=list, + ) @strawberry.field( description="All descendant spans (children, grandchildren, etc.)", @@ -133,17 +143,24 @@ def descendants( if (traces := info.context.traces) is None: return [] return [ - to_gql_span(cast(trace_schema.Span, traces[span_id])) + to_gql_span(cast(trace_schema.Span, traces[span_id]), info.context.evals) for span_id in traces.get_descendant_span_ids( cast(SpanID, self.context.span_id), ) ] -def to_gql_span(span: trace_schema.Span) -> "Span": +def to_gql_span(span: trace_schema.Span, evals: Optional[Evals] = None) -> "Span": events: List[SpanEvent] = list(map(SpanEvent.from_event, span.events)) input_value = cast(Optional[str], span.attributes.get(INPUT_VALUE)) output_value = cast(Optional[str], span.attributes.get(OUTPUT_VALUE)) + span_evaluations: List[SpanEvaluation] = [] + document_evaluations: List[DocumentEvaluation] = [] + span_id = span.context.span_id + for evaluation in evals.get_evaluations_by_span_id(span_id) if evals else (): + span_evaluations.append(SpanEvaluation.from_pb_evaluation(evaluation)) + for evaluation in evals.get_document_evaluations_by_span_id(span_id) if evals else (): + document_evaluations.append(DocumentEvaluation.from_pb_evaluation(evaluation)) return Span( name=span.name, status_code=SpanStatusCode(span.status_code), @@ -154,7 +171,7 @@ def to_gql_span(span: trace_schema.Span) -> "Span": latency_ms=cast(Optional[float], span.attributes.get(ComputedAttributes.LATENCY_MS.value)), context=SpanContext( trace_id=cast(ID, span.context.trace_id), - span_id=cast(ID, span.context.span_id), + span_id=cast(ID, span_id), ), attributes=json.dumps( _nested_attributes(_hide_embedding_vectors(span.attributes)), @@ -201,6 +218,8 @@ def to_gql_span(span: trace_schema.Span) -> "Span": if output_value is not None else None ), + span_evaluations=span_evaluations, + document_evaluations=document_evaluations, ) diff --git a/src/phoenix/server/app.py b/src/phoenix/server/app.py index db75dcfbbc..40a4c83995 100644 --- a/src/phoenix/server/app.py +++ b/src/phoenix/server/app.py @@ -20,11 +20,13 @@ import phoenix from phoenix.config import SERVER_DIR +from phoenix.core.evals import Evals from phoenix.core.model_schema import Model from phoenix.core.traces import Traces from phoenix.pointcloud.umap_parameters import UMAPParameters from phoenix.server.api.context import Context from phoenix.server.api.schema import schema +from phoenix.server.evaluation_handler import EvaluationHandler from phoenix.server.span_handler import SpanHandler logger = logging.getLogger(__name__) @@ -93,10 +95,12 @@ def __init__( graphiql: bool = False, corpus: Optional[Model] = None, traces: Optional[Traces] = None, + evals: Optional[Evals] = None, ) -> None: self.model = model self.corpus = corpus self.traces = traces + self.evals = evals self.export_path = export_path super().__init__(schema, graphiql=graphiql) @@ -111,6 +115,7 @@ async def get_context( model=self.model, corpus=self.corpus, traces=self.traces, + evals=self.evals, export_path=self.export_path, ) @@ -140,6 +145,7 @@ def create_app( umap_params: UMAPParameters, corpus: Optional[Model] = None, traces: Optional[Traces] = None, + evals: Optional[Evals] = None, debug: bool = False, ) -> Starlette: graphql = GraphQLWithContext( @@ -147,6 +153,7 @@ def create_app( model=model, corpus=corpus, traces=traces, + evals=evals, export_path=export_path, graphiql=True, ) @@ -169,6 +176,16 @@ def create_app( ), ] ) + + ( + [] + if evals is None + else [ + Route( + "/v1/evaluations", + type("SpanEndpoint", (EvaluationHandler,), {"queue": evals}), + ) + ] + ) + [ Route("/arize_phoenix_version", version), Route( diff --git a/src/phoenix/server/evaluation_handler.py b/src/phoenix/server/evaluation_handler.py new file mode 100644 index 0000000000..55460bf322 --- /dev/null +++ b/src/phoenix/server/evaluation_handler.py @@ -0,0 +1,39 @@ +import gzip +from typing import Protocol + +from starlette.endpoints import HTTPEndpoint +from starlette.requests import Request +from starlette.responses import Response +from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY + +import phoenix.trace.v1 as pb + + +class SupportsPutEvaluation(Protocol): + def put(self, evaluation: pb.Evaluation) -> None: + ... + + +class EvaluationHandler(HTTPEndpoint): + queue: SupportsPutEvaluation + + async def post(self, request: Request) -> Response: + try: + content_type = request.headers.get("content-type") + if content_type == "application/x-protobuf": + body = await request.body() + content_encoding = request.headers.get("content-encoding") + if content_encoding == "gzip": + body = gzip.decompress(body) + elif content_encoding is not None: + raise NotImplementedError(f"Unsupported content-encoding: {content_encoding}") + evaluation = pb.Evaluation() + evaluation.ParseFromString(body) + else: + raise NotImplementedError(f"Unsupported content-type: {content_type}") + except NotImplementedError as e: + return Response(str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY) + except Exception: + return Response(status_code=HTTP_422_UNPROCESSABLE_ENTITY) + self.queue.put(evaluation) + return Response() diff --git a/src/phoenix/server/main.py b/src/phoenix/server/main.py index 489a336268..fc38f84af2 100644 --- a/src/phoenix/server/main.py +++ b/src/phoenix/server/main.py @@ -11,6 +11,7 @@ from uvicorn import Config, Server from phoenix.config import EXPORT_DIR, get_env_host, get_env_port, get_pids_path +from phoenix.core.evals import Evals from phoenix.core.model_schema_adapter import create_model_from_datasets from phoenix.core.traces import Traces from phoenix.datasets.dataset import EMPTY_DATASET, Dataset @@ -26,6 +27,7 @@ TRACES_FIXTURES, _download_traces_fixture, _get_trace_fixture_by_name, + get_evals_from_fixture, ) from phoenix.trace.schemas import Span from phoenix.trace.span_json_decoder import json_string_to_span @@ -135,6 +137,7 @@ def _load_spans( reference_dataset, ) traces = Traces() + evals = Evals() if trace_dataset_name is not None: fixture_spans = map( json_string_to_span, @@ -149,6 +152,9 @@ def _load_spans( args=(traces, fixture_spans, simulate_streaming), daemon=True, ).start() + fixture_evals = get_evals_from_fixture(trace_dataset_name) + for evaluation in fixture_evals: + evals.put(evaluation) umap_params_list = args.umap_params.split(",") umap_params = UMAPParameters( min_dist=float(umap_params_list[0]), @@ -161,6 +167,7 @@ def _load_spans( model=model, umap_params=umap_params, traces=traces, + evals=evals, corpus=None if corpus_dataset is None else create_model_from_datasets(corpus_dataset), debug=args.debug, ) diff --git a/src/phoenix/session/evaluaton.py b/src/phoenix/session/evaluaton.py new file mode 100644 index 0000000000..f5169bd9af --- /dev/null +++ b/src/phoenix/session/evaluaton.py @@ -0,0 +1,102 @@ +from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union, cast + +import pandas as pd +from google.protobuf.wrappers_pb2 import DoubleValue, StringValue + +import phoenix.trace.v1 as pb +from phoenix.core.traces import TRACE_ID +from phoenix.session.session import Session +from phoenix.trace.exporter import HttpExporter +from phoenix.trace.schemas import ATTRIBUTE_PREFIX +from phoenix.trace.semantic_conventions import ( + DOCUMENT_CONTENT, + DOCUMENT_SCORE, + INPUT_VALUE, + RETRIEVAL_DOCUMENTS, +) + + +def get_retrieved_documents(session: Session) -> pd.DataFrame: + data: List[Mapping[str, Any]] = [] + if (df := session.get_spans_dataframe("span_kind == 'RETRIEVER'")) is not None: + for span_id, query, documents, trace_id in df.loc[ + :, + [ATTRIBUTE_PREFIX + INPUT_VALUE, ATTRIBUTE_PREFIX + RETRIEVAL_DOCUMENTS, TRACE_ID], + ].itertuples(): + if not isinstance(documents, Iterable): + continue + for position, document in enumerate(documents): + if not hasattr(document, "get"): + continue + data.append( + { + "context.trace_id": trace_id, + "context.span_id": span_id, + "query": query, + "document_position": position, + "document_content": document.get(DOCUMENT_CONTENT), + "document_score": document.get(DOCUMENT_SCORE), + } + ) + index = ["context.span_id", "document_position"] + columns = [ + "context.span_id", + "document_position", + "query", + "document_content", + "document_score", + "context.trace_id", + ] + return pd.DataFrame(data=data, columns=columns).set_index(index) + + +def add_evaluations( + exporter: HttpExporter, + evaluations: pd.DataFrame, + evaluation_name: str, +) -> None: + index_names = evaluations.index.names + for index, row in evaluations.iterrows(): + subject_id = _extract_subject_id(cast(Union[str, Tuple[str]], index), index_names) + result = _extract_result(row) + evaluation = pb.Evaluation( + name=evaluation_name, + result=result, + subject_id=subject_id, + ) + exporter.export(evaluation) + + +def _extract_subject_id( + index: Union[str, Tuple[str]], index_names: List[str] +) -> pb.Evaluation.SubjectID: + if index_names and index_names[0].endswith("span_id"): + if len(index_names) == 2 and index_names[1].endswith("document_position"): + span_id, document_position = cast(Tuple[str, int], index) + assert isinstance(span_id, str) + assert isinstance(document_position, int) + return pb.Evaluation.SubjectID( + document_retrieval_id=pb.Evaluation.SubjectID.DocumentRetrievalID( + document_position=document_position, + span_id=span_id, + ), + ) + span_id = cast(str, index) + assert isinstance(span_id, str) + return pb.Evaluation.SubjectID(span_id=span_id) + elif index_names and index_names[0].endswith("trace_id"): + trace_id = cast(str, index) + assert isinstance(trace_id, str) + return pb.Evaluation.SubjectID(trace_id=trace_id) + raise ValueError(f"Unexpected index names: {index_names}") + + +def _extract_result(row: "pd.Series[Any]") -> pb.Evaluation.Result: + score = cast(Optional[float], row.get("score")) + label = cast(Optional[str], row.get("label")) + explanation = cast(Optional[str], row.get("explanation")) + return pb.Evaluation.Result( + score=DoubleValue(value=score) if score is not None else None, + label=StringValue(value=label) if label else None, + explanation=StringValue(value=explanation) if explanation else None, + ) diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py index a7718bc492..b6a8294610 100644 --- a/src/phoenix/session/session.py +++ b/src/phoenix/session/session.py @@ -18,6 +18,7 @@ import pandas as pd from phoenix.config import get_env_host, get_env_port, get_exported_files +from phoenix.core.evals import Evals from phoenix.core.model_schema_adapter import create_model_from_datasets from phoenix.core.traces import Traces from phoenix.datasets.dataset import EMPTY_DATASET, Dataset @@ -105,6 +106,8 @@ def __init__( for span in trace_dataset.to_spans(): self.traces.put(span) + self.evals: Evals = Evals() + self.host = host or get_env_host() self.port = port or get_env_port() self.temp_dir = TemporaryDirectory() @@ -263,6 +266,7 @@ def __init__( model=self.model, corpus=self.corpus, traces=self.traces, + evals=self.evals, umap_params=self.umap_parameters, ) self.server = ThreadServer( diff --git a/src/phoenix/trace/exporter.py b/src/phoenix/trace/exporter.py index 5bc58fed61..5da59cfb93 100644 --- a/src/phoenix/trace/exporter.py +++ b/src/phoenix/trace/exporter.py @@ -1,24 +1,29 @@ import gzip import logging import weakref +from functools import singledispatchmethod from queue import SimpleQueue from threading import Thread from types import MethodType -from typing import Optional +from typing import Any, Optional, Union import requests +from google.protobuf.message import Message from requests import Session +import phoenix.trace.v1 as pb from phoenix.config import get_env_host, get_env_port from phoenix.trace.schemas import Span from phoenix.trace.v1.utils import encode +END_OF_QUEUE = None # sentinel value for queue termination + logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) class NoOpExporter: - def export(self, span: Span) -> None: + def export(self, _: Any) -> None: pass @@ -29,7 +34,7 @@ def __init__( port: Optional[int] = None, ) -> None: """ - Span Exporter using HTTP. + Span/Evaluation Exporter using HTTP. Parameters ---------- @@ -44,7 +49,6 @@ def __init__( self._port = port or get_env_port() self._base_url = f"http://{self._host}:{self._port}" self._warn_if_phoenix_is_not_running() - self._url = f"{self._base_url}/v1/spans" self._session = Session() weakref.finalize(self, self._session.close) self._session.headers.update( @@ -53,13 +57,13 @@ def __init__( "content-encoding": "gzip", } ) - self._queue: "SimpleQueue[Optional[Span]]" = SimpleQueue() + self._queue: "SimpleQueue[Optional[Union[Span, pb.Evaluation]]]" = SimpleQueue() # Putting `None` as the sentinel value for queue termination. - weakref.finalize(self, self._queue.put, None) + weakref.finalize(self, self._queue.put, END_OF_QUEUE) self._start_consumer() - def export(self, span: Span) -> None: - self._queue.put(span) + def export(self, item: Union[Span, pb.Evaluation]) -> None: + self._queue.put(item) def _start_consumer(self) -> None: Thread( @@ -71,20 +75,31 @@ def _start_consumer(self) -> None: ).start() def _consume_spans(self) -> None: - while True: - if not (span := self._queue.get()): - return - self._send(span) - - def _send(self, span: Span) -> None: - pb_span = encode(span) - serialized = pb_span.SerializeToString() + while (item := self._queue.get()) is not END_OF_QUEUE: + self._send(item) + + def _send(self, item: Union[Span, pb.Evaluation]) -> None: + if isinstance(item, Span): + message: Message = encode(item) + elif isinstance(item, pb.Evaluation): + message = item + else: + return + serialized = message.SerializeToString() data = gzip.compress(serialized) try: - self._session.post(self._url, data=data) + self._session.post(self._url(item), data=data) except Exception as e: logger.exception(e) + @singledispatchmethod + def _url(self, _: Span) -> str: + return f"{self._base_url}/v1/spans" + + @_url.register + def _(self, _: pb.Evaluation) -> str: + return f"{self._base_url}/v1/evaluations" + def _warn_if_phoenix_is_not_running(self) -> None: try: requests.get(f"{self._base_url}/arize_phoenix_version").raise_for_status() diff --git a/src/phoenix/trace/fixtures.py b/src/phoenix/trace/fixtures.py index 6341bd9acd..fe88f21cb5 100644 --- a/src/phoenix/trace/fixtures.py +++ b/src/phoenix/trace/fixtures.py @@ -1,22 +1,68 @@ -from dataclasses import dataclass -from typing import List, Optional, cast +from dataclasses import dataclass, field +from typing import Iterable, Iterator, List, NamedTuple, Optional, Tuple, cast from urllib import request +import pandas as pd +from google.protobuf.wrappers_pb2 import DoubleValue, StringValue + +import phoenix.trace.v1 as pb +from phoenix.trace.schemas import SpanID from phoenix.trace.trace_dataset import TraceDataset from phoenix.trace.utils import json_lines_to_df +class EvaluationResultSchema(NamedTuple): + label: Optional[str] = "label" + score: Optional[str] = "score" + explanation: Optional[str] = "explanation" + + +@dataclass(frozen=True) +class EvaluationFixture: + evaluation_name: str + file_name: str + evaluation_result_schema: EvaluationResultSchema = field(default_factory=EvaluationResultSchema) + + +@dataclass(frozen=True) +class DocumentRelevanceEvaluationFixture(EvaluationFixture): + document_position: str = "document_position" + + @dataclass(frozen=True) class TracesFixture: name: str description: str file_name: str + evaluation_fixtures: Iterable[EvaluationFixture] = () llama_index_rag_fixture = TracesFixture( name="llama_index_rag", description="Traces from running the llama_index on a RAG use case.", file_name="llama_index_rag_with_rerank.jsonl", + evaluation_fixtures=( + EvaluationFixture( + evaluation_name="Q&A Correctness", + file_name="llama_index_rag_with_rerank.qa_correctness_eval.parquet", + ), + EvaluationFixture( + evaluation_name="Hallucination", + file_name="llama_index_rag_with_rerank.hallucination_eval.parquet", + ), + EvaluationFixture( + evaluation_name="NDCG@2", + file_name="llama_index_rag_with_rerank.ndcg_at_2.parquet", + ), + EvaluationFixture( + evaluation_name="Precision@3", + file_name="llama_index_rag_with_rerank.precision_at_3.parquet", + ), + DocumentRelevanceEvaluationFixture( + evaluation_name="Relevance", + file_name="llama_index_rag_with_rerank.documents_eval.parquet", + ), + ), ) llama_index_calculator_agent_fixture = TracesFixture( @@ -105,3 +151,48 @@ def load_example_traces(use_case: str) -> TraceDataset: """ fixture = _get_trace_fixture_by_name(use_case) return TraceDataset(json_lines_to_df(_download_traces_fixture(fixture))) + + +def get_evals_from_fixture(use_case: str) -> Iterator[pb.Evaluation]: + fixture = _get_trace_fixture_by_name(use_case) + for eval_fixture in fixture.evaluation_fixtures: + yield from _read_eval_fixture(eval_fixture) + + +def _read_eval_fixture(eval_fixture: EvaluationFixture) -> Iterator[pb.Evaluation]: + df = pd.read_parquet(_url(eval_fixture.file_name)) + for index, row in df.iterrows(): + schema = eval_fixture.evaluation_result_schema + label = row.get(schema.label) + score = row.get(schema.score) + explanation = row.get(schema.explanation) + result = pb.Evaluation.Result( + score=DoubleValue(value=cast(float, score)) if score is not None else None, + label=StringValue(value=cast(str, label)) if label else None, + explanation=StringValue(value=cast(str, explanation)) if explanation else None, + ) + if isinstance(eval_fixture, DocumentRelevanceEvaluationFixture): + span_id, document_position = cast(Tuple[SpanID, int], index) + subject_id = pb.Evaluation.SubjectID( + document_retrieval_id=pb.Evaluation.SubjectID.DocumentRetrievalID( + document_position=document_position, + span_id=str(span_id), + ), + ) + else: + span_id = cast(SpanID, index) + subject_id = pb.Evaluation.SubjectID(span_id=str(span_id)) + yield pb.Evaluation( + name=eval_fixture.evaluation_name, + result=result, + subject_id=subject_id, + ) + + +def _url( + file_name: str, + host: Optional[str] = "https://storage.googleapis.com/", + bucket: Optional[str] = "arize-assets", + prefix: Optional[str] = "phoenix/traces/", +) -> str: + return f"{host}{bucket}/{prefix}{file_name}" diff --git a/src/phoenix/trace/v1/__init__.py b/src/phoenix/trace/v1/__init__.py index 00d6a00abc..39315c8a60 100644 --- a/src/phoenix/trace/v1/__init__.py +++ b/src/phoenix/trace/v1/__init__.py @@ -1,7 +1,9 @@ +from phoenix.trace.v1.evaluation_pb2 import Evaluation from phoenix.trace.v1.trace_pb2 import Embedding, Retrieval, Span __all__ = [ "Span", "Retrieval", "Embedding", + "Evaluation", ] diff --git a/src/phoenix/trace/v1/evaluation_pb2.py b/src/phoenix/trace/v1/evaluation_pb2.py new file mode 100644 index 0000000000..3e4315304b --- /dev/null +++ b/src/phoenix/trace/v1/evaluation_pb2.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: trace/v1/evaluation.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import builder as _builder +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 +from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 +from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19trace/v1/evaluation.proto\x12\x1bphoenix.proto.evaluation.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1egoogle/protobuf/wrappers.proto\"\xfd\x04\n\nEvaluation\x12\x45\n\nsubject_id\x18\n \x01(\x0b\x32\x31.phoenix.proto.evaluation.v1.Evaluation.SubjectID\x12>\n\x06result\x18\x02 \x01(\x0b\x32..phoenix.proto.evaluation.v1.Evaluation.Result\x12\x0c\n\x04name\x18\x03 \x01(\t\x12-\n\ttimestamp\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12+\n\nattributes\x18\x05 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\xe5\x01\n\tSubjectID\x12\x12\n\x08trace_id\x18\x01 \x01(\tH\x00\x12\x11\n\x07span_id\x18\x02 \x01(\tH\x00\x12\x66\n\x15\x64ocument_retrieval_id\x18\x03 \x01(\x0b\x32\x45.phoenix.proto.evaluation.v1.Evaluation.SubjectID.DocumentRetrievalIDH\x00\x1a\x41\n\x13\x44ocumentRetrievalID\x12\x0f\n\x07span_id\x18\x01 \x01(\t\x12\x19\n\x11\x64ocument_position\x18\x02 \x01(\x05\x42\x06\n\x04kind\x1a\x95\x01\n\x06Result\x12+\n\x05score\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12+\n\x05label\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.StringValue\x12\x31\n\x0b\x65xplanation\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.StringValueb\x06proto3') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trace.v1.evaluation_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _EVALUATION._serialized_start=154 + _EVALUATION._serialized_end=791 + _EVALUATION_SUBJECTID._serialized_start=410 + _EVALUATION_SUBJECTID._serialized_end=639 + _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_start=566 + _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_end=631 + _EVALUATION_RESULT._serialized_start=642 + _EVALUATION_RESULT._serialized_end=791 +# @@protoc_insertion_point(module_scope) diff --git a/src/phoenix/trace/v1/evaluation_pb2.pyi b/src/phoenix/trace/v1/evaluation_pb2.pyi new file mode 100644 index 0000000000..ed848ef322 --- /dev/null +++ b/src/phoenix/trace/v1/evaluation_pb2.pyi @@ -0,0 +1,112 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" +import builtins +import google.protobuf.descriptor +import google.protobuf.message +import google.protobuf.struct_pb2 +import google.protobuf.timestamp_pb2 +import google.protobuf.wrappers_pb2 +import sys + +if sys.version_info >= (3, 8): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +@typing_extensions.final +class Evaluation(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class SubjectID(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class DocumentRetrievalID(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SPAN_ID_FIELD_NUMBER: builtins.int + DOCUMENT_POSITION_FIELD_NUMBER: builtins.int + span_id: builtins.str + document_position: builtins.int + """zero-based-index""" + def __init__( + self, + *, + span_id: builtins.str = ..., + document_position: builtins.int = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["document_position", b"document_position", "span_id", b"span_id"]) -> None: ... + + TRACE_ID_FIELD_NUMBER: builtins.int + SPAN_ID_FIELD_NUMBER: builtins.int + DOCUMENT_RETRIEVAL_ID_FIELD_NUMBER: builtins.int + trace_id: builtins.str + span_id: builtins.str + @property + def document_retrieval_id(self) -> global___Evaluation.SubjectID.DocumentRetrievalID: ... + def __init__( + self, + *, + trace_id: builtins.str = ..., + span_id: builtins.str = ..., + document_retrieval_id: global___Evaluation.SubjectID.DocumentRetrievalID | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["kind", b"kind"]) -> typing_extensions.Literal["trace_id", "span_id", "document_retrieval_id"] | None: ... + + @typing_extensions.final + class Result(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SCORE_FIELD_NUMBER: builtins.int + LABEL_FIELD_NUMBER: builtins.int + EXPLANATION_FIELD_NUMBER: builtins.int + @property + def score(self) -> google.protobuf.wrappers_pb2.DoubleValue: ... + @property + def label(self) -> google.protobuf.wrappers_pb2.StringValue: ... + @property + def explanation(self) -> google.protobuf.wrappers_pb2.StringValue: ... + def __init__( + self, + *, + score: google.protobuf.wrappers_pb2.DoubleValue | None = ..., + label: google.protobuf.wrappers_pb2.StringValue | None = ..., + explanation: google.protobuf.wrappers_pb2.StringValue | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> None: ... + + SUBJECT_ID_FIELD_NUMBER: builtins.int + RESULT_FIELD_NUMBER: builtins.int + NAME_FIELD_NUMBER: builtins.int + TIMESTAMP_FIELD_NUMBER: builtins.int + ATTRIBUTES_FIELD_NUMBER: builtins.int + @property + def subject_id(self) -> global___Evaluation.SubjectID: ... + @property + def result(self) -> global___Evaluation.Result: ... + name: builtins.str + @property + def timestamp(self) -> google.protobuf.timestamp_pb2.Timestamp: ... + @property + def attributes(self) -> google.protobuf.struct_pb2.Struct: ... + def __init__( + self, + *, + subject_id: global___Evaluation.SubjectID | None = ..., + result: global___Evaluation.Result | None = ..., + name: builtins.str = ..., + timestamp: google.protobuf.timestamp_pb2.Timestamp | None = ..., + attributes: google.protobuf.struct_pb2.Struct | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["attributes", b"attributes", "result", b"result", "subject_id", b"subject_id", "timestamp", b"timestamp"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["attributes", b"attributes", "name", b"name", "result", b"result", "subject_id", b"subject_id", "timestamp", b"timestamp"]) -> None: ... + +global___Evaluation = Evaluation diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb new file mode 100644 index 0000000000..d523923dbb --- /dev/null +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d58c5245e6d7811f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import phoenix as px\n", + "from phoenix.experimental.evals.functions import llm_classify\n", + "from phoenix.experimental.evals.models import OpenAIModel\n", + "from phoenix.experimental.evals.templates.default_templates import (\n", + " HALLUCINATION_PROMPT_RAILS_MAP,\n", + " HALLUCINATION_PROMPT_TEMPLATE,\n", + " QA_PROMPT_RAILS_MAP,\n", + " QA_PROMPT_TEMPLATE,\n", + " RAG_RELEVANCY_PROMPT_RAILS_MAP,\n", + " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", + ")\n", + "from phoenix.session.evaluaton import add_evaluations, get_retrieved_documents\n", + "from phoenix.trace.exporter import HttpExporter\n", + "from sklearn.metrics import ndcg_score" + ] + }, + { + "cell_type": "markdown", + "id": "4b43166d02c26e8d", + "metadata": { + "collapsed": false + }, + "source": [ + "# Start Phoenix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "ds = px.load_example_traces(\"llama_index_rag\")\n", + "px.launch_app(trace=ds)" + ] + }, + { + "cell_type": "markdown", + "id": "1362576ff0fe4e2c", + "metadata": { + "collapsed": false + }, + "source": [ + "# Extract Retrieved Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2c45b85c6644735", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents = get_retrieved_documents(px.active_session())\n", + "retrieved_documents" + ] + }, + { + "cell_type": "markdown", + "id": "9ac938a5c199dc82", + "metadata": { + "collapsed": false + }, + "source": [ + "# Set Up OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e14465175520ce42", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "import openai\n", + "\n", + "if not (openai_api_key := os.getenv(\"OPENAI_API_KEY\")):\n", + " openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n", + "openai.api_key = openai_api_key\n", + "os.environ[\"OPENAI_API_KEY\"] = openai_api_key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de9664171d3e33b8", + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAIModel(model_name=\"gpt-4-1106-preview\")\n", + "model(\"hi\")" + ] + }, + { + "cell_type": "markdown", + "id": "d694213dcf35676f", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Document Relevance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff130bce795ea35f", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "516dc273735ad00c", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents_eval = llm_classify(\n", + " retrieved_documents.rename({\"query\": \"input\", \"document_content\": \"reference\"}, axis=1),\n", + " model,\n", + " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", + " list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "retrieved_documents_eval[\"score\"] = (retrieved_documents_eval[\"label\"] == \"relevant\").astype(int)\n", + "retrieved_documents_eval.to_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f341795ae24ca024", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents_eval = pd.read_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")\n", + "retrieved_documents_eval" + ] + }, + { + "cell_type": "markdown", + "id": "357fe94b02b22a6b", + "metadata": { + "collapsed": false + }, + "source": [ + "# Merge Data to Compute Ranking Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3bd04b678c9d18c", + "metadata": {}, + "outputs": [], + "source": [ + "combined = pd.concat([retrieved_documents, retrieved_documents_eval.add_prefix(\"eval_\")], axis=1)\n", + "combined" + ] + }, + { + "cell_type": "markdown", + "id": "b162eccd6c69aa7f", + "metadata": { + "collapsed": false + }, + "source": [ + "# Compute NDCG@2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d9fdebd46d268b", + "metadata": {}, + "outputs": [], + "source": [ + "ndcg_at_2 = pd.DataFrame(\n", + " {\n", + " \"score\": combined.groupby(\"context.span_id\").apply(\n", + " lambda x: ndcg_score([x.eval_score[:2]], [x.document_score[:2]])\n", + " )\n", + " }\n", + ")\n", + "ndcg_at_2.to_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8032851d13b63d55", + "metadata": {}, + "outputs": [], + "source": [ + "ndcg_at_2 = pd.read_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")\n", + "ndcg_at_2" + ] + }, + { + "cell_type": "markdown", + "id": "e8d5816954fbaa4d", + "metadata": { + "collapsed": false + }, + "source": [ + "# Compute Precision@3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3167f4675c7313", + "metadata": {}, + "outputs": [], + "source": [ + "precision_at_3 = pd.DataFrame(\n", + " {\"score\": combined.groupby(\"context.span_id\").apply(lambda x: x.eval_score[:3].sum() / 3)}\n", + ")\n", + "precision_at_3.to_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c1d31d1d1c95429", + "metadata": {}, + "outputs": [], + "source": [ + "precision_at_3 = pd.read_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")\n", + "precision_at_3" + ] + }, + { + "cell_type": "markdown", + "id": "1819b377e7602361", + "metadata": { + "collapsed": false + }, + "source": [ + "# Merge Documents from Retrieval Spans to Q&A Spans (to Compute Q&A Correctness)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb27fd4724e0e27e", + "metadata": {}, + "outputs": [], + "source": [ + "qa_df = (\n", + " px.active_session()\n", + " .get_spans_dataframe(\"output.value is not None\", root_spans_only=True)\n", + " .set_index(\"context.trace_id\")[\n", + " [\"attributes.input.value\", \"attributes.output.value\", \"context.span_id\"]\n", + " ]\n", + " .rename({\"attributes.input.value\": \"input\", \"attributes.output.value\": \"output\"}, axis=1)\n", + ")\n", + "qa_df[\"reference\"] = retrieved_documents.groupby(\"context.trace_id\").apply(\n", + " lambda x: \"\\n\\n\".join(x.document_content)\n", + ")\n", + "qa_df.set_index(\"context.span_id\", inplace=True)\n", + "qa_df" + ] + }, + { + "cell_type": "markdown", + "id": "f4084449c986aed8", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Q&A Correctness" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae507af54ce886a", + "metadata": {}, + "outputs": [], + "source": [ + "qa_correctness_eval = llm_classify(\n", + " qa_df,\n", + " model,\n", + " QA_PROMPT_TEMPLATE,\n", + " list(QA_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "qa_correctness_eval[\"score\"] = (qa_correctness_eval[\"label\"] == \"correct\").astype(int)\n", + "qa_correctness_eval.to_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2041d2dcc7d02322", + "metadata": {}, + "outputs": [], + "source": [ + "qa_correctness_eval = pd.read_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")\n", + "qa_correctness_eval" + ] + }, + { + "cell_type": "markdown", + "id": "a88f90ea9c24832b", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Hallucination" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "946b5aad5d72c1f5", + "metadata": {}, + "outputs": [], + "source": [ + "hallucination_eval = llm_classify(\n", + " qa_df,\n", + " model,\n", + " HALLUCINATION_PROMPT_TEMPLATE,\n", + " list(HALLUCINATION_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "hallucination_eval[\"score\"] = (hallucination_eval[\"label\"] == \"factual\").astype(int)\n", + "hallucination_eval.to_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8b1a6d7143c986e", + "metadata": {}, + "outputs": [], + "source": [ + "hallucination_eval = pd.read_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")\n", + "hallucination_eval" + ] + }, + { + "cell_type": "markdown", + "id": "bdd4d1c641fb5e15", + "metadata": { + "collapsed": false + }, + "source": [ + "# Ingest Evaluations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eed5bc68320bb18", + "metadata": {}, + "outputs": [], + "source": [ + "exporter = HttpExporter()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495a5e74b469a660", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, retrieved_documents_eval, \"Relevance\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20cc931d1529f84c", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, ndcg_at_2, \"NDCG@2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb48a5daae9d5bcb", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, precision_at_3, \"Precision@2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "848420ee90e10f62", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, qa_correctness_eval, \"Q&A Correctness\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c03dde5802ed98a3", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, hallucination_eval, \"Hallucination\")" + ] + }, + { + "cell_type": "markdown", + "id": "83dd4cd21c966504", + "metadata": { + "collapsed": false + }, + "source": [ + "# End Session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5842da4238a93554", + "metadata": {}, + "outputs": [], + "source": [ + "# px.active_session().end()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de4947cc9a9b6e23", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 55a4eb33ff7551a0aa32226fae125efde045fcad Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Thu, 16 Nov 2023 11:11:05 -0800 Subject: [PATCH 02/15] recompile proto --- src/phoenix/trace/v1/evaluation_pb2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix/trace/v1/evaluation_pb2.py b/src/phoenix/trace/v1/evaluation_pb2.py index 3e4315304b..3b00660660 100644 --- a/src/phoenix/trace/v1/evaluation_pb2.py +++ b/src/phoenix/trace/v1/evaluation_pb2.py @@ -16,7 +16,7 @@ from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19trace/v1/evaluation.proto\x12\x1bphoenix.proto.evaluation.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1egoogle/protobuf/wrappers.proto\"\xfd\x04\n\nEvaluation\x12\x45\n\nsubject_id\x18\n \x01(\x0b\x32\x31.phoenix.proto.evaluation.v1.Evaluation.SubjectID\x12>\n\x06result\x18\x02 \x01(\x0b\x32..phoenix.proto.evaluation.v1.Evaluation.Result\x12\x0c\n\x04name\x18\x03 \x01(\t\x12-\n\ttimestamp\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12+\n\nattributes\x18\x05 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\xe5\x01\n\tSubjectID\x12\x12\n\x08trace_id\x18\x01 \x01(\tH\x00\x12\x11\n\x07span_id\x18\x02 \x01(\tH\x00\x12\x66\n\x15\x64ocument_retrieval_id\x18\x03 \x01(\x0b\x32\x45.phoenix.proto.evaluation.v1.Evaluation.SubjectID.DocumentRetrievalIDH\x00\x1a\x41\n\x13\x44ocumentRetrievalID\x12\x0f\n\x07span_id\x18\x01 \x01(\t\x12\x19\n\x11\x64ocument_position\x18\x02 \x01(\x05\x42\x06\n\x04kind\x1a\x95\x01\n\x06Result\x12+\n\x05score\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12+\n\x05label\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.StringValue\x12\x31\n\x0b\x65xplanation\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.StringValueb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19trace/v1/evaluation.proto\x12\x1bphoenix.proto.evaluation.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1egoogle/protobuf/wrappers.proto\"\xfd\x04\n\nEvaluation\x12\x45\n\nsubject_id\x18\x01 \x01(\x0b\x32\x31.phoenix.proto.evaluation.v1.Evaluation.SubjectID\x12>\n\x06result\x18\x02 \x01(\x0b\x32..phoenix.proto.evaluation.v1.Evaluation.Result\x12\x0c\n\x04name\x18\x03 \x01(\t\x12-\n\ttimestamp\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12+\n\nattributes\x18\x05 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\xe5\x01\n\tSubjectID\x12\x12\n\x08trace_id\x18\x01 \x01(\tH\x00\x12\x11\n\x07span_id\x18\x02 \x01(\tH\x00\x12\x66\n\x15\x64ocument_retrieval_id\x18\x03 \x01(\x0b\x32\x45.phoenix.proto.evaluation.v1.Evaluation.SubjectID.DocumentRetrievalIDH\x00\x1a\x41\n\x13\x44ocumentRetrievalID\x12\x0f\n\x07span_id\x18\x01 \x01(\t\x12\x19\n\x11\x64ocument_position\x18\x02 \x01(\x05\x42\x06\n\x04kind\x1a\x95\x01\n\x06Result\x12+\n\x05score\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12+\n\x05label\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.StringValue\x12\x31\n\x0b\x65xplanation\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.StringValueb\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trace.v1.evaluation_pb2', globals()) From d8ba77399db8a4aa7a19d9d11aab33445f0e7762 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Thu, 16 Nov 2023 13:58:30 -0800 Subject: [PATCH 03/15] simulate streaming --- src/phoenix/server/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/phoenix/server/main.py b/src/phoenix/server/main.py index 6e659ee4a3..eda3382145 100644 --- a/src/phoenix/server/main.py +++ b/src/phoenix/server/main.py @@ -161,8 +161,11 @@ def _load_items( daemon=True, ).start() fixture_evals = get_evals_from_fixture(trace_dataset_name) - for evaluation in fixture_evals: - evals.put(evaluation) + Thread( + target=_load_items, + args=(evals, fixture_evals, simulate_streaming), + daemon=True, + ).start() umap_params_list = args.umap_params.split(",") umap_params = UMAPParameters( min_dist=float(umap_params_list[0]), From ea44b3106229a92f632d74eb9cb9926c6e6db2b4 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Thu, 16 Nov 2023 14:05:00 -0800 Subject: [PATCH 04/15] remove unused functions --- src/phoenix/core/evals.py | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py index 32f8aff1e1..f33d2e702e 100644 --- a/src/phoenix/core/evals.py +++ b/src/phoenix/core/evals.py @@ -6,11 +6,10 @@ from typing import DefaultDict, Dict, List, Optional from uuid import UUID -import numpy as np from typing_extensions import TypeAlias import phoenix.trace.v1 as pb -from phoenix.trace.schemas import SpanID, TraceID +from phoenix.trace.schemas import SpanID END_OF_QUEUE = None # sentinel value for queue termination @@ -24,12 +23,6 @@ def __init__(self) -> None: weakref.finalize(self, self._queue.put, END_OF_QUEUE) self._lock = RLock() self._start_consumer() - self._trace_evaluations_by_name: DefaultDict[ - EvaluationName, Dict[TraceID, pb.Evaluation] - ] = defaultdict(dict) - self._evaluations_by_trace_id: DefaultDict[ - TraceID, Dict[EvaluationName, pb.Evaluation] - ] = defaultdict(dict) self._span_evaluations_by_name: DefaultDict[ EvaluationName, Dict[SpanID, pb.Evaluation] ] = defaultdict(dict) @@ -70,10 +63,6 @@ def _process_evaluation(self, evaluation: pb.Evaluation) -> None: span_id = UUID(subject_id.span_id) self._evaluations_by_span_id[span_id][name] = evaluation self._span_evaluations_by_name[name][span_id] = evaluation - elif subject_id_kind == "trace_id": - trace_id = UUID(subject_id.trace_id) - self._evaluations_by_span_id[trace_id][name] = evaluation - self._trace_evaluations_by_name[name][trace_id] = evaluation else: raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}") @@ -91,26 +80,3 @@ def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evalua for evaluations in self._document_evaluations_by_span_id[span_id].values(): all_evaluations.extend(evaluations.values()) return all_evaluations - - def get_document_evaluations( - self, span_id: SpanID, evaluation_name: str, num_documents: int - ) -> List[Optional[pb.Evaluation]]: - relevance_evaluations: List[Optional[pb.Evaluation]] = [None] * num_documents - with self._lock: - evaluations = self._document_evaluations_by_span_id[span_id][evaluation_name] - for document_position, document_relevance in evaluations.items(): - if document_position < len(relevance_evaluations): - relevance_evaluations[document_position] = document_relevance - return relevance_evaluations - - def get_document_evaluation_scores( - self, span_id: SpanID, evaluation_name: str, num_documents: int - ) -> List[Optional[float]]: - scores: List[Optional[float]] = [np.nan] * num_documents - with self._lock: - evaluations = self._document_evaluations_by_span_id[span_id][evaluation_name] - for document_position, document_relevance in evaluations.items(): - result = document_relevance.result - if result.HasField("score") and document_position < len(scores): - scores[document_position] = document_relevance.result.score.value - return scores From 059e3ad97602ab4ab313444cd495fd2e7fdec50a Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 07:43:01 -0800 Subject: [PATCH 05/15] fix typo --- tutorials/internal/trace_eval_ingestion_testing.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb index d523923dbb..9e4f6ff717 100644 --- a/tutorials/internal/trace_eval_ingestion_testing.ipynb +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -19,7 +19,7 @@ " RAG_RELEVANCY_PROMPT_RAILS_MAP,\n", " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", ")\n", - "from phoenix.session.evaluaton import add_evaluations, get_retrieved_documents\n", + "from phoenix.session.evaluation import add_evaluations, get_retrieved_documents\n", "from phoenix.trace.exporter import HttpExporter\n", "from sklearn.metrics import ndcg_score" ] From 0514c6f7e1467c0026d2b1009ac0097746ca904b Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 08:58:58 -0800 Subject: [PATCH 06/15] clean up gql --- app/schema.graphql | 10 ++--- src/phoenix/core/evals.py | 5 +-- src/phoenix/server/api/types/Evaluation.py | 20 ++++++--- src/phoenix/server/api/types/Span.py | 45 +++++++++++-------- .../trace_eval_ingestion_testing.ipynb | 2 +- 5 files changed, 48 insertions(+), 34 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index 427ce3a729..fef570f076 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -184,7 +184,10 @@ type DocumentEvaluation implements Evaluation { score: Float label: String explanation: String - spanId: String! + + """ + The zero-based index among retrieved documents, which is collected as a list (even when ordering is not inherently meaningful). + """ documentPosition: Int! } @@ -543,11 +546,7 @@ type Span { Cumulative (completion) token count from self and all descendant spans (children, grandchildren, etc.) """ cumulativeTokenCountCompletion: Int - - """Span evaluations""" spanEvaluations: [SpanEvaluation!]! - - """Document evaluations""" documentEvaluations: [DocumentEvaluation!]! """All descendant spans (children, grandchildren, etc.)""" @@ -586,7 +585,6 @@ type SpanEvaluation implements Evaluation { score: Float label: String explanation: String - spanId: String! } type SpanEvent { diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py index f33d2e702e..4972a38fd1 100644 --- a/src/phoenix/core/evals.py +++ b/src/phoenix/core/evals.py @@ -4,7 +4,6 @@ from threading import RLock, Thread from types import MethodType from typing import DefaultDict, Dict, List, Optional -from uuid import UUID from typing_extensions import TypeAlias @@ -56,11 +55,11 @@ def _process_evaluation(self, evaluation: pb.Evaluation) -> None: subject_id_kind = subject_id.WhichOneof("kind") if subject_id_kind == "document_retrieval_id": document_retrieval_id = subject_id.document_retrieval_id - span_id = UUID(document_retrieval_id.span_id) + span_id = SpanID(document_retrieval_id.span_id) document_position = document_retrieval_id.document_position self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation elif subject_id_kind == "span_id": - span_id = UUID(subject_id.span_id) + span_id = SpanID(subject_id.span_id) self._evaluations_by_span_id[span_id][name] = evaluation self._span_evaluations_by_name[name][span_id] = evaluation else: diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py index c3762ddd93..da47f0b728 100644 --- a/src/phoenix/server/api/types/Evaluation.py +++ b/src/phoenix/server/api/types/Evaluation.py @@ -3,6 +3,7 @@ import strawberry import phoenix.trace.v1 as pb +from phoenix.trace.schemas import SpanID @strawberry.interface @@ -15,7 +16,7 @@ class Evaluation: @strawberry.type class SpanEvaluation(Evaluation): - span_id: str + span_id: strawberry.Private[SpanID] @staticmethod def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation": @@ -23,19 +24,23 @@ def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation": score = result.score.value if result.HasField("score") else None label = result.label.value if result.HasField("label") else None explanation = result.explanation.value if result.HasField("explanation") else None + span_id = SpanID(evaluation.subject_id.span_id) return SpanEvaluation( name=evaluation.name, score=score, label=label, explanation=explanation, - span_id=evaluation.subject_id.span_id, + span_id=span_id, ) @strawberry.type class DocumentEvaluation(Evaluation): - span_id: str - document_position: int + span_id: strawberry.Private[SpanID] + document_position: int = strawberry.field( + description="The zero-based index among retrieved documents, which " + "is collected as a list (even when ordering is not inherently meaningful)." + ) @staticmethod def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation": @@ -43,11 +48,14 @@ def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation": score = result.score.value if result.HasField("score") else None label = result.label.value if result.HasField("label") else None explanation = result.explanation.value if result.HasField("explanation") else None + document_retrieval_id = evaluation.subject_id.document_retrieval_id + document_position = document_retrieval_id.document_position + span_id = SpanID(document_retrieval_id.span_id) return DocumentEvaluation( name=evaluation.name, score=score, label=label, explanation=explanation, - span_id=evaluation.subject_id.document_retrieval_id.span_id, - document_position=evaluation.subject_id.document_retrieval_id.document_position, + document_position=document_position, + span_id=span_id, ) diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index c53c5366aa..869ba58252 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -124,14 +124,32 @@ class Span: description="Cumulative (completion) token count from self and all " "descendant spans (children, grandchildren, etc.)", ) - span_evaluations: List[SpanEvaluation] = strawberry.field( - description="Span evaluations", - default_factory=list, - ) - document_evaluations: List[DocumentEvaluation] = strawberry.field( - description="Document evaluations", - default_factory=list, - ) + + @strawberry.field + def span_evaluations( + self, + info: Info[Context, None], + ) -> List[SpanEvaluation]: + if not (evals := info.context.evals): + return [] + span_id = SpanID(str(self.context.span_id)) + return [ + SpanEvaluation.from_pb_evaluation(evaluation) + for evaluation in evals.get_evaluations_by_span_id(span_id) + ] + + @strawberry.field + def document_evaluations( + self, + info: Info[Context, None], + ) -> List[DocumentEvaluation]: + if not (evals := info.context.evals): + return [] + span_id = SpanID(str(self.context.span_id)) + return [ + DocumentEvaluation.from_pb_evaluation(evaluation) + for evaluation in evals.get_document_evaluations_by_span_id(span_id) + ] @strawberry.field( description="All descendant spans (children, grandchildren, etc.)", @@ -154,13 +172,6 @@ def to_gql_span(span: trace_schema.Span, evals: Optional[Evals] = None) -> "Span events: List[SpanEvent] = list(map(SpanEvent.from_event, span.events)) input_value = cast(Optional[str], span.attributes.get(INPUT_VALUE)) output_value = cast(Optional[str], span.attributes.get(OUTPUT_VALUE)) - span_evaluations: List[SpanEvaluation] = [] - document_evaluations: List[DocumentEvaluation] = [] - span_id = span.context.span_id - for evaluation in evals.get_evaluations_by_span_id(span_id) if evals else (): - span_evaluations.append(SpanEvaluation.from_pb_evaluation(evaluation)) - for evaluation in evals.get_document_evaluations_by_span_id(span_id) if evals else (): - document_evaluations.append(DocumentEvaluation.from_pb_evaluation(evaluation)) return Span( name=span.name, status_code=SpanStatusCode(span.status_code), @@ -171,7 +182,7 @@ def to_gql_span(span: trace_schema.Span, evals: Optional[Evals] = None) -> "Span latency_ms=cast(Optional[float], span.attributes.get(ComputedAttributes.LATENCY_MS.value)), context=SpanContext( trace_id=cast(ID, span.context.trace_id), - span_id=cast(ID, span_id), + span_id=cast(ID, span.context.span_id), ), attributes=json.dumps( _nested_attributes(_hide_embedding_vectors(span.attributes)), @@ -218,8 +229,6 @@ def to_gql_span(span: trace_schema.Span, evals: Optional[Evals] = None) -> "Span if output_value is not None else None ), - span_evaluations=span_evaluations, - document_evaluations=document_evaluations, ) diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb index 9e4f6ff717..a08bac96c9 100644 --- a/tutorials/internal/trace_eval_ingestion_testing.ipynb +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -444,7 +444,7 @@ "metadata": {}, "outputs": [], "source": [ - "# px.active_session().end()" + "px.active_session().end()" ] }, { From 196963e6df4adbb418dbdf49f5dbf3db6b391116 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 13:22:13 -0800 Subject: [PATCH 07/15] fix receiver --- src/phoenix/server/evaluation_handler.py | 41 ++++++++++++++---------- src/phoenix/trace/exporter.py | 2 +- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/phoenix/server/evaluation_handler.py b/src/phoenix/server/evaluation_handler.py index 55460bf322..238476272f 100644 --- a/src/phoenix/server/evaluation_handler.py +++ b/src/phoenix/server/evaluation_handler.py @@ -1,10 +1,11 @@ import gzip from typing import Protocol +from google.protobuf.message import DecodeError from starlette.endpoints import HTTPEndpoint from starlette.requests import Request from starlette.responses import Response -from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY +from starlette.status import HTTP_415_UNSUPPORTED_MEDIA_TYPE, HTTP_422_UNPROCESSABLE_ENTITY import phoenix.trace.v1 as pb @@ -18,22 +19,28 @@ class EvaluationHandler(HTTPEndpoint): queue: SupportsPutEvaluation async def post(self, request: Request) -> Response: + content_type = request.headers.get("content-type") + if content_type != "application/x-protobuf": + return Response( + content="Unsupported content type", + status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE, + ) + body = await request.body() + content_encoding = request.headers.get("content-encoding") + if content_encoding == "gzip": + body = gzip.decompress(body) + elif content_encoding: + return Response( + content="Unsupported content encoding", + status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE, + ) + evaluation = pb.Evaluation() try: - content_type = request.headers.get("content-type") - if content_type == "application/x-protobuf": - body = await request.body() - content_encoding = request.headers.get("content-encoding") - if content_encoding == "gzip": - body = gzip.decompress(body) - elif content_encoding is not None: - raise NotImplementedError(f"Unsupported content-encoding: {content_encoding}") - evaluation = pb.Evaluation() - evaluation.ParseFromString(body) - else: - raise NotImplementedError(f"Unsupported content-type: {content_type}") - except NotImplementedError as e: - return Response(str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY) - except Exception: - return Response(status_code=HTTP_422_UNPROCESSABLE_ENTITY) + evaluation.ParseFromString(body) + except DecodeError: + return Response( + content="Request body is invalid", + status_code=HTTP_422_UNPROCESSABLE_ENTITY, + ) self.queue.put(evaluation) return Response() diff --git a/src/phoenix/trace/exporter.py b/src/phoenix/trace/exporter.py index a077d21a85..f7b71fad4e 100644 --- a/src/phoenix/trace/exporter.py +++ b/src/phoenix/trace/exporter.py @@ -86,7 +86,7 @@ def _send(self, message: Message) -> None: serialized = message.SerializeToString() data = gzip.compress(serialized) try: - self._session.post(self._url(message), data=data) + self._session.post(self._url(message), data=data).raise_for_status() except Exception as e: logger.exception(e) From ae753d97af1cdc81ed885c2f01fcb9310ea26f62 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 13:47:58 -0800 Subject: [PATCH 08/15] add back dropped param --- src/phoenix/session/session.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py index dfdc587e04..3ff682eb17 100644 --- a/src/phoenix/session/session.py +++ b/src/phoenix/session/session.py @@ -282,6 +282,7 @@ def __init__( model=self.model, corpus=self.corpus, traces=self.traces, + evals=self.evals, umap_params=self.umap_parameters, ) self.server = ThreadServer( From fc48438fef778c8709c4417cc702a402be9c2cdb Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 13:48:52 -0800 Subject: [PATCH 09/15] clean up notebook --- .../trace_eval_ingestion_testing.ipynb | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb index a08bac96c9..0585cf0c3b 100644 --- a/tutorials/internal/trace_eval_ingestion_testing.ipynb +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -115,16 +115,6 @@ "# Evaluate Document Relevance" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff130bce795ea35f", - "metadata": {}, - "outputs": [], - "source": [ - "retrieved_documents" - ] - }, { "cell_type": "code", "execution_count": null, @@ -444,16 +434,8 @@ "metadata": {}, "outputs": [], "source": [ - "px.active_session().end()" + "# px.active_session().end()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de4947cc9a9b6e23", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 7408f38a68de09c1df8567a5d405151f5a1854d5 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 14:14:22 -0800 Subject: [PATCH 10/15] improve gql descriptions --- app/schema.graphql | 8 ++++++++ src/phoenix/server/api/types/Span.py | 15 +++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index fef570f076..2642e13f10 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -546,7 +546,15 @@ type Span { Cumulative (completion) token count from self and all descendant spans (children, grandchildren, etc.) """ cumulativeTokenCountCompletion: Int + + """ + Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may access the helpfulness of its response with respect to its input. + """ spanEvaluations: [SpanEvaluation!]! + + """ + Evaluations of the documents associated with the span, e.g. if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS attribute, an evaluation for each document may assess its relevance respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is a list, and each evaluation is identified by its document's (zero-based) index in that list. + """ documentEvaluations: [DocumentEvaluation!]! """All descendant spans (children, grandchildren, etc.)""" diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index 869ba58252..8b25466a6f 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -125,7 +125,11 @@ class Span: "descendant spans (children, grandchildren, etc.)", ) - @strawberry.field + @strawberry.field( + description="Evaluations associated with the span, e.g. if the span is " + "an LLM, an evaluation may access the helpfulness of its response with " + "respect to its input." + ) # type: ignore def span_evaluations( self, info: Info[Context, None], @@ -138,7 +142,14 @@ def span_evaluations( for evaluation in evals.get_evaluations_by_span_id(span_id) ] - @strawberry.field + @strawberry.field( + description="Evaluations of the documents associated with the span, e.g. " + "if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS " + "attribute, an evaluation for each document may assess its relevance " + "respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is " + "a list, and each evaluation is identified by its document's (zero-based) " + "index in that list." + ) # type: ignore def document_evaluations( self, info: Info[Context, None], From 1ab6acc696b5fab683daf7c5dac124f9b1e016d1 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 14:38:26 -0800 Subject: [PATCH 11/15] clean up gql --- app/schema.graphql | 37 ++++++++++++++++++++++ src/phoenix/server/api/schema.py | 7 ++-- src/phoenix/server/api/types/Evaluation.py | 18 ++++++++--- src/phoenix/server/api/types/Span.py | 5 ++- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index 2642e13f10..9f267d8ef1 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -180,9 +180,20 @@ type DimensionWithValue { } type DocumentEvaluation implements Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" name: String! + + """Result of the evaluation in the form of a numeric score.""" score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ explanation: String """ @@ -274,9 +285,20 @@ type EmbeddingMetadata { } interface Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" name: String! + + """Result of the evaluation in the form of a numeric score.""" score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ explanation: String } @@ -479,6 +501,10 @@ type Query { clusterSelectionEpsilon: Float! = 0 ): [Cluster!]! spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection! + + """ + Names of all available evaluations for spans. (The list contains no duplicates.) + """ spanEvaluationNames: [String!]! traceDatasetInfo: TraceDatasetInfo validateSpanFilterCondition(condition: String!): ValidationResult! @@ -589,9 +615,20 @@ type SpanEdge { } type SpanEvaluation implements Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" name: String! + + """Result of the evaluation in the form of a numeric score.""" score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ explanation: String } diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py index 37d4b2abb2..844cdb32a8 100644 --- a/src/phoenix/server/api/schema.py +++ b/src/phoenix/server/api/schema.py @@ -236,10 +236,13 @@ def spans( spans = filter(predicate, spans) if sort: spans = sort(spans) - data = [to_gql_span(span, info.context.evals) for span in spans] + data = list(map(to_gql_span, spans)) return connection_from_list(data=data, args=args) - @strawberry.field + @strawberry.field( + description="Names of all available evaluations for spans. " + "(The list contains no duplicates.)" + ) # type: ignore def span_evaluation_names( self, info: Info[Context, None], diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py index da47f0b728..5b4ccd0915 100644 --- a/src/phoenix/server/api/types/Evaluation.py +++ b/src/phoenix/server/api/types/Evaluation.py @@ -8,10 +8,20 @@ @strawberry.interface class Evaluation: - name: str - score: Optional[float] - label: Optional[str] - explanation: Optional[str] + name: str = strawberry.field( + description="Name of the evaluation, e.g. 'helpfulness' or 'relevance'." + ) + score: Optional[float] = strawberry.field( + description="Result of the evaluation in the form of a numeric score." + ) + label: Optional[str] = strawberry.field( + description="Result of the evaluation in the form of a string, e.g. " + "'helpful' or 'not helpful'. Note that the label is not necessarily binary." + ) + explanation: Optional[str] = strawberry.field( + description="The evaluator's explanation for the evaluation result (i.e. " + "score or label, or both) given to the subject." + ) @strawberry.type diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index 8b25466a6f..88edcf3717 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -9,7 +9,6 @@ from strawberry.types import Info import phoenix.trace.schemas as trace_schema -from phoenix.core.evals import Evals from phoenix.core.traces import ComputedAttributes from phoenix.server.api.context import Context from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation @@ -172,14 +171,14 @@ def descendants( if (traces := info.context.traces) is None: return [] return [ - to_gql_span(cast(trace_schema.Span, traces[span_id]), info.context.evals) + to_gql_span(cast(trace_schema.Span, traces[span_id])) for span_id in traces.get_descendant_span_ids( cast(SpanID, self.context.span_id), ) ] -def to_gql_span(span: trace_schema.Span, evals: Optional[Evals] = None) -> "Span": +def to_gql_span(span: trace_schema.Span) -> "Span": events: List[SpanEvent] = list(map(SpanEvent.from_event, span.events)) input_value = cast(Optional[str], span.attributes.get(INPUT_VALUE)) output_value = cast(Optional[str], span.attributes.get(OUTPUT_VALUE)) From 0b7fded7b2edf66f82540729e7616a88827bc6f8 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 14:47:14 -0800 Subject: [PATCH 12/15] fix typo --- src/phoenix/proto/trace/v1/evaluation.proto | 2 +- src/phoenix/trace/v1/evaluation_pb2.pyi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix/proto/trace/v1/evaluation.proto b/src/phoenix/proto/trace/v1/evaluation.proto index 0358f0b9bd..93441161a6 100644 --- a/src/phoenix/proto/trace/v1/evaluation.proto +++ b/src/phoenix/proto/trace/v1/evaluation.proto @@ -8,7 +8,7 @@ message Evaluation { message SubjectId { message DocumentRetrievalId { string span_id = 1; - int32 document_position = 2; // zero-based-index + int32 document_position = 2; // zero-based index } oneof kind { string trace_id = 1; diff --git a/src/phoenix/trace/v1/evaluation_pb2.pyi b/src/phoenix/trace/v1/evaluation_pb2.pyi index f6f18fc976..ab34b84f5c 100644 --- a/src/phoenix/trace/v1/evaluation_pb2.pyi +++ b/src/phoenix/trace/v1/evaluation_pb2.pyi @@ -31,7 +31,7 @@ class Evaluation(google.protobuf.message.Message): DOCUMENT_POSITION_FIELD_NUMBER: builtins.int span_id: builtins.str document_position: builtins.int - """zero-based-index""" + """zero-based index""" def __init__( self, *, From 15f71f5ecf1c88a826a6ad37062c155ecb308a91 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 14:54:15 -0800 Subject: [PATCH 13/15] fix typo --- app/schema.graphql | 2 +- src/phoenix/server/api/types/Span.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index 9f267d8ef1..7c1e164987 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -574,7 +574,7 @@ type Span { cumulativeTokenCountCompletion: Int """ - Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may access the helpfulness of its response with respect to its input. + Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may assess the helpfulness of its response with respect to its input. """ spanEvaluations: [SpanEvaluation!]! diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index 88edcf3717..4de44d14a6 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -126,7 +126,7 @@ class Span: @strawberry.field( description="Evaluations associated with the span, e.g. if the span is " - "an LLM, an evaluation may access the helpfulness of its response with " + "an LLM, an evaluation may assess the helpfulness of its response with " "respect to its input." ) # type: ignore def span_evaluations( From ada921e127b841b50d332840601ec1c373f1bc26 Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 16:01:37 -0800 Subject: [PATCH 14/15] handle missing values as result of interrupts --- src/phoenix/session/evaluation.py | 22 ++++++---- .../trace_eval_ingestion_testing.ipynb | 40 ++++++++++++++----- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/phoenix/session/evaluation.py b/src/phoenix/session/evaluation.py index f26d628624..a4de5e1029 100644 --- a/src/phoenix/session/evaluation.py +++ b/src/phoenix/session/evaluation.py @@ -6,6 +6,7 @@ - ingest evaluation results into Phoenix via HttpExporter """ import collections +import math from typing import ( Any, Iterable, @@ -54,9 +55,9 @@ def get_retrieved_documents(session: Session) -> pd.DataFrame: { "context.trace_id": trace_id, "context.span_id": span_id, - "query": query, + "input": query, "document_position": position, - "document_content": document.get(DOCUMENT_CONTENT), + "reference": document.get(DOCUMENT_CONTENT), "document_score": document.get(DOCUMENT_SCORE), } ) @@ -64,8 +65,8 @@ def get_retrieved_documents(session: Session) -> pd.DataFrame: columns = [ "context.span_id", "document_position", - "query", - "document_content", + "input", + "reference", "document_score", "context.trace_id", ] @@ -83,7 +84,8 @@ def add_evaluations( index_names, cast(Union[str, Tuple[Any]], index), ) - result = _extract_result(row) + if (result := _extract_result(row)) is None: + continue evaluation = pb.Evaluation( name=evaluation_name, result=result, @@ -134,15 +136,21 @@ def _extract_subject_id_from_index( assert isinstance(value, str) if names[0] in ("context.span_id", "span_id"): return pb.Evaluation.SubjectId(span_id=value) - elif names[0] in ("context.trace_id", "trace_id"): + if names[0] in ("context.trace_id", "trace_id"): return pb.Evaluation.SubjectId(trace_id=value) raise ValueError(f"Unexpected index names: {names}") -def _extract_result(row: "pd.Series[Any]") -> pb.Evaluation.Result: +def _extract_result(row: "pd.Series[Any]") -> Optional[pb.Evaluation.Result]: score = cast(Optional[float], row.get("score")) label = cast(Optional[str], row.get("label")) explanation = cast(Optional[str], row.get("explanation")) + if ( + (score is None or isinstance(score, float) and math.isnan(score)) + and not label + and not explanation + ): + return None return pb.Evaluation.Result( score=DoubleValue(value=score) if score is not None else None, label=StringValue(value=label) if label else None, diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb index 0585cf0c3b..2925afa704 100644 --- a/tutorials/internal/trace_eval_ingestion_testing.ipynb +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -7,6 +7,7 @@ "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", "import pandas as pd\n", "import phoenix as px\n", "from phoenix.experimental.evals.functions import llm_classify\n", @@ -123,13 +124,15 @@ "outputs": [], "source": [ "retrieved_documents_eval = llm_classify(\n", - " retrieved_documents.rename({\"query\": \"input\", \"document_content\": \"reference\"}, axis=1),\n", + " retrieved_documents,\n", " model,\n", " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", " list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),\n", " provide_explanation=True,\n", ")\n", - "retrieved_documents_eval[\"score\"] = (retrieved_documents_eval[\"label\"] == \"relevant\").astype(int)\n", + "retrieved_documents_eval[\"score\"] = (\n", + " retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == \"relevant\"\n", + ").astype(int)\n", "retrieved_documents_eval.to_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")" ] }, @@ -182,11 +185,26 @@ "metadata": {}, "outputs": [], "source": [ + "def _compute_ndcg(df: pd.DataFrame, k: int):\n", + " \"\"\"Compute NDCG@k in the presence of missing values (e.g. as a result of keyboard interrupt).\"\"\"\n", + " eval_scores = [np.nan] * k\n", + " pred_scores = [np.nan] * k\n", + " for i in range(k):\n", + " if i >= len(df.eval_score):\n", + " break\n", + " eval_scores[i] = df.eval_score[i]\n", + " pred_scores[i] = df.document_score[i]\n", + " try:\n", + " return ndcg_score([eval_scores], [pred_scores])\n", + " except ValueError:\n", + " return np.nan\n", + "\n", + "\n", "ndcg_at_2 = pd.DataFrame(\n", " {\n", - " \"score\": combined.groupby(\"context.span_id\").apply(\n", - " lambda x: ndcg_score([x.eval_score[:2]], [x.document_score[:2]])\n", - " )\n", + " \"score\": combined\n", + " .groupby(\"context.span_id\")\n", + " .apply(_compute_ndcg, k=2)\n", " }\n", ")\n", "ndcg_at_2.to_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")" @@ -221,7 +239,7 @@ "outputs": [], "source": [ "precision_at_3 = pd.DataFrame(\n", - " {\"score\": combined.groupby(\"context.span_id\").apply(lambda x: x.eval_score[:3].sum() / 3)}\n", + " {\"score\": combined.groupby(\"context.span_id\").apply(lambda x: x.eval_score[:3].sum(skipna=False) / 3)}\n", ")\n", "precision_at_3.to_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")" ] @@ -263,7 +281,7 @@ " .rename({\"attributes.input.value\": \"input\", \"attributes.output.value\": \"output\"}, axis=1)\n", ")\n", "qa_df[\"reference\"] = retrieved_documents.groupby(\"context.trace_id\").apply(\n", - " lambda x: \"\\n\\n\".join(x.document_content)\n", + " lambda x: \"\\n\\n\".join(x.reference)\n", ")\n", "qa_df.set_index(\"context.span_id\", inplace=True)\n", "qa_df" @@ -293,7 +311,9 @@ " list(QA_PROMPT_RAILS_MAP.values()),\n", " provide_explanation=True,\n", ")\n", - "qa_correctness_eval[\"score\"] = (qa_correctness_eval[\"label\"] == \"correct\").astype(int)\n", + "qa_correctness_eval[\"score\"] = (\n", + " qa_correctness_eval.label[~qa_correctness_eval.label.isna()] == \"correct\"\n", + ").astype(int)\n", "qa_correctness_eval.to_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")" ] }, @@ -332,7 +352,9 @@ " list(HALLUCINATION_PROMPT_RAILS_MAP.values()),\n", " provide_explanation=True,\n", ")\n", - "hallucination_eval[\"score\"] = (hallucination_eval[\"label\"] == \"factual\").astype(int)\n", + "hallucination_eval[\"score\"] = (\n", + " hallucination_eval.label[~hallucination_eval.label.isna()] == \"factual\"\n", + ").astype(int)\n", "hallucination_eval.to_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")" ] }, From 7357c6b412d903539c45a4de70cd80bd0b28e22b Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Mon, 20 Nov 2023 16:12:09 -0800 Subject: [PATCH 15/15] fix format --- .../internal/trace_eval_ingestion_testing.ipynb | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb index 2925afa704..bc07fcedbf 100644 --- a/tutorials/internal/trace_eval_ingestion_testing.ipynb +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -200,13 +200,7 @@ " return np.nan\n", "\n", "\n", - "ndcg_at_2 = pd.DataFrame(\n", - " {\n", - " \"score\": combined\n", - " .groupby(\"context.span_id\")\n", - " .apply(_compute_ndcg, k=2)\n", - " }\n", - ")\n", + "ndcg_at_2 = pd.DataFrame({\"score\": combined.groupby(\"context.span_id\").apply(_compute_ndcg, k=2)})\n", "ndcg_at_2.to_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")" ] }, @@ -239,7 +233,11 @@ "outputs": [], "source": [ "precision_at_3 = pd.DataFrame(\n", - " {\"score\": combined.groupby(\"context.span_id\").apply(lambda x: x.eval_score[:3].sum(skipna=False) / 3)}\n", + " {\n", + " \"score\": combined.groupby(\"context.span_id\").apply(\n", + " lambda x: x.eval_score[:3].sum(skipna=False) / 3\n", + " )\n", + " }\n", ")\n", "precision_at_3.to_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")" ]