diff --git a/app/schema.graphql b/app/schema.graphql index edf634e517..7c1e164987 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -179,6 +179,29 @@ type DimensionWithValue { value: String } +type DocumentEvaluation implements Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" + name: String! + + """Result of the evaluation in the form of a numeric score.""" + score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ + label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ + explanation: String + + """ + The zero-based index among retrieved documents, which is collected as a list (even when ordering is not inherently meaningful). + """ + documentPosition: Int! +} + type DriftTimeSeries implements TimeSeries { data: [TimeSeriesDataPoint!]! } @@ -261,6 +284,24 @@ type EmbeddingMetadata { linkToData: String } +interface Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" + name: String! + + """Result of the evaluation in the form of a numeric score.""" + score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ + label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ + explanation: String +} + type Event { id: ID! eventMetadata: EventMetadata! @@ -460,6 +501,11 @@ type Query { clusterSelectionEpsilon: Float! = 0 ): [Cluster!]! spans(timeRange: TimeRange, traceIds: [ID!], first: Int = 50, last: Int, after: String, before: String, sort: SpanSort, rootSpansOnly: Boolean = false, filterCondition: String = null): SpanConnection! + + """ + Names of all available evaluations for spans. (The list contains no duplicates.) + """ + spanEvaluationNames: [String!]! traceDatasetInfo: TraceDatasetInfo validateSpanFilterCondition(condition: String!): ValidationResult! } @@ -527,6 +573,16 @@ type Span { """ cumulativeTokenCountCompletion: Int + """ + Evaluations associated with the span, e.g. if the span is an LLM, an evaluation may assess the helpfulness of its response with respect to its input. + """ + spanEvaluations: [SpanEvaluation!]! + + """ + Evaluations of the documents associated with the span, e.g. if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS attribute, an evaluation for each document may assess its relevance respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is a list, and each evaluation is identified by its document's (zero-based) index in that list. + """ + documentEvaluations: [DocumentEvaluation!]! + """All descendant spans (children, grandchildren, etc.)""" descendants: [Span!]! } @@ -558,6 +614,24 @@ type SpanEdge { cursor: String! } +type SpanEvaluation implements Evaluation { + """Name of the evaluation, e.g. 'helpfulness' or 'relevance'.""" + name: String! + + """Result of the evaluation in the form of a numeric score.""" + score: Float + + """ + Result of the evaluation in the form of a string, e.g. 'helpful' or 'not helpful'. Note that the label is not necessarily binary. + """ + label: String + + """ + The evaluator's explanation for the evaluation result (i.e. score or label, or both) given to the subject. + """ + explanation: String +} + type SpanEvent { name: String! message: String! diff --git a/pyproject.toml b/pyproject.toml index 0f3524a6bd..092f57c7b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,7 +234,10 @@ dependencies = [ ] [tool.hatch.envs.proto.scripts] -recompile = "python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto" +recompile = """ +python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto && +python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto +""" [tool.interrogate] fail-under = 0 diff --git a/src/phoenix/core/evals.py b/src/phoenix/core/evals.py new file mode 100644 index 0000000000..4972a38fd1 --- /dev/null +++ b/src/phoenix/core/evals.py @@ -0,0 +1,81 @@ +import weakref +from collections import defaultdict +from queue import SimpleQueue +from threading import RLock, Thread +from types import MethodType +from typing import DefaultDict, Dict, List, Optional + +from typing_extensions import TypeAlias + +import phoenix.trace.v1 as pb +from phoenix.trace.schemas import SpanID + +END_OF_QUEUE = None # sentinel value for queue termination + +EvaluationName: TypeAlias = str +DocumentPosition: TypeAlias = int + + +class Evals: + def __init__(self) -> None: + self._queue: "SimpleQueue[Optional[pb.Evaluation]]" = SimpleQueue() + weakref.finalize(self, self._queue.put, END_OF_QUEUE) + self._lock = RLock() + self._start_consumer() + self._span_evaluations_by_name: DefaultDict[ + EvaluationName, Dict[SpanID, pb.Evaluation] + ] = defaultdict(dict) + self._evaluations_by_span_id: DefaultDict[ + SpanID, Dict[EvaluationName, pb.Evaluation] + ] = defaultdict(dict) + self._document_evaluations_by_span_id: DefaultDict[ + SpanID, DefaultDict[EvaluationName, Dict[DocumentPosition, pb.Evaluation]] + ] = defaultdict(lambda: defaultdict(dict)) + + def put(self, evaluation: pb.Evaluation) -> None: + self._queue.put(evaluation) + + def _start_consumer(self) -> None: + Thread( + target=MethodType( + self.__class__._consume_evaluations, + weakref.proxy(self), + ), + daemon=True, + ).start() + + def _consume_evaluations(self) -> None: + while (item := self._queue.get()) is not END_OF_QUEUE: + with self._lock: + self._process_evaluation(item) + + def _process_evaluation(self, evaluation: pb.Evaluation) -> None: + subject_id = evaluation.subject_id + name = evaluation.name + subject_id_kind = subject_id.WhichOneof("kind") + if subject_id_kind == "document_retrieval_id": + document_retrieval_id = subject_id.document_retrieval_id + span_id = SpanID(document_retrieval_id.span_id) + document_position = document_retrieval_id.document_position + self._document_evaluations_by_span_id[span_id][name][document_position] = evaluation + elif subject_id_kind == "span_id": + span_id = SpanID(subject_id.span_id) + self._evaluations_by_span_id[span_id][name] = evaluation + self._span_evaluations_by_name[name][span_id] = evaluation + else: + raise ValueError(f"unrecognized subject_id type: {type(subject_id_kind)}") + + def get_span_evaluation_names(self) -> List[EvaluationName]: + with self._lock: + return list(self._span_evaluations_by_name.keys()) + + def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]: + with self._lock: + return list(self._evaluations_by_span_id[span_id].values()) + + def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]: + all_evaluations: List[pb.Evaluation] = [] + with self._lock: + for evaluations in self._document_evaluations_by_span_id[span_id].values(): + all_evaluations.extend(evaluations.values()) + return all_evaluations diff --git a/src/phoenix/proto/trace/v1/evaluation.proto b/src/phoenix/proto/trace/v1/evaluation.proto new file mode 100644 index 0000000000..93441161a6 --- /dev/null +++ b/src/phoenix/proto/trace/v1/evaluation.proto @@ -0,0 +1,26 @@ +syntax = "proto3"; +package phoenix.proto.evaluation.v1; + +import "google/protobuf/wrappers.proto"; + +message Evaluation { + string name = 1; + message SubjectId { + message DocumentRetrievalId { + string span_id = 1; + int32 document_position = 2; // zero-based index + } + oneof kind { + string trace_id = 1; + string span_id = 2; + DocumentRetrievalId document_retrieval_id = 3; + } + } + SubjectId subject_id = 2; + message Result { + google.protobuf.DoubleValue score = 1; + google.protobuf.StringValue label = 2; + google.protobuf.StringValue explanation = 3; + } + Result result = 3; +} diff --git a/src/phoenix/server/api/context.py b/src/phoenix/server/api/context.py index 2d6a95be7e..b0480bb960 100644 --- a/src/phoenix/server/api/context.py +++ b/src/phoenix/server/api/context.py @@ -6,6 +6,7 @@ from starlette.responses import Response from starlette.websockets import WebSocket +from phoenix.core.evals import Evals from phoenix.core.model_schema import Model from phoenix.core.traces import Traces @@ -18,3 +19,4 @@ class Context: export_path: Path corpus: Optional[Model] = None traces: Optional[Traces] = None + evals: Optional[Evals] = None diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py index cfd9fad317..844cdb32a8 100644 --- a/src/phoenix/server/api/schema.py +++ b/src/phoenix/server/api/schema.py @@ -239,6 +239,18 @@ def spans( data = list(map(to_gql_span, spans)) return connection_from_list(data=data, args=args) + @strawberry.field( + description="Names of all available evaluations for spans. " + "(The list contains no duplicates.)" + ) # type: ignore + def span_evaluation_names( + self, + info: Info[Context, None], + ) -> List[str]: + if (evals := info.context.evals) is None: + return [] + return evals.get_span_evaluation_names() + @strawberry.field def trace_dataset_info( self, diff --git a/src/phoenix/server/api/types/Evaluation.py b/src/phoenix/server/api/types/Evaluation.py new file mode 100644 index 0000000000..5b4ccd0915 --- /dev/null +++ b/src/phoenix/server/api/types/Evaluation.py @@ -0,0 +1,71 @@ +from typing import Optional + +import strawberry + +import phoenix.trace.v1 as pb +from phoenix.trace.schemas import SpanID + + +@strawberry.interface +class Evaluation: + name: str = strawberry.field( + description="Name of the evaluation, e.g. 'helpfulness' or 'relevance'." + ) + score: Optional[float] = strawberry.field( + description="Result of the evaluation in the form of a numeric score." + ) + label: Optional[str] = strawberry.field( + description="Result of the evaluation in the form of a string, e.g. " + "'helpful' or 'not helpful'. Note that the label is not necessarily binary." + ) + explanation: Optional[str] = strawberry.field( + description="The evaluator's explanation for the evaluation result (i.e. " + "score or label, or both) given to the subject." + ) + + +@strawberry.type +class SpanEvaluation(Evaluation): + span_id: strawberry.Private[SpanID] + + @staticmethod + def from_pb_evaluation(evaluation: pb.Evaluation) -> "SpanEvaluation": + result = evaluation.result + score = result.score.value if result.HasField("score") else None + label = result.label.value if result.HasField("label") else None + explanation = result.explanation.value if result.HasField("explanation") else None + span_id = SpanID(evaluation.subject_id.span_id) + return SpanEvaluation( + name=evaluation.name, + score=score, + label=label, + explanation=explanation, + span_id=span_id, + ) + + +@strawberry.type +class DocumentEvaluation(Evaluation): + span_id: strawberry.Private[SpanID] + document_position: int = strawberry.field( + description="The zero-based index among retrieved documents, which " + "is collected as a list (even when ordering is not inherently meaningful)." + ) + + @staticmethod + def from_pb_evaluation(evaluation: pb.Evaluation) -> "DocumentEvaluation": + result = evaluation.result + score = result.score.value if result.HasField("score") else None + label = result.label.value if result.HasField("label") else None + explanation = result.explanation.value if result.HasField("explanation") else None + document_retrieval_id = evaluation.subject_id.document_retrieval_id + document_position = document_retrieval_id.document_position + span_id = SpanID(document_retrieval_id.span_id) + return DocumentEvaluation( + name=evaluation.name, + score=score, + label=label, + explanation=explanation, + document_position=document_position, + span_id=span_id, + ) diff --git a/src/phoenix/server/api/types/Span.py b/src/phoenix/server/api/types/Span.py index 02559445b4..4de44d14a6 100644 --- a/src/phoenix/server/api/types/Span.py +++ b/src/phoenix/server/api/types/Span.py @@ -11,6 +11,7 @@ import phoenix.trace.schemas as trace_schema from phoenix.core.traces import ComputedAttributes from phoenix.server.api.context import Context +from phoenix.server.api.types.Evaluation import DocumentEvaluation, SpanEvaluation from phoenix.server.api.types.MimeType import MimeType from phoenix.trace.schemas import SpanID from phoenix.trace.semantic_conventions import ( @@ -123,6 +124,43 @@ class Span: "descendant spans (children, grandchildren, etc.)", ) + @strawberry.field( + description="Evaluations associated with the span, e.g. if the span is " + "an LLM, an evaluation may assess the helpfulness of its response with " + "respect to its input." + ) # type: ignore + def span_evaluations( + self, + info: Info[Context, None], + ) -> List[SpanEvaluation]: + if not (evals := info.context.evals): + return [] + span_id = SpanID(str(self.context.span_id)) + return [ + SpanEvaluation.from_pb_evaluation(evaluation) + for evaluation in evals.get_evaluations_by_span_id(span_id) + ] + + @strawberry.field( + description="Evaluations of the documents associated with the span, e.g. " + "if the span is a RETRIEVER with a list of documents in its RETRIEVAL_DOCUMENTS " + "attribute, an evaluation for each document may assess its relevance " + "respect to the input query of the span. Note that RETRIEVAL_DOCUMENTS is " + "a list, and each evaluation is identified by its document's (zero-based) " + "index in that list." + ) # type: ignore + def document_evaluations( + self, + info: Info[Context, None], + ) -> List[DocumentEvaluation]: + if not (evals := info.context.evals): + return [] + span_id = SpanID(str(self.context.span_id)) + return [ + DocumentEvaluation.from_pb_evaluation(evaluation) + for evaluation in evals.get_document_evaluations_by_span_id(span_id) + ] + @strawberry.field( description="All descendant spans (children, grandchildren, etc.)", ) # type: ignore diff --git a/src/phoenix/server/app.py b/src/phoenix/server/app.py index f1efd195e3..4f16f4769e 100644 --- a/src/phoenix/server/app.py +++ b/src/phoenix/server/app.py @@ -20,11 +20,13 @@ import phoenix from phoenix.config import SERVER_DIR +from phoenix.core.evals import Evals from phoenix.core.model_schema import Model from phoenix.core.traces import Traces from phoenix.pointcloud.umap_parameters import UMAPParameters from phoenix.server.api.context import Context from phoenix.server.api.schema import schema +from phoenix.server.evaluation_handler import EvaluationHandler from phoenix.server.span_handler import SpanHandler logger = logging.getLogger(__name__) @@ -109,10 +111,12 @@ def __init__( graphiql: bool = False, corpus: Optional[Model] = None, traces: Optional[Traces] = None, + evals: Optional[Evals] = None, ) -> None: self.model = model self.corpus = corpus self.traces = traces + self.evals = evals self.export_path = export_path super().__init__(schema, graphiql=graphiql) @@ -127,6 +131,7 @@ async def get_context( model=self.model, corpus=self.corpus, traces=self.traces, + evals=self.evals, export_path=self.export_path, ) @@ -156,6 +161,7 @@ def create_app( umap_params: UMAPParameters, corpus: Optional[Model] = None, traces: Optional[Traces] = None, + evals: Optional[Evals] = None, debug: bool = False, ) -> Starlette: graphql = GraphQLWithContext( @@ -163,6 +169,7 @@ def create_app( model=model, corpus=corpus, traces=traces, + evals=evals, export_path=export_path, graphiql=True, ) @@ -185,6 +192,16 @@ def create_app( ), ] ) + + ( + [] + if evals is None + else [ + Route( + "/v1/evaluations", + type("SpanEndpoint", (EvaluationHandler,), {"queue": evals}), + ) + ] + ) + [ Route("/arize_phoenix_version", version), Route( diff --git a/src/phoenix/server/evaluation_handler.py b/src/phoenix/server/evaluation_handler.py new file mode 100644 index 0000000000..238476272f --- /dev/null +++ b/src/phoenix/server/evaluation_handler.py @@ -0,0 +1,46 @@ +import gzip +from typing import Protocol + +from google.protobuf.message import DecodeError +from starlette.endpoints import HTTPEndpoint +from starlette.requests import Request +from starlette.responses import Response +from starlette.status import HTTP_415_UNSUPPORTED_MEDIA_TYPE, HTTP_422_UNPROCESSABLE_ENTITY + +import phoenix.trace.v1 as pb + + +class SupportsPutEvaluation(Protocol): + def put(self, evaluation: pb.Evaluation) -> None: + ... + + +class EvaluationHandler(HTTPEndpoint): + queue: SupportsPutEvaluation + + async def post(self, request: Request) -> Response: + content_type = request.headers.get("content-type") + if content_type != "application/x-protobuf": + return Response( + content="Unsupported content type", + status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE, + ) + body = await request.body() + content_encoding = request.headers.get("content-encoding") + if content_encoding == "gzip": + body = gzip.decompress(body) + elif content_encoding: + return Response( + content="Unsupported content encoding", + status_code=HTTP_415_UNSUPPORTED_MEDIA_TYPE, + ) + evaluation = pb.Evaluation() + try: + evaluation.ParseFromString(body) + except DecodeError: + return Response( + content="Request body is invalid", + status_code=HTTP_422_UNPROCESSABLE_ENTITY, + ) + self.queue.put(evaluation) + return Response() diff --git a/src/phoenix/server/main.py b/src/phoenix/server/main.py index ce5e996662..eda3382145 100644 --- a/src/phoenix/server/main.py +++ b/src/phoenix/server/main.py @@ -11,6 +11,7 @@ from uvicorn import Config, Server from phoenix.config import EXPORT_DIR, get_env_host, get_env_port, get_pids_path +from phoenix.core.evals import Evals from phoenix.core.model_schema_adapter import create_model_from_datasets from phoenix.core.traces import Traces from phoenix.datasets.dataset import EMPTY_DATASET, Dataset @@ -26,6 +27,7 @@ TRACES_FIXTURES, _download_traces_fixture, _get_trace_fixture_by_name, + get_evals_from_fixture, ) from phoenix.trace.span_json_decoder import json_string_to_span @@ -143,6 +145,7 @@ def _load_items( reference_dataset, ) traces = Traces() + evals = Evals() if trace_dataset_name is not None: fixture_spans = map( json_string_to_span, @@ -157,6 +160,12 @@ def _load_items( args=(traces, fixture_spans, simulate_streaming), daemon=True, ).start() + fixture_evals = get_evals_from_fixture(trace_dataset_name) + Thread( + target=_load_items, + args=(evals, fixture_evals, simulate_streaming), + daemon=True, + ).start() umap_params_list = args.umap_params.split(",") umap_params = UMAPParameters( min_dist=float(umap_params_list[0]), @@ -169,6 +178,7 @@ def _load_items( model=model, umap_params=umap_params, traces=traces, + evals=evals, corpus=None if corpus_dataset is None else create_model_from_datasets(corpus_dataset), debug=args.debug, ) diff --git a/src/phoenix/session/evaluation.py b/src/phoenix/session/evaluation.py new file mode 100644 index 0000000000..a4de5e1029 --- /dev/null +++ b/src/phoenix/session/evaluation.py @@ -0,0 +1,158 @@ +""" +A set of **highly experimental** helper functions to + - extract spans from Phoenix for evaluation + - explode retrieved documents from (horizontal) lists to a (vertical) series + indexed by `context.span_id` and `document_position` + - ingest evaluation results into Phoenix via HttpExporter +""" +import collections +import math +from typing import ( + Any, + Iterable, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +import pandas as pd +from google.protobuf.wrappers_pb2 import DoubleValue, StringValue + +import phoenix.trace.v1 as pb +from phoenix.core.traces import TRACE_ID +from phoenix.session.session import Session +from phoenix.trace.exporter import HttpExporter +from phoenix.trace.schemas import ATTRIBUTE_PREFIX +from phoenix.trace.semantic_conventions import ( + DOCUMENT_CONTENT, + DOCUMENT_SCORE, + INPUT_VALUE, + RETRIEVAL_DOCUMENTS, +) + + +def get_retrieved_documents(session: Session) -> pd.DataFrame: + data: List[Mapping[str, Any]] = [] + if (df := session.get_spans_dataframe("span_kind == 'RETRIEVER'")) is not None: + for span_id, query, documents, trace_id in df.loc[ + :, + [ + ATTRIBUTE_PREFIX + INPUT_VALUE, + ATTRIBUTE_PREFIX + RETRIEVAL_DOCUMENTS, + TRACE_ID, + ], + ].itertuples(): + if not isinstance(documents, Iterable): + continue + for position, document in enumerate(documents): + if not hasattr(document, "get"): + continue + data.append( + { + "context.trace_id": trace_id, + "context.span_id": span_id, + "input": query, + "document_position": position, + "reference": document.get(DOCUMENT_CONTENT), + "document_score": document.get(DOCUMENT_SCORE), + } + ) + index = ["context.span_id", "document_position"] + columns = [ + "context.span_id", + "document_position", + "input", + "reference", + "document_score", + "context.trace_id", + ] + return pd.DataFrame(data=data, columns=columns).set_index(index) + + +def add_evaluations( + exporter: HttpExporter, + evaluations: pd.DataFrame, + evaluation_name: str, +) -> None: + index_names = evaluations.index.names + for index, row in evaluations.iterrows(): + subject_id = _extract_subject_id_from_index( + index_names, + cast(Union[str, Tuple[Any]], index), + ) + if (result := _extract_result(row)) is None: + continue + evaluation = pb.Evaluation( + name=evaluation_name, + result=result, + subject_id=subject_id, + ) + exporter.export(evaluation) + + +def _extract_subject_id_from_index( + names: Sequence[str], + value: Union[str, Sequence[Any]], +) -> pb.Evaluation.SubjectId: + """ + (**Highly Experimental**) + Returns `SubjectId` given the format of `index_names`. Allowed formats are: + - DocumentRetrievalId + - index_names=["context.span_id", "document_position"] + - index_names=["span_id", "document_position"] + - index_names=["document_position", "context.span_id"] + - index_names=["document_position", "span_id"] + - SpanId + - index_names=["span_id"] + - index_names=["context.span_id"] + - TraceId + - index_names=["context.span_id"] + - index_names=["trace_id"] + """ + assert isinstance(names, collections.Sequence) + if len(names) == 2: + assert isinstance(value, collections.Sequence) and len(value) == 2 + if "document_position" in names: + document_position = value[names.index("document_position")] + assert isinstance(document_position, int) + if "context.span_id" in names: + span_id = value[names.index("context.span_id")] + elif "span_id" in names: + span_id = value[names.index("span_id")] + else: + raise ValueError(f"Unexpected index names: {names}") + assert isinstance(span_id, str) + return pb.Evaluation.SubjectId( + document_retrieval_id=pb.Evaluation.SubjectId.DocumentRetrievalId( + document_position=document_position, + span_id=span_id, + ), + ) + elif len(names) == 1: + assert isinstance(value, str) + if names[0] in ("context.span_id", "span_id"): + return pb.Evaluation.SubjectId(span_id=value) + if names[0] in ("context.trace_id", "trace_id"): + return pb.Evaluation.SubjectId(trace_id=value) + raise ValueError(f"Unexpected index names: {names}") + + +def _extract_result(row: "pd.Series[Any]") -> Optional[pb.Evaluation.Result]: + score = cast(Optional[float], row.get("score")) + label = cast(Optional[str], row.get("label")) + explanation = cast(Optional[str], row.get("explanation")) + if ( + (score is None or isinstance(score, float) and math.isnan(score)) + and not label + and not explanation + ): + return None + return pb.Evaluation.Result( + score=DoubleValue(value=score) if score is not None else None, + label=StringValue(value=label) if label else None, + explanation=StringValue(value=explanation) if explanation else None, + ) diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py index ba5422bde9..3ff682eb17 100644 --- a/src/phoenix/session/session.py +++ b/src/phoenix/session/session.py @@ -21,6 +21,7 @@ import pandas as pd from phoenix.config import ENV_NOTEBOOK_ENV, get_env_host, get_env_port, get_exported_files +from phoenix.core.evals import Evals from phoenix.core.model_schema_adapter import create_model_from_datasets from phoenix.core.traces import Traces from phoenix.datasets.dataset import EMPTY_DATASET, Dataset @@ -117,6 +118,8 @@ def __init__( for span in trace_dataset.to_spans(): self.traces.put(span) + self.evals: Evals = Evals() + self.host = host or get_env_host() self.port = port or get_env_port() self.temp_dir = TemporaryDirectory() @@ -279,6 +282,7 @@ def __init__( model=self.model, corpus=self.corpus, traces=self.traces, + evals=self.evals, umap_params=self.umap_parameters, ) self.server = ThreadServer( diff --git a/src/phoenix/trace/exporter.py b/src/phoenix/trace/exporter.py index a1f4eb66e5..f7b71fad4e 100644 --- a/src/phoenix/trace/exporter.py +++ b/src/phoenix/trace/exporter.py @@ -4,10 +4,11 @@ from queue import SimpleQueue from threading import Thread from types import MethodType -from typing import Any, Optional +from typing import Any, Optional, Union import requests from requests import Session +from typing_extensions import TypeAlias import phoenix.trace.v1 as pb from phoenix.config import get_env_host, get_env_port @@ -19,6 +20,8 @@ END_OF_QUEUE = None # sentinel value for queue termination +Message: TypeAlias = Union[pb.Span, pb.Evaluation] + class NoOpExporter: def export(self, _: Any) -> None: @@ -32,7 +35,7 @@ def __init__( port: Optional[int] = None, ) -> None: """ - Span Exporter using HTTP. + Span/Evaluation Exporter using HTTP. Parameters ---------- @@ -55,13 +58,16 @@ def __init__( "content-encoding": "gzip", } ) - self._queue: "SimpleQueue[Optional[pb.Span]]" = SimpleQueue() + self._queue: "SimpleQueue[Optional[Message]]" = SimpleQueue() # Putting `None` as the sentinel value for queue termination. weakref.finalize(self, self._queue.put, END_OF_QUEUE) self._start_consumer() - def export(self, span: Span) -> None: - self._queue.put(encode(span)) + def export(self, item: Union[Span, pb.Evaluation]) -> None: + if isinstance(item, Span): + self._queue.put(encode(item)) + elif isinstance(item, pb.Evaluation): + self._queue.put(item) def _start_consumer(self) -> None: Thread( @@ -76,16 +82,20 @@ def _consume_items(self) -> None: while (item := self._queue.get()) is not END_OF_QUEUE: self._send(item) - def _send(self, item: pb.Span) -> None: - serialized = item.SerializeToString() + def _send(self, message: Message) -> None: + serialized = message.SerializeToString() data = gzip.compress(serialized) try: - self._session.post(self._url(item), data=data) + self._session.post(self._url(message), data=data).raise_for_status() except Exception as e: logger.exception(e) - def _url(self, _: pb.Span) -> str: - return f"{self._base_url}/v1/spans" + def _url(self, message: Message) -> str: + if isinstance(message, pb.Span): + return f"{self._base_url}/v1/spans" + if isinstance(message, pb.Evaluation): + return f"{self._base_url}/v1/evaluations" + raise ValueError(f"Unknown message type: {type(message)}") def _warn_if_phoenix_is_not_running(self) -> None: try: diff --git a/src/phoenix/trace/fixtures.py b/src/phoenix/trace/fixtures.py index 6341bd9acd..cd5a474986 100644 --- a/src/phoenix/trace/fixtures.py +++ b/src/phoenix/trace/fixtures.py @@ -1,22 +1,67 @@ -from dataclasses import dataclass -from typing import List, Optional, cast +from dataclasses import dataclass, field +from typing import Iterable, Iterator, List, NamedTuple, Optional, Tuple, cast from urllib import request +import pandas as pd +from google.protobuf.wrappers_pb2 import DoubleValue, StringValue + +import phoenix.trace.v1 as pb from phoenix.trace.trace_dataset import TraceDataset from phoenix.trace.utils import json_lines_to_df +class EvaluationResultSchema(NamedTuple): + label: Optional[str] = "label" + score: Optional[str] = "score" + explanation: Optional[str] = "explanation" + + +@dataclass(frozen=True) +class EvaluationFixture: + evaluation_name: str + file_name: str + evaluation_result_schema: EvaluationResultSchema = field(default_factory=EvaluationResultSchema) + + +@dataclass(frozen=True) +class DocumentEvaluationFixture(EvaluationFixture): + document_position: str = "document_position" + + @dataclass(frozen=True) class TracesFixture: name: str description: str file_name: str + evaluation_fixtures: Iterable[EvaluationFixture] = () llama_index_rag_fixture = TracesFixture( name="llama_index_rag", description="Traces from running the llama_index on a RAG use case.", file_name="llama_index_rag_with_rerank.jsonl", + evaluation_fixtures=( + EvaluationFixture( + evaluation_name="Q&A Correctness", + file_name="llama_index_rag_with_rerank.qa_correctness_eval.parquet", + ), + EvaluationFixture( + evaluation_name="Hallucination", + file_name="llama_index_rag_with_rerank.hallucination_eval.parquet", + ), + EvaluationFixture( + evaluation_name="NDCG@2", + file_name="llama_index_rag_with_rerank.ndcg_at_2.parquet", + ), + EvaluationFixture( + evaluation_name="Precision@3", + file_name="llama_index_rag_with_rerank.precision_at_3.parquet", + ), + DocumentEvaluationFixture( + evaluation_name="Relevance", + file_name="llama_index_rag_with_rerank.documents_eval.parquet", + ), + ), ) llama_index_calculator_agent_fixture = TracesFixture( @@ -105,3 +150,48 @@ def load_example_traces(use_case: str) -> TraceDataset: """ fixture = _get_trace_fixture_by_name(use_case) return TraceDataset(json_lines_to_df(_download_traces_fixture(fixture))) + + +def get_evals_from_fixture(use_case: str) -> Iterator[pb.Evaluation]: + fixture = _get_trace_fixture_by_name(use_case) + for eval_fixture in fixture.evaluation_fixtures: + yield from _read_eval_fixture(eval_fixture) + + +def _read_eval_fixture(eval_fixture: EvaluationFixture) -> Iterator[pb.Evaluation]: + df = pd.read_parquet(_url(eval_fixture.file_name)) + for index, row in df.iterrows(): + schema = eval_fixture.evaluation_result_schema + label = row.get(schema.label) + score = row.get(schema.score) + explanation = row.get(schema.explanation) + result = pb.Evaluation.Result( + score=DoubleValue(value=cast(float, score)) if score is not None else None, + label=StringValue(value=cast(str, label)) if label else None, + explanation=StringValue(value=cast(str, explanation)) if explanation else None, + ) + if isinstance(eval_fixture, DocumentEvaluationFixture): + span_id, document_position = cast(Tuple[str, int], index) + subject_id = pb.Evaluation.SubjectId( + document_retrieval_id=pb.Evaluation.SubjectId.DocumentRetrievalId( + document_position=document_position, + span_id=span_id, + ), + ) + else: + span_id = cast(str, index) + subject_id = pb.Evaluation.SubjectId(span_id=span_id) + yield pb.Evaluation( + name=eval_fixture.evaluation_name, + result=result, + subject_id=subject_id, + ) + + +def _url( + file_name: str, + host: Optional[str] = "https://storage.googleapis.com/", + bucket: Optional[str] = "arize-assets", + prefix: Optional[str] = "phoenix/traces/", +) -> str: + return f"{host}{bucket}/{prefix}{file_name}" diff --git a/src/phoenix/trace/v1/__init__.py b/src/phoenix/trace/v1/__init__.py index 00d6a00abc..39315c8a60 100644 --- a/src/phoenix/trace/v1/__init__.py +++ b/src/phoenix/trace/v1/__init__.py @@ -1,7 +1,9 @@ +from phoenix.trace.v1.evaluation_pb2 import Evaluation from phoenix.trace.v1.trace_pb2 import Embedding, Retrieval, Span __all__ = [ "Span", "Retrieval", "Embedding", + "Evaluation", ] diff --git a/src/phoenix/trace/v1/evaluation_pb2.py b/src/phoenix/trace/v1/evaluation_pb2.py new file mode 100644 index 0000000000..a7a2a284cd --- /dev/null +++ b/src/phoenix/trace/v1/evaluation_pb2.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: trace/v1/evaluation.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import builder as _builder +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19trace/v1/evaluation.proto\x12\x1bphoenix.proto.evaluation.v1\x1a\x1egoogle/protobuf/wrappers.proto\"\xa1\x04\n\nEvaluation\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x45\n\nsubject_id\x18\x02 \x01(\x0b\x32\x31.phoenix.proto.evaluation.v1.Evaluation.SubjectId\x12>\n\x06result\x18\x03 \x01(\x0b\x32..phoenix.proto.evaluation.v1.Evaluation.Result\x1a\xe5\x01\n\tSubjectId\x12\x12\n\x08trace_id\x18\x01 \x01(\tH\x00\x12\x11\n\x07span_id\x18\x02 \x01(\tH\x00\x12\x66\n\x15\x64ocument_retrieval_id\x18\x03 \x01(\x0b\x32\x45.phoenix.proto.evaluation.v1.Evaluation.SubjectId.DocumentRetrievalIdH\x00\x1a\x41\n\x13\x44ocumentRetrievalId\x12\x0f\n\x07span_id\x18\x01 \x01(\t\x12\x19\n\x11\x64ocument_position\x18\x02 \x01(\x05\x42\x06\n\x04kind\x1a\x95\x01\n\x06Result\x12+\n\x05score\x18\x01 \x01(\x0b\x32\x1c.google.protobuf.DoubleValue\x12+\n\x05label\x18\x02 \x01(\x0b\x32\x1c.google.protobuf.StringValue\x12\x31\n\x0b\x65xplanation\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.StringValueb\x06proto3') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trace.v1.evaluation_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _EVALUATION._serialized_start=91 + _EVALUATION._serialized_end=636 + _EVALUATION_SUBJECTID._serialized_start=255 + _EVALUATION_SUBJECTID._serialized_end=484 + _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_start=411 + _EVALUATION_SUBJECTID_DOCUMENTRETRIEVALID._serialized_end=476 + _EVALUATION_RESULT._serialized_start=487 + _EVALUATION_RESULT._serialized_end=636 +# @@protoc_insertion_point(module_scope) diff --git a/src/phoenix/trace/v1/evaluation_pb2.pyi b/src/phoenix/trace/v1/evaluation_pb2.pyi new file mode 100644 index 0000000000..ab34b84f5c --- /dev/null +++ b/src/phoenix/trace/v1/evaluation_pb2.pyi @@ -0,0 +1,102 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" +import builtins +import google.protobuf.descriptor +import google.protobuf.message +import google.protobuf.wrappers_pb2 +import sys + +if sys.version_info >= (3, 8): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +@typing_extensions.final +class Evaluation(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class SubjectId(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class DocumentRetrievalId(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SPAN_ID_FIELD_NUMBER: builtins.int + DOCUMENT_POSITION_FIELD_NUMBER: builtins.int + span_id: builtins.str + document_position: builtins.int + """zero-based index""" + def __init__( + self, + *, + span_id: builtins.str = ..., + document_position: builtins.int = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["document_position", b"document_position", "span_id", b"span_id"]) -> None: ... + + TRACE_ID_FIELD_NUMBER: builtins.int + SPAN_ID_FIELD_NUMBER: builtins.int + DOCUMENT_RETRIEVAL_ID_FIELD_NUMBER: builtins.int + trace_id: builtins.str + span_id: builtins.str + @property + def document_retrieval_id(self) -> global___Evaluation.SubjectId.DocumentRetrievalId: ... + def __init__( + self, + *, + trace_id: builtins.str = ..., + span_id: builtins.str = ..., + document_retrieval_id: global___Evaluation.SubjectId.DocumentRetrievalId | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["document_retrieval_id", b"document_retrieval_id", "kind", b"kind", "span_id", b"span_id", "trace_id", b"trace_id"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["kind", b"kind"]) -> typing_extensions.Literal["trace_id", "span_id", "document_retrieval_id"] | None: ... + + @typing_extensions.final + class Result(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SCORE_FIELD_NUMBER: builtins.int + LABEL_FIELD_NUMBER: builtins.int + EXPLANATION_FIELD_NUMBER: builtins.int + @property + def score(self) -> google.protobuf.wrappers_pb2.DoubleValue: ... + @property + def label(self) -> google.protobuf.wrappers_pb2.StringValue: ... + @property + def explanation(self) -> google.protobuf.wrappers_pb2.StringValue: ... + def __init__( + self, + *, + score: google.protobuf.wrappers_pb2.DoubleValue | None = ..., + label: google.protobuf.wrappers_pb2.StringValue | None = ..., + explanation: google.protobuf.wrappers_pb2.StringValue | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["explanation", b"explanation", "label", b"label", "score", b"score"]) -> None: ... + + NAME_FIELD_NUMBER: builtins.int + SUBJECT_ID_FIELD_NUMBER: builtins.int + RESULT_FIELD_NUMBER: builtins.int + name: builtins.str + @property + def subject_id(self) -> global___Evaluation.SubjectId: ... + @property + def result(self) -> global___Evaluation.Result: ... + def __init__( + self, + *, + name: builtins.str = ..., + subject_id: global___Evaluation.SubjectId | None = ..., + result: global___Evaluation.Result | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["result", b"result", "subject_id", b"subject_id"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["name", b"name", "result", b"result", "subject_id", b"subject_id"]) -> None: ... + +global___Evaluation = Evaluation diff --git a/tutorials/internal/trace_eval_ingestion_testing.ipynb b/tutorials/internal/trace_eval_ingestion_testing.ipynb new file mode 100644 index 0000000000..bc07fcedbf --- /dev/null +++ b/tutorials/internal/trace_eval_ingestion_testing.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d58c5245e6d7811f", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import phoenix as px\n", + "from phoenix.experimental.evals.functions import llm_classify\n", + "from phoenix.experimental.evals.models import OpenAIModel\n", + "from phoenix.experimental.evals.templates.default_templates import (\n", + " HALLUCINATION_PROMPT_RAILS_MAP,\n", + " HALLUCINATION_PROMPT_TEMPLATE,\n", + " QA_PROMPT_RAILS_MAP,\n", + " QA_PROMPT_TEMPLATE,\n", + " RAG_RELEVANCY_PROMPT_RAILS_MAP,\n", + " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", + ")\n", + "from phoenix.session.evaluation import add_evaluations, get_retrieved_documents\n", + "from phoenix.trace.exporter import HttpExporter\n", + "from sklearn.metrics import ndcg_score" + ] + }, + { + "cell_type": "markdown", + "id": "4b43166d02c26e8d", + "metadata": { + "collapsed": false + }, + "source": [ + "# Start Phoenix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "ds = px.load_example_traces(\"llama_index_rag\")\n", + "px.launch_app(trace=ds)" + ] + }, + { + "cell_type": "markdown", + "id": "1362576ff0fe4e2c", + "metadata": { + "collapsed": false + }, + "source": [ + "# Extract Retrieved Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2c45b85c6644735", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents = get_retrieved_documents(px.active_session())\n", + "retrieved_documents" + ] + }, + { + "cell_type": "markdown", + "id": "9ac938a5c199dc82", + "metadata": { + "collapsed": false + }, + "source": [ + "# Set Up OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e14465175520ce42", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "import openai\n", + "\n", + "if not (openai_api_key := os.getenv(\"OPENAI_API_KEY\")):\n", + " openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n", + "openai.api_key = openai_api_key\n", + "os.environ[\"OPENAI_API_KEY\"] = openai_api_key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de9664171d3e33b8", + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAIModel(model_name=\"gpt-4-1106-preview\")\n", + "model(\"hi\")" + ] + }, + { + "cell_type": "markdown", + "id": "d694213dcf35676f", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Document Relevance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "516dc273735ad00c", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents_eval = llm_classify(\n", + " retrieved_documents,\n", + " model,\n", + " RAG_RELEVANCY_PROMPT_TEMPLATE,\n", + " list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "retrieved_documents_eval[\"score\"] = (\n", + " retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == \"relevant\"\n", + ").astype(int)\n", + "retrieved_documents_eval.to_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f341795ae24ca024", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_documents_eval = pd.read_parquet(\"llama_index_rag_with_rerank.documents_eval.parquet\")\n", + "retrieved_documents_eval" + ] + }, + { + "cell_type": "markdown", + "id": "357fe94b02b22a6b", + "metadata": { + "collapsed": false + }, + "source": [ + "# Merge Data to Compute Ranking Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3bd04b678c9d18c", + "metadata": {}, + "outputs": [], + "source": [ + "combined = pd.concat([retrieved_documents, retrieved_documents_eval.add_prefix(\"eval_\")], axis=1)\n", + "combined" + ] + }, + { + "cell_type": "markdown", + "id": "b162eccd6c69aa7f", + "metadata": { + "collapsed": false + }, + "source": [ + "# Compute NDCG@2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d9fdebd46d268b", + "metadata": {}, + "outputs": [], + "source": [ + "def _compute_ndcg(df: pd.DataFrame, k: int):\n", + " \"\"\"Compute NDCG@k in the presence of missing values (e.g. as a result of keyboard interrupt).\"\"\"\n", + " eval_scores = [np.nan] * k\n", + " pred_scores = [np.nan] * k\n", + " for i in range(k):\n", + " if i >= len(df.eval_score):\n", + " break\n", + " eval_scores[i] = df.eval_score[i]\n", + " pred_scores[i] = df.document_score[i]\n", + " try:\n", + " return ndcg_score([eval_scores], [pred_scores])\n", + " except ValueError:\n", + " return np.nan\n", + "\n", + "\n", + "ndcg_at_2 = pd.DataFrame({\"score\": combined.groupby(\"context.span_id\").apply(_compute_ndcg, k=2)})\n", + "ndcg_at_2.to_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8032851d13b63d55", + "metadata": {}, + "outputs": [], + "source": [ + "ndcg_at_2 = pd.read_parquet(\"llama_index_rag_with_rerank.ndcg_at_2.parquet\")\n", + "ndcg_at_2" + ] + }, + { + "cell_type": "markdown", + "id": "e8d5816954fbaa4d", + "metadata": { + "collapsed": false + }, + "source": [ + "# Compute Precision@3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3167f4675c7313", + "metadata": {}, + "outputs": [], + "source": [ + "precision_at_3 = pd.DataFrame(\n", + " {\n", + " \"score\": combined.groupby(\"context.span_id\").apply(\n", + " lambda x: x.eval_score[:3].sum(skipna=False) / 3\n", + " )\n", + " }\n", + ")\n", + "precision_at_3.to_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c1d31d1d1c95429", + "metadata": {}, + "outputs": [], + "source": [ + "precision_at_3 = pd.read_parquet(\"llama_index_rag_with_rerank.precision_at_3.parquet\")\n", + "precision_at_3" + ] + }, + { + "cell_type": "markdown", + "id": "1819b377e7602361", + "metadata": { + "collapsed": false + }, + "source": [ + "# Merge Documents from Retrieval Spans to Q&A Spans (to Compute Q&A Correctness)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb27fd4724e0e27e", + "metadata": {}, + "outputs": [], + "source": [ + "qa_df = (\n", + " px.active_session()\n", + " .get_spans_dataframe(\"output.value is not None\", root_spans_only=True)\n", + " .set_index(\"context.trace_id\")[\n", + " [\"attributes.input.value\", \"attributes.output.value\", \"context.span_id\"]\n", + " ]\n", + " .rename({\"attributes.input.value\": \"input\", \"attributes.output.value\": \"output\"}, axis=1)\n", + ")\n", + "qa_df[\"reference\"] = retrieved_documents.groupby(\"context.trace_id\").apply(\n", + " lambda x: \"\\n\\n\".join(x.reference)\n", + ")\n", + "qa_df.set_index(\"context.span_id\", inplace=True)\n", + "qa_df" + ] + }, + { + "cell_type": "markdown", + "id": "f4084449c986aed8", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Q&A Correctness" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae507af54ce886a", + "metadata": {}, + "outputs": [], + "source": [ + "qa_correctness_eval = llm_classify(\n", + " qa_df,\n", + " model,\n", + " QA_PROMPT_TEMPLATE,\n", + " list(QA_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "qa_correctness_eval[\"score\"] = (\n", + " qa_correctness_eval.label[~qa_correctness_eval.label.isna()] == \"correct\"\n", + ").astype(int)\n", + "qa_correctness_eval.to_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2041d2dcc7d02322", + "metadata": {}, + "outputs": [], + "source": [ + "qa_correctness_eval = pd.read_parquet(\"llama_index_rag_with_rerank.qa_correctness_eval.parquet\")\n", + "qa_correctness_eval" + ] + }, + { + "cell_type": "markdown", + "id": "a88f90ea9c24832b", + "metadata": { + "collapsed": false + }, + "source": [ + "# Evaluate Hallucination" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "946b5aad5d72c1f5", + "metadata": {}, + "outputs": [], + "source": [ + "hallucination_eval = llm_classify(\n", + " qa_df,\n", + " model,\n", + " HALLUCINATION_PROMPT_TEMPLATE,\n", + " list(HALLUCINATION_PROMPT_RAILS_MAP.values()),\n", + " provide_explanation=True,\n", + ")\n", + "hallucination_eval[\"score\"] = (\n", + " hallucination_eval.label[~hallucination_eval.label.isna()] == \"factual\"\n", + ").astype(int)\n", + "hallucination_eval.to_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8b1a6d7143c986e", + "metadata": {}, + "outputs": [], + "source": [ + "hallucination_eval = pd.read_parquet(\"llama_index_rag_with_rerank.hallucination_eval.parquet\")\n", + "hallucination_eval" + ] + }, + { + "cell_type": "markdown", + "id": "bdd4d1c641fb5e15", + "metadata": { + "collapsed": false + }, + "source": [ + "# Ingest Evaluations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eed5bc68320bb18", + "metadata": {}, + "outputs": [], + "source": [ + "exporter = HttpExporter()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495a5e74b469a660", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, retrieved_documents_eval, \"Relevance\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20cc931d1529f84c", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, ndcg_at_2, \"NDCG@2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb48a5daae9d5bcb", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, precision_at_3, \"Precision@2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "848420ee90e10f62", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, qa_correctness_eval, \"Q&A Correctness\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c03dde5802ed98a3", + "metadata": {}, + "outputs": [], + "source": [ + "add_evaluations(exporter, hallucination_eval, \"Hallucination\")" + ] + }, + { + "cell_type": "markdown", + "id": "83dd4cd21c966504", + "metadata": { + "collapsed": false + }, + "source": [ + "# End Session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5842da4238a93554", + "metadata": {}, + "outputs": [], + "source": [ + "# px.active_session().end()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}