Skip to content

Commit

Permalink
refactor: remove dataset_examples submodule (#3979)
Browse files Browse the repository at this point in the history
  • Loading branch information
axiomofjoy authored Jul 24, 2024
1 parent 02f264c commit 5872e54
Show file tree
Hide file tree
Showing 5 changed files with 350 additions and 387 deletions.
1 change: 0 additions & 1 deletion schemas/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,6 @@
"/v1/datasets/{id}/examples": {
"get": {
"tags": [
"datasets",
"datasets"
],
"summary": "Get examples from a dataset",
Expand Down
157 changes: 0 additions & 157 deletions src/phoenix/server/api/routers/v1/dataset_examples.py

This file was deleted.

140 changes: 131 additions & 9 deletions src/phoenix/server/api/routers/v1/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@
add_dataset_examples,
)
from phoenix.server.api.types.Dataset import Dataset as DatasetNodeType
from phoenix.server.api.types.DatasetExample import DatasetExample
from phoenix.server.api.types.DatasetExample import DatasetExample as DatasetExampleNodeType
from phoenix.server.api.types.DatasetVersion import DatasetVersion as DatasetVersionNodeType
from phoenix.server.api.types.node import from_global_id_with_expected_type
from phoenix.server.api.utils import delete_projects, delete_traces

from .dataset_examples import router as dataset_examples_router
from .pydantic_compat import V1RoutesBaseModel
from .utils import (
PaginatedResponseBody,
Expand Down Expand Up @@ -669,12 +668,135 @@ async def _parse_form_data(
)


# including the dataset examples router here ensures the dataset example routes
# are included in a natural order in the openapi schema and the swagger ui
#
# todo: move the dataset examples routes here and remove the dataset_examples
# sub-module
router.include_router(dataset_examples_router)
class DatasetExample(V1RoutesBaseModel):
id: str
input: Dict[str, Any]
output: Dict[str, Any]
metadata: Dict[str, Any]
updated_at: datetime


class ListDatasetExamplesData(V1RoutesBaseModel):
dataset_id: str
version_id: str
examples: List[DatasetExample]


class ListDatasetExamplesResponseBody(ResponseBody[ListDatasetExamplesData]):
pass


@router.get(
"/datasets/{id}/examples",
operation_id="getDatasetExamples",
summary="Get examples from a dataset",
responses=add_errors_to_responses([HTTP_404_NOT_FOUND]),
)
async def get_dataset_examples(
request: Request,
id: str = Path(description="The ID of the dataset"),
version_id: Optional[str] = Query(
default=None,
description=(
"The ID of the dataset version " "(if omitted, returns data from the latest version)"
),
),
) -> ListDatasetExamplesResponseBody:
dataset_gid = GlobalID.from_id(id)
version_gid = GlobalID.from_id(version_id) if version_id else None

if (dataset_type := dataset_gid.type_name) != "Dataset":
raise HTTPException(
detail=f"ID {dataset_gid} refers to a {dataset_type}", status_code=HTTP_404_NOT_FOUND
)

if version_gid and (version_type := version_gid.type_name) != "DatasetVersion":
raise HTTPException(
detail=f"ID {version_gid} refers to a {version_type}", status_code=HTTP_404_NOT_FOUND
)

async with request.app.state.db() as session:
if (
resolved_dataset_id := await session.scalar(
select(models.Dataset.id).where(models.Dataset.id == int(dataset_gid.node_id))
)
) is None:
raise HTTPException(
detail=f"No dataset with id {dataset_gid} can be found.",
status_code=HTTP_404_NOT_FOUND,
)

# Subquery to find the maximum created_at for each dataset_example_id
# timestamp tiebreaks are resolved by the largest id
partial_subquery = select(
func.max(models.DatasetExampleRevision.id).label("max_id"),
).group_by(models.DatasetExampleRevision.dataset_example_id)

if version_gid:
if (
resolved_version_id := await session.scalar(
select(models.DatasetVersion.id).where(
and_(
models.DatasetVersion.dataset_id == resolved_dataset_id,
models.DatasetVersion.id == int(version_gid.node_id),
)
)
)
) is None:
raise HTTPException(
detail=f"No dataset version with id {version_id} can be found.",
status_code=HTTP_404_NOT_FOUND,
)
# if a version_id is provided, filter the subquery to only include revisions from that
partial_subquery = partial_subquery.filter(
models.DatasetExampleRevision.dataset_version_id <= resolved_version_id
)
else:
if (
resolved_version_id := await session.scalar(
select(func.max(models.DatasetVersion.id)).where(
models.DatasetVersion.dataset_id == resolved_dataset_id
)
)
) is None:
raise HTTPException(
detail="Dataset has no versions.",
status_code=HTTP_404_NOT_FOUND,
)

subquery = partial_subquery.subquery()
# Query for the most recent example revisions that are not deleted
query = (
select(models.DatasetExample, models.DatasetExampleRevision)
.join(
models.DatasetExampleRevision,
models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,
)
.join(
subquery,
(subquery.c.max_id == models.DatasetExampleRevision.id),
)
.filter(models.DatasetExample.dataset_id == resolved_dataset_id)
.filter(models.DatasetExampleRevision.revision_kind != "DELETE")
.order_by(models.DatasetExample.id.asc())
)
examples = [
DatasetExample(
id=str(GlobalID("DatasetExample", str(example.id))),
input=revision.input,
output=revision.output,
metadata=revision.metadata_,
updated_at=revision.created_at,
)
async for example, revision in await session.stream(query)
]
return ListDatasetExamplesResponseBody(
data=ListDatasetExamplesData(
dataset_id=str(GlobalID("Dataset", str(resolved_dataset_id))),
version_id=str(GlobalID("DatasetVersion", str(resolved_version_id))),
examples=examples,
)
)


@router.get(
Expand Down Expand Up @@ -794,7 +916,7 @@ def _get_content_csv(examples: List[models.DatasetExampleRevision]) -> bytes:
records = [
{
"example_id": GlobalID(
type_name=DatasetExample.__name__,
type_name=DatasetExampleNodeType.__name__,
node_id=str(ex.dataset_example_id),
),
**{f"input_{k}": v for k, v in ex.input.items()},
Expand Down
Loading

0 comments on commit 5872e54

Please sign in to comment.