Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: plumb Dataset start/end times through graphQL #194

Merged
merged 16 commits into from
Jan 27, 2023
1 change: 1 addition & 0 deletions app/relay.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ module.exports = {
noFutureProofEnums: true,
customScalars: {
GlobalID: "String",
"DateTime": "string",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's mapping to another base primitive so it should be uppercase String

},
};
5 changes: 5 additions & 0 deletions app/schema.graphql
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
"""Date with time (isoformat)"""
scalar DateTime

type Dataset {
name: String!
startTime: DateTime!
endTime: DateTime!
}

type Dimension implements Node {
Expand Down
4 changes: 4 additions & 0 deletions app/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@ const RootQuery = graphql`
query AppRootQuery {
primaryDataset {
name
startTime
endTime
}
referenceDataset {
name
startTime
endTime
}
}
`;
Expand Down
26 changes: 22 additions & 4 deletions app/src/__generated__/AppRootQuery.graphql.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 16 additions & 1 deletion src/phoenix/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import uuid
from copy import deepcopy
from dataclasses import fields, replace
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from datetime import datetime
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast

from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime
from pandas.api.types import is_numeric_dtype
Expand Down Expand Up @@ -72,6 +73,20 @@ def __init__(
self.to_disc()
logger.info(f"""Dataset: {self.__name} initialized""")

@property
def start_time(self) -> datetime:
"""Returns the datetime of the earliest inference in the dataset"""
ts_col_name: str = cast(str, self.schema.timestamp_column_name)
dt: datetime = self.__dataframe[ts_col_name].min()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this never changes you can use dynamic programming to compute once

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming you're referring to the caching aspect of dp? how about just using the @cached_property annotation? or do we have a preexisting convention for how we'd like to do this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that's great. Very cool

return dt

@property
def end_time(self) -> datetime:
"""Returns the datetime of the latest inference in the dataset"""
ts_col_name: str = cast(str, self.schema.timestamp_column_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer dynamic programming to compute once

dt: datetime = self.__dataframe[ts_col_name].max()
return dt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Been advocating for non abbreviated variable names right now https://youtu.be/-J3wNP6u5YU

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 -- done


@property
def dataframe(self) -> DataFrame:
return self.__dataframe
Expand Down
6 changes: 3 additions & 3 deletions src/phoenix/server/api/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from strawberry.types import Info

from .context import Context
from .types.Dataset import Dataset
from .types.Dataset import Dataset, to_gql_dataset
from .types.Dimension import to_gql_dimension
from .types.EmbeddingDimension import to_gql_embedding_dimension
from .types.Model import Model
Expand All @@ -13,11 +13,11 @@
class Query:
@strawberry.field
def primary_dataset(self, info: Info[Context, None]) -> Dataset:
return Dataset(name=info.context.model.primary_dataset.name)
return to_gql_dataset(info.context.model.primary_dataset)

@strawberry.field
def reference_dataset(self, info: Info[Context, None]) -> Dataset:
return Dataset(name=info.context.model.reference_dataset.name)
return to_gql_dataset(info.context.model.reference_dataset)

@strawberry.field
def model(self) -> Model:
Expand Down
17 changes: 17 additions & 0 deletions src/phoenix/server/api/types/Dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
from datetime import datetime

import strawberry

from phoenix.datasets import Dataset as InternalDataset


@strawberry.type
class Dataset:
name: str
start_time: datetime
end_time: datetime


def to_gql_dataset(dataset: InternalDataset) -> Dataset:
"""
Converts a phoenix.datasets.Dataset to a phoenix.server.api.types.Dataset
"""
return Dataset(
name=dataset.name,
start_time=dataset.start_time,
end_time=dataset.end_time,
)
23 changes: 23 additions & 0 deletions tests/datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,29 @@ def test_dataset_validate_invalid_schema_excludes_prediction_id(self) -> None:
with raises(DatasetError):
Dataset(dataframe=input_df, schema=input_schema)

def test_dataset_bookends(self) -> None:
expected_start_time = pd.Timestamp(year=2023, month=1, day=1, hour=2, second=30)
expected_end_time = pd.Timestamp(year=2023, month=1, day=10, hour=6, second=20)
input_df = DataFrame(
{
"prediction_label": ["apple", "orange", "grape"],
"timestamp": [
expected_end_time,
expected_start_time,
pd.Timestamp(year=2023, month=1, day=5, hour=4, second=25),
],
}
)

input_schema = Schema(
prediction_label_column_name="prediction_label",
timestamp_column_name="timestamp",
)
output_dataset = Dataset(dataframe=input_df, schema=input_schema)

assert output_dataset.start_time == expected_start_time
assert output_dataset.end_time == expected_end_time

@property
def num_records(self):
return self._NUM_RECORDS
Expand Down
34 changes: 34 additions & 0 deletions tests/server/api/types/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest
from pandas import DataFrame, Timestamp

from phoenix.datasets import Dataset as InputDataset
from phoenix.datasets import Schema
from phoenix.server.api.types import Dataset


@pytest.fixture
def input_dataset():
input_df = DataFrame(
{
"prediction_label": ["apple", "orange", "grape"],
"timestamp": [
Timestamp(year=2023, month=1, day=1, hour=2, second=30),
Timestamp(year=2023, month=1, day=5, hour=4, second=25),
Timestamp(year=2023, month=1, day=10, hour=6, second=20),
],
}
)

input_schema = Schema(
prediction_label_column_name="prediction_label",
timestamp_column_name="timestamp",
)
return InputDataset(dataframe=input_df, schema=input_schema)


def test_dataset_serialization(input_dataset):
converted_gql_dataset = Dataset.to_gql_dataset(input_dataset)

expected_dataset = input_dataset
assert converted_gql_dataset.start_time == expected_dataset.start_time
assert converted_gql_dataset.end_time == expected_dataset.end_time