Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: plumb Dataset start/end times through graphQL #194

Merged
merged 16 commits into from
Jan 27, 2023
7 changes: 7 additions & 0 deletions app/schema.graphql
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
"""
A date-time string at UTC, such as 2007-12-03T10:15:30Z, compliant with the `date-time` format outlined in section 5.6 of the RFC 3339 profile of the ISO 8601 standard for representation of dates and times using the Gregorian calendar.
"""
scalar DateTime

type Dataset {
name: String!
startTime: DateTime
endTime: DateTime
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
startTime: DateTime
endTime: DateTime
startTime: DateTime!
endTime: DateTime!

Let's make these non-maybe

}

type Dimension implements Node {
Expand Down
4 changes: 4 additions & 0 deletions app/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@ const RootQuery = graphql`
query AppRootQuery {
primaryDataset {
name
startTime
endTime
}
referenceDataset {
name
startTime
endTime
}
}
`;
Expand Down
2 changes: 1 addition & 1 deletion src/phoenix/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_pids_path() -> str:

PHOENIX_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.join("~", ".phoenix")
dataset_dir = normalize_path(os.path.join(ROOT_DIR, "datasets"))
dataset_dir = normalize_path(os.path.join(ROOT_DIR, "core/datasets"))
mikeldking marked this conversation as resolved.
Show resolved Hide resolved

# Server config
server_dir = os.path.join(PHOENIX_DIR, "server")
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import uuid
from copy import deepcopy
from dataclasses import fields, replace
from datetime import datetime
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime
Expand Down Expand Up @@ -72,6 +73,16 @@ def __init__(
self.to_disc()
logger.info(f"""Dataset: {self.__name} initialized""")

@property
def start_time(self) -> datetime:
"""Returns the datetime of the earliest inference in the dataset"""
return self.__dataframe[self.schema.timestamp_column_name].min()

@property
def end_time(self) -> datetime:
"""Returns the datetime of the latest inference in the dataset"""
return self.__dataframe[self.schema.timestamp_column_name].max()

@property
def dataframe(self) -> DataFrame:
return self.__dataframe
Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions src/phoenix/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pandas.api.types import is_numeric_dtype, is_object_dtype

from phoenix.datasets import Dataset
from phoenix.datasets.schema import EmbeddingFeatures
from phoenix.core.datasets import Dataset
from phoenix.core.datasets.schema import EmbeddingFeatures
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I sorta wanted the code that the user interacts with be unnested and not be part of core. Core being part of the application. I think voxel has a similar organization. Makes for easier discovery in the notebook. But we would dissuade people from importing core in the notebook

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok gotcha -- so datasets is the API interface so to speak, and users should never need to interact with anything in core.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that was the idea. That the user interacts with datasets and metrics, phoenix serves core via an api.


from .dimension import Dimension
from .dimension_data_type import DimensionDataType
Expand Down
2 changes: 1 addition & 1 deletion src/phoenix/metrics/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd

from phoenix.datasets import Dataset
from phoenix.core.datasets import Dataset


def euclidean_distance(
Expand Down
2 changes: 1 addition & 1 deletion src/phoenix/pointcloud/projectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from numpy.typing import ArrayLike
from umap import UMAP

from phoenix.datasets import Dataset
from phoenix.core.datasets import Dataset

from .pointcloud import (
Cluster,
Expand Down
6 changes: 3 additions & 3 deletions src/phoenix/server/api/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from strawberry.types import Info

from .context import Context
from .types.Dataset import Dataset
from .types.Dataset import Dataset, to_gql_dataset
from .types.Dimension import to_gql_dimension
from .types.EmbeddingDimension import to_gql_embedding_dimension
from .types.Model import Model
Expand All @@ -13,11 +13,11 @@
class Query:
@strawberry.field
def primary_dataset(self, info: Info[Context, None]) -> Dataset:
return Dataset(name=info.context.model.primary_dataset.name)
return to_gql_dataset(info.context.model.primary_dataset)

@strawberry.field
def reference_dataset(self, info: Info[Context, None]) -> Dataset:
return Dataset(name=info.context.model.reference_dataset.name)
return to_gql_dataset(info.context.model.reference_dataset)

@strawberry.field
def model(self) -> Model:
Expand Down
17 changes: 17 additions & 0 deletions src/phoenix/server/api/types/Dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
from datetime import datetime

import strawberry

from phoenix.core.datasets import Dataset as InternalDataset


@strawberry.type
class Dataset:
name: str
start_time: datetime
end_time: datetime


def to_gql_dataset(dataset: InternalDataset) -> Dataset:
"""
Converts a phoenix.datasets.Dataset to a phoenix.server.api.types.Dataset
"""
return Dataset(
name=dataset.name,
start_time=dataset.start_time,
end_time=dataset.end_time,
)
2 changes: 1 addition & 1 deletion src/phoenix/session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional

import phoenix.config as config
from phoenix.datasets import Dataset
from phoenix.core.datasets import Dataset
from phoenix.services import AppService

try:
Expand Down
4 changes: 2 additions & 2 deletions tests/datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
from pandas import DataFrame, to_datetime
from pytest import LogCaptureFixture, raises

from phoenix.datasets.dataset import (
from phoenix.core.datasets.dataset import (
Dataset,
EmbeddingColumnNames,
Schema,
_parse_dataframe_and_schema,
)
from phoenix.datasets.errors import DatasetError
from phoenix.core.datasets.errors import DatasetError


class TestParseDataFrameAndSchema:
Expand Down
2 changes: 1 addition & 1 deletion tests/datasets/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from phoenix.datasets import EmbeddingColumnNames, Schema
from phoenix.core.datasets import EmbeddingColumnNames, Schema


def test_json_serialization():
Expand Down
2 changes: 1 addition & 1 deletion tests/metrics/embeddings/test_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
import pytest

from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.core.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.metrics.embeddings import euclidean_distance


Expand Down
34 changes: 34 additions & 0 deletions tests/server/api/types/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest
from pandas import DataFrame, Timestamp

from phoenix.core.datasets import Dataset as CoreDataset
from phoenix.core.datasets import Schema
from phoenix.server.api.types import Dataset


@pytest.fixture
def core_dataset():
input_df = DataFrame(
{
"prediction_label": ["apple", "orange", "grape"],
"timestamp": [
Timestamp(year=2023, month=1, day=1, hour=2, second=30),
Timestamp(year=2023, month=1, day=5, hour=4, second=25),
Timestamp(year=2023, month=1, day=10, hour=6, second=20),
],
}
)

input_schema = Schema(
prediction_label_column_name="prediction_label",
timestamp_column_name="timestamp",
)
return CoreDataset(dataframe=input_df, schema=input_schema)


def test_dataset_serialization(core_dataset):
converted_gql_dataset = Dataset.to_gql_dataset(core_dataset)

expected_dataset = core_dataset
assert converted_gql_dataset.start_time == expected_dataset.start_time
assert converted_gql_dataset.end_time == expected_dataset.end_time