From e50e4324f357a65f1fc3484b5acb507d930a99f6 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 20:42:38 -0800 Subject: [PATCH 01/16] wip --- app/schema.graphql | 7 +++++++ app/src/App.tsx | 4 ++++ src/phoenix/datasets/dataset.py | 11 ++++++++++- src/phoenix/server/api/schema.py | 6 +++--- src/phoenix/server/api/types/Dataset.py | 15 +++++++++++++++ 5 files changed, 39 insertions(+), 4 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index d1111aa200..ed4291b0fb 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -1,5 +1,12 @@ +""" +A date-time string at UTC, such as 2007-12-03T10:15:30Z, compliant with the `date-time` format outlined in section 5.6 of the RFC 3339 profile of the ISO 8601 standard for representation of dates and times using the Gregorian calendar. +""" +scalar DateTime + type Dataset { name: String! + startTime: DateTime + endTime: DateTime } type Dimension implements Node { diff --git a/app/src/App.tsx b/app/src/App.tsx index 46c9539cb7..db2c6a07e9 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -18,9 +18,13 @@ const RootQuery = graphql` query AppRootQuery { primaryDataset { name + startTime + endTime } referenceDataset { name + startTime + endTime } } `; diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index ac6649546a..4ba03d2d17 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -1,3 +1,4 @@ +import datetime import logging import os import sys @@ -32,7 +33,7 @@ logger.setLevel(logging.INFO) -class Dataset: +class dataset: """ A dataset represents data for a set of inferences. It is represented as a dataframe + schema """ @@ -72,6 +73,14 @@ def __init__( self.to_disc() logger.info(f"""Dataset: {self.__name} initialized""") + @property + def start_time(self) -> datetime.datetime: + return self.__dataframe[self.schema.timestamp_column_name].min() + + @property + def end_time(self) -> datetime.datetime: + return self.__dataframe[self.schema.timestamp_column_name].max() + @property def dataframe(self) -> DataFrame: return self.__dataframe diff --git a/src/phoenix/server/api/schema.py b/src/phoenix/server/api/schema.py index 4e15727602..56ca70a597 100644 --- a/src/phoenix/server/api/schema.py +++ b/src/phoenix/server/api/schema.py @@ -2,7 +2,7 @@ from strawberry.types import Info from .context import Context -from .types.Dataset import Dataset +from .types.Dataset import Dataset, to_gql_dataset from .types.Dimension import to_gql_dimension from .types.EmbeddingDimension import to_gql_embedding_dimension from .types.Model import Model @@ -13,11 +13,11 @@ class Query: @strawberry.field def primary_dataset(self, info: Info[Context, None]) -> Dataset: - return Dataset(name=info.context.model.primary_dataset.name) + return to_gql_dataset(info.context.model.primary_dataset) @strawberry.field def reference_dataset(self, info: Info[Context, None]) -> Dataset: - return Dataset(name=info.context.model.reference_dataset.name) + return to_gql_dataset(info.context.model.reference_dataset) @strawberry.field def model(self) -> Model: diff --git a/src/phoenix/server/api/types/Dataset.py b/src/phoenix/server/api/types/Dataset.py index 55dd406d00..21862952a7 100644 --- a/src/phoenix/server/api/types/Dataset.py +++ b/src/phoenix/server/api/types/Dataset.py @@ -1,6 +1,21 @@ +from datetime import datetime +from src.phoenix.datasets import Dataset as CoreDataset import strawberry @strawberry.type class Dataset: name: str + start_time: datetime + end_time: datetime + + +def to_gql_dataset(dataset: CoreDataset) -> Dataset: + """ + Converts a phoenix.core.Dimension to a phoenix.server.api.types.Dimension + """ + return Dataset( + name=dataset.name, + start_time=dataset.start_time, + end_time=dataset.end_time, + ) From b9014b51b2c6bb757fb9c911288c5f6c05b1b4d2 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 20:48:06 -0800 Subject: [PATCH 02/16] comments and variable name change --- src/phoenix/datasets/dataset.py | 10 ++++++---- src/phoenix/server/api/types/Dataset.py | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 4ba03d2d17..38f86aa3b5 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime import logging import os import sys @@ -33,7 +33,7 @@ logger.setLevel(logging.INFO) -class dataset: +class Dataset: """ A dataset represents data for a set of inferences. It is represented as a dataframe + schema """ @@ -74,11 +74,13 @@ def __init__( logger.info(f"""Dataset: {self.__name} initialized""") @property - def start_time(self) -> datetime.datetime: + def start_time(self) -> datetime: + """Returns the datetime of the earliest inference in the dataset""" return self.__dataframe[self.schema.timestamp_column_name].min() @property - def end_time(self) -> datetime.datetime: + def end_time(self) -> datetime: + """Returns the datetime of the latest inference in the dataset""" return self.__dataframe[self.schema.timestamp_column_name].max() @property diff --git a/src/phoenix/server/api/types/Dataset.py b/src/phoenix/server/api/types/Dataset.py index 21862952a7..77f4063e6a 100644 --- a/src/phoenix/server/api/types/Dataset.py +++ b/src/phoenix/server/api/types/Dataset.py @@ -1,5 +1,5 @@ from datetime import datetime -from src.phoenix.datasets import Dataset as CoreDataset +from src.phoenix.datasets import Dataset as PhoenixDataset import strawberry @@ -10,9 +10,9 @@ class Dataset: end_time: datetime -def to_gql_dataset(dataset: CoreDataset) -> Dataset: +def to_gql_dataset(dataset: PhoenixDataset) -> Dataset: """ - Converts a phoenix.core.Dimension to a phoenix.server.api.types.Dimension + Converts a phoenix.datasets.Dataset to a phoenix.server.api.types.Dataset """ return Dataset( name=dataset.name, From 962bdd9814fc31364ee2a1d4b8dfdd6faa20da82 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 21:30:13 -0800 Subject: [PATCH 03/16] proposed refactor --- src/phoenix/config.py | 2 +- src/phoenix/{ => core}/datasets/__init__.py | 0 src/phoenix/{ => core}/datasets/dataset.py | 0 src/phoenix/{ => core}/datasets/errors.py | 0 src/phoenix/{ => core}/datasets/schema.py | 0 src/phoenix/{ => core}/datasets/validation.py | 0 src/phoenix/core/model.py | 4 +-- src/phoenix/metrics/embeddings.py | 2 +- src/phoenix/pointcloud/projectors.py | 2 +- src/phoenix/server/api/types/Dataset.py | 4 +-- src/phoenix/session/session.py | 2 +- tests/datasets/test_dataset.py | 4 +-- tests/datasets/test_schema.py | 2 +- tests/metrics/embeddings/test_embeddings.py | 2 +- tests/server/api/types/test_dataset.py | 34 +++++++++++++++++++ 15 files changed, 46 insertions(+), 12 deletions(-) rename src/phoenix/{ => core}/datasets/__init__.py (100%) rename src/phoenix/{ => core}/datasets/dataset.py (100%) rename src/phoenix/{ => core}/datasets/errors.py (100%) rename src/phoenix/{ => core}/datasets/schema.py (100%) rename src/phoenix/{ => core}/datasets/validation.py (100%) create mode 100644 tests/server/api/types/test_dataset.py diff --git a/src/phoenix/config.py b/src/phoenix/config.py index ba9c578dbb..0e7d074202 100644 --- a/src/phoenix/config.py +++ b/src/phoenix/config.py @@ -39,7 +39,7 @@ def get_pids_path() -> str: PHOENIX_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.join("~", ".phoenix") -dataset_dir = normalize_path(os.path.join(ROOT_DIR, "datasets")) +dataset_dir = normalize_path(os.path.join(ROOT_DIR, "core/datasets")) # Server config server_dir = os.path.join(PHOENIX_DIR, "server") diff --git a/src/phoenix/datasets/__init__.py b/src/phoenix/core/datasets/__init__.py similarity index 100% rename from src/phoenix/datasets/__init__.py rename to src/phoenix/core/datasets/__init__.py diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/core/datasets/dataset.py similarity index 100% rename from src/phoenix/datasets/dataset.py rename to src/phoenix/core/datasets/dataset.py diff --git a/src/phoenix/datasets/errors.py b/src/phoenix/core/datasets/errors.py similarity index 100% rename from src/phoenix/datasets/errors.py rename to src/phoenix/core/datasets/errors.py diff --git a/src/phoenix/datasets/schema.py b/src/phoenix/core/datasets/schema.py similarity index 100% rename from src/phoenix/datasets/schema.py rename to src/phoenix/core/datasets/schema.py diff --git a/src/phoenix/datasets/validation.py b/src/phoenix/core/datasets/validation.py similarity index 100% rename from src/phoenix/datasets/validation.py rename to src/phoenix/core/datasets/validation.py diff --git a/src/phoenix/core/model.py b/src/phoenix/core/model.py index ae85cb4d83..9333c07654 100644 --- a/src/phoenix/core/model.py +++ b/src/phoenix/core/model.py @@ -2,8 +2,8 @@ from pandas.api.types import is_numeric_dtype, is_object_dtype -from phoenix.datasets import Dataset -from phoenix.datasets.schema import EmbeddingFeatures +from phoenix.core.datasets import Dataset +from phoenix.core.datasets.schema import EmbeddingFeatures from .dimension import Dimension from .dimension_data_type import DimensionDataType diff --git a/src/phoenix/metrics/embeddings.py b/src/phoenix/metrics/embeddings.py index 2eb2fbe1ab..97b6a2dd3b 100644 --- a/src/phoenix/metrics/embeddings.py +++ b/src/phoenix/metrics/embeddings.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from phoenix.datasets import Dataset +from phoenix.core.datasets import Dataset def euclidean_distance( diff --git a/src/phoenix/pointcloud/projectors.py b/src/phoenix/pointcloud/projectors.py index 81515561b7..7ad173d427 100644 --- a/src/phoenix/pointcloud/projectors.py +++ b/src/phoenix/pointcloud/projectors.py @@ -6,7 +6,7 @@ from numpy.typing import ArrayLike from umap import UMAP -from phoenix.datasets import Dataset +from phoenix.core.datasets import Dataset from .pointcloud import ( Cluster, diff --git a/src/phoenix/server/api/types/Dataset.py b/src/phoenix/server/api/types/Dataset.py index 77f4063e6a..88879b726a 100644 --- a/src/phoenix/server/api/types/Dataset.py +++ b/src/phoenix/server/api/types/Dataset.py @@ -1,5 +1,5 @@ from datetime import datetime -from src.phoenix.datasets import Dataset as PhoenixDataset +from phoenix.core.datasets import Dataset as InternalDataset import strawberry @@ -10,7 +10,7 @@ class Dataset: end_time: datetime -def to_gql_dataset(dataset: PhoenixDataset) -> Dataset: +def to_gql_dataset(dataset: InternalDataset) -> Dataset: """ Converts a phoenix.datasets.Dataset to a phoenix.server.api.types.Dataset """ diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py index a456ae8a99..5493cd2820 100644 --- a/src/phoenix/session/session.py +++ b/src/phoenix/session/session.py @@ -2,7 +2,7 @@ from typing import Optional import phoenix.config as config -from phoenix.datasets import Dataset +from phoenix.core.datasets import Dataset from phoenix.services import AppService try: diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index 066c4bc360..bff54bc494 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -11,13 +11,13 @@ from pandas import DataFrame, to_datetime from pytest import LogCaptureFixture, raises -from phoenix.datasets.dataset import ( +from phoenix.core.datasets.dataset import ( Dataset, EmbeddingColumnNames, Schema, _parse_dataframe_and_schema, ) -from phoenix.datasets.errors import DatasetError +from phoenix.core.datasets.errors import DatasetError class TestParseDataFrameAndSchema: diff --git a/tests/datasets/test_schema.py b/tests/datasets/test_schema.py index d1d2b0bd0e..1e37f387df 100644 --- a/tests/datasets/test_schema.py +++ b/tests/datasets/test_schema.py @@ -1,4 +1,4 @@ -from phoenix.datasets import EmbeddingColumnNames, Schema +from phoenix.core.datasets import EmbeddingColumnNames, Schema def test_json_serialization(): diff --git a/tests/metrics/embeddings/test_embeddings.py b/tests/metrics/embeddings/test_embeddings.py index 9c25fac2f2..27c7e32022 100644 --- a/tests/metrics/embeddings/test_embeddings.py +++ b/tests/metrics/embeddings/test_embeddings.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema +from phoenix.core.datasets import Dataset, EmbeddingColumnNames, Schema from phoenix.metrics.embeddings import euclidean_distance diff --git a/tests/server/api/types/test_dataset.py b/tests/server/api/types/test_dataset.py new file mode 100644 index 0000000000..ed281265b1 --- /dev/null +++ b/tests/server/api/types/test_dataset.py @@ -0,0 +1,34 @@ +import pytest + +from phoenix.server.api.types import Dataset +from phoenix.core.datasets import Schema, Dataset as CoreDataset + +from pandas import DataFrame, Timestamp + + +@pytest.fixture +def core_dataset(): + input_df = DataFrame( + { + "prediction_label": ["apple", "orange", "grape"], + "timestamp": [ + Timestamp(year=2023, month=1, day=1, hour=2, second=30), + Timestamp(year=2023, month=1, day=5, hour=4, second=25), + Timestamp(year=2023, month=1, day=10, hour=6, second=20), + ] + } + ) + + input_schema = Schema( + prediction_label_column_name="prediction_label", + timestamp_column_name="timestamp", + ) + return CoreDataset(dataframe=input_df, schema=input_schema) + + +def test_dataset_serialization(core_dataset): + converted_gql_dataset = Dataset.to_gql_dataset(core_dataset) + + expected_dataset = core_dataset + assert converted_gql_dataset.start_time == expected_dataset.start_time + assert converted_gql_dataset.end_time == expected_dataset.end_time From 13f3b9542245a31a24ccbbc78b7a8d43021fb2e0 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 21:39:36 -0800 Subject: [PATCH 04/16] formatting --- src/phoenix/core/datasets/dataset.py | 2 +- src/phoenix/server/api/types/Dataset.py | 4 +++- tests/server/api/types/test_dataset.py | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/phoenix/core/datasets/dataset.py b/src/phoenix/core/datasets/dataset.py index 38f86aa3b5..520cea6cf9 100644 --- a/src/phoenix/core/datasets/dataset.py +++ b/src/phoenix/core/datasets/dataset.py @@ -1,10 +1,10 @@ -from datetime import datetime import logging import os import sys import uuid from copy import deepcopy from dataclasses import fields, replace +from datetime import datetime from typing import Any, Dict, List, Optional, Set, Tuple, Union from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime diff --git a/src/phoenix/server/api/types/Dataset.py b/src/phoenix/server/api/types/Dataset.py index 88879b726a..f501856df7 100644 --- a/src/phoenix/server/api/types/Dataset.py +++ b/src/phoenix/server/api/types/Dataset.py @@ -1,7 +1,9 @@ from datetime import datetime -from phoenix.core.datasets import Dataset as InternalDataset + import strawberry +from phoenix.core.datasets import Dataset as InternalDataset + @strawberry.type class Dataset: diff --git a/tests/server/api/types/test_dataset.py b/tests/server/api/types/test_dataset.py index ed281265b1..9dfedea67f 100644 --- a/tests/server/api/types/test_dataset.py +++ b/tests/server/api/types/test_dataset.py @@ -1,9 +1,9 @@ import pytest +from pandas import DataFrame, Timestamp +from phoenix.core.datasets import Dataset as CoreDataset +from phoenix.core.datasets import Schema from phoenix.server.api.types import Dataset -from phoenix.core.datasets import Schema, Dataset as CoreDataset - -from pandas import DataFrame, Timestamp @pytest.fixture @@ -15,7 +15,7 @@ def core_dataset(): Timestamp(year=2023, month=1, day=1, hour=2, second=30), Timestamp(year=2023, month=1, day=5, hour=4, second=25), Timestamp(year=2023, month=1, day=10, hour=6, second=20), - ] + ], } ) From 3f6a198e261d650454b7c91925d921aea79838a6 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 21:56:22 -0800 Subject: [PATCH 05/16] revert folder move --- src/phoenix/config.py | 2 +- src/phoenix/core/model.py | 4 +-- src/phoenix/{core => }/datasets/__init__.py | 0 src/phoenix/{core => }/datasets/dataset.py | 0 src/phoenix/{core => }/datasets/errors.py | 0 src/phoenix/{core => }/datasets/schema.py | 0 src/phoenix/{core => }/datasets/validation.py | 0 src/phoenix/metrics/embeddings.py | 2 +- src/phoenix/pointcloud/projectors.py | 2 +- src/phoenix/server/api/types/Dataset.py | 2 +- src/phoenix/session/session.py | 2 +- tests/datasets/test_dataset.py | 27 +++++++++++++++++-- tests/datasets/test_schema.py | 2 +- tests/metrics/embeddings/test_embeddings.py | 2 +- tests/server/api/types/test_dataset.py | 4 +-- 15 files changed, 36 insertions(+), 13 deletions(-) rename src/phoenix/{core => }/datasets/__init__.py (100%) rename src/phoenix/{core => }/datasets/dataset.py (100%) rename src/phoenix/{core => }/datasets/errors.py (100%) rename src/phoenix/{core => }/datasets/schema.py (100%) rename src/phoenix/{core => }/datasets/validation.py (100%) diff --git a/src/phoenix/config.py b/src/phoenix/config.py index 0e7d074202..ba9c578dbb 100644 --- a/src/phoenix/config.py +++ b/src/phoenix/config.py @@ -39,7 +39,7 @@ def get_pids_path() -> str: PHOENIX_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.join("~", ".phoenix") -dataset_dir = normalize_path(os.path.join(ROOT_DIR, "core/datasets")) +dataset_dir = normalize_path(os.path.join(ROOT_DIR, "datasets")) # Server config server_dir = os.path.join(PHOENIX_DIR, "server") diff --git a/src/phoenix/core/model.py b/src/phoenix/core/model.py index 9333c07654..ae85cb4d83 100644 --- a/src/phoenix/core/model.py +++ b/src/phoenix/core/model.py @@ -2,8 +2,8 @@ from pandas.api.types import is_numeric_dtype, is_object_dtype -from phoenix.core.datasets import Dataset -from phoenix.core.datasets.schema import EmbeddingFeatures +from phoenix.datasets import Dataset +from phoenix.datasets.schema import EmbeddingFeatures from .dimension import Dimension from .dimension_data_type import DimensionDataType diff --git a/src/phoenix/core/datasets/__init__.py b/src/phoenix/datasets/__init__.py similarity index 100% rename from src/phoenix/core/datasets/__init__.py rename to src/phoenix/datasets/__init__.py diff --git a/src/phoenix/core/datasets/dataset.py b/src/phoenix/datasets/dataset.py similarity index 100% rename from src/phoenix/core/datasets/dataset.py rename to src/phoenix/datasets/dataset.py diff --git a/src/phoenix/core/datasets/errors.py b/src/phoenix/datasets/errors.py similarity index 100% rename from src/phoenix/core/datasets/errors.py rename to src/phoenix/datasets/errors.py diff --git a/src/phoenix/core/datasets/schema.py b/src/phoenix/datasets/schema.py similarity index 100% rename from src/phoenix/core/datasets/schema.py rename to src/phoenix/datasets/schema.py diff --git a/src/phoenix/core/datasets/validation.py b/src/phoenix/datasets/validation.py similarity index 100% rename from src/phoenix/core/datasets/validation.py rename to src/phoenix/datasets/validation.py diff --git a/src/phoenix/metrics/embeddings.py b/src/phoenix/metrics/embeddings.py index 97b6a2dd3b..2eb2fbe1ab 100644 --- a/src/phoenix/metrics/embeddings.py +++ b/src/phoenix/metrics/embeddings.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from phoenix.core.datasets import Dataset +from phoenix.datasets import Dataset def euclidean_distance( diff --git a/src/phoenix/pointcloud/projectors.py b/src/phoenix/pointcloud/projectors.py index 7ad173d427..81515561b7 100644 --- a/src/phoenix/pointcloud/projectors.py +++ b/src/phoenix/pointcloud/projectors.py @@ -6,7 +6,7 @@ from numpy.typing import ArrayLike from umap import UMAP -from phoenix.core.datasets import Dataset +from phoenix.datasets import Dataset from .pointcloud import ( Cluster, diff --git a/src/phoenix/server/api/types/Dataset.py b/src/phoenix/server/api/types/Dataset.py index f501856df7..cf05c0f9dc 100644 --- a/src/phoenix/server/api/types/Dataset.py +++ b/src/phoenix/server/api/types/Dataset.py @@ -2,7 +2,7 @@ import strawberry -from phoenix.core.datasets import Dataset as InternalDataset +from phoenix.datasets import Dataset as InternalDataset @strawberry.type diff --git a/src/phoenix/session/session.py b/src/phoenix/session/session.py index 5493cd2820..a456ae8a99 100644 --- a/src/phoenix/session/session.py +++ b/src/phoenix/session/session.py @@ -2,7 +2,7 @@ from typing import Optional import phoenix.config as config -from phoenix.core.datasets import Dataset +from phoenix.datasets import Dataset from phoenix.services import AppService try: diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index bff54bc494..0e6292b445 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -11,13 +11,13 @@ from pandas import DataFrame, to_datetime from pytest import LogCaptureFixture, raises -from phoenix.core.datasets.dataset import ( +from phoenix.datasets.dataset import ( Dataset, EmbeddingColumnNames, Schema, _parse_dataframe_and_schema, ) -from phoenix.core.datasets.errors import DatasetError +from phoenix.datasets.errors import DatasetError class TestParseDataFrameAndSchema: @@ -801,6 +801,29 @@ def test_dataset_validate_invalid_schema_excludes_prediction_id(self) -> None: with raises(DatasetError): Dataset(dataframe=input_df, schema=input_schema) + def test_dataset_bookends(self) -> None: + expected_start_time = pd.Timestamp(year=2023, month=1, day=1, hour=2, second=30) + expected_end_time = pd.Timestamp(year=2023, month=1, day=10, hour=6, second=20) + input_df = DataFrame( + { + "prediction_label": ["apple", "orange", "grape"], + "timestamp": [ + expected_end_time, + expected_start_time, + pd.Timestamp(year=2023, month=1, day=5, hour=4, second=25), + ], + } + ) + + input_schema = Schema( + prediction_label_column_name="prediction_label", + timestamp_column_name="timestamp", + ) + output_dataset = Dataset(dataframe=input_df, schema=input_schema) + + assert output_dataset.start_time == expected_start_time + assert output_dataset.end_time == expected_end_time + @property def num_records(self): return self._NUM_RECORDS diff --git a/tests/datasets/test_schema.py b/tests/datasets/test_schema.py index 1e37f387df..d1d2b0bd0e 100644 --- a/tests/datasets/test_schema.py +++ b/tests/datasets/test_schema.py @@ -1,4 +1,4 @@ -from phoenix.core.datasets import EmbeddingColumnNames, Schema +from phoenix.datasets import EmbeddingColumnNames, Schema def test_json_serialization(): diff --git a/tests/metrics/embeddings/test_embeddings.py b/tests/metrics/embeddings/test_embeddings.py index 27c7e32022..9c25fac2f2 100644 --- a/tests/metrics/embeddings/test_embeddings.py +++ b/tests/metrics/embeddings/test_embeddings.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from phoenix.core.datasets import Dataset, EmbeddingColumnNames, Schema +from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema from phoenix.metrics.embeddings import euclidean_distance diff --git a/tests/server/api/types/test_dataset.py b/tests/server/api/types/test_dataset.py index 9dfedea67f..aaa27fcef8 100644 --- a/tests/server/api/types/test_dataset.py +++ b/tests/server/api/types/test_dataset.py @@ -1,8 +1,8 @@ import pytest from pandas import DataFrame, Timestamp -from phoenix.core.datasets import Dataset as CoreDataset -from phoenix.core.datasets import Schema +from phoenix.datasets import Dataset as CoreDataset +from phoenix.datasets import Schema from phoenix.server.api.types import Dataset From 77d9df4277f463db33d29118077f8b1d3413b06d Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 21:59:12 -0800 Subject: [PATCH 06/16] rename test variables --- tests/server/api/types/test_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/server/api/types/test_dataset.py b/tests/server/api/types/test_dataset.py index aaa27fcef8..9f612d4c7e 100644 --- a/tests/server/api/types/test_dataset.py +++ b/tests/server/api/types/test_dataset.py @@ -1,13 +1,13 @@ import pytest from pandas import DataFrame, Timestamp -from phoenix.datasets import Dataset as CoreDataset +from phoenix.datasets import Dataset as InputDataset from phoenix.datasets import Schema from phoenix.server.api.types import Dataset @pytest.fixture -def core_dataset(): +def input_dataset(): input_df = DataFrame( { "prediction_label": ["apple", "orange", "grape"], @@ -23,12 +23,12 @@ def core_dataset(): prediction_label_column_name="prediction_label", timestamp_column_name="timestamp", ) - return CoreDataset(dataframe=input_df, schema=input_schema) + return InputDataset(dataframe=input_df, schema=input_schema) -def test_dataset_serialization(core_dataset): - converted_gql_dataset = Dataset.to_gql_dataset(core_dataset) +def test_dataset_serialization(input_dataset): + converted_gql_dataset = Dataset.to_gql_dataset(input_dataset) - expected_dataset = core_dataset + expected_dataset = input_dataset assert converted_gql_dataset.start_time == expected_dataset.start_time assert converted_gql_dataset.end_time == expected_dataset.end_time From bfe7ed25274ae9b7ca715a5883c53514b155358a Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:16:56 -0800 Subject: [PATCH 07/16] Update AppRootQuery.graphql.ts --- app/src/__generated__/AppRootQuery.graphql.ts | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/app/src/__generated__/AppRootQuery.graphql.ts b/app/src/__generated__/AppRootQuery.graphql.ts index 31578f4e8c..a7ca24588d 100644 --- a/app/src/__generated__/AppRootQuery.graphql.ts +++ b/app/src/__generated__/AppRootQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<9c85aa6b1076be42ab12314c83978e59>> + * @generated SignedSource<<3319c722f670a28782c8e3f59c8947d3>> * @lightSyntaxTransform * @nogrep */ @@ -12,10 +12,14 @@ import { ConcreteRequest, Query } from 'relay-runtime'; export type AppRootQuery$variables = {}; export type AppRootQuery$data = { readonly primaryDataset: { + readonly endTime: any | null; readonly name: string; + readonly startTime: any | null; }; readonly referenceDataset: { + readonly endTime: any | null; readonly name: string; + readonly startTime: any | null; }; }; export type AppRootQuery = { @@ -31,6 +35,20 @@ var v0 = [ "kind": "ScalarField", "name": "name", "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "startTime", + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "endTime", + "storageKey": null } ], v1 = [ @@ -73,16 +91,16 @@ return { "selections": (v1/*: any*/) }, "params": { - "cacheID": "a29bf65580844ece1a908f8ede2cf3a0", + "cacheID": "d4307243a1ca8536ced464cf7c3359ce", "id": null, "metadata": {}, "name": "AppRootQuery", "operationKind": "query", - "text": "query AppRootQuery {\n primaryDataset {\n name\n }\n referenceDataset {\n name\n }\n}\n" + "text": "query AppRootQuery {\n primaryDataset {\n name\n startTime\n endTime\n }\n referenceDataset {\n name\n startTime\n endTime\n }\n}\n" } }; })(); -(node as any).hash = "f77baf0de9c7173d430a629e89533a72"; +(node as any).hash = "2f1366a7b6eae055b970061fb3e965b7"; export default node; From cee67966d1f10090d33a74f24fc49fb0f8218e5c Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:26:04 -0800 Subject: [PATCH 08/16] implicit typecast --- src/phoenix/datasets/dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 520cea6cf9..4523bdf695 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -76,12 +76,14 @@ def __init__( @property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - return self.__dataframe[self.schema.timestamp_column_name].min() + dt: datetime = self.__dataframe[self.schema.timestamp_column_name].min() + return dt @property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - return self.__dataframe[self.schema.timestamp_column_name].max() + dt: datetime = self.__dataframe[self.schema.timestamp_column_name].max() + return dt @property def dataframe(self) -> DataFrame: From bfe257c939a1766c7393e6607824c04ff5456602 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:34:28 -0800 Subject: [PATCH 09/16] fix types --- app/schema.graphql | 4 ++-- app/src/__generated__/AppRootQuery.graphql.ts | 10 +++++----- src/phoenix/datasets/dataset.py | 6 ++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index ed4291b0fb..d4e24f524b 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -5,8 +5,8 @@ scalar DateTime type Dataset { name: String! - startTime: DateTime - endTime: DateTime + startTime: DateTime! + endTime: DateTime! } type Dimension implements Node { diff --git a/app/src/__generated__/AppRootQuery.graphql.ts b/app/src/__generated__/AppRootQuery.graphql.ts index a7ca24588d..22357723c7 100644 --- a/app/src/__generated__/AppRootQuery.graphql.ts +++ b/app/src/__generated__/AppRootQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<3319c722f670a28782c8e3f59c8947d3>> + * @generated SignedSource<<89597fa86d05c567e72b09660ea9fa73>> * @lightSyntaxTransform * @nogrep */ @@ -12,14 +12,14 @@ import { ConcreteRequest, Query } from 'relay-runtime'; export type AppRootQuery$variables = {}; export type AppRootQuery$data = { readonly primaryDataset: { - readonly endTime: any | null; + readonly endTime: any; readonly name: string; - readonly startTime: any | null; + readonly startTime: any; }; readonly referenceDataset: { - readonly endTime: any | null; + readonly endTime: any; readonly name: string; - readonly startTime: any | null; + readonly startTime: any; }; }; export type AppRootQuery = { diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 4523bdf695..c4810e0ce5 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -76,13 +76,15 @@ def __init__( @property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - dt: datetime = self.__dataframe[self.schema.timestamp_column_name].min() + ts_col_name: str = self.schema.timestamp_column_name + dt: datetime = self.__dataframe[ts_col_name].min() return dt @property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - dt: datetime = self.__dataframe[self.schema.timestamp_column_name].max() + ts_col_name: str = self.schema.timestamp_column_name + dt: datetime = self.__dataframe[ts_col_name].max() return dt @property From 3859376106e0bc1c6df41b845b920821d5af4dd2 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:45:21 -0800 Subject: [PATCH 10/16] cast for now --- src/phoenix/datasets/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index c4810e0ce5..c76492a31b 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -5,7 +5,7 @@ from copy import deepcopy from dataclasses import fields, replace from datetime import datetime -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime from pandas.api.types import is_numeric_dtype @@ -76,14 +76,14 @@ def __init__( @property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - ts_col_name: str = self.schema.timestamp_column_name + ts_col_name: str = cast(str, self.schema.timestamp_column_name) dt: datetime = self.__dataframe[ts_col_name].min() return dt @property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - ts_col_name: str = self.schema.timestamp_column_name + ts_col_name: str = cast(str, self.schema.timestamp_column_name) dt: datetime = self.__dataframe[ts_col_name].max() return dt From 1c46785b9e9655ec25f06ab7071186e5042d2448 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:50:33 -0800 Subject: [PATCH 11/16] updte datetime type --- app/relay.config.js | 1 + app/src/__generated__/AppRootQuery.graphql.ts | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/app/relay.config.js b/app/relay.config.js index 8e7283b121..4be5c9283f 100644 --- a/app/relay.config.js +++ b/app/relay.config.js @@ -7,5 +7,6 @@ module.exports = { noFutureProofEnums: true, customScalars: { GlobalID: "String", + "DateTime": "string", }, }; diff --git a/app/src/__generated__/AppRootQuery.graphql.ts b/app/src/__generated__/AppRootQuery.graphql.ts index 22357723c7..227e399e67 100644 --- a/app/src/__generated__/AppRootQuery.graphql.ts +++ b/app/src/__generated__/AppRootQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<89597fa86d05c567e72b09660ea9fa73>> + * @generated SignedSource<<265bf9796d295f06d4d16444d3d3b686>> * @lightSyntaxTransform * @nogrep */ @@ -12,14 +12,14 @@ import { ConcreteRequest, Query } from 'relay-runtime'; export type AppRootQuery$variables = {}; export type AppRootQuery$data = { readonly primaryDataset: { - readonly endTime: any; + readonly endTime: string; readonly name: string; - readonly startTime: any; + readonly startTime: string; }; readonly referenceDataset: { - readonly endTime: any; + readonly endTime: string; readonly name: string; - readonly startTime: any; + readonly startTime: string; }; }; export type AppRootQuery = { From 6b133c58fd707e917b6e25215dc538579a45e84d Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 22:55:48 -0800 Subject: [PATCH 12/16] Update schema.graphql --- app/schema.graphql | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/app/schema.graphql b/app/schema.graphql index d4e24f524b..19f52a3466 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -1,6 +1,4 @@ -""" -A date-time string at UTC, such as 2007-12-03T10:15:30Z, compliant with the `date-time` format outlined in section 5.6 of the RFC 3339 profile of the ISO 8601 standard for representation of dates and times using the Gregorian calendar. -""" +"""Date with time (isoformat)""" scalar DateTime type Dataset { From 3db71b690479b1367e9c13c948e6752ed0b65915 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 23:13:57 -0800 Subject: [PATCH 13/16] a few minor updates --- app/relay.config.js | 2 +- app/schema.graphql | 6 +++--- app/src/__generated__/AppRootQuery.graphql.ts | 10 +++++----- src/phoenix/datasets/dataset.py | 14 +++++++------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/app/relay.config.js b/app/relay.config.js index 4be5c9283f..a0cab7b5cd 100644 --- a/app/relay.config.js +++ b/app/relay.config.js @@ -7,6 +7,6 @@ module.exports = { noFutureProofEnums: true, customScalars: { GlobalID: "String", - "DateTime": "string", + "DateTime": "String", }, }; diff --git a/app/schema.graphql b/app/schema.graphql index 19f52a3466..d2c3a1a0cd 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -1,12 +1,12 @@ -"""Date with time (isoformat)""" -scalar DateTime - type Dataset { name: String! startTime: DateTime! endTime: DateTime! } +"""Date with time (isoformat)""" +scalar DateTime + type Dimension implements Node { id: GlobalID! name: String! diff --git a/app/src/__generated__/AppRootQuery.graphql.ts b/app/src/__generated__/AppRootQuery.graphql.ts index 227e399e67..f47f4910f8 100644 --- a/app/src/__generated__/AppRootQuery.graphql.ts +++ b/app/src/__generated__/AppRootQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<265bf9796d295f06d4d16444d3d3b686>> + * @generated SignedSource<<9f4785e848cb71e3425a6411f521e229>> * @lightSyntaxTransform * @nogrep */ @@ -12,14 +12,14 @@ import { ConcreteRequest, Query } from 'relay-runtime'; export type AppRootQuery$variables = {}; export type AppRootQuery$data = { readonly primaryDataset: { - readonly endTime: string; + readonly endTime: String; readonly name: string; - readonly startTime: string; + readonly startTime: String; }; readonly referenceDataset: { - readonly endTime: string; + readonly endTime: String; readonly name: string; - readonly startTime: string; + readonly startTime: String; }; }; export type AppRootQuery = { diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index c76492a31b..16002bc3f5 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -73,19 +73,19 @@ def __init__( self.to_disc() logger.info(f"""Dataset: {self.__name} initialized""") - @property + @cached_property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - ts_col_name: str = cast(str, self.schema.timestamp_column_name) - dt: datetime = self.__dataframe[ts_col_name].min() - return dt + timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) + start_datetime: datetime = self.__dataframe[timestamp_col_name].min() + return start_datetime @property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - ts_col_name: str = cast(str, self.schema.timestamp_column_name) - dt: datetime = self.__dataframe[ts_col_name].max() - return dt + timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) + end_datetime: datetime = self.__dataframe[timestamp_col_name].max() + return end_datetime @property def dataframe(self) -> DataFrame: From a47bb207712c78a19e0697e38e148dd8fb50397d Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Wed, 25 Jan 2023 23:17:51 -0800 Subject: [PATCH 14/16] Update dataset.py --- src/phoenix/datasets/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 16002bc3f5..f69bf0f6a0 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -73,7 +73,7 @@ def __init__( self.to_disc() logger.info(f"""Dataset: {self.__name} initialized""") - @cached_property + @property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) From da3e96bd524962b734a4259907e4f7a936212b25 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Thu, 26 Jan 2023 14:38:06 -0800 Subject: [PATCH 15/16] Update dataset.py --- src/phoenix/datasets/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index f69bf0f6a0..bc29a4aadd 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -11,6 +11,7 @@ from pandas.api.types import is_numeric_dtype from phoenix.config import dataset_dir +from functools import cached_property from . import errors as err from .schema import ( @@ -73,14 +74,14 @@ def __init__( self.to_disc() logger.info(f"""Dataset: {self.__name} initialized""") - @property + @cached_property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) start_datetime: datetime = self.__dataframe[timestamp_col_name].min() return start_datetime - @property + @cached_property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) From 158135a6a8425fd42f5a1e4df6629b40b0ddb2b0 Mon Sep 17 00:00:00 2001 From: nate-mar <67926244+nate-mar@users.noreply.github.com> Date: Thu, 26 Jan 2023 14:48:50 -0800 Subject: [PATCH 16/16] Update dataset.py --- src/phoenix/datasets/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index bc29a4aadd..ae00626678 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -5,13 +5,13 @@ from copy import deepcopy from dataclasses import fields, replace from datetime import datetime +from functools import cached_property from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime from pandas.api.types import is_numeric_dtype from phoenix.config import dataset_dir -from functools import cached_property from . import errors as err from .schema import (