From 7519f66fe7c31b327c790d861fe3d35f00392cf8 Mon Sep 17 00:00:00 2001 From: axiomofjoy Date: Wed, 8 Feb 2023 00:42:27 -0800 Subject: [PATCH 1/5] sort by time and index --- src/phoenix/datasets/dataset.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index ae00626678..deb81215e6 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -50,6 +50,7 @@ def __init__( name: Optional[str] = None, persist_to_disc: bool = True, ): + dataframe = dataframe.reset_index() errors = validate_dataset_inputs( dataframe=dataframe, schema=schema, @@ -58,9 +59,10 @@ def __init__( for e in errors: logger.error(e) raise err.DatasetError(errors) - parsed_dataframe, parsed_schema = _parse_dataframe_and_schema(dataframe, schema) - self.__dataframe: DataFrame = parsed_dataframe - self.__schema: Schema = parsed_schema + dataframe, schema = _parse_dataframe_and_schema(dataframe, schema) + dataframe = _add_timestamp_index_and_sort(dataframe, schema) + self.__dataframe: DataFrame = dataframe + self.__schema: Schema = schema self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}""" self.__directory: str = os.path.join(dataset_dir, self.name) @@ -454,3 +456,15 @@ def _create_and_normalize_dataframe_and_schema( parsed_dataframe[pred_col_name] = parsed_dataframe[pred_col_name].astype(str) return parsed_dataframe, parsed_schema + + +def _add_timestamp_index_and_sort(dataframe: DataFrame, schema: Schema) -> DataFrame: + """ + Adds timestamp index and sorts dataframe by timestamp. + """ + timestamp_column_name = schema.timestamp_column_name + if timestamp_column_name is None: + raise ValueError("Schema must specify a timestamp column name.") + dataframe = dataframe.set_index(timestamp_column_name) + dataframe = dataframe.sort_index() + return dataframe From 9ee1d3bcf380c47a4e0e376ee5b8e8b2fcbf6b27 Mon Sep 17 00:00:00 2001 From: axiomofjoy Date: Wed, 8 Feb 2023 15:54:46 -0800 Subject: [PATCH 2/5] sort rows of dataframe by timestamp --- src/phoenix/datasets/dataset.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index deb81215e6..3d1dde5cfd 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -50,7 +50,6 @@ def __init__( name: Optional[str] = None, persist_to_disc: bool = True, ): - dataframe = dataframe.reset_index() errors = validate_dataset_inputs( dataframe=dataframe, schema=schema, @@ -60,7 +59,7 @@ def __init__( logger.error(e) raise err.DatasetError(errors) dataframe, schema = _parse_dataframe_and_schema(dataframe, schema) - dataframe = _add_timestamp_index_and_sort(dataframe, schema) + dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema) self.__dataframe: DataFrame = dataframe self.__schema: Schema = schema self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}""" @@ -458,13 +457,12 @@ def _create_and_normalize_dataframe_and_schema( return parsed_dataframe, parsed_schema -def _add_timestamp_index_and_sort(dataframe: DataFrame, schema: Schema) -> DataFrame: +def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame: """ - Adds timestamp index and sorts dataframe by timestamp. + Sorts dataframe rows by timestamp. """ timestamp_column_name = schema.timestamp_column_name if timestamp_column_name is None: raise ValueError("Schema must specify a timestamp column name.") - dataframe = dataframe.set_index(timestamp_column_name) - dataframe = dataframe.sort_index() + dataframe = dataframe.sort_values(by=[timestamp_column_name]) return dataframe From 9f63b0fd21db489c3daa5e8b1a98b6573a4957b7 Mon Sep 17 00:00:00 2001 From: axiomofjoy Date: Wed, 8 Feb 2023 18:21:09 -0800 Subject: [PATCH 3/5] fix tests --- src/phoenix/datasets/dataset.py | 18 +++++++++--------- tests/datasets/test_dataset.py | 3 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 3d1dde5cfd..8681ac676e 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -6,7 +6,7 @@ from dataclasses import fields, replace from datetime import datetime from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Set, Tuple, Union from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime from pandas.api.types import is_numeric_dtype @@ -50,6 +50,7 @@ def __init__( name: Optional[str] = None, persist_to_disc: bool = True, ): + dataframe = dataframe.reset_index() errors = validate_dataset_inputs( dataframe=dataframe, schema=schema, @@ -59,7 +60,7 @@ def __init__( logger.error(e) raise err.DatasetError(errors) dataframe, schema = _parse_dataframe_and_schema(dataframe, schema) - dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema) + dataframe = _add_timestamp_index_and_sort_by_time(dataframe, schema) self.__dataframe: DataFrame = dataframe self.__schema: Schema = schema self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}""" @@ -78,15 +79,13 @@ def __init__( @cached_property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) - start_datetime: datetime = self.__dataframe[timestamp_col_name].min() + start_datetime: datetime = self.dataframe.index.min() return start_datetime @cached_property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) - end_datetime: datetime = self.__dataframe[timestamp_col_name].max() + end_datetime: datetime = self.dataframe.index.max() return end_datetime @property @@ -457,12 +456,13 @@ def _create_and_normalize_dataframe_and_schema( return parsed_dataframe, parsed_schema -def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame: +def _add_timestamp_index_and_sort_by_time(dataframe: DataFrame, schema: Schema) -> DataFrame: """ - Sorts dataframe rows by timestamp. + Adds timestamp index and sorts dataframe rows by timestamp. """ timestamp_column_name = schema.timestamp_column_name if timestamp_column_name is None: raise ValueError("Schema must specify a timestamp column name.") - dataframe = dataframe.sort_values(by=[timestamp_column_name]) + dataframe = dataframe.set_index(keys=[timestamp_column_name]) + dataframe = dataframe.sort_index() return dataframe diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index 0e6292b445..4b2edab17d 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -715,12 +715,13 @@ def test_dataset_normalization_columns_already_normalized(self): feature_column_names=["feature0"], prediction_label_column_name="prediction_label", ) + expected_dataframe = input_dataframe.set_index(keys=["timestamp"]) dataset = Dataset(dataframe=input_dataframe, schema=input_schema) output_dataframe = dataset.dataframe output_schema = dataset.schema - assert output_dataframe.equals(input_dataframe) + assert output_dataframe.equals(expected_dataframe) assert output_schema == input_schema # TODO: Move validation tests to validation module; keep one validation integration test From cefde5ddbf315ee41535a58387a59907bc70af72 Mon Sep 17 00:00:00 2001 From: axiomofjoy Date: Wed, 8 Feb 2023 18:37:24 -0800 Subject: [PATCH 4/5] remove index --- src/phoenix/datasets/dataset.py | 11 ++++++----- tests/datasets/test_dataset.py | 3 +-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index c4a894756c..1c334b3908 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -7,7 +7,7 @@ from datetime import datetime from enum import Enum from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast from pandas import DataFrame, Series, Timestamp, read_parquet, to_datetime from pandas.api.types import is_numeric_dtype @@ -80,13 +80,15 @@ def __init__( @cached_property def start_time(self) -> datetime: """Returns the datetime of the earliest inference in the dataset""" - start_datetime: datetime = self.dataframe.index.min() + timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) + start_datetime: datetime = self.__dataframe[timestamp_col_name].min() return start_datetime @cached_property def end_time(self) -> datetime: """Returns the datetime of the latest inference in the dataset""" - end_datetime: datetime = self.dataframe.index.max() + timestamp_col_name: str = cast(str, self.schema.timestamp_column_name) + end_datetime: datetime = self.__dataframe[timestamp_col_name].max() return end_datetime @property @@ -469,6 +471,5 @@ def _add_timestamp_index_and_sort_by_time(dataframe: DataFrame, schema: Schema) timestamp_column_name = schema.timestamp_column_name if timestamp_column_name is None: raise ValueError("Schema must specify a timestamp column name.") - dataframe = dataframe.set_index(keys=[timestamp_column_name]) - dataframe = dataframe.sort_index() + dataframe = dataframe.sort_values(by=[timestamp_column_name]) return dataframe diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index 4b2edab17d..0e6292b445 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -715,13 +715,12 @@ def test_dataset_normalization_columns_already_normalized(self): feature_column_names=["feature0"], prediction_label_column_name="prediction_label", ) - expected_dataframe = input_dataframe.set_index(keys=["timestamp"]) dataset = Dataset(dataframe=input_dataframe, schema=input_schema) output_dataframe = dataset.dataframe output_schema = dataset.schema - assert output_dataframe.equals(expected_dataframe) + assert output_dataframe.equals(input_dataframe) assert output_schema == input_schema # TODO: Move validation tests to validation module; keep one validation integration test From c842fdc5910390d13e7e83b000119f1fe7f1852d Mon Sep 17 00:00:00 2001 From: axiomofjoy Date: Wed, 8 Feb 2023 18:40:28 -0800 Subject: [PATCH 5/5] correct function name and docstring --- src/phoenix/datasets/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phoenix/datasets/dataset.py b/src/phoenix/datasets/dataset.py index 1c334b3908..f2dd72c449 100644 --- a/src/phoenix/datasets/dataset.py +++ b/src/phoenix/datasets/dataset.py @@ -61,7 +61,7 @@ def __init__( logger.error(e) raise err.DatasetError(errors) dataframe, schema = _parse_dataframe_and_schema(dataframe, schema) - dataframe = _add_timestamp_index_and_sort_by_time(dataframe, schema) + dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema) self.__dataframe: DataFrame = dataframe self.__schema: Schema = schema self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}""" @@ -464,9 +464,9 @@ class DatasetType(Enum): REFERENCE = 1 -def _add_timestamp_index_and_sort_by_time(dataframe: DataFrame, schema: Schema) -> DataFrame: +def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame: """ - Adds timestamp index and sorts dataframe rows by timestamp. + Sorts dataframe rows by timestamp. """ timestamp_column_name = schema.timestamp_column_name if timestamp_column_name is None: