Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sort dataframe rows by time #231

Merged
merged 6 commits into from
Feb 9, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions src/phoenix/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
name: Optional[str] = None,
persist_to_disc: bool = True,
):
dataframe = dataframe.reset_index()
Copy link
Contributor Author

@axiomofjoy axiomofjoy Feb 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If an input dataframe has an index containing one of the columns defined in the schema, our current validation code will fail with a missing column error since such a column is part of dataframe.index and not part of dataframe.columns.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes sense. we won't need any index we don't use

errors = validate_dataset_inputs(
dataframe=dataframe,
schema=schema,
Expand All @@ -58,9 +59,10 @@ def __init__(
for e in errors:
logger.error(e)
raise err.DatasetError(errors)
parsed_dataframe, parsed_schema = _parse_dataframe_and_schema(dataframe, schema)
self.__dataframe: DataFrame = parsed_dataframe
self.__schema: Schema = parsed_schema
dataframe, schema = _parse_dataframe_and_schema(dataframe, schema)
dataframe = _add_timestamp_index_and_sort(dataframe, schema)
self.__dataframe: DataFrame = dataframe
self.__schema: Schema = schema
self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}"""
self.__directory: str = os.path.join(dataset_dir, self.name)

Expand Down Expand Up @@ -454,3 +456,15 @@ def _create_and_normalize_dataframe_and_schema(
parsed_dataframe[pred_col_name] = parsed_dataframe[pred_col_name].astype(str)

return parsed_dataframe, parsed_schema


def _add_timestamp_index_and_sort(dataframe: DataFrame, schema: Schema) -> DataFrame:
"""
Adds timestamp index and sorts dataframe by timestamp.
"""
timestamp_column_name = schema.timestamp_column_name
if timestamp_column_name is None:
raise ValueError("Schema must specify a timestamp column name.")
dataframe = dataframe.set_index(timestamp_column_name)
dataframe = dataframe.sort_index()
return dataframe