Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: circular imports and name input #92

Merged
merged 2 commits into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions src/phoenix/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
from numpy import fromstring
from pandas import DataFrame, Series, read_csv, read_hdf, read_parquet

import phoenix.datasets.errors as err
from phoenix.config import dataset_dir
from phoenix.datasets import EmbeddingColumnNames, Schema
from phoenix.datasets.validation import validate_dataset_inputs
from phoenix.utils import is_url, parse_file_format, parse_filename

from . import errors as err
from .schema import EmbeddingColumnNames, Schema
from .validation import validate_dataset_inputs

SUPPORTED_URL_FORMATS = sorted(["hdf", "csv"])

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,6 +54,7 @@ def __init__(self, dataframe: DataFrame, schema: Schema, name: Optional[str] = N
self.__dataframe: DataFrame = parsed_dataframe
self.__schema: Schema = schema
self.__name: str = name if name is not None else f"""dataset_{str(uuid.uuid4())}"""
self.__directory: str = os.path.join(dataset_dir, self.name)
logger.info(f"""Dataset: {self.__name} initialized""")

@property
Expand All @@ -70,7 +72,7 @@ def name(self) -> str:
@property
def directory(self) -> str:
"""The directory under which the dataset metadata is stored"""
return os.path.join(dataset_dir, self.name)
return self.__directory
Comment on lines -73 to +75
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice.


def head(self, num_rows: Optional[int] = 5) -> DataFrame:
num_rows = 5 if num_rows is None else num_rows
Expand Down Expand Up @@ -182,20 +184,26 @@ def from_hdf(
return cls(df, schema, name)

@classmethod
def from_url(cls, url_path: str, schema: Schema, hdf_key: Optional[str] = None) -> "Dataset":
def from_url(
cls,
url_path: str,
schema: Schema,
name: Optional[str] = None,
hdf_key: Optional[str] = None,
) -> "Dataset":
if not is_url(url_path):
raise ValueError("Invalid url")
file_format = parse_file_format(url_path)
if file_format == ".csv":
return cls.from_csv(url_path, schema)
return cls.from_csv(url_path, schema, name)
elif file_format == ".hdf5" or file_format == ".hdf":
filename = parse_filename(url_path)
with tempfile.TemporaryDirectory() as temp_dir:
local_file_path = os.path.join(temp_dir, filename)
print(f"Downloading file: {filename}")
request.urlretrieve(url_path, local_file_path, show_progress)
print("\n")
return cls.from_hdf(local_file_path, schema, hdf_key)
return cls.from_hdf(local_file_path, schema, name, hdf_key)
raise ValueError(
f"File format {file_format} not supported. Currently supported "
f"formats are: {', '.join(SUPPORTED_URL_FORMATS)}."
Expand Down
4 changes: 2 additions & 2 deletions src/phoenix/datasets/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from pandas import DataFrame

import phoenix.datasets.errors as err
from phoenix.datasets import Schema
from . import errors as err
from .schema import Schema


def validate_dataset_inputs(dataframe: DataFrame, schema: Schema) -> List[err.ValidationError]:
Expand Down
3 changes: 2 additions & 1 deletion src/phoenix/pointcloud/projectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from umap import UMAP # type: ignore

from phoenix.datasets import Dataset
from phoenix.pointcloud.pointcloud import (

from .pointcloud import (
Cluster,
Coordinates,
Coordinates2D,
Expand Down
3 changes: 1 addition & 2 deletions tests/datasets/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from phoenix.datasets.schema import EmbeddingColumnNames, Schema
from phoenix.datasets import EmbeddingColumnNames, Schema


def test_json_serialization():
Expand All @@ -11,7 +11,6 @@ def test_json_serialization():

# serialize and deserialize.
p = s.to_json()
print(p)
schema_from_json = Schema.from_json(p)

assert schema_from_json.embedding_feature_column_names is not None
Expand Down
3 changes: 1 addition & 2 deletions tests/metrics/embeddings/test_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pandas as pd
import pytest

from phoenix.datasets import Dataset
from phoenix.datasets.schema import EmbeddingColumnNames, Schema
from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.metrics.embeddings import euclidean_distance


Expand Down