Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new shape field to ColumnSchema #195

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
724bebc
Improve robustness of test for converting schemas to dataframes
karlhigley Jan 6, 2023
7d03b5f
Add a new `shape` field to `ColumnSchema`
karlhigley Jan 6, 2023
6a8e772
Move `Shape` to `merlin.dtypes` and add tests
karlhigley Jan 18, 2023
9123161
Compute `is_list` and `is_ragged` from `ColumnSchema.shape`
karlhigley Feb 3, 2023
97d0463
Remove shape from dtype when translating across frameworks
karlhigley Jan 19, 2023
f45d2c4
Make the default `Shape()` represent unknown shapes
karlhigley Jan 19, 2023
ffa9d19
Ignore shapes when validating operator output dtypes
karlhigley Jan 19, 2023
9399742
Fall back to the existing shape if there is one
karlhigley Feb 3, 2023
c491a63
Remove `Shape.fixed` property
karlhigley Feb 3, 2023
74f0436
Insert missing f-string
karlhigley Feb 3, 2023
327c62a
Use `DType.without_shape`
karlhigley Feb 3, 2023
2df78c0
Merge branch 'main' into feature/comprehensive-shapes
karlhigley Feb 3, 2023
423a9b9
Make `None` shorthand for a dimension with unknown or unbounded min/max
karlhigley Feb 6, 2023
da2f677
Use whatever shape info is provided to fill in the rest
karlhigley Feb 6, 2023
eae9882
Remove the value count min/max test
karlhigley Feb 6, 2023
00c7be2
Fix stray linter error
karlhigley Feb 6, 2023
967a43e
Merge branch 'main' into feature/comprehensive-shapes
karlhigley Feb 6, 2023
6e8160f
Minor test fix
karlhigley Feb 6, 2023
c05f8ba
Disable validation that shape info is provided when `is_ragged=False`
karlhigley Feb 6, 2023
5f8ebc5
Add few convenience methods to `Shape`
karlhigley Feb 6, 2023
8f3b4f6
Update `ColumnSchema.with_*` methods to clear existing shape info
karlhigley Feb 6, 2023
44faf95
Drop shapes from dtypes in `ColumnSchema` constructor
karlhigley Feb 6, 2023
b4291c1
Fix `with_dtype` so dtypes don't overwrite the shape
karlhigley Feb 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion merlin/core/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,10 @@ def parquet_writer_dispatch(df: DataFrameLike, path=None, **kwargs):
elif cudf is not None:
_cls = cudf.io.parquet.ParquetWriter
else:
ValueError("Unable to load cudf. Please check your environment GPU and cudf available.")
raise ValueError(
"Unable to load cudf. "
"Please check that your environment has GPU(s) and cudf available."
)

if not path:
return _cls
Expand Down
5 changes: 3 additions & 2 deletions merlin/dag/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,9 @@ def _transform_data(self, node, input_data, capture_dtypes=False):
if (
output_col_schema.dtype
and output_data_schema.dtype
and output_col_schema.dtype != md.string
and output_col_schema.dtype != output_data_schema.dtype
and output_col_schema.dtype.without_shape != md.string
and output_col_schema.dtype.without_shape
!= output_data_schema.dtype.without_shape
):
raise TypeError(
f"Dtype discrepancy detected for column {col_name}: "
Expand Down
54 changes: 51 additions & 3 deletions merlin/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
# limitations under the License.
#

from dataclasses import dataclass
from dataclasses import dataclass, replace
from enum import Enum
from typing import Optional
from typing import Optional, Tuple, Union

from merlin.dtypes.registry import _dtype_registry
from merlin.dtypes.shape import Shape


class ElementType(Enum):
Expand Down Expand Up @@ -69,6 +70,11 @@ class DType:
element_size: Optional[int] = None
element_unit: Optional[ElementUnit] = None
signed: Optional[bool] = None
shape: Optional[Shape] = None

def __post_init__(self):
if not self.shape:
object.__setattr__(self, "shape", Shape())

def to(self, mapping_name: str):
"""
Expand Down Expand Up @@ -103,7 +109,7 @@ def to(self, mapping_name: str):
) from exc

try:
return mapping.from_merlin(self)
return mapping.from_merlin(self.without_shape)
except KeyError as exc:
raise ValueError(
f"The registered dtype mapping for {mapping_name} doesn't contain type {self.name}."
Expand All @@ -125,3 +131,45 @@ def is_integer(self):
@property
def is_float(self):
return self.element_type.value == "float"

def with_shape(self, shape: Union[Tuple, Shape]):
"""
Create a copy of this dtype with a new shape

Parameters
----------
shape : Union[Tuple, Shape]
Object to set as shape of dtype, must be either a tuple or Shape.

Returns
-------
DType
A copy of this dtype containing the provided shape value

Raises
------
TypeError
If value is not either a tuple or a Shape
"""
if isinstance(shape, tuple):
shape = Shape(shape)

if not isinstance(shape, Shape):
raise TypeError(
f"Provided value {shape} (of type {type(shape)}) for DType.shape property "
"is not of type Shape."
)

return replace(self, shape=shape)

@property
def without_shape(self):
"""
Create a copy of this object without the shape

Returns
-------
DType
A copy of this object with the shape removed
"""
return self.with_shape(Shape())
143 changes: 143 additions & 0 deletions merlin/dtypes/shape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from dataclasses import dataclass
from typing import Optional, Tuple, Union


@dataclass(frozen=True)
class Dimension:
"""
The range of potential sizes for a single dimension of a field or column
"""

min: int = 0
max: Optional[int] = None

def __post_init__(self):
if self.min is None:
raise ValueError("The minimum size of a dimension cannot be None. ")

if self.min < 0:
raise ValueError(
"The minimum size of a dimension must be non-negative. " f"Provided min: {self.min}"
)

if self.max and self.max < 0:
raise ValueError(
"The maximum size of a dimension must be at least one. " f"Provided max: {self.max}"
)

if self.max and self.max < self.min:
raise ValueError(
"The maximum size of a dimension must be at least as large as the minimum size. "
f"Provided min: {self.min} max: {self.max}"
)

@property
def is_bounded(self):
return self.max is not None

@property
def is_fixed(self):
return self.is_bounded and self.min == self.max

@property
def is_variable(self):
return not self.is_fixed


@dataclass(frozen=True)
class Shape:
"""
The range of potential sizes for all the dimensions of a field or column
"""

dims: Optional[Union[Tuple, "Shape"]] = None

def __post_init__(self):
if isinstance(self.dims, Shape):
object.__setattr__(self, "dims", self.dims.dims)

if self.dims is not None:
new_dims = []
for i, dim in enumerate(self.dims):
if isinstance(dim, Dimension):
new_dim = dim
elif isinstance(dim, tuple) and len(dim) == 2:
new_dim = Dimension(dim[0], dim[1])
elif isinstance(dim, int):
new_dim = Dimension(dim, dim)
elif dim is None:
new_dim = Dimension()
else:
raise ValueError(
f"Invalid shape tuple format: {self.dims}. Each dimension is expected "
" to be None, a single integer, or a tuple with length 2."
)
new_dims.append(new_dim)

object.__setattr__(self, "dims", tuple(new_dims))

def __eq__(self, other):
"""
Make `dims is None` a wildcard when determining equality

This definition of equality allows an unknown shape with `dims is None` to be
considered equal or compatible with a known shape with `dims is not None`.
"""
if not isinstance(other, Shape):
return False

if self.dims is None or other.dims is None:
return True

return self.dims == other.dims

def __iter__(self):
return self.dims

@property
def min(self) -> Tuple:
return tuple(dim.min for dim in self.dims)

@property
def max(self) -> Tuple:
return tuple(dim.max for dim in self.dims)

@property
def is_bounded(self):
return all(dim.is_bounded for dim in self.dims)

@property
def is_fixed(self):
return all(dim.is_fixed for dim in self.dims)

@property
def is_variable(self):
return not self.is_fixed

@property
def is_list(self):
return self.dims is not None and len(self.dims) > 1

@property
def is_ragged(self):
return self.is_list and any(dim.min != dim.max for dim in self.dims[1:])

@property
def as_tuple(self):
return ((dim.min, dim.max) for dim in self.dims) if self.dims else None
8 changes: 4 additions & 4 deletions merlin/schema/io/tensorflow_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ def _pb_feature(column_schema):

value_count = column_schema.properties.get("value_count", {})
if value_count:
min_length = value_count.get("min", 0)
max_length = value_count.get("max", 0)
min_length = value_count.get("min", 0) or 0
max_length = value_count.get("max", 0) or 0
feature.value_count = ValueCount(min=min_length, max=max_length)

feature.annotation.tag = _pb_tag(column_schema)
Expand Down Expand Up @@ -323,9 +323,9 @@ def _merlin_value_count(feature):
if proto_utils.has_field(feature, "value_count"):
value_count = feature.value_count
value_count_dict = {}
if value_count.min > 0:
if value_count.min and value_count.min > 0:
value_count_dict["min"] = value_count.min
if value_count.max > 0:
if value_count.max and value_count.max > 0:
value_count_dict["max"] = value_count.max
return value_count_dict

Expand Down
Loading