Skip to content

Commit

Permalink
Add a new shape field to ColumnSchema (#195)
Browse files Browse the repository at this point in the history
* Improve robustness of test for converting schemas to dataframes

This test was brittle to the addition of new fields in the `ColumnSchema` dataclass, but minor rework avoids that issue.

* Add a new `shape` field to `ColumnSchema`

This creates a place to store shape information for all dimensions of the data across both array/tensor and dataframe formats. In contrast to the existing "value_count" property (which only records the value counts of the lists in list field, this attribute is intended to capture the size of _all_ dimensions of the data (the batch dimension, the list lengths, embedding sizes, etc.)

* Move `Shape` to `merlin.dtypes` and add tests

* Compute `is_list` and `is_ragged` from `ColumnSchema.shape`

* Remove shape from dtype when translating across frameworks

For now, since all existing dtype translations rely on exact matching, we can drop the shape. In the future, when we add translations that need to know whether to use a list dtype or not, we'll have the information available here in the translation code.

* Make the default `Shape()` represent unknown shapes

* Ignore shapes when validating operator output dtypes

* Fall back to the existing shape if there is one

* Remove `Shape.fixed` property

* Insert missing f-string

* Use `DType.without_shape`

* Make `None` shorthand for a dimension with unknown or unbounded min/max

* Use whatever shape info is provided to fill in the rest

This changes the way validation is done so that only the new shape info that's provided gets validated for consistency, and the rest gets inferred and filled in based on what was provided (assuming it's valid.)

* Remove the value count min/max test

This is now handled by the shape validation

* Fix stray linter error

* Minor test fix

* Disable validation that shape info is provided when `is_ragged=False`

* Add few convenience methods to `Shape`

* Update `ColumnSchema.with_*` methods to clear existing shape info

* Drop shapes from dtypes in `ColumnSchema` constructor

* Fix `with_dtype` so dtypes don't overwrite the shape
  • Loading branch information
karlhigley authored Feb 7, 2023
1 parent 6b9019f commit c41f23d
Show file tree
Hide file tree
Showing 9 changed files with 650 additions and 91 deletions.
5 changes: 4 additions & 1 deletion merlin/core/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,10 @@ def parquet_writer_dispatch(df: DataFrameLike, path=None, **kwargs):
elif cudf is not None:
_cls = cudf.io.parquet.ParquetWriter
else:
ValueError("Unable to load cudf. Please check your environment GPU and cudf available.")
raise ValueError(
"Unable to load cudf. "
"Please check that your environment has GPU(s) and cudf available."
)

if not path:
return _cls
Expand Down
5 changes: 3 additions & 2 deletions merlin/dag/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,9 @@ def _transform_data(self, node, input_data, capture_dtypes=False):
if (
output_col_schema.dtype
and output_data_schema.dtype
and output_col_schema.dtype != md.string
and output_col_schema.dtype != output_data_schema.dtype
and output_col_schema.dtype.without_shape != md.string
and output_col_schema.dtype.without_shape
!= output_data_schema.dtype.without_shape
):
raise TypeError(
f"Dtype discrepancy detected for column {col_name}: "
Expand Down
54 changes: 51 additions & 3 deletions merlin/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
# limitations under the License.
#

from dataclasses import dataclass
from dataclasses import dataclass, replace
from enum import Enum
from typing import Optional
from typing import Optional, Tuple, Union

from merlin.dtypes.registry import _dtype_registry
from merlin.dtypes.shape import Shape


class ElementType(Enum):
Expand Down Expand Up @@ -69,6 +70,11 @@ class DType:
element_size: Optional[int] = None
element_unit: Optional[ElementUnit] = None
signed: Optional[bool] = None
shape: Optional[Shape] = None

def __post_init__(self):
if not self.shape:
object.__setattr__(self, "shape", Shape())

def to(self, mapping_name: str):
"""
Expand Down Expand Up @@ -103,7 +109,7 @@ def to(self, mapping_name: str):
) from exc

try:
return mapping.from_merlin(self)
return mapping.from_merlin(self.without_shape)
except KeyError as exc:
raise ValueError(
f"The registered dtype mapping for {mapping_name} doesn't contain type {self.name}."
Expand All @@ -125,3 +131,45 @@ def is_integer(self):
@property
def is_float(self):
return self.element_type.value == "float"

def with_shape(self, shape: Union[Tuple, Shape]):
"""
Create a copy of this dtype with a new shape
Parameters
----------
shape : Union[Tuple, Shape]
Object to set as shape of dtype, must be either a tuple or Shape.
Returns
-------
DType
A copy of this dtype containing the provided shape value
Raises
------
TypeError
If value is not either a tuple or a Shape
"""
if isinstance(shape, tuple):
shape = Shape(shape)

if not isinstance(shape, Shape):
raise TypeError(
f"Provided value {shape} (of type {type(shape)}) for DType.shape property "
"is not of type Shape."
)

return replace(self, shape=shape)

@property
def without_shape(self):
"""
Create a copy of this object without the shape
Returns
-------
DType
A copy of this object with the shape removed
"""
return self.with_shape(Shape())
143 changes: 143 additions & 0 deletions merlin/dtypes/shape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from dataclasses import dataclass
from typing import Optional, Tuple, Union


@dataclass(frozen=True)
class Dimension:
"""
The range of potential sizes for a single dimension of a field or column
"""

min: int = 0
max: Optional[int] = None

def __post_init__(self):
if self.min is None:
raise ValueError("The minimum size of a dimension cannot be None. ")

if self.min < 0:
raise ValueError(
"The minimum size of a dimension must be non-negative. " f"Provided min: {self.min}"
)

if self.max and self.max < 0:
raise ValueError(
"The maximum size of a dimension must be at least one. " f"Provided max: {self.max}"
)

if self.max and self.max < self.min:
raise ValueError(
"The maximum size of a dimension must be at least as large as the minimum size. "
f"Provided min: {self.min} max: {self.max}"
)

@property
def is_bounded(self):
return self.max is not None

@property
def is_fixed(self):
return self.is_bounded and self.min == self.max

@property
def is_variable(self):
return not self.is_fixed


@dataclass(frozen=True)
class Shape:
"""
The range of potential sizes for all the dimensions of a field or column
"""

dims: Optional[Union[Tuple, "Shape"]] = None

def __post_init__(self):
if isinstance(self.dims, Shape):
object.__setattr__(self, "dims", self.dims.dims)

if self.dims is not None:
new_dims = []
for i, dim in enumerate(self.dims):
if isinstance(dim, Dimension):
new_dim = dim
elif isinstance(dim, tuple) and len(dim) == 2:
new_dim = Dimension(dim[0], dim[1])
elif isinstance(dim, int):
new_dim = Dimension(dim, dim)
elif dim is None:
new_dim = Dimension()
else:
raise ValueError(
f"Invalid shape tuple format: {self.dims}. Each dimension is expected "
" to be None, a single integer, or a tuple with length 2."
)
new_dims.append(new_dim)

object.__setattr__(self, "dims", tuple(new_dims))

def __eq__(self, other):
"""
Make `dims is None` a wildcard when determining equality
This definition of equality allows an unknown shape with `dims is None` to be
considered equal or compatible with a known shape with `dims is not None`.
"""
if not isinstance(other, Shape):
return False

if self.dims is None or other.dims is None:
return True

return self.dims == other.dims

def __iter__(self):
return self.dims

@property
def min(self) -> Tuple:
return tuple(dim.min for dim in self.dims)

@property
def max(self) -> Tuple:
return tuple(dim.max for dim in self.dims)

@property
def is_bounded(self):
return all(dim.is_bounded for dim in self.dims)

@property
def is_fixed(self):
return all(dim.is_fixed for dim in self.dims)

@property
def is_variable(self):
return not self.is_fixed

@property
def is_list(self):
return self.dims is not None and len(self.dims) > 1

@property
def is_ragged(self):
return self.is_list and any(dim.min != dim.max for dim in self.dims[1:])

@property
def as_tuple(self):
return ((dim.min, dim.max) for dim in self.dims) if self.dims else None
8 changes: 4 additions & 4 deletions merlin/schema/io/tensorflow_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ def _pb_feature(column_schema):

value_count = column_schema.properties.get("value_count", {})
if value_count:
min_length = value_count.get("min", 0)
max_length = value_count.get("max", 0)
min_length = value_count.get("min", 0) or 0
max_length = value_count.get("max", 0) or 0
feature.value_count = ValueCount(min=min_length, max=max_length)

feature.annotation.tag = _pb_tag(column_schema)
Expand Down Expand Up @@ -323,9 +323,9 @@ def _merlin_value_count(feature):
if proto_utils.has_field(feature, "value_count"):
value_count = feature.value_count
value_count_dict = {}
if value_count.min > 0:
if value_count.min and value_count.min > 0:
value_count_dict["min"] = value_count.min
if value_count.max > 0:
if value_count.max and value_count.max > 0:
value_count_dict["max"] = value_count.max
return value_count_dict

Expand Down
Loading

0 comments on commit c41f23d

Please sign in to comment.