-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Improve robustness of test for converting schemas to dataframes This test was brittle to the addition of new fields in the `ColumnSchema` dataclass, but minor rework avoids that issue. * Add a new `shape` field to `ColumnSchema` This creates a place to store shape information for all dimensions of the data across both array/tensor and dataframe formats. In contrast to the existing "value_count" property (which only records the value counts of the lists in list field, this attribute is intended to capture the size of _all_ dimensions of the data (the batch dimension, the list lengths, embedding sizes, etc.) * Move `Shape` to `merlin.dtypes` and add tests * Compute `is_list` and `is_ragged` from `ColumnSchema.shape` * Remove shape from dtype when translating across frameworks For now, since all existing dtype translations rely on exact matching, we can drop the shape. In the future, when we add translations that need to know whether to use a list dtype or not, we'll have the information available here in the translation code. * Make the default `Shape()` represent unknown shapes * Ignore shapes when validating operator output dtypes * Fall back to the existing shape if there is one * Remove `Shape.fixed` property * Insert missing f-string * Use `DType.without_shape` * Make `None` shorthand for a dimension with unknown or unbounded min/max * Use whatever shape info is provided to fill in the rest This changes the way validation is done so that only the new shape info that's provided gets validated for consistency, and the rest gets inferred and filled in based on what was provided (assuming it's valid.) * Remove the value count min/max test This is now handled by the shape validation * Fix stray linter error * Minor test fix * Disable validation that shape info is provided when `is_ragged=False` * Add few convenience methods to `Shape` * Update `ColumnSchema.with_*` methods to clear existing shape info * Drop shapes from dtypes in `ColumnSchema` constructor * Fix `with_dtype` so dtypes don't overwrite the shape
- Loading branch information
1 parent
6b9019f
commit c41f23d
Showing
9 changed files
with
650 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
# | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
from dataclasses import dataclass | ||
from typing import Optional, Tuple, Union | ||
|
||
|
||
@dataclass(frozen=True) | ||
class Dimension: | ||
""" | ||
The range of potential sizes for a single dimension of a field or column | ||
""" | ||
|
||
min: int = 0 | ||
max: Optional[int] = None | ||
|
||
def __post_init__(self): | ||
if self.min is None: | ||
raise ValueError("The minimum size of a dimension cannot be None. ") | ||
|
||
if self.min < 0: | ||
raise ValueError( | ||
"The minimum size of a dimension must be non-negative. " f"Provided min: {self.min}" | ||
) | ||
|
||
if self.max and self.max < 0: | ||
raise ValueError( | ||
"The maximum size of a dimension must be at least one. " f"Provided max: {self.max}" | ||
) | ||
|
||
if self.max and self.max < self.min: | ||
raise ValueError( | ||
"The maximum size of a dimension must be at least as large as the minimum size. " | ||
f"Provided min: {self.min} max: {self.max}" | ||
) | ||
|
||
@property | ||
def is_bounded(self): | ||
return self.max is not None | ||
|
||
@property | ||
def is_fixed(self): | ||
return self.is_bounded and self.min == self.max | ||
|
||
@property | ||
def is_variable(self): | ||
return not self.is_fixed | ||
|
||
|
||
@dataclass(frozen=True) | ||
class Shape: | ||
""" | ||
The range of potential sizes for all the dimensions of a field or column | ||
""" | ||
|
||
dims: Optional[Union[Tuple, "Shape"]] = None | ||
|
||
def __post_init__(self): | ||
if isinstance(self.dims, Shape): | ||
object.__setattr__(self, "dims", self.dims.dims) | ||
|
||
if self.dims is not None: | ||
new_dims = [] | ||
for i, dim in enumerate(self.dims): | ||
if isinstance(dim, Dimension): | ||
new_dim = dim | ||
elif isinstance(dim, tuple) and len(dim) == 2: | ||
new_dim = Dimension(dim[0], dim[1]) | ||
elif isinstance(dim, int): | ||
new_dim = Dimension(dim, dim) | ||
elif dim is None: | ||
new_dim = Dimension() | ||
else: | ||
raise ValueError( | ||
f"Invalid shape tuple format: {self.dims}. Each dimension is expected " | ||
" to be None, a single integer, or a tuple with length 2." | ||
) | ||
new_dims.append(new_dim) | ||
|
||
object.__setattr__(self, "dims", tuple(new_dims)) | ||
|
||
def __eq__(self, other): | ||
""" | ||
Make `dims is None` a wildcard when determining equality | ||
This definition of equality allows an unknown shape with `dims is None` to be | ||
considered equal or compatible with a known shape with `dims is not None`. | ||
""" | ||
if not isinstance(other, Shape): | ||
return False | ||
|
||
if self.dims is None or other.dims is None: | ||
return True | ||
|
||
return self.dims == other.dims | ||
|
||
def __iter__(self): | ||
return self.dims | ||
|
||
@property | ||
def min(self) -> Tuple: | ||
return tuple(dim.min for dim in self.dims) | ||
|
||
@property | ||
def max(self) -> Tuple: | ||
return tuple(dim.max for dim in self.dims) | ||
|
||
@property | ||
def is_bounded(self): | ||
return all(dim.is_bounded for dim in self.dims) | ||
|
||
@property | ||
def is_fixed(self): | ||
return all(dim.is_fixed for dim in self.dims) | ||
|
||
@property | ||
def is_variable(self): | ||
return not self.is_fixed | ||
|
||
@property | ||
def is_list(self): | ||
return self.dims is not None and len(self.dims) > 1 | ||
|
||
@property | ||
def is_ragged(self): | ||
return self.is_list and any(dim.min != dim.max for dim in self.dims[1:]) | ||
|
||
@property | ||
def as_tuple(self): | ||
return ((dim.min, dim.max) for dim in self.dims) if self.dims else None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.