Add a new shape field to ColumnSchema (#195)

* Improve robustness of test for converting schemas to dataframes This test was brittle to the addition of new fields in the `ColumnSchema` dataclass, but minor rework avoids that issue. * Add a new `shape` field to `ColumnSchema` This creates a place to store shape information for all dimensions of the data across both array/tensor and dataframe formats. In contrast to the existing "value_count" property (which only records the value counts of the lists in list field, this attribute is intended to capture the size of _all_ dimensions of the data (the batch dimension, the list lengths, embedding sizes, etc.) * Move `Shape` to `merlin.dtypes` and add tests * Compute `is_list` and `is_ragged` from `ColumnSchema.shape` * Remove shape from dtype when translating across frameworks For now, since all existing dtype translations rely on exact matching, we can drop the shape. In the future, when we add translations that need to know whether to use a list dtype or not, we'll have the information available here in the translation code. * Make the default `Shape()` represent unknown shapes * Ignore shapes when validating operator output dtypes * Fall back to the existing shape if there is one * Remove `Shape.fixed` property * Insert missing f-string * Use `DType.without_shape` * Make `None` shorthand for a dimension with unknown or unbounded min/max * Use whatever shape info is provided to fill in the rest This changes the way validation is done so that only the new shape info that's provided gets validated for consistency, and the rest gets inferred and filled in based on what was provided (assuming it's valid.) * Remove the value count min/max test This is now handled by the shape validation * Fix stray linter error * Minor test fix * Disable validation that shape info is provided when `is_ragged=False` * Add few convenience methods to `Shape` * Update `ColumnSchema.with_*` methods to clear existing shape info * Drop shapes from dtypes in `ColumnSchema` constructor * Fix `with_dtype` so dtypes don't overwrite the shape
NVIDIA-Merlin · Feb 7, 2023 · c41f23d · c41f23d
1 parent 6b9019f
commit c41f23d
Show file tree

Hide file tree

Showing 9 changed files with 650 additions and 91 deletions.
diff --git a/merlin/core/dispatch.py b/merlin/core/dispatch.py
@@ -415,7 +415,10 @@ def parquet_writer_dispatch(df: DataFrameLike, path=None, **kwargs):
     elif cudf is not None:
         _cls = cudf.io.parquet.ParquetWriter
     else:
-        ValueError("Unable to load cudf. Please check your environment GPU and cudf available.")
+        raise ValueError(
+            "Unable to load cudf. "
+            "Please check that your environment has GPU(s) and cudf available."
+        )
 
     if not path:
         return _cls

diff --git a/merlin/dag/executors.py b/merlin/dag/executors.py
@@ -196,8 +196,9 @@ def _transform_data(self, node, input_data, capture_dtypes=False):
                     if (
                         output_col_schema.dtype
                         and output_data_schema.dtype
-                        and output_col_schema.dtype != md.string
-                        and output_col_schema.dtype != output_data_schema.dtype
+                        and output_col_schema.dtype.without_shape != md.string
+                        and output_col_schema.dtype.without_shape
+                        != output_data_schema.dtype.without_shape
                     ):
                         raise TypeError(
                             f"Dtype discrepancy detected for column {col_name}: "

diff --git a/merlin/dtypes/base.py b/merlin/dtypes/base.py
@@ -14,11 +14,12 @@
 # limitations under the License.
 #
 
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from enum import Enum
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 from merlin.dtypes.registry import _dtype_registry
+from merlin.dtypes.shape import Shape
 
 
 class ElementType(Enum):
@@ -69,6 +70,11 @@ class DType:
     element_size: Optional[int] = None
     element_unit: Optional[ElementUnit] = None
     signed: Optional[bool] = None
+    shape: Optional[Shape] = None
+
+    def __post_init__(self):
+        if not self.shape:
+            object.__setattr__(self, "shape", Shape())
 
     def to(self, mapping_name: str):
         """
@@ -103,7 +109,7 @@ def to(self, mapping_name: str):
             ) from exc
 
         try:
-            return mapping.from_merlin(self)
+            return mapping.from_merlin(self.without_shape)
         except KeyError as exc:
             raise ValueError(
                 f"The registered dtype mapping for {mapping_name} doesn't contain type {self.name}."
@@ -125,3 +131,45 @@ def is_integer(self):
     @property
     def is_float(self):
         return self.element_type.value == "float"
+
+    def with_shape(self, shape: Union[Tuple, Shape]):
+        """
+        Create a copy of this dtype with a new shape
+
+        Parameters
+        ----------
+        shape : Union[Tuple, Shape]
+            Object to set as shape of dtype, must be either a tuple or Shape.
+
+        Returns
+        -------
+        DType
+            A copy of this dtype containing the provided shape value
+
+        Raises
+        ------
+        TypeError
+            If value is not either a tuple or a Shape
+        """
+        if isinstance(shape, tuple):
+            shape = Shape(shape)
+
+        if not isinstance(shape, Shape):
+            raise TypeError(
+                f"Provided value {shape} (of type {type(shape)}) for DType.shape property "
+                "is not of type Shape."
+            )
+
+        return replace(self, shape=shape)
+
+    @property
+    def without_shape(self):
+        """
+        Create a copy of this object without the shape
+
+        Returns
+        -------
+        DType
+            A copy of this object with the shape removed
+        """
+        return self.with_shape(Shape())
diff --git a/merlin/dtypes/shape.py b/merlin/dtypes/shape.py
@@ -0,0 +1,143 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+
+@dataclass(frozen=True)
+class Dimension:
+    """
+    The range of potential sizes for a single dimension of a field or column
+    """
+
+    min: int = 0
+    max: Optional[int] = None
+
+    def __post_init__(self):
+        if self.min is None:
+            raise ValueError("The minimum size of a dimension cannot be None. ")
+
+        if self.min < 0:
+            raise ValueError(
+                "The minimum size of a dimension must be non-negative. " f"Provided min: {self.min}"
+            )
+
+        if self.max and self.max < 0:
+            raise ValueError(
+                "The maximum size of a dimension must be at least one. " f"Provided max: {self.max}"
+            )
+
+        if self.max and self.max < self.min:
+            raise ValueError(
+                "The maximum size of a dimension must be at least as large as the minimum size. "
+                f"Provided min: {self.min} max: {self.max}"
+            )
+
+    @property
+    def is_bounded(self):
+        return self.max is not None
+
+    @property
+    def is_fixed(self):
+        return self.is_bounded and self.min == self.max
+
+    @property
+    def is_variable(self):
+        return not self.is_fixed
+
+
+@dataclass(frozen=True)
+class Shape:
+    """
+    The range of potential sizes for all the dimensions of a field or column
+    """
+
+    dims: Optional[Union[Tuple, "Shape"]] = None
+
+    def __post_init__(self):
+        if isinstance(self.dims, Shape):
+            object.__setattr__(self, "dims", self.dims.dims)
+
+        if self.dims is not None:
+            new_dims = []
+            for i, dim in enumerate(self.dims):
+                if isinstance(dim, Dimension):
+                    new_dim = dim
+                elif isinstance(dim, tuple) and len(dim) == 2:
+                    new_dim = Dimension(dim[0], dim[1])
+                elif isinstance(dim, int):
+                    new_dim = Dimension(dim, dim)
+                elif dim is None:
+                    new_dim = Dimension()
+                else:
+                    raise ValueError(
+                        f"Invalid shape tuple format: {self.dims}. Each dimension is expected "
+                        " to be None, a single integer, or a tuple with length 2."
+                    )
+                new_dims.append(new_dim)
+
+            object.__setattr__(self, "dims", tuple(new_dims))
+
+    def __eq__(self, other):
+        """
+        Make `dims is None` a wildcard when determining equality
+
+        This definition of equality allows an unknown shape with `dims is None` to be
+        considered equal or compatible with a known shape with `dims is not None`.
+        """
+        if not isinstance(other, Shape):
+            return False
+
+        if self.dims is None or other.dims is None:
+            return True
+
+        return self.dims == other.dims
+
+    def __iter__(self):
+        return self.dims
+
+    @property
+    def min(self) -> Tuple:
+        return tuple(dim.min for dim in self.dims)
+
+    @property
+    def max(self) -> Tuple:
+        return tuple(dim.max for dim in self.dims)
+
+    @property
+    def is_bounded(self):
+        return all(dim.is_bounded for dim in self.dims)
+
+    @property
+    def is_fixed(self):
+        return all(dim.is_fixed for dim in self.dims)
+
+    @property
+    def is_variable(self):
+        return not self.is_fixed
+
+    @property
+    def is_list(self):
+        return self.dims is not None and len(self.dims) > 1
+
+    @property
+    def is_ragged(self):
+        return self.is_list and any(dim.min != dim.max for dim in self.dims[1:])
+
+    @property
+    def as_tuple(self):
+        return ((dim.min, dim.max) for dim in self.dims) if self.dims else None
diff --git a/merlin/schema/io/tensorflow_metadata.py b/merlin/schema/io/tensorflow_metadata.py
@@ -270,8 +270,8 @@ def _pb_feature(column_schema):
 
     value_count = column_schema.properties.get("value_count", {})
     if value_count:
-        min_length = value_count.get("min", 0)
-        max_length = value_count.get("max", 0)
+        min_length = value_count.get("min", 0) or 0
+        max_length = value_count.get("max", 0) or 0
         feature.value_count = ValueCount(min=min_length, max=max_length)
 
     feature.annotation.tag = _pb_tag(column_schema)
@@ -323,9 +323,9 @@ def _merlin_value_count(feature):
     if proto_utils.has_field(feature, "value_count"):
         value_count = feature.value_count
         value_count_dict = {}
-        if value_count.min > 0:
+        if value_count.min and value_count.min > 0:
             value_count_dict["min"] = value_count.min
-        if value_count.max > 0:
+        if value_count.max and value_count.max > 0:
             value_count_dict["max"] = value_count.max
         return value_count_dict