Renaming ProcessList to PrepareFeatures (#992)

* Fixed error that was causing the broadcasted context feature to have fixed size first dim in graph mode and not being compatible with the ragged sequential features * Enforcing non-list (scalar) features to be 2D (batch size,1) if 1D or with last dim undefined (which happens in graph mode) * Making Continuous support_masking=True (to cascade mask) * Changing BroadcastToSequence to fix some issues and simplify the masking * Fixed tests * Fixed test * Renaming ProcessList to PrepareFeatures, as it nows also prepares not only list features but also scalar features * Fixed tests
NVIDIA-Merlin · Feb 28, 2023 · 0d28de4 · 0d28de4
1 parent f78bec7
commit 0d28de4
Show file tree

Hide file tree

Showing 14 changed files with 35 additions and 41 deletions.
diff --git a/merlin/models/tf/core/encoder.py b/merlin/models/tf/core/encoder.py
@@ -29,7 +29,7 @@
 from merlin.models.tf.inputs.embedding import CombinerType, EmbeddingTable
 from merlin.models.tf.models.base import BaseModel, get_output_schema
 from merlin.models.tf.outputs.topk import TopKOutput
-from merlin.models.tf.transforms.tensor import ProcessList
+from merlin.models.tf.transforms.tensor import PrepareFeatures
 from merlin.models.tf.utils import tf_utils
 from merlin.schema import ColumnSchema, Schema, Tags
 
@@ -73,7 +73,7 @@ def __init__(
         self.blocks = [input_block] + list(blocks) if blocks else [input_block]
         self.pre = pre
         self.post = post
-        self.process_list = ProcessList(self._schema)
+        self.prepare_features = PrepareFeatures(self._schema)
 
     def encode(
         self,
@@ -163,7 +163,7 @@ def call(self, inputs, training=False, testing=False, targets=None, **kwargs):
         return combinators.call_sequentially(
             list(self.to_call),
             inputs=inputs,
-            features=self.process_list(inputs),
+            features=self.prepare_features(inputs),
             targets=targets,
             training=training,
             testing=testing,

diff --git a/merlin/models/tf/inputs/base.py b/merlin/models/tf/inputs/base.py
@@ -31,7 +31,7 @@
     Embeddings,
     SequenceEmbeddingFeatures,
 )
-from merlin.models.tf.transforms.tensor import ListToDense, ProcessList
+from merlin.models.tf.transforms.tensor import ListToDense, PrepareFeatures
 from merlin.schema import Schema, Tags, TagsType
 
 LOG = logging.getLogger("merlin-models")
@@ -325,7 +325,7 @@ def InputBlockV2(
     if not parsed:
         raise ValueError("No columns selected for the input block")
 
-    _pre = ProcessList(schema)
+    _pre = PrepareFeatures(schema)
     if pre:
         _pre = _pre.connect(pre)
 

diff --git a/merlin/models/tf/loader.py b/merlin/models/tf/loader.py
@@ -369,7 +369,7 @@ def sample_batch(
     include_targets: bool = True,
     to_ragged: bool = False,
     to_dense: bool = False,
-    process_lists=False,
+    prepare_features=False,
 ):
     """Util function to generate a batch of input tensors from a merlin.io.Dataset instance
 
@@ -387,6 +387,11 @@ def sample_batch(
         Whether to convert the tuple of sparse tensors into ragged tensors, by default False.
     to_dense: bool
         Whether to convert the tuple of sparse tensors into dense tensors, by default False.
+    prepare_features: bool
+        Whether to prepare features from dataloader for the model, by default False.
+        If enabled, it converts multi-hot/list features to dense or ragged based on the schema.
+        It also ensures that scalar features are converted to 2D (batch size, 1).
+        P.s. The features are automatically prepared by InputBlockV2 if it is used
     Returns:
     -------
     batch: Dict[tf.tensor]
@@ -397,7 +402,7 @@ def sample_batch(
             "Sparse values cannot be converted to both ragged tensors and dense tensors"
         )
 
-    from merlin.models.tf.transforms.tensor import ListToDense, ListToRagged, ProcessList
+    from merlin.models.tf.transforms.tensor import ListToDense, ListToRagged, PrepareFeatures
 
     if isinstance(dataset_or_loader, Dataset):
         if not batch_size:
@@ -414,8 +419,8 @@ def sample_batch(
         inputs = ListToRagged()(inputs)
     elif to_dense:
         inputs = ListToDense()(inputs)
-    if process_lists:
-        inputs = ProcessList(loader.schema)(inputs)
+    if prepare_features:
+        inputs = PrepareFeatures(loader.schema)(inputs)
     if not include_targets:
         return inputs
     return inputs, targets

diff --git a/merlin/models/tf/models/base.py b/merlin/models/tf/models/base.py
@@ -1476,7 +1476,7 @@ def __init__(
             ]
             self.schema = sum(input_block_schemas, Schema())
 
-        self.process_list = ProcessList(self.schema)
+        self.prepare_features = PrepareFeatures(self.schema)
         self._frozen_blocks = set()
 
     def save(
@@ -1543,7 +1543,7 @@ def _maybe_build(self, inputs):
                     f"\n\t{call_input_features.difference(model_input_features)}"
                 )
 
-            _ragged_inputs = self.process_list(inputs)
+            _ragged_inputs = self.prepare_features(inputs)
             feature_shapes = {k: v.shape for k, v in _ragged_inputs.items()}
             feature_dtypes = {k: v.dtype for k, v in _ragged_inputs.items()}
 
@@ -1565,7 +1565,7 @@ def build(self, input_shape=None):
         """
         last_layer = None
 
-        input_shape = self.process_list.compute_output_shape(input_shape)
+        input_shape = self.prepare_features.compute_output_shape(input_shape)
 
         if self.pre is not None:
             self.pre.build(input_shape)
@@ -1592,7 +1592,7 @@ def build(self, input_shape=None):
 
     def call(self, inputs, targets=None, training=False, testing=False, output_context=False):
         context = self._create_context(
-            self.process_list(inputs),
+            self.prepare_features(inputs),
             targets=targets,
             training=training,
             testing=testing,

diff --git a/merlin/models/tf/transforms/tensor.py b/merlin/models/tf/transforms/tensor.py
@@ -66,7 +66,7 @@ def compute_call_output_shape(self, input_shapes):
 
 
 @tf.keras.utils.register_keras_serializable(package="merlin.models")
-class ProcessList(TabularBlock):
+class PrepareFeatures(TabularBlock):
     """Process all list (multi-hot/sequential) features.add()
 
     In NVTabular, list-columns are represented as a tuple of (values, offsets).

diff --git a/merlin/models/tf/utils/testing_utils.py b/merlin/models/tf/utils/testing_utils.py
@@ -103,7 +103,7 @@ def model_test(
 
         assert isinstance(loaded_model, type(model))
 
-        x, y = sample_batch(dataloader, batch_size=50, to_ragged=False, process_lists=False)
+        x, y = sample_batch(dataloader, batch_size=50, to_ragged=False, prepare_features=False)
         batch = [(x, y)]
 
         model_preds = model.predict(iter(batch))

diff --git a/tests/unit/datasets/test_synthetic.py b/tests/unit/datasets/test_synthetic.py
@@ -46,7 +46,7 @@ def test_tf_tensors_generation_cpu():
 
     from merlin.models.tf import sample_batch
 
-    tensors, _ = sample_batch(data, batch_size=100, process_lists=False)
+    tensors, _ = sample_batch(data, batch_size=100)
     assert tensors["user_id"].shape == (100, 1)
     assert tensors["user_age"].dtype == tf.float64
     for name, val in filter_dict_by_schema(tensors, schema.select_by_tag(Tags.LIST)).items():
@@ -68,7 +68,7 @@ def test_sequence_data_length(generate_data_kwargs, expected_sequence_length):
 
     from merlin.models.tf import sample_batch
 
-    tensors, y = sample_batch(data, batch_size=1, process_lists=False)
+    tensors, y = sample_batch(data, batch_size=1)
 
     for col in ["item_id_seq", "categories"]:
         assert all(tensors[col][1] == expected_sequence_length)

diff --git a/tests/unit/tf/blocks/retrieval/test_two_tower.py b/tests/unit/tf/blocks/retrieval/test_two_tower.py
@@ -112,7 +112,7 @@ def test_two_tower_block_with_l2_norm_on_towers_outputs(testing_data: Dataset):
 
 def test_two_tower_block_tower_save(testing_data: Dataset, tmp_path):
     two_tower = ml.TwoTowerBlock(testing_data.schema, query_tower=ml.MLPBlock([64, 128]))
-    features, _ = ml.sample_batch(testing_data, batch_size=100, process_lists=False)
+    features, _ = ml.sample_batch(testing_data, batch_size=100)
     two_tower(features)
 
     query_tower = two_tower.query_block()

diff --git a/tests/unit/tf/blocks/test_interactions.py b/tests/unit/tf/blocks/test_interactions.py
@@ -87,7 +87,7 @@ def test_fm_block_with_multi_hot_categ_features(testing_data: Dataset):
         factors_dim=32,
     )
 
-    batch, _ = mm.sample_batch(testing_data, batch_size=16, process_lists=False)
+    batch, _ = mm.sample_batch(testing_data, batch_size=16)
 
     output = fm_block(batch)
     assert output.shape.as_list() == [16, 1]
diff --git a/tests/unit/tf/inputs/test_base.py b/tests/unit/tf/inputs/test_base.py
@@ -30,7 +30,7 @@ def test_concat_sequence(sequence_testing_data):
         ),
     )
 
-    inputs = mm.sample_batch(sequence_testing_data, 8, include_targets=False, process_lists=True)
+    inputs = mm.sample_batch(sequence_testing_data, 8, include_targets=False, prepare_features=True)
 
     outputs = seq_inputs(inputs)
 

diff --git a/tests/unit/tf/inputs/test_continuous.py b/tests/unit/tf/inputs/test_continuous.py
@@ -80,7 +80,7 @@ def test_continuous_features_ragged(sequence_testing_data: Dataset):
     inputs = ml.ContinuousFeatures.from_schema(
         schema, post=ml.BroadcastToSequence(context_schema, seq_schema), aggregation="concat"
     )
-    features, _ = ml.sample_batch(sequence_testing_data, batch_size=100, process_lists=True)
+    features, _ = ml.sample_batch(sequence_testing_data, batch_size=100, prepare_features=True)
     outputs = inputs(features)
 
     assert outputs.to_tensor().shape == (100, 4, 6)
diff --git a/tests/unit/tf/models/test_ranking.py b/tests/unit/tf/models/test_ranking.py
@@ -371,7 +371,7 @@ def test_wide_deep_model_wide_onehot_multihot_feature_interaction(ecommerce_data
         ),
     ]
 
-    batch, _ = mm.sample_batch(ml_dataset, batch_size=100, process_lists=False)
+    batch, _ = mm.sample_batch(ml_dataset, batch_size=100)
 
     output_wide_features = mm.ParallelBlock(wide_preprocessing_blocks)(batch)
     assert set(output_wide_features.keys()) == set(

diff --git a/tests/unit/tf/transforms/test_negative_sampling.py b/tests/unit/tf/transforms/test_negative_sampling.py
@@ -185,7 +185,7 @@ def test_in_model(self, run_eagerly, music_streaming_data: Dataset, tf_random_se
         testing_utils.model_test(model, dataset, run_eagerly=run_eagerly, reload_model=True)
 
         batch_size = 10
-        features, targets = mm.sample_batch(dataset, batch_size=batch_size, process_lists=True)
+        features, targets = mm.sample_batch(dataset, batch_size=batch_size, prepare_features=True)
 
         with_negatives = model(features, targets=targets, training=True)
         assert with_negatives.predictions.shape[0] >= 50

diff --git a/tests/unit/tf/transforms/test_sequence.py b/tests/unit/tf/transforms/test_sequence.py
@@ -26,16 +26,13 @@
 def test_seq_predict_next(sequence_testing_data: Dataset):
     seq_schema = sequence_testing_data.schema.select_by_tag(Tags.SEQUENCE)
     target = sequence_testing_data.schema.select_by_tag(Tags.ITEM_ID).column_names[0]
-    predict_next = mm.SequencePredictNext(schema=seq_schema, target=target, pre=mm.ListToRagged())
+    predict_next = mm.SequencePredictNext(schema=seq_schema, target=target)
 
-    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, process_lists=False)
+    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, prepare_features=True)
     output = predict_next(batch)
     output_x, output_y = output
     output_y = output_y[target]
 
-    as_ragged = mm.ListToRagged()
-    batch = as_ragged(batch)
-
     # Checks if sequential input features were truncated in the last position
     for k, v in batch.items():
         if k in seq_schema.column_names:
@@ -55,14 +52,11 @@ def test_seq_predict_last(sequence_testing_data: Dataset):
     target = sequence_testing_data.schema.select_by_tag(Tags.ITEM_ID).column_names[0]
     predict_last = mm.SequencePredictLast(schema=seq_schema, target=target)
 
-    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, process_lists=False)
+    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, prepare_features=True)
     output = predict_last(batch)
     output_x, output_y = output
     output_y = output_y[target]
 
-    as_ragged = mm.ListToRagged()
-    batch = as_ragged(batch)
-
     # Checks if sequential input features were truncated in the last position
     for k, v in batch.items():
         if k in seq_schema.column_names:
@@ -83,13 +77,11 @@ def test_seq_predict_random(sequence_testing_data: Dataset):
     target = sequence_testing_data.schema.select_by_tag(Tags.ITEM_ID).column_names[0]
     predict_random = mm.SequencePredictRandom(schema=seq_schema, target=target)
 
-    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, process_lists=False)
+    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, prepare_features=True)
     output = predict_random(batch)
     output_x, output_y = output
     output_y = output_y[target]
 
-    as_ragged = mm.ListToRagged()
-    batch = as_ragged(batch)
     batch_size = batch[target].shape[0]
 
     for k, v in batch.items():
@@ -108,7 +100,7 @@ def test_seq_predict_random(sequence_testing_data: Dataset):
             tf.Assert(tf.reduce_all(output_x[k] == v), [output_x[k], v])
 
     # Checks if the target has the right shape
-    tf.Assert(tf.reduce_all(tf.shape(output_y) == batch_size), [])
+    tf.Assert(tf.reduce_all(tf.shape(output_y) == tf.TensorShape((batch_size, 1))), [])
 
 
 def test_seq_predict_next_output_shape(sequence_testing_data):
@@ -160,7 +152,7 @@ def test_seq_random_masking(sequence_testing_data: Dataset):
     target = sequence_testing_data.schema.select_by_tag(Tags.ITEM_ID).column_names[0]
     predict_masked = mm.SequenceMaskRandom(schema=seq_schema, target=target, masking_prob=0.3)
 
-    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, process_lists=False)
+    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, prepare_features=True)
 
     output_x, output_y = predict_masked(batch)
     output_y = output_y[target]
@@ -171,9 +163,6 @@ def test_seq_random_masking(sequence_testing_data: Dataset):
 
     asserts_mlm_target_mask(target_mask)
 
-    as_ragged = mm.ListToRagged()
-    batch = as_ragged(batch)
-
     for k, v in batch.items():
         # Checking if inputs values didn't change
         tf.Assert(tf.reduce_all(output_x[k] == v), [output_x[k], v])
@@ -215,7 +204,7 @@ def test_seq_mask_random_replace_embeddings(
     target = sequence_testing_data.schema.select_by_tag(Tags.ITEM_ID).column_names[0]
     predict_masked = mm.SequenceMaskRandom(schema=seq_schema, target=target, masking_prob=0.3)
 
-    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, process_lists=False)
+    batch, _ = mm.sample_batch(sequence_testing_data, batch_size=8, prepare_features=False)
 
     inputs, targets = predict_masked(batch)
     targets = targets[target]
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,7 +30,7 @@ def test_concat_sequence(sequence_testing_data): @@
             ),
         )
-        inputs = mm.sample_batch(sequence_testing_data, 8, include_targets=False, process_lists=True)
+        inputs = mm.sample_batch(sequence_testing_data, 8, include_targets=False, prepare_features=True)
         outputs = seq_inputs(inputs)
@@ Expand Down @@