Merge branch 'main' into fix-transformer-version-test-errs

NVIDIA-Merlin · Nov 1, 2023 · 93a5aff · 93a5aff
2 parents f24ea77 + 5bef974
commit 93a5aff
Show file tree

Hide file tree

Showing 33 changed files with 2,279 additions and 497 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
diff --git a/.github/workflows/gpu-ci-integration.yml b/.github/workflows/gpu-ci-integration.yml
@@ -3,16 +3,23 @@ name: GPU NOTEBOOK CI
 on:
   workflow_dispatch:
   push:
-    branches: [main]
+    branches:
+      - main
+      - "pull-request/[0-9]+"
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
-  pull_request:
-    branches: [main]
-    types: [opened, synchronize, reopened]
 
 jobs:
   gpu-ci-integration:
-    runs-on: 1GPU
+    runs-on: linux-amd64-gpu-p100-latest-1
+    container:
+      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+      options: --shm-size=1G
+      credentials:
+        username: $oauthtoken
+        password: ${{ secrets.NGC_TOKEN }}
 
     steps:
       - uses: actions/checkout@v3
@@ -29,4 +36,4 @@ jobs:
             # find the release branch that we're pointing at
             branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
           fi
-          cd ${{ github.workspace }}; tox -e test-gpu-integration -- $branch
+          tox -e test-gpu-integration -- $branch
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -3,34 +3,37 @@ name: GPU CI
 on:
   workflow_dispatch:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - "pull-request/[0-9]+"
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
-  pull_request:
-    branches: [ main ]
-    types: [opened, synchronize, reopened]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
 
 jobs:
   gpu-ci:
-    runs-on: 2GPU
+    runs-on: linux-amd64-gpu-p100-latest-1
+    container:
+      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+      options: --shm-size=1G
+      credentials:
+        username: $oauthtoken
+        password: ${{ secrets.NGC_TOKEN }}
 
     steps:
-    - uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-    - name: Run tests
-      run: |
-        ref_type=${{ github.ref_type }}
-        branch=main
-        if [[ $ref_type == "tag"* ]]
-        then
-          # fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
-          git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
-          # find the release branch that we're pointing at
-          branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
-        fi
-        cd ${{ github.workspace }}; tox -e test-gpu -- $branch
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Run tests
+        run: |
+          ref_type=${{ github.ref_type }}
+          branch=main
+          if [[ $ref_type == "tag"* ]]
+          then
+            # fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
+            git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
+            # find the release branch that we're pointing at
+            branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
+          fi
+          tox -e test-gpu -- $branch
diff --git a/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb b/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb
diff --git a/examples/end-to-end-session-based/02-End-to-end-session-based-with-Yoochoose-PyT.ipynb b/examples/end-to-end-session-based/02-End-to-end-session-based-with-Yoochoose-PyT.ipynb
diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb
diff --git a/requirements/base_merlin.txt b/requirements/base_merlin.txt
@@ -1 +1 @@
-merlin-models>=0.11
+merlin-models>=23.4.0
diff --git a/tests/integration/notebooks/test_end_to_end_session_based.py b/tests/integration/notebooks/test_end_to_end_session_based.py
@@ -0,0 +1,85 @@
+import os
+
+import pytest
+from merlin.core.dispatch import HAS_GPU
+from testbook import testbook
+
+from tests.conftest import REPO_ROOT
+
+pytest.importorskip("transformers")
+
+# flake8: noqa
+
+
+@pytest.mark.notebook
+@pytest.mark.skipif(not HAS_GPU, reason="No GPU available")
+def test_func(tmp_path):
+    with testbook(
+        REPO_ROOT / "examples" / "end-to-end-session-based" / "01-ETL-with-NVTabular.ipynb",
+        execute=False,
+    ) as tb1:
+        dirname = f"{tmp_path}/data"
+        os.mkdir(dirname)
+        tb1.inject(
+            f"""
+            import os
+            os.environ["DATA_FOLDER"] = f"{dirname}"
+            os.environ["USE_SYNTHETIC"] = "True"
+            os.environ["START_DATE"] = "2014/4/1"
+            os.environ["END_DATE"] = "2014/4/5"
+            os.environ["THRESHOLD_DAY_INDEX"] = "1"
+            """
+        )
+        tb1.execute()
+        assert os.path.isdir(f"{dirname}/processed_nvt")
+        assert os.path.isdir(f"{dirname}/preproc_sessions_by_day")
+        assert os.path.isdir(f"{dirname}/workflow_etl")
+
+    with testbook(
+        REPO_ROOT
+        / "examples"
+        / "end-to-end-session-based"
+        / "02-End-to-end-session-based-with-Yoochoose-PyT.ipynb",
+        timeout=720,
+        execute=False,
+    ) as tb2:
+        dirname = f"{tmp_path}/data"
+        tb2.inject(
+            f"""
+            import os
+            os.environ["INPUT_DATA_DIR"] = f"{dirname}"
+            os.environ["OUTPUT_DIR"] = f"{dirname}/preproc_sessions_by_day"
+            os.environ["START_TIME_INDEX"] = "1"
+            os.environ["END_TIME_INDEX"] = "3"
+            os.environ["BATCH_SIZE_TRAIN"] = "64"
+            os.environ["BATCH_SIZE_VALID"] = "32"
+            """
+        )
+        NUM_OF_CELLS = len(tb2.cells)
+        tb2.execute_cell(list(range(0, NUM_OF_CELLS - 20)))
+        assert os.path.isdir(f"{dirname}/models")
+        assert os.listdir(f"{dirname}/models")
+
+    with testbook(
+        REPO_ROOT
+        / "examples"
+        / "end-to-end-session-based"
+        / "03-Session-based-Yoochoose-multigpu-training-PyT.ipynb",
+        timeout=720,
+        execute=False,
+    ) as tb3:
+        dirname = f"{tmp_path}/data"
+        tb3.inject(
+            f"""
+            import os
+            os.environ["INPUT_DATA_DIR"] = f"{dirname}"
+            os.environ["OUTPUT_DIR"] = f"{dirname}/preproc_sessions_by_day"
+            os.environ["START_TIME_INDEX"] = "1"
+            os.environ["END_TIME_INDEX"] = "4"
+            os.environ["LEARNING_RATE"] = "0.0005"
+            os.environ["BATCH_SIZE_TRAIN"] = "64"
+            os.environ["BATCH_SIZE_VALID"] = "32"
+            """
+        )
+        tb3.execute()
+        assert os.path.isfile(f"{dirname}/eval_metrics.txt")
diff --git a/tests/unit/torch/features/test_sequential.py b/tests/unit/torch/features/test_sequential.py
@@ -14,12 +14,17 @@
 # limitations under the License.
 #
 
+import numpy as np
 import pytest
+from merlin.dataloader.ops.embeddings import EmbeddingOperator
+from merlin.io import Dataset
+from merlin.schema import ColumnSchema
 from merlin.schema import Schema as CoreSchema
 from merlin.schema import Tags
 
 import transformers4rec.torch as tr
 from tests.conftest import parametrize_schemas
+from transformers4rec.torch.utils.data_utils import MerlinDataLoader
 
 
 @parametrize_schemas("yoochoose")
@@ -131,7 +136,6 @@ def test_sequential_tabular_features_ignore_masking(schema, torch_yoochoose_like
         input_module(torch_yoochoose_like, training=False, testing=True).detach().cpu().numpy()
     )
 
-    assert np.allclose(output_wo_masking, output_inference_masking, rtol=1e-04, atol=1e-08)
     assert not np.allclose(output_wo_masking, output_clm_masking, rtol=1e-04, atol=1e-08)
 
     input_module._masking = MaskedLanguageModeling(hidden_size=100)
@@ -217,3 +221,173 @@ def test_sequential_and_non_sequential_tabular_features(schema, torch_yoochoose_
     outputs = tab_module(torch_yoochoose_like)
 
     assert list(outputs.shape) == [100, 20, 203]
+
+
+@pytest.mark.parametrize(
+    "pretrained_dim",
+    [None, 128, {"pretrained_item_id_embeddings": 128, "pretrained_user_id_embeddings": 128}],
+)
+def test_sequential_input_block_with_pretrained_embeddings(pretrained_dim):
+    data = tr.data.music_streaming_testing_data
+    seq_schema = data.merlin_schema.select_by_name(["item_id"])
+    # Set the property `dims` for the non-sequential feature: "user_id"
+    user_cardinality = data.merlin_schema["user_id"].int_domain.max + 1
+    seq_schema = seq_schema + CoreSchema(
+        [
+            ColumnSchema(
+                "user_id",
+                dtype=np.int32,
+                tags=[Tags.USER, Tags.CATEGORICAL],
+                properties={
+                    "domain": {"name": "user_id", "min": 0, "max": user_cardinality},
+                },
+                dims=(None,),
+            )
+        ]
+    )
+    batch_size, max_length = 128, 20
+    embedding_dim_default, item_dim, user_dim = 8, 32, 16
+
+    # generate pre-trained embeddings tables
+    item_cardinality = seq_schema["item_id"].int_domain.max + 1
+    np_emb_item_id = np.random.rand(item_cardinality, item_dim)
+    np_emb_user_id = np.random.rand(user_cardinality, user_dim)
+    embeddings_op_item = EmbeddingOperator(
+        np_emb_item_id, lookup_key="item_id", embedding_name="pretrained_item_id_embeddings"
+    )
+    embeddings_op_user = EmbeddingOperator(
+        np_emb_user_id, lookup_key="user_id", embedding_name="pretrained_user_id_embeddings"
+    )
+
+    # set dataloader with pre-trained embeddings
+    data_loader = MerlinDataLoader.from_schema(
+        seq_schema,
+        data.path,
+        batch_size=batch_size,
+        max_sequence_length=max_length,
+        transforms=[embeddings_op_item, embeddings_op_user],
+        shuffle=False,
+    )
+
+    batch, _ = next(iter(data_loader))
+
+    # Sequential input block with pre-trained features
+    inputs = tr.TabularSequenceFeatures.from_schema(
+        data_loader.output_schema,
+        max_sequence_length=20,
+        pretrained_output_dims=pretrained_dim,
+        aggregation=None,
+    )
+
+    # Sequential input + concat aggregation, which inherently performs broadcasting of 2-D features.
+    inputs_with_concat = tr.TabularSequenceFeatures.from_schema(
+        data_loader.output_schema,
+        embedding_dim_default=embedding_dim_default,
+        max_sequence_length=20,
+        aggregation="concat",
+    )
+
+    output = inputs.to(batch["item_id"].device).double()(batch)
+    concat_output = inputs_with_concat.to(batch["item_id"].device).double()(batch)
+
+    assert concat_output.shape[-1] == embedding_dim_default * 2 + item_dim + user_dim
+
+    assert "pretrained_item_id_embeddings" in output
+    if pretrained_dim is not None:
+        assert list(output["pretrained_item_id_embeddings"].shape) == [
+            batch_size,
+            max_length,
+            128,
+        ]
+        assert list(output["pretrained_user_id_embeddings"].shape) == [
+            batch_size,
+            128,
+        ]
+    else:
+        assert list(output["pretrained_item_id_embeddings"].shape) == [
+            batch_size,
+            max_length,
+            item_dim,
+        ]
+        assert list(output["pretrained_user_id_embeddings"].shape) == [
+            batch_size,
+            user_dim,
+        ]
+
+
+@pytest.mark.parametrize(
+    "pretrained_dim",
+    [None, 128, {"pretrained_item_id_embeddings": 128, "pretrained_user_id_embeddings": 128}],
+)
+def test_non_sequential_input_block_with_pretrained_embeddings(pretrained_dim):
+    data = tr.data.music_streaming_testing_data
+    seq_schema = data.merlin_schema.select_by_name(["item_id"])
+    # Set the property `dims` for the non-sequential feature: "user_id"
+    user_cardinality = data.merlin_schema["user_id"].int_domain.max + 1
+    seq_schema = seq_schema + CoreSchema(
+        [
+            ColumnSchema(
+                "user_id",
+                dtype=np.int32,
+                tags=[Tags.USER, Tags.CATEGORICAL],
+                properties={
+                    "domain": {"name": "user_id", "min": 0, "max": user_cardinality},
+                },
+                dims=(None,),
+            )
+        ]
+    )
+    batch_size, max_length = 128, 20
+    item_dim, user_dim = 32, 16
+
+    # generate pre-trained embeddings tables
+    item_cardinality = seq_schema["item_id"].int_domain.max + 1
+    np_emb_item_id = np.random.rand(item_cardinality, item_dim)
+    np_emb_user_id = np.random.rand(user_cardinality, user_dim)
+    embeddings_op_item = EmbeddingOperator(
+        np_emb_item_id, lookup_key="item_id", embedding_name="pretrained_item_id_embeddings"
+    )
+    embeddings_op_user = EmbeddingOperator(
+        np_emb_user_id, lookup_key="user_id", embedding_name="pretrained_user_id_embeddings"
+    )
+
+    # set dataloader with pre-trained embeddings
+    data_loader = MerlinDataLoader.from_schema(
+        seq_schema,
+        Dataset(data.path, schema=seq_schema),
+        batch_size=batch_size,
+        max_sequence_length=max_length,
+        transforms=[embeddings_op_item, embeddings_op_user],
+        shuffle=False,
+    )
+
+    batch, _ = next(iter(data_loader))
+
+    # Non-Sequential input block with a 3-D pre-trained feature
+    inputs = tr.TabularFeatures.from_schema(
+        data_loader.output_schema,
+        pretrained_output_dims=pretrained_dim,
+        sequence_combiner="mean",
+        aggregation=None,
+    )
+    output = inputs.to(batch["item_id"].device).double()(batch)
+
+    assert "pretrained_item_id_embeddings" in output
+    if pretrained_dim is not None:
+        assert list(output["pretrained_item_id_embeddings"].shape) == [
+            batch_size,
+            128,
+        ]
+        assert list(output["pretrained_user_id_embeddings"].shape) == [
+            batch_size,
+            128,
+        ]
+    else:
+        assert list(output["pretrained_item_id_embeddings"].shape) == [
+            batch_size,
+            item_dim,
+        ]
+        assert list(output["pretrained_user_id_embeddings"].shape) == [
+            batch_size,
+            user_dim,
+        ]