Skip to content

Commit

Permalink
Merge branch 'main' into fix-transformer-version-test-errs
Browse files Browse the repository at this point in the history
  • Loading branch information
jperez999 authored Nov 1, 2023
2 parents f24ea77 + 5bef974 commit 93a5aff
Show file tree
Hide file tree
Showing 33 changed files with 2,279 additions and 497 deletions.
4 changes: 4 additions & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Configuration file for `copy-pr-bot` GitHub App
# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

enabled: true
4 changes: 0 additions & 4 deletions .github/ops-bot.yaml

This file was deleted.

19 changes: 13 additions & 6 deletions .github/workflows/gpu-ci-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,23 @@ name: GPU NOTEBOOK CI
on:
workflow_dispatch:
push:
branches: [main]
branches:
- main
- "pull-request/[0-9]+"
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
pull_request:
branches: [main]
types: [opened, synchronize, reopened]

jobs:
gpu-ci-integration:
runs-on: 1GPU
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
credentials:
username: $oauthtoken
password: ${{ secrets.NGC_TOKEN }}

steps:
- uses: actions/checkout@v3
Expand All @@ -29,4 +36,4 @@ jobs:
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
cd ${{ github.workspace }}; tox -e test-gpu-integration -- $branch
tox -e test-gpu-integration -- $branch
51 changes: 27 additions & 24 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,37 @@ name: GPU CI
on:
workflow_dispatch:
push:
branches: [ main ]
branches:
- main
- "pull-request/[0-9]+"
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
gpu-ci:
runs-on: 2GPU
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
credentials:
username: $oauthtoken
password: ${{ secrets.NGC_TOKEN }}

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
# fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
cd ${{ github.workspace }}; tox -e test-gpu -- $branch
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
# fetch release branches (the branch name is not automatically fetched by the actions/checkout step)
git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
# find the release branch that we're pointing at
branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
fi
tox -e test-gpu -- $branch
160 changes: 123 additions & 37 deletions examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements/base_merlin.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
merlin-models>=0.11
merlin-models>=23.4.0
85 changes: 85 additions & 0 deletions tests/integration/notebooks/test_end_to_end_session_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os

import pytest
from merlin.core.dispatch import HAS_GPU
from testbook import testbook

from tests.conftest import REPO_ROOT

pytest.importorskip("transformers")

# flake8: noqa


@pytest.mark.notebook
@pytest.mark.skipif(not HAS_GPU, reason="No GPU available")
def test_func(tmp_path):
with testbook(
REPO_ROOT / "examples" / "end-to-end-session-based" / "01-ETL-with-NVTabular.ipynb",
execute=False,
) as tb1:
dirname = f"{tmp_path}/data"
os.mkdir(dirname)
tb1.inject(
f"""
import os
os.environ["DATA_FOLDER"] = f"{dirname}"
os.environ["USE_SYNTHETIC"] = "True"
os.environ["START_DATE"] = "2014/4/1"
os.environ["END_DATE"] = "2014/4/5"
os.environ["THRESHOLD_DAY_INDEX"] = "1"
"""
)
tb1.execute()
assert os.path.isdir(f"{dirname}/processed_nvt")
assert os.path.isdir(f"{dirname}/preproc_sessions_by_day")
assert os.path.isdir(f"{dirname}/workflow_etl")

with testbook(
REPO_ROOT
/ "examples"
/ "end-to-end-session-based"
/ "02-End-to-end-session-based-with-Yoochoose-PyT.ipynb",
timeout=720,
execute=False,
) as tb2:
dirname = f"{tmp_path}/data"
tb2.inject(
f"""
import os
os.environ["INPUT_DATA_DIR"] = f"{dirname}"
os.environ["OUTPUT_DIR"] = f"{dirname}/preproc_sessions_by_day"
os.environ["START_TIME_INDEX"] = "1"
os.environ["END_TIME_INDEX"] = "3"
os.environ["BATCH_SIZE_TRAIN"] = "64"
os.environ["BATCH_SIZE_VALID"] = "32"
"""
)
NUM_OF_CELLS = len(tb2.cells)
tb2.execute_cell(list(range(0, NUM_OF_CELLS - 20)))
assert os.path.isdir(f"{dirname}/models")
assert os.listdir(f"{dirname}/models")

with testbook(
REPO_ROOT
/ "examples"
/ "end-to-end-session-based"
/ "03-Session-based-Yoochoose-multigpu-training-PyT.ipynb",
timeout=720,
execute=False,
) as tb3:
dirname = f"{tmp_path}/data"
tb3.inject(
f"""
import os
os.environ["INPUT_DATA_DIR"] = f"{dirname}"
os.environ["OUTPUT_DIR"] = f"{dirname}/preproc_sessions_by_day"
os.environ["START_TIME_INDEX"] = "1"
os.environ["END_TIME_INDEX"] = "4"
os.environ["LEARNING_RATE"] = "0.0005"
os.environ["BATCH_SIZE_TRAIN"] = "64"
os.environ["BATCH_SIZE_VALID"] = "32"
"""
)
tb3.execute()
assert os.path.isfile(f"{dirname}/eval_metrics.txt")
176 changes: 175 additions & 1 deletion tests/unit/torch/features/test_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@
# limitations under the License.
#

import numpy as np
import pytest
from merlin.dataloader.ops.embeddings import EmbeddingOperator
from merlin.io import Dataset
from merlin.schema import ColumnSchema
from merlin.schema import Schema as CoreSchema
from merlin.schema import Tags

import transformers4rec.torch as tr
from tests.conftest import parametrize_schemas
from transformers4rec.torch.utils.data_utils import MerlinDataLoader


@parametrize_schemas("yoochoose")
Expand Down Expand Up @@ -131,7 +136,6 @@ def test_sequential_tabular_features_ignore_masking(schema, torch_yoochoose_like
input_module(torch_yoochoose_like, training=False, testing=True).detach().cpu().numpy()
)

assert np.allclose(output_wo_masking, output_inference_masking, rtol=1e-04, atol=1e-08)
assert not np.allclose(output_wo_masking, output_clm_masking, rtol=1e-04, atol=1e-08)

input_module._masking = MaskedLanguageModeling(hidden_size=100)
Expand Down Expand Up @@ -217,3 +221,173 @@ def test_sequential_and_non_sequential_tabular_features(schema, torch_yoochoose_
outputs = tab_module(torch_yoochoose_like)

assert list(outputs.shape) == [100, 20, 203]


@pytest.mark.parametrize(
"pretrained_dim",
[None, 128, {"pretrained_item_id_embeddings": 128, "pretrained_user_id_embeddings": 128}],
)
def test_sequential_input_block_with_pretrained_embeddings(pretrained_dim):
data = tr.data.music_streaming_testing_data
seq_schema = data.merlin_schema.select_by_name(["item_id"])
# Set the property `dims` for the non-sequential feature: "user_id"
user_cardinality = data.merlin_schema["user_id"].int_domain.max + 1
seq_schema = seq_schema + CoreSchema(
[
ColumnSchema(
"user_id",
dtype=np.int32,
tags=[Tags.USER, Tags.CATEGORICAL],
properties={
"domain": {"name": "user_id", "min": 0, "max": user_cardinality},
},
dims=(None,),
)
]
)
batch_size, max_length = 128, 20
embedding_dim_default, item_dim, user_dim = 8, 32, 16

# generate pre-trained embeddings tables
item_cardinality = seq_schema["item_id"].int_domain.max + 1
np_emb_item_id = np.random.rand(item_cardinality, item_dim)
np_emb_user_id = np.random.rand(user_cardinality, user_dim)
embeddings_op_item = EmbeddingOperator(
np_emb_item_id, lookup_key="item_id", embedding_name="pretrained_item_id_embeddings"
)
embeddings_op_user = EmbeddingOperator(
np_emb_user_id, lookup_key="user_id", embedding_name="pretrained_user_id_embeddings"
)

# set dataloader with pre-trained embeddings
data_loader = MerlinDataLoader.from_schema(
seq_schema,
data.path,
batch_size=batch_size,
max_sequence_length=max_length,
transforms=[embeddings_op_item, embeddings_op_user],
shuffle=False,
)

batch, _ = next(iter(data_loader))

# Sequential input block with pre-trained features
inputs = tr.TabularSequenceFeatures.from_schema(
data_loader.output_schema,
max_sequence_length=20,
pretrained_output_dims=pretrained_dim,
aggregation=None,
)

# Sequential input + concat aggregation, which inherently performs broadcasting of 2-D features.
inputs_with_concat = tr.TabularSequenceFeatures.from_schema(
data_loader.output_schema,
embedding_dim_default=embedding_dim_default,
max_sequence_length=20,
aggregation="concat",
)

output = inputs.to(batch["item_id"].device).double()(batch)
concat_output = inputs_with_concat.to(batch["item_id"].device).double()(batch)

assert concat_output.shape[-1] == embedding_dim_default * 2 + item_dim + user_dim

assert "pretrained_item_id_embeddings" in output
if pretrained_dim is not None:
assert list(output["pretrained_item_id_embeddings"].shape) == [
batch_size,
max_length,
128,
]
assert list(output["pretrained_user_id_embeddings"].shape) == [
batch_size,
128,
]
else:
assert list(output["pretrained_item_id_embeddings"].shape) == [
batch_size,
max_length,
item_dim,
]
assert list(output["pretrained_user_id_embeddings"].shape) == [
batch_size,
user_dim,
]


@pytest.mark.parametrize(
"pretrained_dim",
[None, 128, {"pretrained_item_id_embeddings": 128, "pretrained_user_id_embeddings": 128}],
)
def test_non_sequential_input_block_with_pretrained_embeddings(pretrained_dim):
data = tr.data.music_streaming_testing_data
seq_schema = data.merlin_schema.select_by_name(["item_id"])
# Set the property `dims` for the non-sequential feature: "user_id"
user_cardinality = data.merlin_schema["user_id"].int_domain.max + 1
seq_schema = seq_schema + CoreSchema(
[
ColumnSchema(
"user_id",
dtype=np.int32,
tags=[Tags.USER, Tags.CATEGORICAL],
properties={
"domain": {"name": "user_id", "min": 0, "max": user_cardinality},
},
dims=(None,),
)
]
)
batch_size, max_length = 128, 20
item_dim, user_dim = 32, 16

# generate pre-trained embeddings tables
item_cardinality = seq_schema["item_id"].int_domain.max + 1
np_emb_item_id = np.random.rand(item_cardinality, item_dim)
np_emb_user_id = np.random.rand(user_cardinality, user_dim)
embeddings_op_item = EmbeddingOperator(
np_emb_item_id, lookup_key="item_id", embedding_name="pretrained_item_id_embeddings"
)
embeddings_op_user = EmbeddingOperator(
np_emb_user_id, lookup_key="user_id", embedding_name="pretrained_user_id_embeddings"
)

# set dataloader with pre-trained embeddings
data_loader = MerlinDataLoader.from_schema(
seq_schema,
Dataset(data.path, schema=seq_schema),
batch_size=batch_size,
max_sequence_length=max_length,
transforms=[embeddings_op_item, embeddings_op_user],
shuffle=False,
)

batch, _ = next(iter(data_loader))

# Non-Sequential input block with a 3-D pre-trained feature
inputs = tr.TabularFeatures.from_schema(
data_loader.output_schema,
pretrained_output_dims=pretrained_dim,
sequence_combiner="mean",
aggregation=None,
)
output = inputs.to(batch["item_id"].device).double()(batch)

assert "pretrained_item_id_embeddings" in output
if pretrained_dim is not None:
assert list(output["pretrained_item_id_embeddings"].shape) == [
batch_size,
128,
]
assert list(output["pretrained_user_id_embeddings"].shape) == [
batch_size,
128,
]
else:
assert list(output["pretrained_item_id_embeddings"].shape) == [
batch_size,
item_dim,
]
assert list(output["pretrained_user_id_embeddings"].shape) == [
batch_size,
user_dim,
]
Loading

0 comments on commit 93a5aff

Please sign in to comment.