From 979a5f9c5f51b61e4188b67865e7486849a22a47 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Wed, 12 Apr 2023 11:07:02 +0100 Subject: [PATCH 1/2] Enable CategorifyTransform cpp op to run on int16 types --- cpp/nvtabular/inference/categorify.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc index 734610f073e..90c3c336228 100644 --- a/cpp/nvtabular/inference/categorify.cc +++ b/cpp/nvtabular/inference/categorify.cc @@ -93,6 +93,9 @@ namespace nvtabular case 'u': switch (dtype.itemsize()) { + case 2: + insert_int_mapping(values); + return; case 4: insert_int_mapping(values); return; @@ -104,6 +107,9 @@ namespace nvtabular case 'i': switch (dtype.itemsize()) { + case 2: + insert_int_mapping(values); + return; case 4: insert_int_mapping(values); return; @@ -198,6 +204,8 @@ namespace nvtabular case 'u': switch (itemsize) { + case 2: + return transform_int(input); case 4: return transform_int(input); case 8: @@ -207,6 +215,8 @@ namespace nvtabular case 'i': switch (itemsize) { + case 2: + return transform_int(input); case 4: return transform_int(input); case 8: From 82fd2f341cc63aaef77d780b0fe601dea302faf2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Wed, 12 Apr 2023 11:51:32 +0100 Subject: [PATCH 2/2] Add test for categorify inference op with different types --- tests/unit/ops/test_categorify.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/unit/ops/test_categorify.py b/tests/unit/ops/test_categorify.py index c92030c7e05..2c90488b2b0 100644 --- a/tests/unit/ops/test_categorify.py +++ b/tests/unit/ops/test_categorify.py @@ -695,3 +695,29 @@ def test_categorify_joint_list(cpu): assert compare_a == [1, 5, 2, 3] assert compare_e == [2, 3, 1, 4, 1] + + +def test_categorify_inference(): + num_rows = 100 + a_char, z_char = np.array(["a", "z"]).view("int32") + input_tensors = { + "unicode_string": np.random.randint( + low=a_char, high=z_char, size=num_rows * 10, dtype="int32" + ).view("U10"), + "int16_feature": np.random.randint(0, 10, dtype="int16", size=num_rows), + "int32_feature": np.random.randint(0, 10, dtype="int32", size=num_rows), + "int64_feature": np.random.randint(0, 10, dtype="int64", size=num_rows), + "uint16_feature": np.random.randint(0, 10, dtype="uint16", size=num_rows), + "uint32_feature": np.random.randint(0, 10, dtype="uint32", size=num_rows), + "uint64_feature": np.random.randint(0, 10, dtype="uint64", size=num_rows), + } + df = dispatch.make_df(input_tensors) + cat_names = df.columns + cats = cat_names >> nvt.ops.Categorify() + workflow = nvt.Workflow(cats) + workflow.fit(nvt.Dataset(df)) + model_config = {} + inference_op = cats.op.inference_initialize(cats.input_columns, model_config) + output_tensors = inference_op.transform(cats.input_columns, input_tensors) + for key in input_tensors: + assert output_tensors[key].dtype == np.dtype("int64")