allow FP16-type grad_t (#1072)

shintaro-iwasaki · facebook-github-bot · commit 2d16f6167d89 · 2022-04-27T10:49:00.000-07:00
Summary: Pull Request resolved: #1072 This Diff partially revives D31432199 (127f813), but only enables `grad_t = FP16` (no `BF16` support) to reduce the adverse side effect (e.g., the increase of binary size and compilation time). Specifically, D31432199 (127f813) provides FP32, FP16, and BF16 for `grad_t`. This Diff removes BF16 options for `grad_t` (so only FP32 and FP16 for `grad_t`). Reviewed By: jianyuh Differential Revision: D35120293 fbshipit-source-id: b9a1d35f901b26277a220360a2a68583c65c8554
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -344,31 +344,35 @@ void split_embedding_backward_exact_cpu_dense_kernel(
 
   grad_output = grad_output.contiguous();
 
+
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      host_weights.scalar_type(), "split_embedding_backward_exact_cpu", [&] {
-        // TODO: respect output_dtype
-        using grad_t = float;
-        split_embedding_backward_exact_cpu_kernel<scalar_t, grad_t>(
-            grad_output,
-            host_weights,
-            weights_offsets_data,
-            D_offsets_data,
-            hash_size_cumsum,
-            indices,
-            offsets,
-            pooling_mode,
-            indice_weights,
-            num_tables,
-            B,
-            table_to_feature_offset,
-            {% if "momentum1_offsets" in args.split_function_arg_names %}
-            momentum1_offsets_data,
-            {% endif %}
-            {% if "momentum2_offsets" in args.split_function_arg_names %}
-            momentum2_offsets_data,
-            {% endif %}
-            {{ args.split_cpu_kernel_arg_constructors | join(", ") }});
-      });
+      grad_output.scalar_type(),
+      "split_embedding_backward_exact_cpu_outer", [&]() {
+        using grad_t = scalar_t;
+      AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+          host_weights.scalar_type(), "split_embedding_backward_exact_cpu", [&] {
+            split_embedding_backward_exact_cpu_kernel<scalar_t, grad_t>(
+                grad_output,
+                host_weights,
+                weights_offsets_data,
+                D_offsets_data,
+                hash_size_cumsum,
+                indices,
+                offsets,
+                pooling_mode,
+                indice_weights,
+                num_tables,
+                B,
+                table_to_feature_offset,
+                {% if "momentum1_offsets" in args.split_function_arg_names %}
+                momentum1_offsets_data,
+                {% endif %}
+                {% if "momentum2_offsets" in args.split_function_arg_names %}
+                momentum2_offsets_data,
+                {% endif %}
+                {{ args.split_cpu_kernel_arg_constructors | join(", ") }});
+          });
+    });
 
   return;
 
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
@@ -194,8 +194,8 @@ Tensor split_embedding_codegen_forward_cpu(
   // It is assumed that the indice_weights will always be float
   TORCH_CHECK(
       !indice_weights.defined() || indice_weights.scalar_type() != at::kHalf);
-  AT_DISPATCH_FLOATING_TYPES(
-      output.scalar_type(), "split_embedding_cpu_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      output.scalar_type(), "split_embedding_cpu_forward", [&]() {
         using output_t = scalar_t;
         AT_DISPATCH_FLOATING_TYPES_AND2(
             at::ScalarType::Half,
diff --git a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
@@ -137,20 +137,22 @@
     }                                                                      \
   }
 
-#define DISPATCH_EMB_GRAD_CACHE_TYPES(                                       \
-    EMB_TYPE, GRAD_TYPE, CACHE_TYPE, NAME, ...)                              \
-  [&] {                                                                      \
-    const auto& emb_type = EMB_TYPE;                                         \
-    const auto& grad_type = GRAD_TYPE;                                       \
-    const auto& cache_type = CACHE_TYPE;                                     \
-    at::ScalarType _emb_t = ::detail::scalar_type(emb_type);                 \
-    at::ScalarType _grad_t = ::detail::scalar_type(grad_type);               \
-    at::ScalarType _cache_t = ::detail::scalar_type(cache_type);             \
-    switch (_grad_t) {                                                       \
-      PRIVATE_CASE_TYPE_CACHE_EMB(                                           \
-          at::ScalarType::Float, _cache_t, _emb_t, float, NAME, __VA_ARGS__) \
-      default:                                                               \
-        AT_ERROR(                                                            \
-            #NAME, " not implemented for grad_t '", toString(_grad_t), "'"); \
-    }                                                                        \
+#define DISPATCH_EMB_GRAD_CACHE_TYPES(                                         \
+    EMB_TYPE, GRAD_TYPE, CACHE_TYPE, NAME, ...)                                \
+  [&] {                                                                        \
+    const auto& emb_type = EMB_TYPE;                                           \
+    const auto& grad_type = GRAD_TYPE;                                         \
+    const auto& cache_type = CACHE_TYPE;                                       \
+    at::ScalarType _emb_t = ::detail::scalar_type(emb_type);                   \
+    at::ScalarType _grad_t = ::detail::scalar_type(grad_type);                 \
+    at::ScalarType _cache_t = ::detail::scalar_type(cache_type);               \
+    switch (_grad_t) {                                                         \
+      PRIVATE_CASE_TYPE_CACHE_EMB(                                             \
+          at::ScalarType::Float, _cache_t, _emb_t, float, NAME, __VA_ARGS__)   \
+      PRIVATE_CASE_TYPE_CACHE_EMB(                                             \
+          at::ScalarType::Half, _cache_t, _emb_t, at::Half, NAME, __VA_ARGS__) \
+      default:                                                                 \
+        AT_ERROR(                                                              \
+            #NAME, " not implemented for grad_t '", toString(_grad_t), "'");   \
+    }                                                                          \
   }()
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py