pytorch · spcyppt · Dec 20, 2024
diff --git a/fbgemm_gpu/codegen/genscript/optimizer_args.py b/fbgemm_gpu/codegen/genscript/optimizer_args.py
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -1001,6 +1001,29 @@ def partial_rowwise_lamb() -> Dict[str, Any]:
 
 
 def adam() -> Dict[str, Any]:
+    split_precomputation = """
+    at::acc_type<cache_t, true>* __restrict__ row_counter;
+    at::acc_type<cache_t, true> _row_counter = iter;
+    if (use_rowwise_bias_correction) {
+        const auto row_counter_placement = static_cast<PlacementType>(row_counter_placements[t]);
+        const int64_t row_counter_offset = row_counter_offsets[t];
+        if (row_counter_placement == PlacementType::DEVICE) {
+            row_counter = &row_counter_dev[row_counter_offset];
+        } else {
+            row_counter = &row_counter_uvm[row_counter_offset];
+        }
+
+        // need to compute bias correction for each row
+        if (threadIdx.x == 0) {
+            _row_counter = row_counter[idx] + 1;
+            row_counter[idx] = _row_counter;
+        }
+
+        // broadcast bias correction to all threads
+        _row_counter = SHFL_SYNC(_row_counter, 0);
+    }
+    """
+
     split_weight_update = """
       Vec4T<cache_t> m_t(&momentum1[idx * D + d]);
       m_t.acc.x *= beta1;
@@ -1023,10 +1046,10 @@ def adam() -> Dict[str, Any]:
       v_t.fma_(grad, 1.0 - beta2);
       v_t.store(&momentum2[idx * D + d]);
 
-      weight_new.acc.x -= learning_rate * (m_t.acc.x / (1.0 - powf(beta1, iter)) / (sqrtf((v_t.acc.x / (1.0 - powf(beta2, iter)))) + eps) + weight_decay * weight_new.acc.x);
-      weight_new.acc.y -= learning_rate * (m_t.acc.y / (1.0 - powf(beta1, iter)) / (sqrtf((v_t.acc.y / (1.0 - powf(beta2, iter)))) + eps) + weight_decay * weight_new.acc.y);
-      weight_new.acc.z -= learning_rate * (m_t.acc.z / (1.0 - powf(beta1, iter)) / (sqrtf((v_t.acc.z / (1.0 - powf(beta2, iter)))) + eps) + weight_decay * weight_new.acc.z);
-      weight_new.acc.w -= learning_rate * (m_t.acc.w / (1.0 - powf(beta1, iter)) / (sqrtf((v_t.acc.w / (1.0 - powf(beta2, iter)))) + eps) + weight_decay * weight_new.acc.w);
+      weight_new.acc.x -= learning_rate * (m_t.acc.x / (1.0 - powf(beta1, _row_counter)) / (sqrtf((v_t.acc.x / (1.0 - powf(beta2, _row_counter)))) + eps) + weight_decay * weight_new.acc.x);
+      weight_new.acc.y -= learning_rate * (m_t.acc.y / (1.0 - powf(beta1, _row_counter)) / (sqrtf((v_t.acc.y / (1.0 - powf(beta2, _row_counter)))) + eps) + weight_decay * weight_new.acc.y);
+      weight_new.acc.z -= learning_rate * (m_t.acc.z / (1.0 - powf(beta1, _row_counter)) / (sqrtf((v_t.acc.z / (1.0 - powf(beta2, _row_counter)))) + eps) + weight_decay * weight_new.acc.z);
+      weight_new.acc.w -= learning_rate * (m_t.acc.w / (1.0 - powf(beta1, _row_counter)) / (sqrtf((v_t.acc.w / (1.0 - powf(beta2, _row_counter)))) + eps) + weight_decay * weight_new.acc.w);
     """
     split_weight_update_cpu = ""  # TODO
 
@@ -1043,12 +1066,14 @@ def adam() -> Dict[str, Any]:
                 OptimItem(ArgType.FLOAT, "beta2"),
                 OptimItem(ArgType.FLOAT, "weight_decay"),
                 OptimItem(ArgType.INT, "iter"),
+                OptimItem(ArgType.BOOL, "use_rowwise_bias_correction"),
+                OptimItem(ArgType.TENSOR, "row_counter", is_optional=True),
             ],
             {
-                "v1": "Tensor momentum1, Tensor momentum2, float learning_rate = 0, float eps = 0, float beta1 = 0, float beta2 = 0, float weight_decay = 0, int iter = 0"
+                "v1": "Tensor momentum1, Tensor momentum2, float learning_rate = 0, float eps = 0, float beta1 = 0, float beta2 = 0, float weight_decay = 0, int iter = 0, bool use_rowwise_bias_correction = False, Tensor? row_counter = None",
             },
         ),
-        "split_precomputation": "",
+        "split_precomputation": split_precomputation,
         "split_weight_update": split_weight_update,
         "split_post_update": "",
         "split_weight_update_cpu": split_weight_update_cpu,

diff --git a/fbgemm_gpu/codegen/genscript/torch_type_utils.py b/fbgemm_gpu/codegen/genscript/torch_type_utils.py
@@ -26,6 +26,7 @@ class ArgType(IntEnum):
     INT = 7
     FLOAT = 8
     SYM_INT = 9
+    BOOL = 10
 
 
 @dataclass

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp
@@ -64,14 +64,14 @@ class SplitLookupFunction_{{ optimizer }}_Op : public torch::autograd::Function<
     bool gradient_clipping,
     double max_gradient,
     bool stochastic_rounding,
-    {{ args.split_function_args | join(", ") }},
+    {{ args.split_function_args_autograd | join(", ") }},
     int64_t output_dtype = static_cast<int64_t>(SparseType::FP32)) {
     Tensor indice_weights_value = indice_weights.value_or(Tensor());
     Tensor feature_requires_grad_value =
         feature_requires_grad.value_or(Tensor());
     ctx->save_for_backward({
         host_weights, weights_placements, weights_offsets, D_offsets, hash_size_cumsum,
-        indices, offsets, indice_weights_value, feature_requires_grad_value, {{ args.split_saved_tensors | join(", ") }} });
+        indices, offsets, indice_weights_value, feature_requires_grad_value, {{ args.split_saved_tensors_optional | join(", ") }} });
 
     ctx->saved_data["total_D"] = total_D;
     ctx->saved_data["max_D"] = max_D;
@@ -242,7 +242,7 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
       gradient_clipping,
       max_gradient,
       stochastic_rounding,
-      {{ args.split_function_arg_names | join(", ") }},
+      {{ args.split_function_arg_names_autograd | join(", ") }},
       output_dtype)[0];
   {% else %}
   TORCH_CHECK(false, "split_embedding_codegen_lookup_{{ optimizer }}_function_cpu is deprecated. Please see https://github.com/pytorch/FBGEMM/discussions/1727 for more detail.");

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
@@ -363,7 +363,7 @@ enum SSDTensor {
           {%- if ssd %}
           ssd_tensors.value(),
           {%- endif  %}
-          {{ args.split_function_arg_names | join(", ") }}
+          {{ args.split_function_arg_names_autograd | join(", ") }}
           {%- endif %}
           )[0];
 {%- endmacro %}
@@ -623,7 +623,7 @@ class {{ autograd_func }} :
     {%- if ssd %}
     const at::TensorList& ssd_tensors,
     {%- endif %}
-    {{ args.split_function_args | join(", ") }}
+    {{ args.split_function_args_autograd | join(", ") }}
     {%- else %}
     {%- if vbe %}
     const std::optional<Tensor>& B_offsets,
@@ -762,7 +762,7 @@ class {{ autograd_func }} :
         ssd_tensors[SSDTensor::{{ tensor | upper }}],
         {%- endfor %}
         {%- endif %}
-        {{ args.split_saved_tensors | join(", ") }}
+        {{ args.split_saved_tensors_optional | join(", ") }}
     });
 
     {%- if not nobag %}

diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -476,12 +476,35 @@ enum SSDTensor {
 
 /* This macro generates a code blob for unpacking the tensor list
 */
-{%- macro unpack_tensor_list(tensor_list) %}
-    const Tensor {{ tensor_list }}_host = {{ tensor_list }}[0];
-    const Tensor {{ tensor_list }}_dev = {{ tensor_list }}[1];
-    const Tensor {{ tensor_list }}_uvm = {{ tensor_list }}[2];
-    const Tensor {{ tensor_list }}_placements = {{ tensor_list }}[3];
-    const Tensor {{ tensor_list }}_offsets = {{ tensor_list }}[4];
+{%- macro unpack_tensorlist(name) %}
+    const Tensor {{ name }}_host = {{ name }}[0];
+    const Tensor {{ name }}_dev = {{ name }}[1];
+    const Tensor {{ name }}_uvm = {{ name }}[2];
+    const Tensor {{ name }}_placements = {{ name }}[3];
+    const Tensor {{ name }}_offsets = {{ name }}[4];
+{%- endmacro %}
+
+{%- macro unpack_tensorlist_optional(name) %}
+    Tensor {{ name }}_host;
+    Tensor {{ name }}_dev;
+    Tensor {{ name }}_uvm;
+    Tensor {{ name }}_placements;
+    Tensor {{ name }}_offsets;
+    if ({{ name }}.has_value()) {
+      at::TensorList _{{ name }} = {{ name }}.value();
+      {{ name }}_host = _{{ name }}[0];
+      {{ name }}_dev = _{{ name }}[1];
+      {{ name }}_uvm = _{{ name }}[2];
+      {{ name }}_placements = _{{ name }}[3];
+      {{ name }}_offsets = _{{ name }}[4];
+    }
+    else{
+      {{ name }}_host = at::empty({0}, weights_host.options());
+      {{ name }}_dev = at::empty({0}, weights_dev.options());
+      {{ name }}_uvm = at::empty({0}, weights_uvm.options());
+      {{ name }}_placements = at::empty({0}, weights_placements.options());
+      {{ name }}_offsets = at::empty({0}, weights_offsets.options());
+    }
 {%- endmacro %}
 
 
@@ -582,9 +605,12 @@ class {{ autograd_func }} :
     {{ args_pt2.unified_pt2.split_function_args | join(", ") }}) {
 
     // unpack Tensor lists
-    {{ unpack_tensor_list("weights") }}
-    {%- for arg_name in args_pt2.unified_pt2.split_saved_tensor_list %}
-    {{ unpack_tensor_list(arg_name) }}
+    {{ unpack_tensorlist("weights") }}
+    {%- for arg_name in args_pt2.unified_pt2.split_saved_tensorlist %}
+    {{ unpack_tensorlist(arg_name) }}
+    {%- endfor %}
+    {%- for arg_name in args_pt2.unified_pt2.split_saved_tensorlist_optional %}
+    {{ unpack_tensorlist_optional(arg_name) }}
     {%- endfor %}
 
     const auto T = weights_offsets.sym_numel();

diff --git a/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template
@@ -60,7 +60,7 @@ def invoke(
     {%- if "prev_iter_dev" in args.split_function_arg_names %}
     prev_iter: Momentum,
     {%- endif %}
-    {%- if "row_counter_dev" in args.split_function_arg_names %}
+    {%- if "row_counter_dev" in args.split_function_arg_names and "row_counter" not in args_pt2.unified_pt2.split_saved_tensorlist_optional %}
     row_counter: Momentum,
     {%- endif %}
     {%- if "iter" in args.split_function_arg_names %}
@@ -209,7 +209,7 @@ def invoke(
             prev_iter_placements=prev_iter.placements,
             {%- endif %}
             # row_counter
-            {%- if "row_counter_dev" in args.split_function_arg_names %}
+            {%- if "row_counter_dev" in args.split_function_arg_names and "row_counter" not in args_pt2.unified_pt2.split_saved_tensorlist_optional %}
             row_counter_host=row_counter.host,
             row_counter_offsets=row_counter.offsets,
             row_counter_placements=row_counter.placements,
@@ -387,7 +387,7 @@ def invoke(
         prev_iter_dev=prev_iter_dev,
         {%- endif %}
         # row_counter
-        {%- if "row_counter_dev" in args.split_function_arg_names %}
+        {%- if "row_counter_dev" in args.split_function_arg_names and "row_counter" not in args_pt2.unified_pt2.split_saved_tensorlist_optional %}
         row_counter_dev=row_counter.dev,
         row_counter_uvm=row_counter.uvm,
         row_counter_offsets=row_counter.offsets,

diff --git a/fbgemm_gpu/test/tbe/training/forward_test.py b/fbgemm_gpu/test/tbe/training/forward_test.py
@@ -76,6 +76,13 @@
         "test_faketensor__test_forward_gpu_uvm_cache_int8": [
             unittest.skip("Operator not implemented for Meta tensors"),
         ],
+        # learning rate tensor needs to be on CPU to avoid D->H sync point since it will be used as float in the kernel
+        # this fails fake_tensor test as the test expects all tensors to be on the same device
+        "test_pt2_compliant_tag_fbgemm_split_embedding_codegen_lookup_rowwise_adagrad_function": [
+            unittest.skip(
+                "Operator failed on FakeTensor test since learning rate tensor is always on CPU regardless of other tensors"
+            ),
+        ],
     }
 )