From 75dd112513a0574feb39a031272b8b1b11e06239 Mon Sep 17 00:00:00 2001
From: Banit Agrawal <bagrawal@meta.com>
Date: Fri, 10 Mar 2023 13:04:26 -0800
Subject: [PATCH 01/34] using different mechanism for host mapped pinned memory
 (#1638)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1638

This diff adds another mechanism for allocating the host mapped pinned memory to reduce adverse affect on other processes running on the same host when one process is doing some large allocations.

Reviewed By: zyan0, jianyuh

Differential Revision: D43950253

fbshipit-source-id: 41a434cb63354509d32e00c851c5f3a2d68be686
---
 fbgemm_gpu/src/cumem_utils.cu | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
index 7a060681f0..7b49040a83 100644
--- a/fbgemm_gpu/src/cumem_utils.cu
+++ b/fbgemm_gpu/src/cumem_utils.cu
@@ -41,7 +41,8 @@ struct CUDAHostMappedContext {
   ~CUDAHostMappedContext() {
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(cuda_device_);
-    AT_CUDA_CHECK(cudaFreeHost(ptr_));
+    AT_CUDA_CHECK(cudaHostUnregister(ptr_));
+    free(ptr_);
   }
 
   static void release(void* ptr) {
@@ -206,9 +207,28 @@ Tensor new_host_mapped_tensor(
   auto strides = defaultStrides(sizes);
   size_t size_bytes =
       at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
-  void* ptr;
-  AT_CUDA_CHECK(cudaHostAlloc(
-      &ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped));
+
+  // When using cudaHostAlloc for large allocations, we found that it can
+  // potentially take a global lock and lock out CUDA APIs from other processes.
+  // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
+  // of using this cuda API, we can do regular malloc, pre-fault the pages, and
+  // then do cudaHostRegister with GPU mapping flags to lock the pages, so we
+  // can minimize the cost while holding this global lock.
+  void* const ptr = malloc(size_bytes);
+
+  // advise the kernel to allocate large 2M pages
+  madvise(ptr, size_bytes, MADV_HUGEPAGE);
+
+  // pre-fault/map the pages by setting the first byte of the page
+  size_t pageSize = (1 << 21);
+  uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1));
+  for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes);
+       p += pageSize) {
+    memset((void*)p, 0, 1);
+  }
+
+  AT_CUDA_CHECK(cudaHostRegister(
+      ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
   void* dev_ptr;
   AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));
 

From 30833faf5fcea8917d3f22e69f1826c1431e8e3c Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Mon, 13 Mar 2023 15:19:53 -0700
Subject: [PATCH 02/34] disable use_cpu test (#1635)

Summary:
This PR addresses the issue https://github.com/pytorch/FBGEMM/issues/1636

akin to https://github.com/pytorch/FBGEMM/blob/8616ed701015f8b9e4c2825ce592b204b4cfaf28/fbgemm_gpu/test/split_table_batched_embeddings_test.py#L1009

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1635

Reviewed By: shintaro-iwasaki

Differential Revision: D44033725

Pulled By: q10

fbshipit-source-id: 49f28fc2f1c20948a42728eebf3defc5195baa5d
---
 fbgemm_gpu/test/jagged_tensor_ops_test.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 9ed5a39f47..98021007f4 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -20,7 +20,12 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, running_on_github
+    from test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        running_on_github,
+        TEST_WITH_ROCM,
+    )
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
@@ -28,6 +33,7 @@
         gpu_available,
         gpu_unavailable,
         running_on_github,
+        TEST_WITH_ROCM,
     )
 
 
@@ -1466,7 +1472,11 @@ def jagged_index_select_2d_ref(
                 torch.long,
             ]  # Disable torch.bfloat16 due to large error bound
         ),
-        use_cpu=st.booleans() if gpu_available else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
     )
     @settings(max_examples=20, deadline=None)
     def test_jagged_index_select_2d(

From b8241da8ab33b3093cb01f57af2d991f9686a457 Mon Sep 17 00:00:00 2001
From: Sabin Devkota <devkotasabin@meta.com>
Date: Mon, 13 Mar 2023 18:52:06 -0700
Subject: [PATCH 03/34] Update API interface and reroute backend for
 exact_rowwise_adagrad FE when using freq based methods (#1352)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1352

1. Update interface to accomadate rowwise_adagrad_with_counter.
2. Route backend for rowwise_adagrad to the new rowwise_adagrad_with_counter when freq based methods (e.g. freq sgd, counter adjusted regularization) are used.

Reviewed By: csmiler

Differential Revision: D36788395

fbshipit-source-id: 8eb5da8a5c8b52bc1e237af1054aac9f7245c443
---
 fbgemm_gpu/codegen/__init__.template          |   2 +
 .../embedding_backward_code_generator.py      |  11 +-
 fbgemm_gpu/codegen/lookup_args.py             |   7 +
 ..._embedding_codegen_lookup_invoker.template |  85 +++++
 .../split_table_batched_embeddings_ops.py     | 349 +++++++++++++++---
 .../ssd_split_table_batched_embeddings_ops.py |  19 +
 .../split_table_batched_embeddings_test.py    | 219 ++++++++++-
 7 files changed, 618 insertions(+), 74 deletions(-)

diff --git a/fbgemm_gpu/codegen/__init__.template b/fbgemm_gpu/codegen/__init__.template
index de8bf21dd0..661622eff9 100644
--- a/fbgemm_gpu/codegen/__init__.template
+++ b/fbgemm_gpu/codegen/__init__.template
@@ -13,7 +13,9 @@ import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_lars_sgd as loo
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_adam as lookup_partial_rowwise_adam  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_lamb as lookup_partial_rowwise_lamb  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad as lookup_rowwise_adagrad  # noqa: F401
+import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad_with_counter as lookup_rowwise_adagrad_with_counter  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_sgd as lookup_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_sgd as lookup_approx_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad as lookup_approx_rowwise_adagrad  # noqa: F401
+import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad_with_counter as lookup_approx_rowwise_adagrad_with_counter  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_weighted_adagrad as lookup_rowwise_weighted_adagrad  # noqa: F401
diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index 9d67358902..fd69a22f6e 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -646,6 +646,11 @@ def rowwise_adagrad_with_counter() -> None:
     split_precomputation = """
     at::acc_type<cache_t, true> freq = 1.0;
     at::acc_type<cache_t, true> l2_wd = 0.0;
+    at::acc_type<cache_t, true> tail_id_threshold_val = tail_id_threshold;
+    CUDA_KERNEL_ASSERT(max_counter > 0.0); // avoid divide by zero error
+    if (is_tail_id_thresh_ratio == 1){
+        tail_id_threshold_val = floorf(tail_id_threshold * max_counter);
+    }
     if (counter_halflife > 0 && threadIdx.x == 0) {
         // if id occurs multiple times in a batch, iter_delta=1
         const auto iter_delta = prev_iter[idx] == 0 ? 1.0 : iter * 1.0 - prev_iter[idx];
@@ -660,6 +665,7 @@ def rowwise_adagrad_with_counter() -> None:
     }
     freq = SHFL_SYNC(freq, 0);
     l2_wd = SHFL_SYNC(l2_wd, 0);
+    tail_id_threshold_val = SHFL_SYNC(tail_id_threshold_val, 0);
 
     at::acc_type<cache_t, true> g_local_sum_square = 0.0;
 
@@ -682,10 +688,7 @@ def rowwise_adagrad_with_counter() -> None:
     at::acc_type<cache_t, true> multiplier;
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
-    at::acc_type<cache_t, true> tail_id_threshold_val = tail_id_threshold;
-    if (is_tail_id_thresh_ratio == 1){
-        tail_id_threshold_val = floorf(tail_id_threshold * max_counter);
-    }
+
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
         momentum1[idx] = new_sum_square_grads;
diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py
index c5a3d465e9..8c98a96a1a 100644
--- a/fbgemm_gpu/codegen/lookup_args.py
+++ b/fbgemm_gpu/codegen/lookup_args.py
@@ -44,6 +44,13 @@ class OptimizerArgs(NamedTuple):
     weight_decay_mode: int
     eta: float
     momentum: float
+    counter_halflife: int
+    adjustment_iter: int
+    adjustment_ub: float
+    learning_rate_mode: int
+    grad_sum_decay: int
+    tail_id_threshold: float
+    is_tail_id_thresh_ratio: int
 
 
 class Momentum(NamedTuple):
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index 4cdc5b8766..bd406d39fa 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -36,9 +36,18 @@ def invoke(
     {% if "momentum2_dev" in args.split_function_arg_names %}
     momentum2: Momentum,
     {% endif %}
+    {% if "prev_iter_dev" in args.split_function_arg_names %}
+    prev_iter: Momentum,
+    {% endif %}
+    {% if "row_counter_dev" in args.split_function_arg_names %}
+    row_counter: Momentum,
+    {% endif %}
     {% if "iter" in args.split_function_arg_names %}
     iter: int,
     {% endif %}
+    {% if "max_counter" in args.split_function_arg_names %}
+    max_counter: float,
+    {% endif %}
 ) -> torch.Tensor:
     if (common_args.host_weights.numel() > 0):
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
@@ -84,6 +93,27 @@ def invoke(
             {% if "momentum" in args.split_function_arg_names %}
             momentum=optimizer_args.momentum,
             {% endif %}
+            {% if "counter_halflife" in args.split_function_arg_names %}
+            counter_halflife=optimizer_args.counter_halflife,
+            {% endif %}
+            {% if "adjustment_iter" in args.split_function_arg_names %}
+            adjustment_iter=optimizer_args.adjustment_iter,
+            {% endif %}
+            {% if "adjustment_ub" in args.split_function_arg_names %}
+            adjustment_ub=optimizer_args.adjustment_ub,
+            {% endif %}
+            {% if "learning_rate_mode" in args.split_function_arg_names %}
+            learning_rate_mode=optimizer_args.learning_rate_mode,
+            {% endif %}
+            {% if "grad_sum_decay" in args.split_function_arg_names %}
+            grad_sum_decay=optimizer_args.grad_sum_decay,
+            {% endif %}
+            {% if "tail_id_threshold" in args.split_function_arg_names %}
+            tail_id_threshold=optimizer_args.tail_id_threshold,
+            {% endif %}
+            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+            {% endif %}
             # momentum1
             {% if "momentum1_dev" in args.split_function_arg_names %}
             momentum1_host=momentum1.host,
@@ -96,10 +126,26 @@ def invoke(
             momentum2_offsets=momentum2.offsets,
             momentum2_placements=momentum2.placements,
             {% endif %}
+            # prev_iter
+            {% if "prev_iter_dev" in args.split_function_arg_names %}
+            prev_iter_host=prev_iter.host,
+            prev_iter_offsets=prev_iter.offsets,
+            prev_iter_placements=prev_iter.placements,
+            {% endif %}
+            # row_counter
+            {% if "row_counter_dev" in args.split_function_arg_names %}
+            row_counter_host=row_counter.host,
+            row_counter_offsets=row_counter.offsets,
+            row_counter_placements=row_counter.placements,
+            {% endif %}
             # iter
             {% if "iter" in args.split_function_arg_names %}
             iter=iter,
             {% endif %}
+            # max counter
+            {% if "max_counter" in args.split_function_arg_names %}
+            max_counter=max_counter,
+            {% endif %}
         )
     else:
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
@@ -151,6 +197,27 @@ def invoke(
             {% if "momentum" in args.split_function_arg_names %}
             momentum=optimizer_args.momentum,
             {% endif %}
+            {% if "counter_halflife" in args.split_function_arg_names %}
+            counter_halflife=optimizer_args.counter_halflife,
+            {% endif %}
+            {% if "adjustment_iter" in args.split_function_arg_names %}
+            adjustment_iter=optimizer_args.adjustment_iter,
+            {% endif %}
+            {% if "adjustment_ub" in args.split_function_arg_names %}
+            adjustment_ub=optimizer_args.adjustment_ub,
+            {% endif %}
+            {% if "learning_rate_mode" in args.split_function_arg_names %}
+            learning_rate_mode=optimizer_args.learning_rate_mode,
+            {% endif %}
+            {% if "grad_sum_decay" in args.split_function_arg_names %}
+            grad_sum_decay=optimizer_args.grad_sum_decay,
+            {% endif %}
+            {% if "tail_id_threshold" in args.split_function_arg_names %}
+            tail_id_threshold=optimizer_args.tail_id_threshold,
+            {% endif %}
+            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+            {% endif %}
             # momentum1
             {% if "momentum1_dev" in args.split_function_arg_names %}
             momentum1_dev=momentum1.dev,
@@ -165,9 +232,27 @@ def invoke(
             momentum2_offsets=momentum2.offsets,
             momentum2_placements=momentum2.placements,
             {% endif %}
+            # prev_iter
+            {% if "prev_iter_dev" in args.split_function_arg_names %}
+            prev_iter_dev=prev_iter.dev,
+            prev_iter_uvm=prev_iter.uvm,
+            prev_iter_offsets=prev_iter.offsets,
+            prev_iter_placements=prev_iter.placements,
+            {% endif %}
+            # row_counter
+            {% if "row_counter_dev" in args.split_function_arg_names %}
+            row_counter_dev=row_counter.dev,
+            row_counter_uvm=row_counter.uvm,
+            row_counter_offsets=row_counter.offsets,
+            row_counter_placements=row_counter.placements,
+            {% endif %}
             # iter
             {% if "iter" in args.split_function_arg_names %}
             iter=iter,
             {% endif %}
+            # max counter
+            {% if "max_counter" in args.split_function_arg_names %}
+            max_counter=max_counter,
+            {% endif %}
             output_dtype=common_args.output_dtype,
         )
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index 0552e9c981..87b9b1a559 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -71,6 +71,43 @@ class WeightDecayMode(enum.IntEnum):
     NONE = 0
     L2 = 1
     DECOUPLE = 2
+    COUNTER = 3
+
+
+class CounterWeightDecayMode(enum.IntEnum):
+    NONE = 0
+    L2 = 1
+    DECOUPLE = 2
+
+
+class LearningRateMode(enum.IntEnum):
+    EQUAL = -1
+    TAIL_ID_LR_INCREASE = 0
+    TAIL_ID_LR_DECREASE = 1
+    COUNTER_SGD = 2
+
+
+class GradSumDecay(enum.IntEnum):
+    NO_DECAY = -1
+    CTR_DECAY = 0
+
+
+@dataclass
+class TailIdThreshold:
+    val: float = 0
+    is_ratio: bool = False
+
+
+@dataclass
+class CounterBasedRegularizationDefinition:
+    counter_weight_decay_mode: CounterWeightDecayMode = CounterWeightDecayMode.NONE
+    counter_halflife: int = -1
+    adjustment_iter: int = -1
+    adjustment_ub: float = 1.0
+    learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL
+    grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY
+    tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False)
+    max_counter_update_freq: int = 1000
 
 
 RecordCacheMetrics: NamedTuple = NamedTuple(
@@ -235,6 +272,9 @@ def __init__(  # noqa C901
         eta: float = 0.001,  # used by LARS-SGD,
         beta1: float = 0.9,  # used by LAMB and ADAM
         beta2: float = 0.999,  # used by LAMB and ADAM
+        counter_based_regularization: Optional[
+            CounterBasedRegularizationDefinition
+        ] = None,  # used by Rowwise Adagrad
         pooling_mode: PoolingMode = PoolingMode.SUM,
         device: Optional[Union[str, int, torch.device]] = None,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
@@ -408,6 +448,34 @@ def __init__(  # noqa C901
         self.stochastic_rounding = stochastic_rounding
         self.optimizer = optimizer
 
+        self.weight_decay_mode = weight_decay_mode
+        if (
+            weight_decay_mode == WeightDecayMode.COUNTER
+            and counter_based_regularization is None
+        ):
+            raise AssertionError(
+                "weight_decay_mode is set to WeightDecayMode.COUNTER but counter_based_regularization is None"
+            )
+
+        self._used_rowwise_adagrad_with_counter: bool = (
+            optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.ROWWISE_ADAGRAD)
+            and weight_decay_mode == WeightDecayMode.COUNTER
+            and counter_based_regularization is not None
+        )
+
+        if counter_based_regularization is None:
+            counter_based_regularization = CounterBasedRegularizationDefinition()
+        self._max_counter_update_freq: int = -1
+        if self._used_rowwise_adagrad_with_counter:
+            self._max_counter_update_freq = (
+                counter_based_regularization.max_counter_update_freq
+            )
+            opt_arg_weight_decay_mode = (
+                counter_based_regularization.counter_weight_decay_mode
+            )
+        else:
+            opt_arg_weight_decay_mode = weight_decay_mode
+
         self.optimizer_args = invokers.lookup_args.OptimizerArgs(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
@@ -417,9 +485,18 @@ def __init__(  # noqa C901
             beta1=beta1,
             beta2=beta2,
             weight_decay=weight_decay,
-            weight_decay_mode=weight_decay_mode.value,
+            weight_decay_mode=opt_arg_weight_decay_mode.value,
             eta=eta,
             momentum=momentum,
+            counter_halflife=counter_based_regularization.counter_halflife,
+            adjustment_iter=counter_based_regularization.adjustment_iter,
+            adjustment_ub=counter_based_regularization.adjustment_ub,
+            learning_rate_mode=counter_based_regularization.learning_rate_mode.value,
+            grad_sum_decay=counter_based_regularization.grad_sum_decay.value,
+            tail_id_threshold=counter_based_regularization.tail_id_threshold.val,
+            is_tail_id_thresh_ratio=int(
+                counter_based_regularization.tail_id_threshold.is_ratio
+            ),
         )
 
         if optimizer in (
@@ -427,25 +504,7 @@ def __init__(  # noqa C901
             OptimType.EXACT_SGD,
         ):
             # NOTE: make TorchScript work!
-            self.register_buffer(
-                "momentum1_dev", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_host", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_uvm", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_placements",
-                torch.tensor([0], dtype=torch.int64),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum1_offsets",
-                torch.tensor([0], dtype=torch.int64),
-                persistent=False,
-            )
+            self._register_nonpersistent_buffers("momentum1")
         else:
             self._apply_split(
                 construct_split_state(
@@ -484,29 +543,40 @@ def __init__(  # noqa C901
             )
         else:
             # NOTE: make TorchScript work!
-            self.register_buffer(
-                "momentum2_dev",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum2_host",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum2_uvm",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
+            self._register_nonpersistent_buffers("momentum2")
+        if self._used_rowwise_adagrad_with_counter:
+            self._apply_split(
+                construct_split_state(
+                    embedding_specs,
+                    rowwise=True,
+                    cacheable=False,
+                ),
+                prefix="prev_iter",
+                # TODO: ideally we should use int64 to track iter but it failed to compile.
+                # It may be related to low precision training code. Currently using float32
+                # as a workaround while investigating the issue.
+                # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
+                #  but got `Type[torch.float32]`.
+                dtype=torch.float32,
             )
-            self.register_buffer(
-                "momentum2_placements",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
+            self._apply_split(
+                construct_split_state(
+                    embedding_specs,
+                    rowwise=True,
+                    cacheable=False,
+                ),
+                prefix="row_counter",
+                # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
+                #  but got `Type[torch.float32]`.
+                dtype=torch.float32,
             )
+            self.register_buffer("max_counter", torch.tensor([1], dtype=torch.float32))
+        else:
+            self._register_nonpersistent_buffers("prev_iter")
+            self._register_nonpersistent_buffers("row_counter")
             self.register_buffer(
-                "momentum2_offsets",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
+                "max_counter",
+                torch.ones(1, dtype=torch.float32, device=self.current_device),
                 persistent=False,
             )
         if optimizer in (
@@ -519,6 +589,7 @@ def __init__(  # noqa C901
             self.register_buffer(
                 "iter", torch.zeros(1, dtype=torch.int64, device=self.current_device)
             )
+
         else:
             self.register_buffer(
                 "iter",
@@ -572,6 +643,34 @@ def __init__(  # noqa C901
 
         self.step = 0
 
+    def _register_nonpersistent_buffers(self, prefix: str) -> None:
+        # NOTE: make TorchScript work!
+        self.register_buffer(
+            f"{prefix}_dev",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_host",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_uvm",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_placements",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_offsets",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+
     def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         if not hasattr(self, f"{prefix}_physical_placements"):
             raise DoesNotHavePrefix()
@@ -590,7 +689,7 @@ def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tenso
 
     def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]:
         all_states = []
-        for prefix in ["weights", "momentum1", "momentum2"]:
+        for prefix in ["weights", "momentum1", "momentum2", "prev_iter", "row_counter"]:
             try:
                 all_states.append(self.get_states(prefix))
             except DoesNotHavePrefix:
@@ -681,10 +780,20 @@ def forward(
             return invokers.lookup_approx_sgd.invoke(common_args, self.optimizer_args)
 
         momentum1 = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
             dev=self.momentum1_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
             host=self.momentum1_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
             uvm=self.momentum1_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
             offsets=self.momentum1_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
             placements=self.momentum1_placements,
         )
 
@@ -696,21 +805,22 @@ def forward(
             return invokers.lookup_adagrad.invoke(
                 common_args, self.optimizer_args, momentum1
             )
-        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
-            return invokers.lookup_rowwise_adagrad.invoke(
-                common_args, self.optimizer_args, momentum1
-            )
-        if self.optimizer == OptimType.ROWWISE_ADAGRAD:
-            assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode"
-            return invokers.lookup_approx_rowwise_adagrad.invoke(
-                common_args, self.optimizer_args, momentum1
-            )
 
         momentum2 = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
             dev=self.momentum2_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
             host=self.momentum2_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
             uvm=self.momentum2_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
             offsets=self.momentum2_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
             placements=self.momentum2_placements,
         )
         # Ensure iter is always on CPU so the increment doesn't synchronize.
@@ -768,6 +878,79 @@ def forward(
                 self.iter.item(),
             )
 
+        prev_iter = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
+            dev=self.prev_iter_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
+            host=self.prev_iter_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
+            uvm=self.prev_iter_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
+            offsets=self.prev_iter_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
+            placements=self.prev_iter_placements,
+        )
+        row_counter = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
+            dev=self.row_counter_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
+            host=self.row_counter_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
+            uvm=self.row_counter_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
+            offsets=self.row_counter_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
+            placements=self.row_counter_placements,
+        )
+        if self._used_rowwise_adagrad_with_counter:
+            if self.iter.item() % self._max_counter_update_freq == 0:
+                max_counter = torch.max(self.row_counter_dev.detach())
+                self.max_counter = max_counter.cpu() + 1
+
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            if self._used_rowwise_adagrad_with_counter:
+                return invokers.lookup_rowwise_adagrad_with_counter.invoke(
+                    common_args,
+                    self.optimizer_args,
+                    momentum1,
+                    prev_iter,
+                    row_counter,
+                    # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`.
+                    self.iter.item(),
+                    self.max_counter.item(),
+                )
+            else:
+                return invokers.lookup_rowwise_adagrad.invoke(
+                    common_args, self.optimizer_args, momentum1
+                )
+        if self.optimizer == OptimType.ROWWISE_ADAGRAD:
+            assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode"
+            if self._used_rowwise_adagrad_with_counter:
+                return invokers.lookup_approx_rowwise_adagrad_with_counter.invoke(
+                    common_args,
+                    self.optimizer_args,
+                    momentum1,
+                    prev_iter,
+                    row_counter,
+                    # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`.
+                    self.iter.item(),
+                    self.max_counter.item(),
+                )
+            else:
+                return invokers.lookup_approx_rowwise_adagrad.invoke(
+                    common_args, self.optimizer_args, momentum1
+                )
+
         raise ValueError(f"Invalid OptimType: {self.optimizer}")
 
     def reset_uvm_cache_stats(self) -> None:
@@ -1013,8 +1196,12 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
             or self.optimizer == OptimType.ROWWISE_ADAGRAD
             or self.optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD
         ):
+            split_optimizer_states = self.split_optimizer_states()
             list_of_state_dict = [
-                {"sum": _sum[0]} for _sum in self.split_optimizer_states()
+                {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]}
+                if self._used_rowwise_adagrad_with_counter
+                else {"sum": states[0]}
+                for states in split_optimizer_states
             ]
         else:
             raise NotImplementedError(
@@ -1024,7 +1211,9 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
         return list_of_state_dict
 
     @torch.jit.ignore
-    def split_optimizer_states(self) -> List[Tuple[torch.Tensor]]:
+    def split_optimizer_states(
+        self,
+    ) -> List[List[torch.Tensor]]:
         """
         Returns a list of states, split by table
         """
@@ -1062,8 +1251,14 @@ def get_optimizer_states(
         ):
             states.append(
                 get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_uvm,
                     # pyre-fixme[6]: Expected `Tensor` for 4th param but got
                     #  `Union[Tensor, nn.Module]`.
@@ -1087,8 +1282,14 @@ def get_optimizer_states(
         ):
             states.append(
                 get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_uvm,
                     # pyre-fixme[6]: Expected `Tensor` for 4th param but got
                     #  `Union[Tensor, nn.Module]`.
@@ -1100,7 +1301,49 @@ def get_optimizer_states(
                     in (OptimType.PARTIAL_ROWWISE_ADAM, OptimType.PARTIAL_ROWWISE_LAMB),
                 )
             )
-        return list(zip(*states))
+        if self._used_rowwise_adagrad_with_counter:
+            states.append(
+                get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_uvm,
+                    # pyre-fixme[6]: Expected `Tensor` for 4th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_physical_offsets,
+                    # pyre-fixme[6]: Expected `Tensor` for 5th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_physical_placements,
+                    rowwise=True,
+                )
+            )
+            states.append(
+                get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_uvm,
+                    # pyre-fixme[6]: Expected `Tensor` for 4th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_physical_offsets,
+                    # pyre-fixme[6]: Expected `Tensor` for 5th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_physical_placements,
+                    rowwise=True,
+                )
+            )
+        return_states = [list(s) for s in zip(*states)]
+        return return_states
 
     @torch.jit.export
     def set_learning_rate(self, lr: float) -> None:
diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
index 1eec03fdd9..250f84abb6 100644
--- a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
@@ -18,6 +18,7 @@
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
     align_to_cacheline,
     CacheAlgorithm,
+    CounterBasedRegularizationDefinition,
     DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     EmbeddingLocation,
     PoolingMode,
@@ -88,6 +89,9 @@ def __init__(
         eta: float = 0.001,  # used by LARS-SGD,
         beta1: float = 0.9,  # used by LAMB and ADAM
         beta2: float = 0.999,  # used by LAMB and ADAM
+        counter_based_regularization: Optional[
+            CounterBasedRegularizationDefinition
+        ] = None,  # used by Rowwise Adagrad
         pooling_mode: PoolingMode = PoolingMode.SUM,
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
@@ -217,6 +221,12 @@ def __init__(
         self.ssd_set_end = torch.cuda.Event()
         self.timesteps_prefetched: List[int] = []
 
+        if weight_decay_mode == WeightDecayMode.COUNTER or counter_based_regularization:
+            raise AssertionError(
+                "weight_decay_mode = WeightDecayMode.COUNTER is not supported for SSD TBE."
+            )
+        counter_based_regularization = CounterBasedRegularizationDefinition()
+
         self.optimizer_args = invokers.lookup_args.OptimizerArgs(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
@@ -229,6 +239,15 @@ def __init__(
             weight_decay_mode=weight_decay_mode.value,
             eta=eta,
             momentum=momentum,
+            counter_halflife=counter_based_regularization.counter_halflife,
+            adjustment_iter=counter_based_regularization.adjustment_iter,
+            adjustment_ub=counter_based_regularization.adjustment_ub,
+            learning_rate_mode=counter_based_regularization.learning_rate_mode.value,
+            grad_sum_decay=counter_based_regularization.grad_sum_decay.value,
+            tail_id_threshold=counter_based_regularization.tail_id_threshold.val,
+            is_tail_id_thresh_ratio=int(
+                counter_based_regularization.tail_id_threshold.is_ratio
+            ),
         )
         self.weights_dev = nn.Parameter(
             torch.empty((0,), device=self.current_device, dtype=torch.float32)
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index c0ba5f6f64..6a4d299b80 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -9,10 +9,11 @@
 
 import copy
 
+import math
 import pickle
 import random
 import unittest
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import fbgemm_gpu
 import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops
@@ -31,11 +32,16 @@
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
     BoundsCheckMode,
+    CounterBasedRegularizationDefinition,
+    CounterWeightDecayMode,
+    GradSumDecay,
     INT8_EMB_ROW_DIM_OFFSET,
+    LearningRateMode,
     OptimType,
     RecordCacheMetrics,
     rounded_row_size_in_bytes,
     SparseType,
+    TailIdThreshold,
     WeightDecayMode,
 )
 from hypothesis import assume, given, HealthCheck, settings, Verbosity
@@ -1627,6 +1633,7 @@ def execute_backward_adagrad_(  # noqa C901
         use_cpu: bool,
         exact: bool,
         output_dtype: SparseType,
+        weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE,
     ) -> None:
         # NOTE: cache is not applicable to CPU version.
         assume(not use_cpu or not use_cache)
@@ -1826,31 +1833,39 @@ def execute_backward_adagrad_(  # noqa C901
             goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         cc.flush()
-        split_optimizer_states = [s for (s,) in cc.split_optimizer_states()]
+        split_optimizer_states = cc.split_optimizer_states()
+        assert len(split_optimizer_states) == T
         tolerance = (
             1.0e-4
             if weights_precision == SparseType.FP32 and output_dtype == SparseType.FP32
             else 1.0e-2
         )
         for t in range(T):
+            if row_wise and weight_decay_mode == WeightDecayMode.COUNTER:
+                (m1, c1, c2) = split_optimizer_states[t]
+            else:
+                (m1,) = split_optimizer_states[t]
             # pyre-fixme[16]: `Optional` has no attribute `float`.
             ref_optimizer_state = bs[t].weight.grad.float().cpu().to_dense().pow(2)
             torch.testing.assert_close(
-                split_optimizer_states[t].float().cpu(),
+                m1.float().cpu(),
                 ref_optimizer_state.mean(dim=1) if row_wise else ref_optimizer_state,
                 atol=tolerance,
                 rtol=tolerance,
             )
         for t in range(T):
             # optimizer_state = squares (no row-wise) or sum squares (row-wise)
+            if row_wise and weight_decay_mode == WeightDecayMode.COUNTER:
+                (m1, c1, c2) = split_optimizer_states[t]
+            else:
+                (m1,) = split_optimizer_states[t]
             torch.testing.assert_close(
                 cc.split_embedding_weights()[t].float().cpu(),
                 torch.addcdiv(
                     bs[t].weight.float().cpu(),
                     value=-lr,
                     tensor1=bs[t].weight.grad.float().cpu().to_dense(),
-                    tensor2=split_optimizer_states[t]
-                    .float()
+                    tensor2=m1.float()
                     .sqrt_()
                     .add_(eps)
                     .view(Es[t], 1 if row_wise else Ds[t])
@@ -2589,6 +2604,8 @@ def execute_backward_optimizers_(  # noqa C901
             0.9,
             0.01,
         )
+        counter_based_regularization: CounterBasedRegularizationDefinition
+
         if optimizer == OptimType.EXACT_ADAGRAD:
             optimizer_kwargs["eps"] = eps
 
@@ -2596,6 +2613,21 @@ def execute_backward_optimizers_(  # noqa C901
             optimizer_kwargs["eps"] = eps
             optimizer_kwargs["weight_decay"] = weight_decay
             optimizer_kwargs["weight_decay_mode"] = weight_decay_mode
+            if weight_decay_mode == WeightDecayMode.COUNTER:
+                counter_based_regularization = CounterBasedRegularizationDefinition(
+                    counter_weight_decay_mode=CounterWeightDecayMode.DECOUPLE,
+                    counter_halflife=20000,
+                    adjustment_iter=24000,
+                    adjustment_ub=0.1,
+                    learning_rate_mode=LearningRateMode.TAIL_ID_LR_DECREASE,
+                    grad_sum_decay=GradSumDecay.NO_DECAY,
+                    tail_id_threshold=TailIdThreshold(val=1000, is_ratio=False),
+                )
+
+                optimizer_kwargs[
+                    "counter_based_regularization"
+                    # pyre-fixme[6]: Expected `float` for 2nd param but got `CounterBasedRegularizationDefinition`.
+                ] = counter_based_regularization
 
         if optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD:
             optimizer_kwargs["eps"] = eps
@@ -2654,15 +2686,39 @@ def execute_backward_optimizers_(  # noqa C901
         if optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.EXACT_ADAGRAD):
             rowwise = optimizer == OptimType.EXACT_ROWWISE_ADAGRAD
             for t in range(T):
-                (m1,) = split_optimizer_states[t]
+                row_counter: Optional[torch.Tensor] = None
+                freq: Optional[torch.Tensor] = None
+                iter_: int = -1
+
+                if rowwise and weight_decay_mode == WeightDecayMode.COUNTER:
+                    (m1, prev_iter, row_counter) = split_optimizer_states[t]
+                else:
+                    (m1,) = split_optimizer_states[t]
                 # to_dense in GPU is non-deterministic due to atmomics used in
                 # coalescing and floating point non-associativity.
                 # pyre-fixme[16]: `Optional` has no attribute `cpu`.
                 dense_cpu_grad = bs[t].weight.grad.cpu().to_dense()
-                if rowwise and not use_cpu and weight_decay_mode == WeightDecayMode.L2:
+                if rowwise and not use_cpu:
                     # We need to skip when using cpu because use_fbgemm (https://fburl.com/code/12131iub)
                     # is true and the template code (https://fburl.com/code/1kctlup3) is not executed.
-                    dense_cpu_grad += weight_decay * bs[t].weight.cpu()
+                    if weight_decay_mode == WeightDecayMode.L2:
+                        dense_cpu_grad += weight_decay * bs[t].weight.cpu()
+                    elif weight_decay_mode == WeightDecayMode.COUNTER:
+                        iter_ = int(cc.iter.item())
+                        (
+                            dense_cpu_grad,
+                            row_counter,
+                            freq,
+                        ) = self.get_grad_from_counter_adagrad(
+                            dense_cpu_grad,
+                            bs[t].weight.cpu(),
+                            counter_based_regularization,
+                            row_counter.cpu(),
+                            prev_iter.cpu(),
+                            iter_,
+                            weight_decay,
+                        )
+
                 m1_ref = (
                     dense_cpu_grad.pow(2)
                     if not rowwise
@@ -2681,14 +2737,31 @@ def execute_backward_optimizers_(  # noqa C901
                     )
                     + eps
                 )
-                if (
-                    rowwise
-                    and not use_cpu
-                    and weight_decay_mode == WeightDecayMode.DECOUPLE
-                ):
-                    weights_ref = bs[t].weight.cpu() - lr * (
-                        dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu()
-                    )
+                if rowwise and not use_cpu:
+                    if weight_decay_mode == WeightDecayMode.DECOUPLE:
+                        weights_ref = bs[t].weight.cpu() - lr * (
+                            dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu()
+                        )
+                    elif weight_decay_mode == WeightDecayMode.L2:
+                        # pyre-fixme[58]: `/` is not supported for operand types `float`
+                        #  and `Tensor`.
+                        weights_ref = bs[t].weight.cpu() - lr * dense_cpu_grad / denom
+                    elif weight_decay_mode == WeightDecayMode.COUNTER:
+                        max_counter = cc.max_counter.item()
+                        weights_ref = self.get_wts_from_counter_adagrad(
+                            dense_cpu_grad,
+                            bs[t].weight.cpu(),
+                            denom,
+                            counter_based_regularization,
+                            row_counter,
+                            # pyre-fixme[6]: Expected `Tensor` for 6th param but got `Optional[Tensor]`
+                            freq,
+                            max_counter,
+                            iter_,
+                            eps,
+                            lr,
+                            weight_decay,
+                        )
                 else:
                     # pyre-fixme[58]: `/` is not supported for operand types `float`
                     #  and `Tensor`.
@@ -2833,6 +2906,117 @@ def execute_backward_optimizers_(  # noqa C901
                     rtol=1.0e-4,
                 )
 
+    def get_grad_from_counter_adagrad(
+        self,
+        dense_cpu_grad: torch.Tensor,
+        weights: torch.Tensor,
+        counter_based_regularization: CounterBasedRegularizationDefinition,
+        row_counter: torch.Tensor,
+        prev_iter: torch.Tensor,
+        iter_: int,
+        weight_decay: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        row_counter = row_counter.view(row_counter.numel(), 1)
+        prev_iter = prev_iter.view(prev_iter.numel(), 1)
+        freq = torch.ones_like(row_counter)
+        counter_weight_decay_mode = (
+            counter_based_regularization.counter_weight_decay_mode
+        )
+        counter_halflife = counter_based_regularization.counter_halflife
+        l2_wd = 1.0 if counter_weight_decay_mode == CounterWeightDecayMode.L2 else 0.0
+
+        if counter_halflife > 0:
+            counter_log_rho = math.log(2.0) / counter_halflife
+            # if id occurs multiple times in a batch, iter_delta=1
+            iter_delta = torch.where(prev_iter == 0.0, 1.0, iter_ * 1.0 - prev_iter)
+            prev_iter = iter_ * torch.ones_like(prev_iter)
+            row_counter = 1.0 + torch.exp(-iter_delta * counter_log_rho) * row_counter
+            freq = torch.tensor([counter_halflife]) / row_counter
+
+        dense_cpu_grad += l2_wd * freq * weight_decay * weights
+        return dense_cpu_grad, row_counter, freq
+
+    def get_wts_from_counter_adagrad(
+        self,
+        dense_cpu_grad: torch.Tensor,
+        weights: torch.Tensor,
+        denom: torch.Tensor,
+        counter_based_regularization: CounterBasedRegularizationDefinition,
+        row_counter: torch.Tensor,
+        freq: torch.Tensor,
+        max_counter: float,
+        iter_: int,
+        eps: float,
+        learning_rate: float,
+        weight_decay: float,
+    ) -> torch.Tensor:
+        counter_weight_decay_mode = (
+            counter_based_regularization.counter_weight_decay_mode
+        )
+        counter_halflife = counter_based_regularization.counter_halflife
+        tail_id_threshold_val = counter_based_regularization.tail_id_threshold.val
+        if counter_based_regularization.tail_id_threshold.is_ratio:
+            tail_id_threshold_val = math.floor(tail_id_threshold_val * max_counter)
+        learning_rate_mode = counter_based_regularization.learning_rate_mode
+        adjustment_iter = counter_based_regularization.adjustment_iter
+        adjustment_ub = counter_based_regularization.adjustment_ub
+
+        multiplier = torch.tensor([learning_rate]) / denom
+        adjusted_multiplier = multiplier
+        exp_reg_correction = torch.ones_like(row_counter)
+
+        if counter_halflife > 0:
+            if adjustment_iter <= 0 or (
+                adjustment_iter > 0 and iter_ > adjustment_iter
+            ):
+                if learning_rate_mode == LearningRateMode.TAIL_ID_LR_INCREASE:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        multiplier
+                        * torch.maximum(
+                            torch.minimum(
+                                torch.pow(
+                                    torch.tensor([max_counter]) / (row_counter + 1.0),
+                                    adjustment_ub,
+                                ),
+                                torch.Tensor([10.0]),
+                            ),
+                            torch.Tensor([1.0]),
+                        ),
+                        multiplier,
+                    )
+                elif learning_rate_mode == LearningRateMode.TAIL_ID_LR_DECREASE:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        multiplier
+                        * torch.minimum(
+                            torch.maximum(
+                                torch.pow(
+                                    (row_counter + 1.0) / max_counter,
+                                    adjustment_ub,
+                                ),
+                                torch.Tensor([0.1]),
+                            ),
+                            torch.Tensor([1.0]),
+                        ),
+                        multiplier,
+                    )
+                elif learning_rate_mode == LearningRateMode.COUNTER_SGD:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        torch.Tensor([learning_rate])
+                        / (torch.sqrt(adjustment_ub * row_counter) + eps),
+                        multiplier,
+                    )
+
+                if counter_weight_decay_mode == CounterWeightDecayMode.DECOUPLE:
+                    exp_reg_correction = 1.0 - freq * weight_decay * learning_rate
+                elif counter_weight_decay_mode == CounterWeightDecayMode.L2:
+                    exp_reg_correction = 1.0 - freq * weight_decay * multiplier
+
+        weights = exp_reg_correction * weights - adjusted_multiplier * dense_cpu_grad
+        return weights
+
     @given(
         T=st.integers(min_value=1, max_value=5),
         D=st.integers(min_value=2, max_value=256),
@@ -2901,7 +3085,7 @@ def test_backward_optimizers_adam(  # noqa C901
         D=st.integers(min_value=2, max_value=256),
         B=st.integers(min_value=1, max_value=128),
         log_E=st.integers(min_value=3, max_value=5),
-        L=st.integers(min_value=0, max_value=20),
+        L=st.integers(min_value=2, max_value=20),
         weighted=st.booleans(),
         mixed=st.booleans(),
         optimizer=st.sampled_from(
@@ -2928,6 +3112,7 @@ def test_backward_optimizers_adam(  # noqa C901
             [
                 WeightDecayMode.L2,
                 WeightDecayMode.DECOUPLE,
+                WeightDecayMode.COUNTER,
             ]
         ),
     )

From f388b955cf5d3f080e750fb3fd445cf69d523f65 Mon Sep 17 00:00:00 2001
From: Junjie Yang <junjieyang@meta.com>
Date: Tue, 14 Mar 2023 00:08:01 -0700
Subject: [PATCH 04/34] Remove sync point in
 jagged_dense_elementwise_add_jagged_output backward (#1642)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1642

Remove sync point in jagged_dense_elementwise_add_jagged_output backward

Reviewed By: brad-mengchi

Differential Revision: D44039901

fbshipit-source-id: 8e7e23e4d9e01359e67e5b166adc57f894a1224d
---
 fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
index 283422b7ae..347ec089e0 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
@@ -644,7 +644,7 @@ jagged_dense_elementwise_add_jagged_output(
     const Tensor& y) {
   // Convert to jagged
   auto jagged_values =
-      DenseToJaggedOp::apply(y, x_offsets, c10::optional<int64_t>())[0];
+      DenseToJaggedOp::apply(y, x_offsets, x_values.size(0))[0];
 
   // Add jagged_values + x_values -> sum_values
   auto sum_values = x_values + jagged_values;

From f158490f6fe6e1ce3646a2f1dc80c1da86d1ad53 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 14 Mar 2023 19:28:13 -0700
Subject: [PATCH 05/34] Add Comprehensive Build Instructions and Isolate CPU
 and ROCm Builds (#1639)

Summary:
- Remove `.post0` suffix from the autogenerated package version
- Document the full FBGEMM_GPU OSS build process in a separate Markdown file
- Remove installation of packages not needed for ROCm builds
- Migrate CPU and ROCm jobs to run on top of Docker containers instead of bare metal instances
- Update GitHub workflow configuration to cancel previous jobs for a PR if a new commit is pushed to the PR

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1639

Reviewed By: shintaro-iwasaki

Differential Revision: D44076312

Pulled By: q10

fbshipit-source-id: 6b2d083022feb7421b26da2d998678e00c11f283
---
 .github/scripts/setup_env.bash                | 213 +++++----
 .github/workflows/fbgemm_ci.yml               |   5 +
 .github/workflows/fbgemm_gpu_ci.yml           |  19 +-
 .github/workflows/fbgemm_gpu_lint.yml         |   5 +
 .github/workflows/fbgemm_nightly_build.yml    |   5 +
 .../workflows/fbgemm_nightly_build_cpu.yml    |  18 +
 .github/workflows/fbgemm_release_build.yml    |   5 +
 .../workflows/fbgemm_release_build_cpu.yml    |  17 +
 fbgemm_gpu/docs/BuildInstructions.md          | 430 ++++++++++++++++++
 fbgemm_gpu/docs/README.md                     |   2 +-
 fbgemm_gpu/setup.py                           |   6 +-
 11 files changed, 639 insertions(+), 86 deletions(-)
 create mode 100644 fbgemm_gpu/docs/BuildInstructions.md

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 4f1c808598..ccdac79097 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -13,8 +13,13 @@
 print_exec () {
   echo "+ $*"
   echo ""
-  "$@"
+  if "$@"; then
+    local retcode=0
+  else
+    local retcode=$?
+  fi
   echo ""
+  return $retcode
 }
 
 exec_with_retries () {
@@ -205,7 +210,7 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
@@ -313,7 +318,7 @@ print_ec2_info () {
 
 
 ################################################################################
-# Environment Setup and Install Functions
+# Miniconda Setup Functions
 ################################################################################
 
 setup_miniconda () {
@@ -398,6 +403,11 @@ create_conda_environment () {
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
 
+
+################################################################################
+# PyTorch Setup Functions
+################################################################################
+
 install_pytorch_conda () {
   local env_name="$1"
   local pytorch_version="$2"
@@ -553,6 +563,28 @@ install_pytorch_pip () {
   echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
 }
 
+
+################################################################################
+# CUDA Setup Functions
+################################################################################
+
+install_nvidia_drivers_centos () {
+  echo "################################################################################"
+  echo "# Install NVIDIA Drivers"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "[SETUP] Adding NVIDIA repos to yum ..."
+  print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+  print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+  print_exec sudo yum clean expire-cache
+
+  echo "[SETUP] Installing NVIDIA drivers ..."
+  install_system_packages nvidia-driver-latest-dkms
+}
+
 install_cuda () {
   local env_name="$1"
   local cuda_version="$2"
@@ -604,6 +636,86 @@ install_cuda () {
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
+install_cudnn () {
+  local env_name="$1"
+  local install_path="$2"
+  local cuda_version="$3"
+  if [ "$cuda_version" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
+    echo "Example:"
+    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install cuDNN"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  # Install cuDNN manually
+  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  local cudnn_packages=(
+    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
+    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
+  )
+
+  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
+  # shellcheck disable=SC2206
+  local cuda_version_arr=(${cuda_version//./ })
+  # Fetch the major and minor version to concat
+  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
+
+  # Get the URL
+  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
+  if [ "$cudnn_url" == "" ]; then
+    # Default to cuDNN for 11.7 if no CUDA version fits
+    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
+    cudnn_url="${cudnn_packages[117]}"
+  fi
+
+  # Clear the install path
+  rm -rf "$install_path"
+  mkdir -p "$install_path"
+
+  # Create temporary directory
+  # shellcheck disable=SC2155
+  local tmp_dir=$(mktemp -d)
+  cd "$tmp_dir" || return 1
+
+  # Download cuDNN
+  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
+  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
+
+  # Unpack the tarball
+  echo "[INSTALL] Unpacking cuDNN ..."
+  tar -xvf cudnn.tar.xz
+
+  # Copy the includes and libs over to the install path
+  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
+  rm -rf "${install_path:?}/include"
+  rm -rf "${install_path:?}/lib"
+  mv cudnn-linux-*/include "$install_path"
+  mv cudnn-linux-*/lib "$install_path"
+
+  # Delete the temporary directory
+  cd - || return 1
+  rm -rf "$tmp_dir"
+
+  # Export the environment variables to the Conda environment
+  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
+  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+
+  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
+}
+
+################################################################################
+# ROCm Setup Functions
+################################################################################
+
 install_rocm_ubuntu () {
   local env_name="$1"
   local rocm_version="$2"
@@ -652,15 +764,25 @@ install_rocm_ubuntu () {
   (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
 
   echo "[INSTALL] Installing HIP-relevant packages ..."
-  install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
   install_system_packages hipify-clang miopen-hip miopen-hip-dev
 
+  # There is no need to install these packages for ROCm
+  # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
+
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
+  echo "[INFO] Check ROCM GPU info ..."
+  print_exec rocm-smi
+
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
 }
 
+
+################################################################################
+# Build Tools Setup Functions
+################################################################################
+
 install_cxx_compiler () {
   local env_name="$1"
   local use_system_package_manager="$2"
@@ -759,82 +881,6 @@ install_build_tools () {
   echo "[INSTALL] Successfully installed all the build tools"
 }
 
-install_cudnn () {
-  local env_name="$1"
-  local install_path="$2"
-  local cuda_version="$3"
-  if [ "$cuda_version" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
-    echo "Example:"
-    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install cuDNN"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  # Install cuDNN manually
-  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
-  local cudnn_packages=(
-    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
-    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
-  )
-
-  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
-  # shellcheck disable=SC2206
-  local cuda_version_arr=(${cuda_version//./ })
-  # Fetch the major and minor version to concat
-  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
-
-  # Get the URL
-  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
-  if [ "$cudnn_url" == "" ]; then
-    # Default to cuDNN for 11.7 if no CUDA version fits
-    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
-    cudnn_url="${cudnn_packages[117]}"
-  fi
-
-  # Clear the install path
-  rm -rf "$install_path"
-  mkdir -p "$install_path"
-
-  # Create temporary directory
-  # shellcheck disable=SC2155
-  local tmp_dir=$(mktemp -d)
-  cd "$tmp_dir" || return 1
-
-  # Download cuDNN
-  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
-  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
-
-  # Unpack the tarball
-  echo "[INSTALL] Unpacking cuDNN ..."
-  tar -xvf cudnn.tar.xz
-
-  # Copy the includes and libs over to the install path
-  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
-  rm -rf "${install_path:?}/include"
-  rm -rf "${install_path:?}/lib"
-  mv cudnn-linux-*/include "$install_path"
-  mv cudnn-linux-*/lib "$install_path"
-
-  # Delete the temporary directory
-  cd - || return 1
-  rm -rf "$tmp_dir"
-
-  # Export the environment variables to the Conda environment
-  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
-  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
-
-  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
-}
-
 
 ################################################################################
 # Combination Functions
@@ -876,7 +922,7 @@ create_conda_pytorch_environment () {
 
 
 ################################################################################
-# Build Functions
+# FBGEMM_GPU Build Functions
 ################################################################################
 
 prepare_fbgemm_gpu_build () {
@@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () {
     echo ""
   fi
 
+  if [[ "${GITHUB_WORKSPACE}" ]]; then
+    # https://github.com/actions/checkout/issues/841
+    git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+  fi
+
   echo "[BUILD] Running git submodules update ..."
   git submodule sync
   git submodule update --init --recursive
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index f6bae56123..977b443a2b 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -13,6 +13,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-posix:
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 8e021c4451..bd62f23761 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -13,9 +13,17 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build_and_test_amd:
     runs-on: ${{ matrix.os }}
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -25,11 +33,18 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04 ]
+        os: [ linux.12xlarge ]
+        container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.10" ]
         rocm-version: [ "5.3" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -74,7 +89,7 @@ jobs:
         print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
         print_exec conda run -n $BUILD_ENV python setup.py build develop
 
-    - name: Test FBGEMM_GPU-ROCM Nightly installation
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index dc2b6344ce..1ff7203108 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -14,6 +14,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run_pylint:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index 4cdb10aaa8..bc699ef62b 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -30,6 +30,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index 72a0af01e7..1125b17a0d 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -30,10 +30,19 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -48,6 +57,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -93,6 +105,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -107,6 +122,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index 5e3d369fe4..def6002a76 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -22,6 +22,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index a652c89854..c7fb53cabd 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -22,10 +22,18 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -40,6 +48,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -85,6 +96,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -99,6 +113,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
new file mode 100644
index 0000000000..a90a059b40
--- /dev/null
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -0,0 +1,430 @@
+# FBGEMM_GPU Build Instructions
+
+The most up-to-date instructions are embedded in
+[`setup_env.bash`](../../.github/scripts/setup_env.bash).  The general steps for
+building FBGEMM_GPU are as follows:
+
+1. Set up an isolated environment for building (Miniconda)
+1. Install the relevant build tools (C/C++ compiler)
+1. Set up for either CUDA, ROCm, or CPU build
+1. Install PyTorch
+1. Run the build
+
+
+## Set Up an Isolated Build Environment
+
+### Install Miniconda
+
+Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+environment is recommended for reproducible builds:
+
+```sh
+# Set the Miniconda prefix directory
+miniconda_prefix=$HOME/miniconda
+
+# Download the Miniconfs installer
+wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+
+# Run the installer
+bash miniconda.sh -b -p "$miniconda_prefix" -u
+
+# Load the shortcuts
+. ~/.bashrc
+
+# Run updates
+conda update -n base -c defaults -y conda
+```
+
+From here on out, all installation commands will be run against or inside a
+Conda environment.
+
+
+### Set Up the Conda Environment
+
+Create a Conda environment with the specified Python version:
+
+```sh
+env_name=<ENV NAME>
+python_version=3.10
+
+# Create the environment
+conda create -y --name "${env_name}" python="${python_version}"
+
+# Upgrade PIP and pyOpenSSL package
+conda run -n "${env_name}" pip install --upgrade pip
+conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
+```
+
+## Install the Build Tools
+
+### C/C++ Compiler
+
+Install the GCC toolchain.  Note that GCC (as opposed to LLVM for example) is
+required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++`
+in the path.
+
+```sh
+conda install -n "${env_name}" -y gxx_linux-64=9.3.0
+```
+
+Note that while newer versions of GCC can be used, binaries compiled under newer
+versions of GCC will not be compatible with older systems such as Ubuntu 20.04
+or CentOS Stream 8, because the compiled library will reference symbols from
+versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support.  To
+see what versions of GLIBCXX that the available `libstdc++.so.6` supports:
+
+```sh
+libcxx_path=/path/to/libstdc++.so.6
+objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+```
+
+### Other Build Tools
+
+Install the other necessary build tools such as `ninja`, `cmake`, etc:
+
+```sh
+conda install -n "${env_name}" -y \
+    click \
+    cmake \
+    hypothesis \
+    jinja2 \
+    ninja \
+    numpy \
+    scikit-build \
+    wheel
+```
+
+
+## Set Up for CUDA Build
+
+The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability
+3.5+.  Setting the machine up for CUDA builds of FBGEMM_GPU can be done either
+through pre-built Docker images or through Conda installation on bare metal.
+Note that neither a GPU nor the NVIDIA drivers need to be present for builds,
+since they are only used at runtime.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired
+Linux distribution and CUDA version.
+
+```sh
+# Run for Ubuntu 22.04, CUDA 11.8
+docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install CUDA
+
+Install the full CUDA package through Conda, which includes
+[NVML](https://developer.nvidia.com/nvidia-management-library-nvml):
+
+```sh
+cuda_version=11.7.1
+
+# Install the full CUDA package
+conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}"
+```
+
+Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are
+found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_runtime.h
+find "${conda_prefix}" -name libnvidia-ml.so
+```
+
+### Install cuDNN
+
+[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the
+CUDA variant of FBGEMM_GPU.  Download and extract the cuDNN package for the
+given CUDA version:
+
+```sh
+# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+
+# Download and unpack cuDNN
+wget -q "${cudnn_url}" -O cudnn.tar.xz
+```
+
+### [OPTIONAL] Install CUB
+
+[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for
+the CUDA variant FBGEMM_GPU.  This must be installed separately for
+**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged.
+
+To install CUB through Conda:
+
+```sh
+conda install -c bottler nvidiacub
+```
+
+Alternatively, CUB may be installed manually by downloading from the
+[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking
+the package:
+
+```sh
+# Download and unpack CUB
+wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+```
+
+
+## Set Up for ROCm Build
+
+Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through
+pre-built Docker images or through bare metal.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the
+desired ROCm CUDA version.
+
+```sh
+# Run for ROCm 5.4.2
+docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install ROCm
+
+Install the full ROCm package through the operating system package manger. The
+full instructions can be found in the
+[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html):
+
+```sh
+# [OPTIONAL] Disable apt installation prompts
+export DEBIAN_FRONTEND=noninteractive
+
+# Update the repo DB
+apt update
+
+# Download the installer
+wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb
+
+# Run the installer
+apt install ./amdgpu-install_5.4.50403-1_all.deb
+
+# Install ROCm
+amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
+```
+
+### Install MIOpen
+
+[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the
+ROCm variant of FBGEMM_GPU that needs to be installed:
+
+```sh
+apt install hipify-clang miopen-hip miopen-hip-dev
+```
+
+
+## Install PyTorch
+
+The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains
+the most authoritative instructions on how to install PyTorch, either through
+Conda or through PIP.
+
+### Installation Through Conda
+
+```sh
+# Install the latest nightly
+conda install -n "${env_name}" -y pytorch -c pytorch-nightly
+# Install the latest test (RC)
+conda install -n "${env_name}" -y pytorch -c pytorch-test
+# Install a specific version
+conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch
+```
+
+Note that installing PyTorch through Conda without specifying a version (as in
+the case of nightly builds) may not always be reliable.  For example, it is known
+that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the
+CPU-only builds.  As such, a Conda installation of `pytorch-nightly` in that time
+window will silently fall back to installing the CPU-only version.
+
+Also note that, because both the GPU and CPU-only versions of PyTorch are placed
+into the same artifact bucket, the PyTorch variant that is selected during
+installation will depend on whether or not CUDA is installed on the system.  Thus
+for GPU builds, it is important to install CUDA first prior to PyTorch.
+
+### Installation Through PIP
+
+Note that PIP is the only choice of installation of PyTorch for ROCm builds.
+
+```sh
+# Install the latest nightly
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/
+# Install the latest test (RC)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/
+# Install a specific version
+conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/
+# Install the latest nightly (ROCm 5.3)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
+```
+
+### Post-Install Checks
+
+Verify the PyTorch installation with an `import` test:
+
+```sh
+conda run -n "${env_name}" python -c "import torch.distributed"
+```
+
+For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`**
+is found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_cmake_macros.h
+```
+
+
+## Build the FBGEMM_GPU Package
+
+### Preparing the Build
+
+Clone the repo along with its submodules, and install the `requirements.txt`:
+
+```sh
+# !! Run inside the Conda environment !!
+
+# Select a version tag
+FBGEMM_VERSION=v0.4.0
+
+# Clone the repo along with its submodules
+git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION}
+
+# Install additional required packages for building and testing
+cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu
+pip install requirements.txt
+```
+
+### The Build Process
+
+The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it
+keeps state across install runs.  As such, builds can become stale and can cause
+problems when re-runs are attempted after a build failure due to missing
+dependencies, etc.  To address this, simply clear the build cache:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python setup.py clean
+```
+
+### CUDA Build
+
+Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and
+made available to the build through environment variables:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# [OPTIONAL] Specify the CUDA installation paths
+# This may be required if CMake is unable to find nvcc
+export CUDACXX=/path/to/nvcc
+export CUDA_BIN_PATH=/path/to/cuda/installation
+
+# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1)
+export CUB_DIR=/path/to/cub
+
+# Specify cuDNN header and library paths
+export CUDNN_INCLUDE_DIR=/path/to/cudnn/include
+export CUDNN_LIBRARY=/path/to/cudnn/lib
+
+# Specify NVML path
+export NVML_LIB_PATH=/path/to/libnvidia-ml.so
+
+# Update to reflect the version of Python in the Conda environment
+python_tag=py310
+package_name=fbgemm_gpu
+
+# Build for SM70/80 (V100/A100 GPU); update as needed
+# If not specified, only the CUDA architecture supported by current system will be targeted
+# Ifo CUDA device is present either, all CUDA architectures will be targeted
+cuda_arch_list=7.0;8.0
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+
+# Build and install the library into the Conda environment
+python setup.py install \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+```
+
+### ROCm Build
+
+For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a')
+export ROCM_PATH=/path/to/rocm
+export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*')
+
+python_tag=py310
+package_name=fbgemm_gpu_rocm
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64
+
+# Build and install the library into the Conda environment
+python setup.py install develop
+```
+
+### CPU-Only Build
+
+For CPU-only builds, the `--cpu_only` needs to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python_tag=py310
+package_name=fbgemm_gpu_cpu
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --cpu_only
+
+# Build and install the library into the Conda environment
+python setup.py install --cpu_only
+```
+
+### Post-Build Checks
+
+After the build completes, it is useful to check the built library and verify
+the version numbers of GLIBCXX referenced as well as the availability of certain
+function symbols:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Locate the built .SO file
+fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so)
+
+# Note the versions of GLIBCXX referenced by the .SO
+# The libstdc++.so.6 available on the install target must support these versions
+objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+
+# Test for the existence of a given function symbol in the .SO
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings("
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense("
+```
diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md
index 097cde17dc..e2b0c81ae7 100644
--- a/fbgemm_gpu/docs/README.md
+++ b/fbgemm_gpu/docs/README.md
@@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti
 
    ```
    pip3 install -r requirements.txt
-   doxygen Doxygen.ini
+   doxygen Doxyfile.in
    make html
    ```
 
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 6b8ebbb570..2b34cb240a 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -7,6 +7,7 @@
 import argparse
 import os
 import random
+import re
 import subprocess
 import sys
 
@@ -38,8 +39,9 @@ def generate_package_version(package_name: str):
         print(
             f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
         )
-        # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
-        version = gitversion.version_from_git().split("+")[0]
+        # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
+        # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
+        version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
 
     print(f"[SETUP.PY] Setting the package version: {version}")
     return version

From da01a59556fec9776733bf20aea8fe8fb29cdd3d Mon Sep 17 00:00:00 2001
From: Alfredo Tupone <tupone@gentoo.org>
Date: Tue, 14 Mar 2023 19:39:40 -0700
Subject: [PATCH 06/34] include cstdint (#1640)

Summary:
fix build with gcc-13

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1640

Reviewed By: shintaro-iwasaki

Differential Revision: D44044422

Pulled By: q10

fbshipit-source-id: 692ec9c34f4aaf726294a2b643fbceabf8159033
---
 include/fbgemm/UtilsAvx2.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h
index a1af6078a8..4fb1220eba 100644
--- a/include/fbgemm/UtilsAvx2.h
+++ b/include/fbgemm/UtilsAvx2.h
@@ -8,6 +8,7 @@
 // This file defines common utilities used in code compiled with avx2/avx512
 // flags.
 
+#include <cstdint>
 #include <string>
 
 namespace fbgemm {

From ae6235bc4b521102155fad4b54f92df34b5a6afe Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 15 Mar 2023 13:55:25 -0700
Subject: [PATCH 07/34] Add support for group size > 54 in group_index_select
 (#1611)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1611

If group size is larger than 54, internally breaks the group down into
smaller groups (each subgroup size is less than or equal to 54).

Reviewed By: jianyuh

Differential Revision: D43585937

fbshipit-source-id: bf14eeb79881a5737dcf7660e3e0f56d21f7b326
---
 fbgemm_gpu/src/sparse_ops_gpu.cpp | 39 +++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops_gpu.cpp
index e3e1225fb9..0126ff414f 100644
--- a/fbgemm_gpu/src/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops_gpu.cpp
@@ -500,12 +500,41 @@ Tensor index_select_dim0_gpu(
 std::vector<Tensor> group_index_select_dim0_gpu(
     const std::vector<Tensor>& input_group,
     const std::vector<Tensor>& indices_group) {
+  const auto group_size = input_group.size();
   std::vector<Tensor> output_group;
-  apply_(
-      [&](auto&&... args) {
-        output_group = GroupIndexSelectDim0GPUOp::apply(indices_group, args...);
-      },
-      input_group);
+  // We use the APPLY_AUTOGRAD_FN macros to instantiate
+  // GroupIndexSelectDim0GPUOp for different group sizes.  We only instantiate
+  // up to group size of 54.
+  constexpr size_t max_group_size = 54;
+  // Specialize this path to avoid copy
+  if (group_size <= max_group_size) {
+    apply_(
+        [&](auto&&... args) {
+          output_group =
+              GroupIndexSelectDim0GPUOp::apply(indices_group, args...);
+        },
+        input_group);
+    return output_group;
+  }
+
+  const auto input_itr = input_group.begin();
+  const auto indices_itr = indices_group.begin();
+
+  for (size_t start = 0; start < group_size; start += max_group_size) {
+    const auto end = std::min(start + max_group_size, group_size);
+    std::vector<Tensor> input_subgroup(input_itr + start, input_itr + end);
+    std::vector<Tensor> indices_subgroup(
+        indices_itr + start, indices_itr + end);
+    std::vector<Tensor> output_subgroup;
+    apply_(
+        [&](auto&&... args) {
+          output_subgroup =
+              GroupIndexSelectDim0GPUOp::apply(indices_subgroup, args...);
+        },
+        input_subgroup);
+    output_group.insert(
+        output_group.end(), output_subgroup.begin(), output_subgroup.end());
+  }
   return output_group;
 }
 

From fd0eb83bd68a0980a03d64da2e846675777a095c Mon Sep 17 00:00:00 2001
From: Doe Hyun Yoon <dhyoon@meta.com>
Date: Thu, 16 Mar 2023 10:18:15 -0700
Subject: [PATCH 08/34] Implement cache miss emulation in UVM_CACHING (#1637)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1637

Enforce cache misses (even if trace-driven testing doesn't experience cache miss due to limited trace size) so that we can evaluate performance under cache misses.

Note that it's not exactly cache misses; enforce access to UVM by overriding lxu_cache_locations -- N / 256 requests.

Reviewed By: YuzeDaiMeta

Differential Revision: D42194019

fbshipit-source-id: ab04c1cc7a749e84d605cfe4f1687489ceab5725
---
 .../embedding_forward_quantized_host.cpp      |  65 ++++++++--
 .../split_embeddings_cache_cuda.cuh           |   6 +
 fbgemm_gpu/src/split_embeddings_cache_cuda.cu |  92 +++++++++++---
 .../test/uvm_cache_miss_emulate_test.cpp      | 119 ++++++++++++++++++
 4 files changed, 258 insertions(+), 24 deletions(-)
 create mode 100644 fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp

diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
index 6d4426cb27..43a182b6b1 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
@@ -4,12 +4,12 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-#include <algorithm>
 #include "c10/core/ScalarType.h"
 #ifdef FBCODE_CAFFE2
 #include "common/stats/Stats.h"
@@ -18,6 +18,8 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
 
+#include <algorithm>
+
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
@@ -37,7 +39,7 @@ DEFINE_quantile_stat(
     facebook::fb303::ExportTypeConsts::kNone,
     std::array<double, 4>{{.25, .50, .75, .99}});
 
-// Miss rate due to conflict in cache associativity.
+// (Unique) Miss rate due to conflict in cache associativity.
 // # unique misses due to conflict / # requested indices.
 DEFINE_quantile_stat(
     tbe_uvm_cache_conflict_unique_miss_rate,
@@ -45,6 +47,21 @@ DEFINE_quantile_stat(
     facebook::fb303::ExportTypeConsts::kNone,
     std::array<double, 4>{{.25, .50, .75, .99}});
 
+// Miss rate due to conflict in cache associativity.
+// # misses due to conflict / # requested indices.
+DEFINE_quantile_stat(
+    tbe_uvm_cache_conflict_miss_rate,
+    "tbe_uvm_cache_conflict_miss_rate_per_mille",
+    facebook::fb303::ExportTypeConsts::kNone,
+    std::array<double, 4>{{.25, .50, .75, .99}});
+
+// Total miss rate.
+DEFINE_quantile_stat(
+    tbe_uvm_cache_total_miss_rate,
+    "tbe_uvm_cache_total_miss_rate_per_mille",
+    facebook::fb303::ExportTypeConsts::kNone,
+    std::array<double, 4>{{.25, .50, .75, .99}});
+
 // FLAGs to control UVMCacheStats.
 DEFINE_int32(
     tbe_uvm_cache_stat_report,
@@ -58,6 +75,12 @@ DEFINE_int32(
     "If tbe_uvm_cache_stat_report is enabled, more detailed raw stats will be printed with this "
     "period. This should be an integer multiple of tbe_uvm_cache_stat_report.");
 
+DEFINE_int32(
+    tbe_uvm_cache_enforced_misses,
+    0,
+    "If set to non-zero, some cache lookups (tbe_uvm_cache_enforced_misses / 256) are enforced to be misses; "
+    "this is performance evaluation purposes only; and should be zero otherwise.");
+
 // TODO: align this with uvm_cache_stats_index in
 // split_embeddings_cache_cuda.cu.
 const int kUvmCacheStatsSize = 6;
@@ -84,10 +107,11 @@ void process_uvm_cache_stats(
     // uvm_cache_stats_counters[0]: num_req_indices
     // uvm_cache_stats_counters[1]: num_unique_indices
     // uvm_cache_stats_counters[2]: num_unique_misses
-    // uvm_cache_stats_counters[3]: num_unique_conflict_misses
+    // uvm_cache_stats_counters[3]: num_conflict_unique_misses
+    // uvm_cache_stats_counters[4]: num_conflict_misses
     // They should be zero-out after the calculated rates are populated into
     // cache counters.
-    static std::vector<int64_t> uvm_cache_stats_counters(4);
+    static std::vector<int64_t> uvm_cache_stats_counters(5);
 
     // Export cache stats.
     auto uvm_cache_stats_cpu = uvm_cache_stats.cpu();
@@ -107,19 +131,32 @@ void process_uvm_cache_stats(
         // Calculate cache related ratios based on the cumulated numbers and
         // push them into the counter pools.
         if (populate_uvm_stats && uvm_cache_stats_counters[0] > 0) {
-          double unique_rate =
+          const double unique_rate =
               static_cast<double>(uvm_cache_stats_counters[1]) /
               uvm_cache_stats_counters[0] * 1000;
-          double unique_miss_rate =
+          const double unique_miss_rate =
               static_cast<double>(uvm_cache_stats_counters[2]) /
               uvm_cache_stats_counters[0] * 1000;
-          double unique_conflict_miss_rate =
+          const double conflict_unique_miss_rate =
               static_cast<double>(uvm_cache_stats_counters[3]) /
               uvm_cache_stats_counters[0] * 1000;
+          const double conflict_miss_rate =
+              static_cast<double>(uvm_cache_stats_counters[4]) /
+              uvm_cache_stats_counters[0] * 1000;
+          // total # misses = unique misses - conflict_unique_misses + conflict
+          // misses.
+          const double total_miss_rate =
+              static_cast<double>(
+                  uvm_cache_stats_counters[2] - uvm_cache_stats_counters[3] +
+                  uvm_cache_stats_counters[4]) /
+              uvm_cache_stats_counters[0] * 1000;
+
           STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate);
           STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate);
           STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue(
-              unique_conflict_miss_rate);
+              conflict_unique_miss_rate);
+          STATS_tbe_uvm_cache_conflict_miss_rate.addValue(conflict_miss_rate);
+          STATS_tbe_uvm_cache_total_miss_rate.addValue(total_miss_rate);
 
           // Fill all the elements of the vector uvm_cache_stats_counters as 0
           // to zero out the cumulated counters.
@@ -365,7 +402,7 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
     // cache_index_table_map: (linearized) index to table number map.
     // 1D tensor, dtype=int32.
     c10::optional<Tensor> cache_index_table_map,
-    // lxu_cache_state: Cache state (cached idnex, or invalid).
+    // lxu_cache_state: Cache state (cached index, or invalid).
     // 2D tensor: # sets x assoc. dtype=int64.
     c10::optional<Tensor> lxu_cache_state,
     // lxu_state: meta info for replacement (time stamp for LRU).
@@ -461,6 +498,16 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
         uvm_cache_stats);
 
 #ifdef FBCODE_CAFFE2
+    if (FLAGS_tbe_uvm_cache_enforced_misses > 0) {
+      // Override some lxu_cache_locations (N for every 256 indices) with cache
+      // miss to enforce access to UVM.
+      lxu_cache_locations = emulate_cache_miss(
+          lxu_cache_locations.value(),
+          FLAGS_tbe_uvm_cache_enforced_misses,
+          gather_uvm_stats,
+          uvm_cache_stats);
+    }
+
     process_uvm_cache_stats(
         signature,
         total_cache_hash_size.value(),
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
index 52854a4f2e..3532928963 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
@@ -155,6 +155,12 @@ at::Tensor lxu_cache_lookup_cuda(
     bool gather_cache_stats,
     c10::optional<at::Tensor> uvm_cache_stats);
 
+at::Tensor emulate_cache_miss(
+    at::Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    at::Tensor uvm_cache_stats);
+
 ///@ingroup table-batched-embed-cuda
 /// Lookup the LRU/LFU cache: find the cache weights location for all indices.
 /// Look up the slots in the cache corresponding to `linear_cache_indices`, with
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index 9d23ee9fff..e5930ab745 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -79,6 +79,18 @@ enum uvm_cache_stats_index {
   num_conflict_misses = 5,
 };
 
+// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
+// not sensitive to grid size as long as the number thread blocks per SM is not
+// too small nor too big.
+constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
+
+int get_max_thread_blocks_for_cache_kernels_() {
+  cudaDeviceProp* deviceProp =
+      at::cuda::getDeviceProperties(c10::cuda::current_device());
+  return deviceProp->multiProcessorCount *
+      MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
+}
+
 } // namespace
 
 int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) {
@@ -495,6 +507,69 @@ std::tuple<Tensor, Tensor, c10::optional<Tensor>> get_unique_indices_cuda(
 
 namespace {
 
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
+    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
+  const int32_t N = lxu_cache_locations.size(0);
+  int64_t n_enforced_misses = 0;
+  CUDA_KERNEL_LOOP(n, N) {
+    if ((n & 0x00FF) < enforced_misses_per_256) {
+      if (lxu_cache_locations[n] >= 0) {
+        n_enforced_misses++;
+      }
+      lxu_cache_locations[n] = kCacheLocationMissing;
+    }
+  }
+  if (gather_cache_stats && n_enforced_misses > 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+        n_enforced_misses);
+  }
+}
+} // namespace
+
+Tensor emulate_cache_miss(
+    Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    Tensor uvm_cache_stats) {
+  TENSOR_ON_CUDA_GPU(lxu_cache_locations);
+  TENSOR_ON_CUDA_GPU(uvm_cache_stats);
+
+  const auto N = lxu_cache_locations.numel();
+  if (lxu_cache_locations.numel() == 0) {
+    // nothing to do
+    return lxu_cache_locations;
+  }
+
+  const dim3 blocks(std::min(
+      div_round_up(N, kMaxThreads),
+      get_max_thread_blocks_for_cache_kernels_()));
+
+  AT_DISPATCH_INDEX_TYPES(
+      lxu_cache_locations.scalar_type(), "emulate_cache_miss", [&] {
+        emulate_cache_miss_kernel<<<
+            blocks,
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            lxu_cache_locations
+                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            enforced_misses_per_256,
+            gather_cache_stats,
+            uvm_cache_stats
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+  return lxu_cache_locations;
+}
+
+namespace {
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
     const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
@@ -622,19 +697,6 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel
     }
   }
 }
-
-// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
-// not sensitive to grid size as long as the number thread blocks per SM is not
-// too small nor too big.
-constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
-
-int get_max_thread_blocks_for_cache_kernels_() {
-  cudaDeviceProp* deviceProp =
-      at::cuda::getDeviceProperties(c10::cuda::current_device());
-  return deviceProp->multiProcessorCount *
-      MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
-}
-
 } // namespace
 
 std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
@@ -798,8 +860,8 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats) {
   const int32_t C = lxu_cache_state.size(0);
-  int64_t n_conflict_misses = 0;
-  int64_t n_inserted = 0;
+  int32_t n_conflict_misses = 0;
+  int32_t n_inserted = 0;
   for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
        n += gridDim.x * blockDim.y) {
     // check if this warp is responsible for this whole segment.
diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
new file mode 100644
index 0000000000..808ed33624
--- /dev/null
+++ b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <gtest/gtest.h>
+
+#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
+
+using namespace ::testing;
+
+// Helper function that generates input tensor for emulate_cache_miss testing.
+at::Tensor generate_lxu_cache_locations(
+    const int64_t num_requests,
+    const int64_t num_sets,
+    const int64_t associativity = 32) {
+  const auto lxu_cache_locations = at::randint(
+      0,
+      num_sets * associativity,
+      {num_requests},
+      at::device(at::kCPU).dtype(at::kInt));
+  return lxu_cache_locations;
+}
+
+// Wrapper function that takes lxu_cache_locations on CPU, copies it to GPU,
+// runs emulate_cache_miss(), and then returns the result, placed on CPU.
+std::pair<at::Tensor, at::Tensor> run_emulate_cache_miss(
+    at::Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_uvm_stats = false) {
+  at::Tensor lxu_cache_locations_copy = at::_to_copy(lxu_cache_locations);
+  const auto options =
+      lxu_cache_locations.options().device(at::kCUDA).dtype(at::kInt);
+  const auto uvm_cache_stats =
+      gather_uvm_stats ? at::zeros({6}, options) : at::empty({0}, options);
+
+  const auto lxu_cache_location_with_cache_misses = emulate_cache_miss(
+      lxu_cache_locations_copy.to(at::kCUDA),
+      enforced_misses_per_256,
+      gather_uvm_stats,
+      uvm_cache_stats);
+  return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()};
+}
+
+TEST(uvm_cache_miss_emulate_test, no_cache_miss) {
+  constexpr int64_t num_requests = 10000;
+  constexpr int64_t num_sets = 32768;
+  constexpr int64_t associativity = 32;
+
+  auto lxu_cache_locations_cpu =
+      generate_lxu_cache_locations(num_requests, num_sets, associativity);
+  auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats =
+      run_emulate_cache_miss(lxu_cache_locations_cpu, 0);
+  auto lxu_cache_location_with_cache_misses =
+      lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first;
+  EXPECT_TRUE(
+      at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses));
+}
+
+TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) {
+  constexpr int64_t num_requests = 10000;
+  constexpr int64_t num_sets = 32768;
+  constexpr int64_t associativity = 32;
+  constexpr std::array<int64_t, 6> enforced_misses_per_256_for_testing = {
+      1, 5, 7, 33, 100, 256};
+
+  for (const bool miss_in_lxu_cache_locations : {false, true}) {
+    for (const bool gather_cache_stats : {false, true}) {
+      for (const auto enforced_misses_per_256 :
+           enforced_misses_per_256_for_testing) {
+        auto lxu_cache_locations_cpu =
+            generate_lxu_cache_locations(num_requests, num_sets, associativity);
+        if (miss_in_lxu_cache_locations) {
+          // one miss in the original lxu_cache_locations; shouldn't be counted
+          // as enforced misses from emulate_cache_miss().
+          auto z = lxu_cache_locations_cpu.data_ptr<int32_t>();
+          z[0] = -1;
+        }
+        auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats =
+            run_emulate_cache_miss(
+                lxu_cache_locations_cpu,
+                enforced_misses_per_256,
+                gather_cache_stats);
+        auto lxu_cache_location_with_cache_misses =
+            lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first;
+        EXPECT_FALSE(at::equal(
+            lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses));
+
+        auto x = lxu_cache_locations_cpu.data_ptr<int32_t>();
+        auto y = lxu_cache_location_with_cache_misses.data_ptr<int32_t>();
+        int64_t enforced_misses = 0;
+        for (int32_t i = 0; i < lxu_cache_locations_cpu.numel(); ++i) {
+          if (x[i] != y[i]) {
+            EXPECT_EQ(y[i], -1);
+            enforced_misses++;
+          }
+        }
+        int64_t num_requests_over_256 =
+            static_cast<int64_t>(num_requests / 256);
+        int64_t expected_misses = num_requests_over_256 *
+                enforced_misses_per_256 +
+            std::min((num_requests - num_requests_over_256 * 256),
+                     enforced_misses_per_256);
+        if (miss_in_lxu_cache_locations) {
+          expected_misses--;
+        }
+        EXPECT_EQ(expected_misses, enforced_misses);
+        if (gather_cache_stats) {
+          auto uvm_cache_stats =
+              lxu_cache_location_with_cache_misses_and_uvm_cache_stats.second;
+          auto cache_stats_ptr = uvm_cache_stats.data_ptr<int32_t>();
+          // enforced misses are recorded as conflict misses.
+          EXPECT_EQ(expected_misses, cache_stats_ptr[5]);
+        }
+      }
+    }
+  }
+}

From 39c5aa4f9fc32202b13a33af9bebdd3dd8266b9a Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 16 Mar 2023 11:01:37 -0700
Subject: [PATCH 09/34] Add TensorAccessor with memcheck (#1602)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1602

Illegal memory access is a common problem during GPU kernel execution.
The FBGEMM GPU relies on PyTorch's `C10_CUDA_KERNEL_LAUNCH_CHECK()` and
the CUDA runtime to detect such problems and throw an error.  However,
there are a few known issues with this approach.

(1) `C10_CUDA_KERNEL_LAUNCH_CHECK()` detects errors on the host.
However, due to the non-blocking, asynchronous nature of GPU kernel
execution, the error is caught on the host at a later point than where
the problematic kernel was launched.  This can cause the stack trace
to be inaccurate and make debugging more difficult.  Although the
issue can be fixed by running the code with `CUDA_LAUNCH_BLOCKING=1`,
this can change the state of the execution and cause Heisenbugs.

(2) Not all illegal memory accesses are caught by the runtime.  This
means that the system may not always throw an error when illegal
memory access occurs.

(3) Although the runtime throws an error for illegal memory access, it
is difficult to pinpoint the specific kernel and memory buffer/address
that is causing the problem.

For all the aforementioned reasons, we attempt to catch and throw an
error as soon as possible in the kernel when illegal memory accesses
occur in FBGEMM GPU.  We introduce the `FBGEMM_GPU_MEMCHECK` flag
to enable memory checking during compile time.  We copy PyTorch's
`TensorAccessor.h` into the FBGEMM GPU and extend it to check every
memory access through the `PackedTensorAccessor`.  If an invalid memory
access occurs, we throw an error using `CUDA_KERNEL_ASSERT`.  The error
message includes the name of the tensor and the kernel that caused the
problem.

If `FBGEMM_GPU_MEMCHECK` is enabled, FBGEMM operators will use
`fbgemm::PackedTensorAccessor`.  Otherwise, they will use
`at::PackedTensorAccessor`

`FBGEMM_GPU_MEMCHECK` integration in FBGEMM ops will be done in
subsequent diffs

Reviewed By: r-barnes

Differential Revision: D43421838

fbshipit-source-id: c8ef04970d94bb097cb5f09b42f994db72845167
---
 fbgemm_gpu/CMakeLists.txt                     |   3 +-
 .../fbgemm_gpu/fbgemm_tensor_accessor.h       | 575 ++++++++++++++++++
 2 files changed, 577 insertions(+), 1 deletion(-)
 create mode 100644 fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 5f393b0010..036470adf2 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -167,7 +167,8 @@ set(codegen_dependencies
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh)
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_tensor_accessor.h)
 
 if(USE_ROCM)
 message(STATUS "${PYTHON_EXECUTABLE}" "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py" "--opensource --is_rocm")
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h
new file mode 100644
index 0000000000..750d315d05
--- /dev/null
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace fbgemm_gpu {
+
+static constexpr size_t PTR_NAME_MAX_LEN = 16;
+static constexpr size_t FUNC_NAME_MAX_LEN = 64;
+
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we
+// restrict ourselves to functions and types available there (e.g.
+// at::IntArrayRef isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__`
+// pointers.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data),
+        sizes_(sizes),
+        strides_(strides),
+        ptr_name_(ptr_name),
+        func_name_(func_name) {
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+  }
+  C10_HOST at::IntArrayRef sizes() const {
+    return at::IntArrayRef(sizes_, N);
+  }
+  C10_HOST at::IntArrayRef strides() const {
+    return at::IntArrayRef(strides_, N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+  C10_HOST_DEVICE T& at(index_t idx) const {
+    if (idx < 0) {
+      printf(
+          "ERROR: idx < 0, tensor %s in %s, idx %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx));
+      CUDA_KERNEL_ASSERT(idx >= 0)
+    } else if (idx >= numel_) {
+      printf(
+          "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx),
+          static_cast<int64_t>(numel_));
+      CUDA_KERNEL_ASSERT(idx < numel_);
+    }
+    return data_[idx];
+  }
+
+ protected:
+  PtrType data_;
+  const index_t* const sizes_;
+  const index_t* const strides_;
+  index_t numel_;
+  const char* const ptr_name_;
+  const char* const func_name_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and
+// only indexing on the device uses `TensorAccessor`s.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessor : public TensorAccessorBase<T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->ptr_name_,
+        this->func_name);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) const {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->ptr_name_,
+        this->func_name);
+  }
+};
+
+template <typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T, 1, PtrTraits, index_t>
+    : public TensorAccessorBase<T, 1, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* func_name)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+  C10_HOST_DEVICE T& operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+  C10_HOST_DEVICE const T& operator[](index_t i) const {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+};
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on
+// for CUDA `Tensor`s on the host and as In contrast to `TensorAccessor`s, they
+// copy the strides and sizes on instantiation (on the host) in order to
+// transfer them on the device when calling kernels. On the device, indexing of
+// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as
+// PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and
+// std::copy isn't available on the device, so those functions are host only.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data) {
+    std::copy(sizes, sizes + N, std::begin(sizes_));
+    std::copy(strides, strides + N, std::begin(strides_));
+    // Compute numel_
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+    copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN);
+    copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN);
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes[i];
+      this->strides_[i] = strides[i];
+    }
+    // Compute numel_
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+    copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN);
+    copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN);
+  }
+
+  C10_HOST void copy_str(char* dst, const char* src, const size_t max_len) {
+    const auto len = std::min(strlen(src), max_len - 1);
+    std::memcpy(dst, src, sizeof(char) * len);
+    dst[len] = '\0';
+  }
+
+  C10_HOST_DEVICE T& at(index_t idx) const {
+    if (idx < 0) {
+      printf(
+          "ERROR: idx < 0, tensor %s in %s, idx %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx));
+      CUDA_KERNEL_ASSERT(idx >= 0)
+    } else if (idx >= numel_) {
+      printf(
+          "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx),
+          static_cast<int64_t>(numel_));
+      CUDA_KERNEL_ASSERT(idx < numel_)
+    }
+    return data_[idx];
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+
+ protected:
+  PtrType data_;
+  index_t sizes_[N];
+  index_t strides_[N];
+  index_t numel_;
+  char ptr_name_[PTR_NAME_MAX_LEN];
+  char func_name_[FUNC_NAME_MAX_LEN];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
+        0 <= i && i < index_t{N},
+        "Index ",
+        i,
+        " is not within bounds of a tensor of dimension ",
+        N);
+  }
+};
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessor
+    : public GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        new_sizes,
+        new_strides,
+        this->ptr_name_,
+        this->func_name_);
+  }
+
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) const {
+    const index_t* const new_sizes = this->sizes_ + 1;
+    const index_t* const new_strides = this->strides_ + 1;
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        new_sizes,
+        new_strides,
+        this->ptr_name_,
+        this->func_name_);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template <typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>
+    : public GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_DEVICE T& operator[](index_t i) {
+    return this->at(this->strides_[0] * i);
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->at(this->strides_[0] * i);
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};
+
+// Can't put this directly into the macro function args because of commas
+#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+
+// Old name for `GenericPackedTensorAccessor`
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
+
+#undef AT_X
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor32 =
+    GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor64 =
+    GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;
+
+} // namespace fbgemm_gpu
+
+#ifdef FBGEMM_GPU_MEMCHECK
+namespace pta = fbgemm_gpu;
+#else
+namespace pta = at;
+#endif
+
+#ifdef FBGEMM_GPU_MEMCHECK
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits,
+    typename index_t = int64_t>
+const fbgemm_gpu::GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+make_generic_packed_tensor_accessor(
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+  static_assert(
+      N > 0,
+      "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+  TORCH_CHECK(
+      tensor.dim() == N,
+      "TensorAccessor expected ",
+      N,
+      " dims but tensor has ",
+      tensor.dim());
+  return fbgemm_gpu::GenericPackedTensorAccessor<T, N, PtrTraits, index_t>(
+      static_cast<typename PtrTraits<T>::PtrType>(tensor.data_ptr<T>()),
+      tensor.sizes().data(),
+      tensor.strides().data(),
+      ptr_name,
+      func_name);
+}
+#endif
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits>
+const pta::PackedTensorAccessor32<T, N, PtrTraits>
+make_packed_tensor_accessor32(
+#ifdef FBGEMM_GPU_MEMCHECK
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+#else
+    at::Tensor& tensor) {
+#endif
+  TORCH_CHECK(
+      tensor.numel() <=
+          static_cast<int64_t>(std::numeric_limits<int32_t>::max()),
+      "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64");
+#ifdef FBGEMM_GPU_MEMCHECK
+  return make_generic_packed_tensor_accessor<T, N, PtrTraits, int32_t>(
+      tensor, ptr_name, func_name);
+#else
+  return tensor.packed_accessor32<T, N, PtrTraits>();
+#endif
+}
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits>
+const pta::PackedTensorAccessor64<T, N, PtrTraits>
+make_packed_tensor_accessor64(
+#ifdef FBGEMM_GPU_MEMCHECK
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+  return make_generic_packed_tensor_accessor<T, N, PtrTraits, int64_t>(
+      tensor, ptr_name, func_name);
+#else
+    at::Tensor& tensor) {
+  return tensor.packed_accessor64<T, N, PtrTraits>();
+#endif
+}
+
+#ifdef FBGEMM_GPU_MEMCHECK
+#define MAKE_PACKED_TENSOR_ACCESSOR_BASE(                     \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS)         \
+  make_packed_tensor_accessor##INDEX_NBITS<T, N, PTR_TRAITS>( \
+      TENSOR, #TENSOR, FUNC_NAME)
+
+#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE(    \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<           \
+      at::acc_type<T, true>,                          \
+      N,                                              \
+      PTR_TRAITS>(TENSOR, #TENSOR, FUNC_NAME)
+#else
+#define MAKE_PACKED_TENSOR_ACCESSOR_BASE(             \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<T, N, PTR_TRAITS>(TENSOR)
+
+#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE(    \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<           \
+      at::acc_type<T, true>,                          \
+      N,                                              \
+      PTR_TRAITS>(TENSOR)
+#endif

From c7cddecb8ddeaad71bb42f53f2e2cebc20a0cf91 Mon Sep 17 00:00:00 2001
From: Matt Galloway <mattjgalloway@meta.com>
Date: Thu, 16 Mar 2023 11:37:50 -0700
Subject: [PATCH 10/34] Fix compiling with Xcode 14.3 (#1648)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1648

This hack is not needed in Xcode 14.3 anymore, where the clang version is 14.0.3. So change the workaround to only include up to 14.0.2.

Reviewed By: MatzeB

Differential Revision: D44130421

fbshipit-source-id: 1fb2948567941bdf6ee9487ccfaa9dfb2caf92dd
---
 src/InlineAsmDefines.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/InlineAsmDefines.h b/src/InlineAsmDefines.h
index 80612536b7..fa3f706602 100644
--- a/src/InlineAsmDefines.h
+++ b/src/InlineAsmDefines.h
@@ -10,13 +10,14 @@
 // We need to do a hack in inline assembly in some clang versions where we have
 // to do `.intel_syntax noprefix`. This was fixed in clang in
 // https://reviews.llvm.org/D113707, which made it into clang-14, but not in
-// Apple's clang-14 that ships with Xcode 14.
+// Apple's clang-14 that ships with Xcode 14.2. It was first fixed in Xcode 14.3
+// where the clang version is 14.0.3.
 #if defined(__clang__)
 
 #if (                                                                      \
     defined(__apple_build_version__) ||                                    \
     (defined(__has_builtin) && __has_builtin(__builtin_pika_xxhash64))) && \
-    (__clang_major__ < 15)
+    (__clang_major__ < 15 && __clang_minor__ == 0 && __clang_patchlevel__ < 3)
 #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1
 #elif (__clang_major__ < 14)
 #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1

From 64833b5185893cbc71ea80c9b01443f762b5cba4 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 17 Mar 2023 12:21:44 -0700
Subject: [PATCH 11/34] Add support for building FBGEMM_GPU against Python 3.11
 in OSS (#1646)

Summary:
- Parallelize the FBGEMM CI builds to build and test static and shared libraries independently instead of in serial
- Move the FBGEMM CI builds to run inside Docker containers
- Add support for building FBGEMM_GPU against Python 3.11 in OSS
- Move all FBGEMM_GPU nightly and release build jobs to run inside `amazonlinux:2023` Docker container
- Assuming no build errors or resource starvation, the full OSS build process now runs under 30 minutes.

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1646

Reviewed By: shintaro-iwasaki

Differential Revision: D44157228

Pulled By: q10

fbshipit-source-id: 6403ea9955856157785c50837b0b8e4c0cd26d53
---
 .github/scripts/setup_env.bash                | 100 +++++--
 .github/workflows/fbgemm_ci.yml               | 244 ++++++++----------
 .github/workflows/fbgemm_gpu_ci.yml           |  32 +--
 .github/workflows/fbgemm_nightly_build.yml    |  27 +-
 .../workflows/fbgemm_nightly_build_cpu.yml    |  24 +-
 .github/workflows/fbgemm_release_build.yml    |  27 +-
 .../workflows/fbgemm_release_build_cpu.yml    |  24 +-
 fbgemm_gpu/docs/BuildInstructions.md          |  10 +-
 .../split_table_batched_embeddings_ops.py     |   4 +-
 9 files changed, 257 insertions(+), 235 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index ccdac79097..a22a09b19e 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -264,22 +264,13 @@ print_gpu_info () {
     if which nvidia-smi; then
       # If nvidia-smi is installed on a machine without GPUs, this will return error
       (print_exec nvidia-smi) || true
+    else
+      echo "[CHECK] nvidia-smi not found"
     fi
   fi
 }
 
-print_system_info () {
-  echo "################################################################################"
-  echo "# Print System Info"
-  echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-  echo "################################################################################"
-  echo ""
-
-  echo "################################################################################"
-  echo "[INFO] Printing environment variables ..."
-  print_exec printenv
-
+__print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Check ldd version ..."
   print_exec ldd --version
@@ -296,6 +287,36 @@ print_system_info () {
   print_exec cat /etc/os-release
 }
 
+__print_system_info_macos () {
+  echo "################################################################################"
+  echo "[INFO] Check CPU info ..."
+  sysctl -a | grep machdep.cpu
+
+  echo "################################################################################"
+  echo "[INFO] Check MacOS version info ..."
+  print_exec uname -a
+  print_exec sw_vers
+}
+
+print_system_info () {
+  echo "################################################################################"
+  echo "# Print System Info"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "################################################################################"
+  echo "[INFO] Printing environment variables ..."
+  print_exec printenv
+
+  if [[ $OSTYPE == 'darwin'* ]]; then
+    __print_system_info_macos
+  else
+    __print_system_info_linux
+  fi
+}
+
 print_ec2_info () {
   echo "################################################################################"
   echo "# Print EC2 Instance Info"
@@ -316,6 +337,30 @@ print_ec2_info () {
   echo "instance-type: $(get_ec2_metadata instance-type)"
 }
 
+print_glibc_info () {
+  local library_path="$1"
+  if [ "$library_path" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6"
+    return 1
+  fi
+
+  if [ -f "${library_path}" ]; then
+    echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat
+    echo ""
+
+    echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    echo ""
+
+  else
+    echo "[CHECK] No file at path: ${library_path}"
+    return 1
+  fi
+}
+
 
 ################################################################################
 # Miniconda Setup Functions
@@ -342,7 +387,7 @@ setup_miniconda () {
     print_exec mkdir -p "$miniconda_prefix"
 
     echo "[SETUP] Downloading the Miniconda installer ..."
-    print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+    (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1
 
     echo "[SETUP] Installing Miniconda ..."
     print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u
@@ -360,9 +405,16 @@ setup_miniconda () {
   print_exec conda info
 
   # These variables will be exported outside
+  echo "[SETUP] Exporting Miniconda variables ..."
   export PATH="${miniconda_prefix}/bin:${PATH}"
   export CONDA="${miniconda_prefix}"
 
+  if [ -f "${GITHUB_PATH}" ]; then
+    echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..."
+    echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}"
+    echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}"
+  fi
+
   echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}"
 }
 
@@ -448,9 +500,11 @@ install_pytorch_conda () {
   fi
 
   # Install PyTorch packages
+  # NOTE: Installation of large package might fail due to corrupt package download
+  # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
   echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
   # Run check for GPU variant
   if [ "$pytorch_cpu" == "" ]; then
@@ -612,7 +666,7 @@ install_cuda () {
 
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
-  (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
 
   # Ensure that nvcc is properly installed
   (test_binpath "${env_name}" nvcc) || return 1
@@ -806,15 +860,19 @@ install_cxx_compiler () {
     install_system_packages gcc gcc-c++
 
   else
-    # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as
-    # the latter breaks builds:
+    # Install gxx_linux-64 from conda-forge instead of from anaconda channel.
+    # sysroot_linux-64 needs to be installed alongside this:
+    #
     #   https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6
+    #   https://github.com/conda-forge/conda-forge.github.io/issues/1625
+    #   https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7
+    #   https://github.com/conda/conda-build/issues/4371
     #
-    # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with
-    # references to GLIBCXX_3.4.29, which is not available on systems with older
+    # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that
+    # reference GLIBCXX_3.4.29, which may not be available on systems with older
     # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04
     echo "[INSTALL] Installing C/C++ compilers through Conda ..."
-    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1
+    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1
 
     # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
     # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
@@ -1055,7 +1113,7 @@ check_fbgemm_gpu_build () {
 
   for library in "${fbgemm_gpu_so_files[@]}"; do
     echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}"
-    objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    print_glibc_info "${library}"
 
     echo "[CHECK] Verifying sample subset of symbols in the library ..."
     for symbol in "${lib_symbols_to_check[@]}"; do
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index 977b443a2b..9b18dfb884 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -19,185 +19,165 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-posix:
-    runs-on: ${{ matrix.os }}
+  build-linux:
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+      DEBIAN_FRONTEND: noninteractive
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macos-latest ]
+        container-image: [ "ubuntu:20.04" ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
+    - name: Setup Build Container
       run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+        apt update -y
+        apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget
+        git config --global --add safe.directory '*'
 
-    - name: Get CPU info on Ubuntu
-      if: contains(runner.os, 'linux')
-      run: |
-        cat /proc/cpuinfo
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
-    - name: Get CPU info on macOS
-      if: contains(runner.os, 'macOs')
-      run: |
-        sysctl -a | grep machdep.cpu
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
 
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_static
-        cd build_static
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 ..
+        make -j
 
-    - name: Test static FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        cd build_static
+        cd $BUILD_DIR
         ctest --rerun-failed --output-on-failure
 
-    - name: Build shared FBGEMM lib
+
+  build-macos:
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ macos-latest ]
+        library-type: [ static, shared ]
+
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    # Build but skip tests due to lack of support for AVX2
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_shared
-        cd build_shared
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} ..
+        make -j
+
+
+  build-bazel:
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
 
-    - name: Test shared FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Download bazel
       run: |
         set -e
-        cd build_shared
-        ctest --rerun-failed --output-on-failure
+        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
+        # verify content
+        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
+        chmod +x bazel
+
+    - name: Build FBGEMM with bazel
+      run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
+
+    - name: Test FBGEMM bazel build
+      run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
+
 
   build-windows:
     runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: cmd
+    env:
+      BUILD_DIR: build_${{ matrix.library-type }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [windows-2019]
+        os: [ windows-2019 ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
     - name: Get CPU info on Windows
       shell: cmd
       run: |
         wmic cpu list full
 
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
         echo "INSTALL NINJA:"
         pip install ninja
         which ninja
-        mkdir build_static
-        cd build_static
+        mkdir %BUILD_DIR%
+        cd %BUILD_DIR%
         echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
+        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
         ninja all
         echo "Build Success"
 
-    - name: Test static FBGEMM lib
-      shell: cmd
-      run: |
-        echo %cd%
-        cd build_static
-        ctest --rerun-failed --output-on-failure
-        if errorlevel 1 exit /b 1
-
-    - name: Build shared FBGEMM lib
-      shell: cmd
-      run: |
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-        echo "INSTALL NINJA:"
-        pip install ninja
-        which ninja
-        mkdir build_shared
-        cd build_shared
-        echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
-        ninja all
-        if errorlevel 1 exit /b 1
-
-    - name: Test shared FBGEMM lib
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         echo %cd%
-        cd build_shared
+        cd %BUILD_DIR%
         set PATH=%PATH%;%cd%;%cd%\asmjit
         echo %PATH%
         ctest --rerun-failed --output-on-failure
         if errorlevel 1 exit /b 1
-
-  build-bazel:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ ubuntu-latest ]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
-
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Download bazel
-      run: |
-        set -e
-        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
-        # verify content
-        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
-        chmod +x bazel
-
-
-    - name: Build FBGEMM with bazel
-      run: |
-        set -e
-        ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
-
-    - name: Test FBGEMM bazel build
-      run: |
-        set -e
-        ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index bd62f23761..adf8443eae 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   build_and_test_amd:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
     container:
       image: ${{ matrix.container-image }}
       options: --user root
@@ -33,9 +33,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
         container-image: [ "ubuntu:20.04" ]
-        python-version: [ "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10" ]
         rocm-version: [ "5.3" ]
 
     steps:
@@ -60,10 +59,7 @@ jobs:
       run: . $PRELUDE; free_disk_space
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -77,7 +73,7 @@ jobs:
     - name: Install PyTorch-ROCm Nightly
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU-ROCM Nightly
@@ -146,7 +142,10 @@ jobs:
 
 
   build_and_test_cpu:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -156,10 +155,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04, ubuntu-latest ]
+        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -172,10 +177,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -186,7 +188,7 @@ jobs:
     - name: Install PyTorch
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build and Install FBGEMM_GPU (CPU version)
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index bc699ef62b..b0ac76900c 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -38,7 +38,10 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -49,11 +52,13 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -66,10 +71,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -89,7 +91,7 @@ jobs:
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly
@@ -116,7 +118,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
@@ -135,10 +137,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -149,7 +148,7 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index 1125b17a0d..d99c3f73ee 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -39,7 +39,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -53,8 +53,7 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
     - name: Setup Build Container
@@ -72,10 +71,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -89,7 +85,7 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly (CPU version)
@@ -104,7 +100,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -117,8 +113,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
@@ -137,10 +132,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -148,7 +140,7 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index def6002a76..75d5235b69 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -30,7 +30,10 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -41,11 +44,13 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -58,10 +63,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -81,7 +83,7 @@ jobs:
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU
@@ -108,7 +110,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
@@ -126,10 +128,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -140,7 +139,7 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index c7fb53cabd..f13ebd32c9 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -44,8 +44,7 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
     - name: Setup Build Container
@@ -63,10 +62,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -80,7 +76,7 @@ jobs:
     - name: Install PyTorch-CPU Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU (CPU version)
@@ -95,7 +91,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -108,8 +104,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
@@ -128,10 +123,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -139,7 +131,7 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index a90a059b40..56aa780fe3 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -22,7 +22,7 @@ environment is recommended for reproducible builds:
 # Set the Miniconda prefix directory
 miniconda_prefix=$HOME/miniconda
 
-# Download the Miniconfs installer
+# Download the Miniconda installer
 wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
 # Run the installer
@@ -59,7 +59,7 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
 
 ### C/C++ Compiler
 
-Install the GCC toolchain.  Note that GCC (as opposed to LLVM for example) is
+Install the GCC toolchain.  Note that GCC (as opposed to Clang for example) is
 required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++`
 in the path.
 
@@ -71,7 +71,7 @@ Note that while newer versions of GCC can be used, binaries compiled under newer
 versions of GCC will not be compatible with older systems such as Ubuntu 20.04
 or CentOS Stream 8, because the compiled library will reference symbols from
 versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support.  To
-see what versions of GLIBCXX that the available `libstdc++.so.6` supports:
+see what versions of GLIBCXX the available `libstdc++.so.6` supports:
 
 ```sh
 libcxx_path=/path/to/libstdc++.so.6
@@ -193,7 +193,7 @@ From there, the rest of the build environment may be constructed through Conda.
 
 ### Install ROCm
 
-Install the full ROCm package through the operating system package manger. The
+Install the full ROCm package through the operating system package manager. The
 full instructions can be found in the
 [ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html):
 
@@ -346,7 +346,7 @@ package_name=fbgemm_gpu
 
 # Build for SM70/80 (V100/A100 GPU); update as needed
 # If not specified, only the CUDA architecture supported by current system will be targeted
-# Ifo CUDA device is present either, all CUDA architectures will be targeted
+# If no CUDA device is present either, all CUDA architectures will be targeted
 cuda_arch_list=7.0;8.0
 
 # Build the wheel artifact only
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index 87b9b1a559..8120cdcb03 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -9,7 +9,7 @@
 
 import enum
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from itertools import accumulate
 from math import log2
 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union
@@ -106,7 +106,7 @@ class CounterBasedRegularizationDefinition:
     adjustment_ub: float = 1.0
     learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL
     grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY
-    tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False)
+    tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold)
     max_counter_update_freq: int = 1000
 
 

From 54eeae214af0834cc07632eb916879d71468a4cd Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Fri, 17 Mar 2023 19:37:29 -0700
Subject: [PATCH 12/34] Remove magic numbers from fbgemm/Types.h (#1629)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1629

Replaces magic numbers with constexpr variables

Reviewed By: sryap

Differential Revision: D43776442

fbshipit-source-id: 5cef7566816f8730f5daa08948ee3260367787aa
---
 include/fbgemm/Types.h | 189 +++++++++++++++++++++++++----------------
 1 file changed, 114 insertions(+), 75 deletions(-)

diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index be8ac4ec8b..e5daa28d8b 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -15,145 +15,184 @@ namespace fbgemm {
 using float16 = std::uint16_t;
 using bfloat16 = std::uint16_t;
 
+// The IEEE754 standard species a binary16 as having the following format:
+// SEEEEEMMMMMMMMMM
+// 0432109876543210
+// That is:
+//  *  1 sign bit
+//  *  5 exponent bits
+//  * 10 mantissa/significand bits (an 11th bit is implicit)
+constexpr uint32_t f16_num_bits = 16;
+constexpr uint32_t f16_num_exponent_bits = 5;
+constexpr uint32_t f16_num_mantissa_bits = 10;
+constexpr uint32_t f16_num_non_sign_bits =
+    f16_num_exponent_bits + f16_num_mantissa_bits;
+constexpr uint32_t f16_exponent_mask = 0x1F; // 5 bits
+constexpr uint32_t f16_sign_bit = 1u
+    << (f16_num_exponent_bits + f16_num_mantissa_bits);
+constexpr uint32_t f16_exponent_bits = f16_exponent_mask
+    << f16_num_mantissa_bits;
+constexpr uint32_t f16_mantissa_mask = 0x3FF; // 10 bits
+constexpr uint32_t f16_exponent_bias = 15;
+constexpr uint32_t f16_nan = 0x7FFF;
+
+// The IEEE754 standard specifies a binary32 as having:
+// SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM
+// That is:
+//  *  1 sign bit
+//  *  8 exponent bits
+//  * 23 mantissa/significand bits (a 24th bit is implicit)
+constexpr uint32_t f32_num_exponent_bits = 8;
+constexpr uint32_t f32_num_mantissa_bits = 23;
+constexpr uint32_t f32_exponent_mask = 0xFF; // 8 bits
+constexpr uint32_t f32_mantissa_mask = 0x7FFFFF; // 23 bits
+constexpr uint32_t f32_exponent_bias = 127;
+constexpr uint32_t f32_all_non_sign_mask = 0x7FFFFFFF; // 31 bits
+constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit
+constexpr uint32_t f32_num_non_sign_bits =
+    f32_num_exponent_bits + f32_num_mantissa_bits;
+
 // Round to nearest even
 static inline float16 cpu_float2half_rn(float f) {
-  float16 ret;
-
   static_assert(
-      sizeof(unsigned int) == sizeof(float),
-      "Programming error sizeof(unsigned int) != sizeof(float)");
+      sizeof(uint32_t) == sizeof(float),
+      "Programming error sizeof(uint32_t) != sizeof(float)");
 
-  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
-  unsigned x = *xp;
-  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
+  uint32_t* xp = reinterpret_cast<uint32_t*>(&f);
+  uint32_t x = *xp;
+  uint32_t u = (x & f32_all_non_sign_mask);
 
   // Get rid of +NaN/-NaN case first.
   if (u > 0x7f800000) {
-    ret = 0x7fffU;
-    return ret;
+    return static_cast<float16>(f16_nan);
   }
 
-  sign = ((x >> 16) & 0x8000);
+  uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit);
 
   // Get rid of +Inf/-Inf, +0/-0.
   if (u > 0x477fefff) {
-    ret = static_cast<float16>(sign | 0x7c00U);
-    return ret;
+    return static_cast<float16>(sign | f16_exponent_bits);
   }
   if (u < 0x33000001) {
-    ret = static_cast<float16>(sign | 0x0000);
-    return ret;
+    return static_cast<float16>(sign | 0x0000);
   }
 
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
+  uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask);
+  uint32_t mantissa = (u & f32_mantissa_mask);
 
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
+  uint32_t shift;
+  if (exponent > f32_exponent_bias - f16_exponent_bias) {
+    shift = f32_num_mantissa_bits - f16_num_mantissa_bits;
+    exponent -= f32_exponent_bias - f16_exponent_bias;
   } else {
-    shift = 0x7e - exponent;
+    shift = (f32_exponent_bias - 1) - exponent;
     exponent = 0;
-    mantissa |= 0x800000;
+    mantissa |=
+        (1u
+         << f32_num_mantissa_bits); // Bump the least significant exponent bit
   }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
+  const uint32_t lsb = (1u << shift);
+  const uint32_t lsb_s1 = (lsb >> 1);
+  const uint32_t lsb_m1 = (lsb - 1);
 
   // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
+  const uint32_t remainder = (mantissa & lsb_m1);
   mantissa >>= shift;
   if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
     ++mantissa;
-    if (!(mantissa & 0x3ff)) {
+    if (!(mantissa & f16_mantissa_mask)) {
       ++exponent;
       mantissa = 0;
     }
   }
 
-  ret = static_cast<float16>(sign | (exponent << 10) | mantissa);
-
-  return ret;
+  return static_cast<float16>(
+      sign | (exponent << f16_num_mantissa_bits) | mantissa);
 }
 
 // Round to zero
 static inline float16 cpu_float2half_rz(float f) {
-  float16 ret;
-
   static_assert(
-      sizeof(unsigned int) == sizeof(float),
-      "Programming error sizeof(unsigned int) != sizeof(float)");
+      sizeof(uint32_t) == sizeof(float),
+      "Programming error sizeof(uint32_t) != sizeof(float)");
 
-  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
-  unsigned x = *xp;
-  unsigned u = (x & 0x7fffffff);
-  unsigned shift, sign, exponent, mantissa;
+  const uint32_t* xp = reinterpret_cast<uint32_t*>(&f);
+  const uint32_t x = *xp;
+  const uint32_t u = (x & f32_all_non_sign_mask);
 
   // Get rid of +NaN/-NaN case first.
   if (u > 0x7f800000) {
-    ret = static_cast<float16>(0x7fffU);
-    return ret;
+    return static_cast<float16>(f16_nan);
   }
 
-  sign = ((x >> 16) & 0x8000);
+  uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit);
 
   // Get rid of +Inf/-Inf, +0/-0.
   if (u > 0x477fefff) {
-    ret = static_cast<float16>(sign | 0x7c00U);
-    return ret;
+    return static_cast<float16>(sign | f16_exponent_bits);
   }
   if (u < 0x33000001) {
-    ret = static_cast<float16>(sign | 0x0000);
-    return ret;
+    return static_cast<float16>(sign | 0x0000);
   }
 
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
+  uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask);
+  uint32_t mantissa = (u & f32_mantissa_mask);
 
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
+  uint32_t shift;
+  if (exponent > f32_exponent_bias - f16_exponent_bias) {
+    shift = f32_num_mantissa_bits - f16_num_mantissa_bits;
+    exponent -= f32_exponent_bias - f16_exponent_bias;
   } else {
-    shift = 0x7e - exponent;
+    shift = (f32_exponent_bias - 1) - exponent;
     exponent = 0;
-    mantissa |= 0x800000;
+    mantissa |=
+        (1u
+         << f32_num_mantissa_bits); // Bump the least significant exponent bit
   }
 
   // Round to zero.
   mantissa >>= shift;
 
-  ret = static_cast<float16>(sign | (exponent << 10) | mantissa);
-
-  return ret;
+  return static_cast<float16>(
+      sign | (exponent << f16_num_mantissa_bits) | mantissa);
 }
 
-static inline float cpu_half2float(float16 h) {
-  unsigned sign = ((h >> 15) & 1);
-  unsigned exponent = ((h >> 10) & 0x1f);
-  unsigned mantissa = ((h & 0x3ff) << 13);
-
-  if (exponent == 0x1f) { /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) { /* Denorm or Zero */
+// Converts a 16-bit unsigned integer representation of a IEEE754 half-precision
+// float into an IEEE754 32-bit single-precision float
+static inline float cpu_half2float(const float16 h) {
+  // Get sign and exponent alone by themselves
+  uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1;
+  uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask;
+  // Shift mantissa so that it fills the most significant bits of a float32
+  uint32_t mantissa = (h & f16_mantissa_mask)
+      << (f32_num_mantissa_bits - f16_num_mantissa_bits);
+
+  if (exponent == f16_exponent_mask) { // NaN or Inf
     if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
+      mantissa = f32_mantissa_mask;
+      sign_bit = 0;
+    }
+    exponent = f32_exponent_mask;
+  } else if (!exponent) { // Denorm or Zero
+    if (mantissa) {
+      uint32_t msb;
+      exponent = f32_exponent_bias - f16_exponent_bias + 1;
       do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1; /* normalize */
+        msb = mantissa & f32_most_significant_bit;
+        mantissa <<= 1; // normalize
         --exponent;
       } while (!msb);
-      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+      mantissa &= f32_mantissa_mask; // 1.mantissa is implicit
     }
   } else {
-    exponent += 0x70;
+    exponent += f32_exponent_bias - f16_exponent_bias;
   }
 
-  unsigned i = ((sign << 31) | (exponent << 23) | mantissa);
+  const uint32_t i = (sign_bit << f32_num_non_sign_bits) |
+      (exponent << f32_num_mantissa_bits) | mantissa;
+
   float ret;
-  memcpy(&ret, &i, sizeof(i));
+  std::memcpy(&ret, &i, sizeof(float));
   return ret;
 }
 
@@ -161,14 +200,14 @@ static inline float cpu_bf162float(bfloat16 src) {
   float ret;
   uint32_t val_fp32 =
       static_cast<uint32_t>(reinterpret_cast<const uint16_t*>(&src)[0]) << 16;
-  memcpy(&ret, &val_fp32, sizeof(ret));
+  memcpy(&ret, &val_fp32, sizeof(float));
   return ret;
 }
 
 static inline bfloat16 cpu_float2bfloat16(float src) {
   uint32_t temp;
-  memcpy(&temp, &src, sizeof(temp));
-  return (temp + (1 << 15)) >> 16;
+  memcpy(&temp, &src, sizeof(uint32_t));
+  return (temp + (1u << 15)) >> 16;
 }
 
 } // namespace fbgemm

From 35bdd402608b552c44087d2a68de7ddb6e488d3a Mon Sep 17 00:00:00 2001
From: Xiao Sun <sunx@meta.com>
Date: Sat, 18 Mar 2023 14:44:44 -0700
Subject: [PATCH 13/34] added check to avoid div 0 errors in cache report
 (#1645)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1645

as in title

Reviewed By: jianyuh

Differential Revision: D44096435

fbshipit-source-id: a7a87a14ffecc2fb6e0be74d199d385357946672
---
 .../split_table_batched_embeddings_ops.py      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index 8120cdcb03..ff8ce4d094 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -979,10 +979,11 @@ def print_uvm_cache_stats(self) -> None:
             f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n"
             f"N_conflict_misses: {uvm_cache_stats[5]}\n"
         )
-        logging.info(
-            f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
-            f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
-        )
+        if uvm_cache_stats[1]:
+            logging.info(
+                f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
+                f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
+            )
 
     def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         self.timestep += 1
@@ -2347,10 +2348,11 @@ def print_uvm_cache_stats(self) -> None:
             f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n"
             f"N_conflict_misses: {uvm_cache_stats[5]}\n"
         )
-        logging.info(
-            f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
-            f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
-        )
+        if uvm_cache_stats[1]:
+            logging.info(
+                f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
+                f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
+            )
 
     @torch.jit.export
     def prefetch(self, indices: Tensor, offsets: Tensor) -> None:

From 125ce44718023e0025d0cc154f17f04b072b73f1 Mon Sep 17 00:00:00 2001
From: Rengan Xu <renganxu@meta.com>
Date: Mon, 20 Mar 2023 11:10:46 -0700
Subject: [PATCH 14/34] jagged_dense_bmm operator optimization (#1643)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1643

This diff optimizes the jagged_dense_bmm operator with the following optimizations:
* tiling across thread blocks, and use GPU shared memory for thread block
* tiling across threads within a thread block, and use registers for each thread

Reviewed By: brad-mengchi

Differential Revision: D43674845

fbshipit-source-id: 85f0abf89fa958f79636ef59c3070a1c569b73c2
---
 .../include/fbgemm_gpu/fbgemm_cuda_utils.cuh  |   5 +
 fbgemm_gpu/src/jagged_tensor_ops.cu           | 183 +++++++++++++++---
 2 files changed, 158 insertions(+), 30 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 5ce7d4f5d1..c21057ac49 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -62,6 +62,11 @@ static constexpr int32_t kWarpSize = 32;
 #endif
 // Max thread num in one thread block
 static constexpr int32_t kMaxThreads = 1024;
+// Max block size in Y dimension of a grid
+static constexpr int32_t kMaxBlockYDim = 65535;
+// Max block size in Z dimension of a grid
+static constexpr int32_t kMaxBlockZDim = 65535;
+
 static constexpr float kQParamEps = 1e-8f;
 
 /* For rowwise int8 quantization, two quantization parameters (qparams)
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 4e93d08a65..860a83ecd6 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -2071,36 +2071,135 @@ Tensor jagged_jagged_bmm_forward(
   return output;
 }
 
-template <typename index_t, typename scalar_t>
+template <
+    const int BLOCK_TILE_M, // tile height of C that each thread block
+                            // calculates
+    const int BLOCK_TILE_N, // tile width of C that each thread block
+                            // calculates
+    const int BLOCK_TILE_K, // tile width of A that each thread block calculates
+    const int THREAD_TILE_M, // tile height of C that each thread
+                             // calculates
+    const int THREAD_TILE_N, // tile width of C that each thread calcualtes
+    typename index_t,
+    typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(
-    const at::PackedTensorAccessor32<scalar_t, 2> x_values,
-    const at::PackedTensorAccessor32<index_t, 1> x_offsets,
-    const at::PackedTensorAccessor32<scalar_t, 3> y,
-    at::PackedTensorAccessor32<scalar_t, 2> output,
+    const at::PackedTensorAccessor32<scalar_t, 2> __restrict__ x_values,
+    const at::PackedTensorAccessor32<index_t, 1> __restrict__ x_offsets,
+    const at::PackedTensorAccessor32<scalar_t, 3> __restrict__ y,
+    at::PackedTensorAccessor32<scalar_t, 2> __restrict__ output,
     const int max_L) {
   const int B = x_offsets.size(0) - 1;
   const int K = x_values.size(1);
   const int N = y.size(2);
 
-  const int b_l_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_l_step = gridDim.x * blockDim.y;
-  for (int b_l = b_l_begin; b_l < B * max_L; b_l += b_l_step) {
-    const int b = b_l / max_L;
-    const int l = b_l % max_L;
+  const auto block_row = blockIdx.y;
+  const auto block_col = blockIdx.x;
+
+  const int THREADS_X_PER_BLOCK = BLOCK_TILE_N / THREAD_TILE_N;
+  const int THREADS_Y_PER_BLOCK = BLOCK_TILE_M / THREAD_TILE_M;
+  const int THREADS_PER_BLOCK = THREADS_X_PER_BLOCK * THREADS_Y_PER_BLOCK;
+  const auto thread_row = threadIdx.x / THREADS_X_PER_BLOCK;
+  const auto thread_col = threadIdx.x % THREADS_X_PER_BLOCK;
+  const auto NUM_K_BLOCKS = (K + BLOCK_TILE_K - 1) / BLOCK_TILE_K;
+
+  __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K];
+  __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N];
+
+  for (auto b = blockIdx.z; b < B; b += gridDim.z) {
+    const index_t row_start = x_offsets[b];
+    const index_t row_end = x_offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    // the indices that this current will load into shared mem
+    const auto inner_row_a = threadIdx.x / BLOCK_TILE_K;
+    const auto inner_col_a = threadIdx.x % BLOCK_TILE_K;
+    // the number of rows of As that will be loaded per step by a thread block
+    const auto A_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_K;
+
+    const auto inner_row_b = threadIdx.x / BLOCK_TILE_N;
+    const auto inner_col_b = threadIdx.x % BLOCK_TILE_N;
+    const auto B_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_N;
+
+    // registers for C
+    scalar_t accum[THREAD_TILE_M][THREAD_TILE_N] = {0};
+
+    // registers for As and Bs
+    scalar_t fragment_a[THREAD_TILE_M] = {0};
+    scalar_t fragment_b[THREAD_TILE_N] = {0};
+
+    // loop for block tiles in K dimension
+    for (auto block = 0; block < NUM_K_BLOCKS; block++) {
+// load a block of x_values from global memory to shared memory
+// apply tiling for threads in a block
+#pragma unroll
+      for (auto offset = 0; offset < BLOCK_TILE_M;
+           offset += A_TILE_ROW_STRIDE) {
+        auto x_row_offset = block_row * BLOCK_TILE_M + inner_row_a + offset;
+        auto x_col_offset = block * BLOCK_TILE_K + inner_col_a;
+        if ((x_row_offset < length) && (x_col_offset < K)) {
+          As[inner_row_a + offset][inner_col_a] =
+              x_values[row_start + x_row_offset][x_col_offset];
+        } else {
+          As[inner_row_a + offset][inner_col_a] = 0;
+        }
+      }
 
-    const int row_start = x_offsets[b];
-    const int row_end = x_offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length == 0 || l >= length) {
-      return;
-    } else {
-      // TODO: use shared memory and better reduction
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        at::acc_type<scalar_t, true> acc = 0;
-        for (int k = 0; k < K; ++k) {
-          acc += x_values[row_start + l][k] * y[b][k][n];
+// load a block of y from global memory to shared memory
+// apply tiling for threads in a block
+#pragma unroll
+      for (auto offset = 0; offset < BLOCK_TILE_K;
+           offset += B_TILE_ROW_STRIDE) {
+        auto y_row_offset = block * BLOCK_TILE_K + inner_row_b + offset;
+        auto y_col_offset = block_col * BLOCK_TILE_N + inner_col_b;
+        if ((y_row_offset < K) && (y_col_offset < N)) {
+          Bs[inner_row_b + offset][inner_col_b] =
+              y[b][y_row_offset][y_col_offset];
+        } else {
+          Bs[inner_row_b + offset][inner_col_b] = 0;
+        }
+      }
+
+      __syncthreads();
+
+// calculate the results per thread
+#pragma unroll
+      for (auto k = 0; k < BLOCK_TILE_K; k++) {
+        // load values from shared memory to registers for x_values
+        for (auto row = 0; row < THREAD_TILE_M; row++) {
+          fragment_a[row] = As[thread_row * THREAD_TILE_M + row][k];
+        }
+
+// load values from shared memory to registers for y
+#pragma unroll
+        for (auto col = 0; col < THREAD_TILE_N; col++) {
+          fragment_b[col] = Bs[k][thread_col * THREAD_TILE_N + col];
+        }
+
+// each thread calcualtes THREAD_TILE_M * THREAD_TILE_N elements
+#pragma unroll
+        for (auto row = 0; row < THREAD_TILE_M; row++) {
+#pragma unroll
+          for (auto col = 0; col < THREAD_TILE_N; col++) {
+            accum[row][col] += fragment_a[row] * fragment_b[col];
+          }
+        }
+      }
+
+      __syncthreads();
+    }
+
+// write the result to the output
+#pragma unroll
+    for (auto row = 0; row < THREAD_TILE_M; row++) {
+#pragma unroll
+      for (auto col = 0; col < THREAD_TILE_N; col++) {
+        auto out_row_offset =
+            block_row * BLOCK_TILE_M + thread_row * THREAD_TILE_M + row;
+        auto out_col_offset =
+            block_col * BLOCK_TILE_N + thread_col * THREAD_TILE_N + col;
+        if ((out_row_offset < length) && (out_col_offset < N)) {
+          output[row_start + out_row_offset][out_col_offset] = accum[row][col];
         }
-        output[row_start + l][n] = acc;
       }
     }
   }
@@ -2124,9 +2223,29 @@ Tensor jagged_dense_bmm_forward(
   const int total_L = x_values.size(0);
   auto output = at::zeros({total_L, N}, x_values.options());
   if (B > 0 && M > 0 && N > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    // The shared memory size is (BLOCK_TILE_M + BLOCK_TILE_N) * BLOCK_TILE_K
+    // BLOCK_TILE_M needs to be multiple of THREAD_TILE_M, and
+    // BLOCK_TILE_N needs to be multiple of THREAD_TILE_N
+    // The setting of these parameters needs to balance the hardware's shared
+    // memory size limit and occupancy
+    // TODO: autotune these parameters based on max_L and input and output
+    // tensor sizes
+    constexpr int BLOCK_TILE_M = 64;
+    constexpr int BLOCK_TILE_N = 8;
+    constexpr int BLOCK_TILE_K = 8;
+    constexpr int THREAD_TILE_M = 4;
+    constexpr int THREAD_TILE_N = 4;
+
+    const dim3 block(
+        (BLOCK_TILE_M * BLOCK_TILE_N) / (THREAD_TILE_M * THREAD_TILE_N));
+    const auto grid_dim_x = div_round_up(N, BLOCK_TILE_N);
+    const auto grid_dim_y = div_round_up(max_L, BLOCK_TILE_M);
+    TORCH_CHECK(
+        grid_dim_y <= kMaxBlockYDim,
+        "max_L cannot be larger than",
+        grid_dim_y * BLOCK_TILE_M + 1 - BLOCK_TILE_M);
+    const auto grid_dim_z = std::min(B, kMaxBlockZDim);
+    const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z);
 
     AT_DISPATCH_INDEX_TYPES(
         x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] {
@@ -2136,11 +2255,15 @@ Tensor jagged_dense_bmm_forward(
               x_values.scalar_type(),
               "jagged_dense_bmm_kernel_2",
               [&] {
-                jagged_dense_bmm_kernel<index_t, scalar_t>
-                    <<<div_round_up(B * max_L, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
+                jagged_dense_bmm_kernel<
+                    BLOCK_TILE_M,
+                    BLOCK_TILE_N,
+                    BLOCK_TILE_K,
+                    THREAD_TILE_M,
+                    THREAD_TILE_N,
+                    index_t,
+                    scalar_t>
+                    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
                         x_values.packed_accessor32<scalar_t, 2>(),
                         x_offsets.packed_accessor32<index_t, 1>(),
                         y.packed_accessor32<scalar_t, 3>(),

From f3af571783d80bb23acbb1d3b9584320bb23e4db Mon Sep 17 00:00:00 2001
From: siwasaki <siwasaki@fb.com>
Date: Tue, 21 Mar 2023 20:37:59 -0700
Subject: [PATCH 15/34] jagged_dense_bmm: fix ROCm test failures (#1655)

Summary:
This patch fixes test failures on AMD GPUs.

1. Remove `__restrict__ `. I don't think it is needed even for CUDA, but it confuses HIPCC.
2. Use `uint32_t` instead of `auto`: old ROCm (including ROCm <= 5.3) does not have `+=` operator for the type of `blockIdx.z`, causing a compilation error. We observed that this issue is fixed in ROCm 5.4.3, but let's use `uint32_t` for now. We should revisit and use `auto` later. See this for details: https://github.com/ROCm-Developer-Tools/hipamd/commit/86a1634c642daeda1e984d4124bcc2aeba5c4e19

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1655

Test Plan: GitHub Actions' AMD CI

Reviewed By: q10, brad-mengchi

Differential Revision: D44242622

Pulled By: shintaro-iwasaki

fbshipit-source-id: c9b88155ebf1ed881b2d03e3be0e8991b4b30174
---
 fbgemm_gpu/src/jagged_tensor_ops.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 860a83ecd6..0282fa9f19 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -2083,10 +2083,10 @@ template <
     typename index_t,
     typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(
-    const at::PackedTensorAccessor32<scalar_t, 2> __restrict__ x_values,
-    const at::PackedTensorAccessor32<index_t, 1> __restrict__ x_offsets,
-    const at::PackedTensorAccessor32<scalar_t, 3> __restrict__ y,
-    at::PackedTensorAccessor32<scalar_t, 2> __restrict__ output,
+    const at::PackedTensorAccessor32<scalar_t, 2> x_values,
+    const at::PackedTensorAccessor32<index_t, 1> x_offsets,
+    const at::PackedTensorAccessor32<scalar_t, 3> y,
+    at::PackedTensorAccessor32<scalar_t, 2> output,
     const int max_L) {
   const int B = x_offsets.size(0) - 1;
   const int K = x_values.size(1);
@@ -2105,7 +2105,9 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(
   __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K];
   __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N];
 
-  for (auto b = blockIdx.z; b < B; b += gridDim.z) {
+  // Once we remove ROCm<=5.3 support, we should replace uint32_t with auto.
+  // See #1655
+  for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) {
     const index_t row_start = x_offsets[b];
     const index_t row_end = x_offsets[b + 1];
     const auto length = min(row_end - row_start, (index_t)max_L);

From 22c97d54f0d76385750401cef7d986f9554c0643 Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@meta.com>
Date: Wed, 22 Mar 2023 13:11:40 -0700
Subject: [PATCH 16/34] Support embedding dim 1024 ~ 2048 (#1656)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1656

wushirong reported the failure on https://fburl.com/code/hae91ra7 .

- The embedding config is from  f418615450 .
- `max_int8_128b_rows` is 10 --> D = 1280

Our embedding dim has grown to 1024 + ?

Note that the static shared memory can only go up to 48 KB:

> Kernels relying on shared memory allocations over 48 KB per block are architecture-specific, as such they must use dynamic shared memory (rather than statically sized arrays)

in https://docs.nvidia.com/cuda/cuda-c-programming-guide/

for ptx shared mem error:
```
[2023-03-21T22:04:33.899-07:00] ptxas error   : Entry function '_ZN4nbit60INT8_split_embedding_codegen_forward_weighted_kernel_small_LIiN3c104HalfELm2ELm4ELm4E
Lm8ELm16ELb1EEEvN2at27GenericPackedTensorAccessorIhLm1ENS3_17RestrictPtrTraitsElEES6_NS4_IiLm1ES5_iEENS4_IlLm1ES5_iEENS4_IhLm1ES5_iEES7_N10fbgemm_gpu12FixedDiv
isorENS4_IT_Lm1ES5_iEESD_llNS4_IfLm1ES5_iEENS4_IT0_Lm2ES5_iEENS4_IhLm2ES5_lEES7_' uses too much shared data (0x10080 bytes, 0xc000 max)
```

Currently we reduce `InputRowsInFlight` to bypass the issue (the static shared memory used in the kernel is
```
  typedef uint4 AllBuffers[WarpsPerBlock][OutputRowsPerThread][InputRowsInFlight][NumUint4LoadsPerRow];
  __shared__ AllBuffers buffers;
```

Long term, we can change the static shared memory to dynamic shared memory, and increase the shared memory size to be 64 KB +.

Reviewed By: wushirong

Differential Revision: D44270081

fbshipit-source-id: 367ae838ea073dfe58d859ea3c0e6c7190beca6a
---
 ...edding_forward_quantized_split_template.cu | 30 +++++++++++++++----
 .../split_table_batched_embeddings_test.py    |  4 +--
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
index 6ac2b2d3c0..4b4345f1cc 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -737,13 +737,16 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int2_D > 0) {
         auto max_int2_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int2_D, SparseType::INT2, row_alignment), 128);
-        TORCH_CHECK(max_int2_128b_rows <= 2);
+        TORCH_CHECK(max_int2_128b_rows <= 4);
         if (max_int2_128b_rows > 0) {
           Y(2, 16, 0, 1);
         }
         if (max_int2_128b_rows > 1) {
           Y(2, 8, 1, 2);
         }
+        if (max_int2_128b_rows > 2) {
+          Y(2, 8, 2, 4);
+        }
       }
     }));
     #undef X
@@ -783,7 +786,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int4_D > 0) {
         auto max_int4_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int4_D, SparseType::INT4, row_alignment), 128);
-        TORCH_CHECK(max_int4_128b_rows <= 4);
+        TORCH_CHECK(max_int4_128b_rows <= 8);
         if (max_int4_128b_rows > 0) {
           Y(4, 8, 0, 1);
         }
@@ -793,6 +796,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_int4_128b_rows > 2) {
           Y(1, 4, 2, 4);
         }
+        if (max_int4_128b_rows > 4) {
+          Y(1, 4, 4, 8);
+        }
       }
     }));
     #undef X
@@ -831,7 +837,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int8_D > 0) {
         auto max_int8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int8_D, SparseType::INT8, row_alignment), 128);
-        TORCH_CHECK(max_int8_128b_rows <= 8);
+        TORCH_CHECK(max_int8_128b_rows <= 16);
         if (max_int8_128b_rows > 0) {
           Y(2, 8, 0, 1);
         }
@@ -844,6 +850,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_int8_128b_rows > 4) {
           Y(2, 4, 4, 8);
         }
+        if (max_int8_128b_rows > 8) {
+          Y(2, 2, 8, 16);
+        }
       }
     }));
     #undef X
@@ -884,7 +893,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float8_D > 0) {
         auto max_fp8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float8_D, SparseType::FP8, row_alignment), 128);
-        TORCH_CHECK(max_fp8_128b_rows <= 8);
+        TORCH_CHECK(max_fp8_128b_rows <= 16);
         if (max_fp8_128b_rows > 0) {
           Y(2, 8, 0, 1);
         }
@@ -897,6 +906,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp8_128b_rows > 4) {
           Y(2, 4, 4, 8);
         }
+        if (max_fp8_128b_rows > 8) {
+          Y(2, 2, 4, 8);
+        }
       }
     }));
     #undef X
@@ -935,7 +947,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float16_D > 0) {
         auto max_fp16_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float16_D, SparseType::FP16, row_alignment), 128);
-        TORCH_CHECK(max_fp16_128b_rows <= 16);
+        TORCH_CHECK(max_fp16_128b_rows <= 32);
         if (max_fp16_128b_rows > 0) {
           Y(2, 8, 0, 2);
         }
@@ -948,6 +960,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp16_128b_rows > 8) {
           Y(2, 2, 8, 16);
         }
+        if (max_fp16_128b_rows > 16) {
+          Y(2, 1, 16, 32);
+        }
       }
     }));
     #undef X
@@ -986,7 +1001,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float32_D > 0) {
         auto max_fp32_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float32_D, SparseType::FP32, row_alignment), 128);
-        TORCH_CHECK(max_fp32_128b_rows <= 32);
+        TORCH_CHECK(max_fp32_128b_rows <= 64);
         if (max_fp32_128b_rows > 0) {
           Y(2, 4, 0, 4);
         }
@@ -996,6 +1011,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp32_128b_rows > 16) {
           Y(1, 1, 16, 32);
         }
+        if (max_fp32_128b_rows > 32) {
+          Y(1, 1, 32, 64);
+        }
       }
     }));
     #undef X
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 6a4d299b80..ddab386bf0 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -3579,7 +3579,7 @@ def test_nbit_forward_cpu(
         T = random.randint(1, 50)
         B = random.randint(0, 128)
         L = random.randint(0, 32)
-        D = random.randint(2, 1024)
+        D = random.randint(2, 2048)
         log_E = random.randint(2, 4)
 
         use_cache = False
@@ -3660,7 +3660,7 @@ def test_nbit_forward_gpu_no_cache(
         T = random.randint(1, 50)
         B = random.randint(0, 128)
         L = random.randint(0, 32)
-        D = random.randint(2, 1024)
+        D = random.randint(2, 2048)
         log_E = random.randint(2, 4)
 
         use_cache = False

From db9eee1c532135a17e83450162d82d371ca94c8a Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 22 Mar 2023 13:23:22 -0700
Subject: [PATCH 17/34] Containerize the remaining FBGEMM_GPU CI jobs (#1658)

Summary:
- Containerize the remaining FBGEMM_GPU CI jobs
- Add Conda cleanups to make PyTorch and CUDA installs more reliable
- Update post-install checks for PyTorch to work with ROCm
- Update the CI to continue running on jobs that fail on just a few variants
- Use PIP to install PyTorch GPU nightly as the nightly packages show up in PIP more reliably than in Conda

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1658

Reviewed By: shintaro-iwasaki

Differential Revision: D44306708

Pulled By: q10

fbshipit-source-id: 5f0862f18eca7151759d9983aa97849222539d7d
---
 .github/scripts/setup_env.bash                | 83 ++++++++++++-------
 .github/workflows/fbgemm_gpu_ci.yml           | 15 +++-
 .github/workflows/fbgemm_nightly_build.yml    | 32 ++++---
 .../workflows/fbgemm_nightly_build_cpu.yml    | 17 ++--
 .github/workflows/fbgemm_release_build.yml    | 27 ++++--
 .../workflows/fbgemm_release_build_cpu.yml    | 17 ++--
 fbgemm_gpu/docs/BuildInstructions.md          |  8 +-
 7 files changed, 127 insertions(+), 72 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index a22a09b19e..8329d661cc 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -212,8 +212,10 @@ run_python_test () {
 
   if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
+    echo ""
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
+    echo ""
     return 1
   fi
 }
@@ -366,6 +368,12 @@ print_glibc_info () {
 # Miniconda Setup Functions
 ################################################################################
 
+__conda_cleanup () {
+  echo "[SETUP] Cleaning up Conda packages ..."
+  (print_exec conda clean --packages --tarball -y) || return 1
+  (print_exec conda clean --all -y) || return 1
+}
+
 setup_miniconda () {
   local miniconda_prefix="$1"
   if [ "$miniconda_prefix" == "" ]; then
@@ -399,7 +407,10 @@ setup_miniconda () {
   print_exec . ~/.bashrc
 
   echo "[SETUP] Updating Miniconda base packages ..."
-  (exec_with_retries conda update -n base -c defaults -y conda) || return 1
+  (exec_with_retries conda update -n base -c defaults --update-deps -y conda) || return 1
+
+  # Clean up packages
+  __conda_cleanup
 
   # Print Conda info
   print_exec conda info
@@ -463,14 +474,14 @@ create_conda_environment () {
 install_pytorch_conda () {
   local env_name="$1"
   local pytorch_version="$2"
-  local pytorch_cpu="$3"
+  local pytorch_variant_type="$3"
   if [ "$pytorch_version" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION [CPU]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env 1.11.0      # Install a specific version"
-    echo "    ${FUNCNAME[0]} build_env latest      # Install the latest stable release"
-    echo "    ${FUNCNAME[0]} build_env test        # Install the pre-release"
-    echo "    ${FUNCNAME[0]} build_env nightly 1   # Install the CPU variant of the nightly"
+    echo "    ${FUNCNAME[0]} build_env 1.11.0       # Install a specific version"
+    echo "    ${FUNCNAME[0]} build_env latest       # Install the latest stable release"
+    echo "    ${FUNCNAME[0]} build_env test         # Install the pre-release"
+    echo "    ${FUNCNAME[0]} build_env nightly cpu  # Install the CPU variant of the nightly"
     return 1
   else
     echo "################################################################################"
@@ -481,11 +492,11 @@ install_pytorch_conda () {
     echo ""
   fi
 
-  # Install cpuonly if needed
-  if [ "$pytorch_cpu" != "" ]; then
-    pytorch_cpu=1
+  # Install the cpuonly package if needed
+  if [ "$pytorch_variant_type" == "cpu" ]; then
     local pytorch_package="cpuonly pytorch"
   else
+    pytorch_variant_type="cuda"
     local pytorch_package="pytorch"
   fi
 
@@ -499,15 +510,25 @@ install_pytorch_conda () {
     local pytorch_channel="pytorch"
   fi
 
+  # Clean up packages before installation
+  __conda_cleanup
+
   # Install PyTorch packages
   # NOTE: Installation of large package might fail due to corrupt package download
   # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
-  echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..."
+  echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
   (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
+  # Check that PyTorch is importable
+  (test_python_import "${env_name}" torch.distributed) || return 1
+
+  # Print out the actual installed PyTorch version
+  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
+
   # Run check for GPU variant
-  if [ "$pytorch_cpu" == "" ]; then
+  if [ "$pytorch_variant_type" == "cuda" ]; then
     # Ensure that the PyTorch build is the GPU variant (i.e. contains cuDNN reference)
     # This test usually applies to the PyTorch nightly builds
     if conda list -n "${env_name}" pytorch | grep cudnn; then
@@ -526,13 +547,7 @@ install_pytorch_conda () {
     (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
   fi
 
-  # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
-
-  # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
-  echo "[INSTALL] Installed PyTorch through Conda"
-  echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
+  echo "[INSTALL] Successfully installed PyTorch through Conda"
 }
 
 install_pytorch_pip () {
@@ -591,30 +606,31 @@ install_pytorch_pip () {
   # shellcheck disable=SC2086
   (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
 
-  if [ "$pytorch_variant_type" != "cpu" ]; then
-    if [ "$pytorch_variant_type" == "cuda" ]; then
-      # Ensure that the PyTorch-CUDA headers are properly installed
-      (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
-    fi
+  # Check that PyTorch is importable
+  (test_python_import "${env_name}" torch.distributed) || return 1
 
+  # Print out the actual installed PyTorch version
+  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
+
+  if [ "$pytorch_variant_type" != "cpu" ]; then
     # Ensure that the PyTorch build is of the correct variant
     # This test usually applies to the PyTorch nightly builds
-    if conda run -n build_binary pip list torch | grep torch | grep "${pytorch_variant}"; then
+    if conda run -n "${env_name}" pip list torch | grep torch | grep "${pytorch_variant}"; then
       echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})"
     else
       echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!"
-      echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA presently installed on the system has not been published yet."
+      echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available."
       return 1
     fi
   fi
 
-  # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
+  if [ "$pytorch_variant_type" == "cuda" ]; then
+    # Ensure that the PyTorch-CUDA headers are properly installed
+    (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
+  fi
 
-  # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
-  echo "[INSTALL] Installed PyTorch through PIP"
-  echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
+  echo "[INSTALL] Successfully installed PyTorch through PIP"
 }
 
 
@@ -664,6 +680,9 @@ install_cuda () {
     return 1
   fi
 
+  # Clean up packages before installation
+  __conda_cleanup
+
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
   (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
@@ -970,7 +989,7 @@ create_conda_pytorch_environment () {
 
   if [ "${cuda_version}" == "" ]; then
     # Install the CPU variant of PyTorch
-    install_pytorch_conda "${env_name}" "${pytorch_version}" 1
+    install_pytorch_conda "${env_name}" "${pytorch_version}" cpu
   else
     # Install CUDA and the GPU variant of PyTorch
     install_cuda "${env_name}" "${cuda_version}"
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index adf8443eae..b7dea4093a 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -6,13 +6,22 @@
 name: FBGEMM_GPU CI
 
 on:
-  push:
+  # PR Trigger
+  #
+  pull_request:
     branches:
       - main
-  pull_request:
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
     branches:
       - main
 
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -35,7 +44,7 @@ jobs:
       matrix:
         container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
-        rocm-version: [ "5.3" ]
+        rocm-version: [ "5.3", "5.4.2" ]
 
     steps:
     - name: Setup Build Container
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index b0ac76900c..0d9257d554 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -48,6 +48,7 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
@@ -85,8 +86,9 @@ jobs:
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
@@ -106,7 +108,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.g5.4xlarge.nvidia.gpu
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root --gpus all
     defaults:
       run:
         shell: bash
@@ -117,7 +122,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.g5.4xlarge.nvidia.gpu ]
+        container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
@@ -125,11 +130,22 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils curl git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -145,21 +161,17 @@ jobs:
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
-
     - name: Install FBGEMM_GPU Nightly
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index d99c3f73ee..8d1d39805f 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -49,6 +49,7 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
@@ -83,7 +84,7 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
@@ -125,6 +126,11 @@ jobs:
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -138,20 +144,15 @@ jobs:
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
-
     - name: Install FBGEMM_GPU Nightly (CPU version)
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index 75d5235b69..b909cec274 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -40,6 +40,7 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
@@ -98,7 +99,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.g5.4xlarge.nvidia.gpu
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root --gpus all
     defaults:
       run:
         shell: bash
@@ -109,18 +113,30 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.g5.4xlarge.nvidia.gpu ]
+        container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
     needs: build_artifact
+
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils curl git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -142,15 +158,10 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
-
     - name: Install FBGEMM_GPU
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index f13ebd32c9..577f0b5e88 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -40,6 +40,7 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
@@ -74,7 +75,7 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
@@ -116,6 +117,11 @@ jobs:
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -129,20 +135,15 @@ jobs:
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
-
     - name: Install FBGEMM_GPU (CPU version)
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index 56aa780fe3..c50bd50d03 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -59,9 +59,11 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
 
 ### C/C++ Compiler
 
-Install the GCC toolchain.  Note that GCC (as opposed to Clang for example) is
-required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++`
-in the path.
+Install a version of the GCC toolchain that supports **C++17**.  Note that GCC
+(as opposed to Clang for example) is required for GPU (CUDA) builds because
+NVIDIA's `nvcc` relies on `gcc` and `g++` in the path.  The `sysroot` package
+will also need to be installed to avoid issues with missing versioned symbols
+when compiling FBGEMM_CPU:
 
 ```sh
 conda install -n "${env_name}" -y gxx_linux-64=9.3.0

From 7dc393295c70126bb8f14f5c64538e12e92ecaca Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 22 Mar 2023 19:06:59 -0700
Subject: [PATCH 18/34] Add tbe_input_combine_with_length for GPU (#1647)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1647

Implement `tbe_input_combine_with_length` for GPU.  The operator takes
3 lists of tensors (`indices`, `lengths`, and `per_sample_weights`)
and concatenates each one into a single tensor.  Implicit type casting
is also performed if the input types are different from the output
types.  `indices` and `lengths` tensors can be of type `int32_t` or
`int64_t`.  The outputs for `indices` concatenation and `lengths`
concatenation are fixed to `int32_t`.  `per_sample_weights` must be
`float`.

Reviewed By: bangshengtang

Differential Revision: D44076452

fbshipit-source-id: f6ce8628e7345093bb55835f9523870c2914516f
---
 fbgemm_gpu/CMakeLists.txt                     |   6 +-
 fbgemm_gpu/include/fbgemm_gpu/input_combine.h |  15 ++
 fbgemm_gpu/src/input_combine.cu               | 160 +++++++++++++
 fbgemm_gpu/src/input_combine_gpu.cpp          | 226 ++++++++++++++++++
 fbgemm_gpu/test/input_combine_test.py         |  61 +++--
 5 files changed, 446 insertions(+), 22 deletions(-)
 create mode 100644 fbgemm_gpu/src/input_combine.cu
 create mode 100644 fbgemm_gpu/src/input_combine_gpu.cpp

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 036470adf2..51348505c4 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -318,7 +318,8 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_utils.cpp
     src/split_table_batched_embeddings.cpp
     src/metric_ops_host.cpp
-    src/embedding_inplace_update_gpu.cpp)
+    src/embedding_inplace_update_gpu.cpp
+    src/input_combine_gpu.cpp)
 
   if(NVML_LIB_PATH)
     message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}")
@@ -352,7 +353,8 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache_cuda.cu
       src/split_embeddings_utils.cu
       src/metric_ops.cu
-      src/embedding_inplace_update.cu)
+      src/embedding_inplace_update.cu
+      src/input_combine.cu)
 
   set_source_files_properties(
     ${fbgemm_gpu_sources_gpu} PROPERTIES COMPILE_OPTIONS
diff --git a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
index 348e0bebfc..c329d6c9d9 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
@@ -30,4 +30,19 @@ padding_fused_tbe_input_combine_cpu(
     const at::Tensor& include_last_offsets,
     int64_t batch_size);
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+tbe_input_combine_with_length_cuda(
+    const uint64_t* const indices_addrs,
+    const uint64_t* const lengths_addrs,
+    const uint64_t* const per_sample_weights_addrs,
+    const uint32_t* const indices_is_long,
+    const uint32_t* const lengths_is_long,
+    const uint64_t* const indices_offsets,
+    const uint64_t* const lengths_offsets,
+    const uint64_t num_lists,
+    const uint64_t total_indices,
+    const uint64_t total_lengths,
+    const uint64_t max_list_size,
+    const c10::DeviceIndex& device);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/input_combine.cu b/fbgemm_gpu/src/input_combine.cu
new file mode 100644
index 0000000000..040ca14bbf
--- /dev/null
+++ b/fbgemm_gpu/src/input_combine.cu
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <c10/cuda/CUDAGuard.h>
+#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "fbgemm_gpu/input_combine.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+template <typename src_t, typename dst_t, uint32_t VEC_WIDTH>
+DEVICE_INLINE void vec_copy_with_implicit_type_cast(
+    dst_t* const __restrict__ dst,
+    const uint64_t src_addr,
+    const uint64_t src_offset,
+    const uint64_t dst_offset,
+    const uint64_t src_bound) {
+  // TODO: Use vector load/store if address aligns with the vector type
+  const src_t* const src = reinterpret_cast<src_t*>(src_addr);
+#pragma unroll
+  for (uint64_t i = 0; i < VEC_WIDTH && src_offset + i < src_bound; i++) {
+    dst[dst_offset + i] = src[src_offset + i];
+  }
+}
+
+template <uint32_t VEC_WIDTH, uint32_t IS_LONG_NUM_BITS>
+__global__
+__launch_bounds__(kMaxThreads) void tbe_input_combine_with_length_kernel(
+    int32_t* const __restrict__ combined_indices,
+    int32_t* const __restrict__ combined_lengths,
+    float* const __restrict__ combined_weights,
+    const uint64_t* const __restrict__ indices_addrs,
+    const uint64_t* const __restrict__ lengths_addrs,
+    const uint64_t* const __restrict__ per_sample_weights_addrs,
+    const uint32_t* const __restrict__ indices_is_long,
+    const uint32_t* const __restrict__ lengths_is_long,
+    const uint64_t* const __restrict__ indices_offsets,
+    const uint64_t* const __restrict__ lengths_offsets,
+    const uint64_t num_lists,
+    const FixedDivisor fd_num_warps_per_list) {
+  const auto global_warp_id = blockIdx.x * blockDim.y + threadIdx.y;
+  uint32_t list_id;
+  uint32_t warp_id;
+  fd_num_warps_per_list.DivMod(
+      global_warp_id,
+      reinterpret_cast<int32_t*>(&list_id),
+      reinterpret_cast<int32_t*>(&warp_id));
+
+  if (list_id >= num_lists) {
+    return;
+  }
+
+  // IS_LONG_NUM_BITS is power of 2 (default = 32); div and mod should be cheap
+  const uint32_t is_long_idx = list_id / IS_LONG_NUM_BITS;
+  const uint32_t is_long_mask = 1u << (list_id % IS_LONG_NUM_BITS);
+  const uint64_t src_idx = (warp_id * kWarpSize + threadIdx.x) * VEC_WIDTH;
+  const auto indices_start = indices_offsets[list_id];
+  const auto indices_end = indices_offsets[list_id + 1];
+  const auto lengths_start = lengths_offsets[list_id];
+  const auto lengths_end = lengths_offsets[list_id + 1];
+
+  // Invoke a function based on the indices type
+  ((indices_is_long[is_long_idx] & is_long_mask)
+       ? vec_copy_with_implicit_type_cast<int64_t, int32_t, VEC_WIDTH>
+       : vec_copy_with_implicit_type_cast<
+             int32_t,
+             int32_t,
+             VEC_WIDTH>)(combined_indices, indices_addrs[list_id], src_idx, indices_start + src_idx, indices_end - indices_start);
+
+  // Invoke a function based on the lengths type
+  ((lengths_is_long[is_long_idx] & is_long_mask)
+       ? vec_copy_with_implicit_type_cast<int64_t, int32_t, VEC_WIDTH>
+       : vec_copy_with_implicit_type_cast<
+             int32_t,
+             int32_t,
+             VEC_WIDTH>)(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start + src_idx, lengths_end - lengths_start);
+
+  if (per_sample_weights_addrs) {
+    vec_copy_with_implicit_type_cast<float, float, VEC_WIDTH>(
+        combined_weights,
+        per_sample_weights_addrs[list_id],
+        src_idx,
+        indices_start + src_idx,
+        indices_end - indices_start);
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_cuda(
+    const uint64_t* const indices_addrs,
+    const uint64_t* const lengths_addrs,
+    const uint64_t* const per_sample_weights_addrs,
+    const uint32_t* const indices_is_long,
+    const uint32_t* const lengths_is_long,
+    const uint64_t* const indices_offsets,
+    const uint64_t* const lengths_offsets,
+    const uint64_t num_lists,
+    const uint64_t total_indices,
+    const uint64_t total_lengths,
+    const uint64_t max_list_size,
+    const c10::DeviceIndex& device) {
+  constexpr uint32_t IS_LONG_NUM_BITS = 32;
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(device);
+
+  // combined_indices and combined_legnths are int tensors
+  const auto int_options = at::TensorOptions().dtype(at::kInt).device(
+      at::kCUDA, at::cuda::current_device());
+  Tensor combined_indices =
+      at::empty({static_cast<int64_t>(total_indices)}, int_options);
+  Tensor combined_lengths =
+      at::empty({static_cast<int64_t>(total_lengths)}, int_options);
+  // combined_weights is a float tensor
+  Tensor combined_weights = at::empty(
+      {per_sample_weights_addrs ? static_cast<int64_t>(total_indices)
+                                : static_cast<int64_t>(0)},
+      at::TensorOptions()
+          .dtype(at::kFloat)
+          .device(at::kCUDA, at::cuda::current_device()));
+
+  // Each thread loads 4 elements (rule of thumb; should work well with 32-bit
+  // inputs)
+  constexpr uint32_t VEC_WIDTH = 4;
+  constexpr uint32_t NUM_WARPS_PER_BLOCK = kMaxThreads / kWarpSize;
+  const auto num_warps_per_list =
+      div_round_up(max_list_size, kWarpSize * VEC_WIDTH);
+  const auto num_blocks =
+      div_round_up(num_warps_per_list * num_lists, NUM_WARPS_PER_BLOCK);
+
+  tbe_input_combine_with_length_kernel<VEC_WIDTH, IS_LONG_NUM_BITS>
+      <<<num_blocks,
+         dim3(kWarpSize, NUM_WARPS_PER_BLOCK),
+         0,
+         at::cuda::getCurrentCUDAStream()>>>(
+          combined_indices.data_ptr<int32_t>(),
+          combined_lengths.data_ptr<int32_t>(),
+          per_sample_weights_addrs ? combined_weights.data_ptr<float>()
+                                   : nullptr,
+          indices_addrs,
+          lengths_addrs,
+          per_sample_weights_addrs,
+          indices_is_long,
+          lengths_is_long,
+          indices_offsets,
+          lengths_offsets,
+          num_lists,
+          FixedDivisor(num_warps_per_list));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {
+      std::move(combined_indices),
+      std::move(combined_lengths),
+      std::move(combined_weights)};
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_gpu.cpp
new file mode 100644
index 0000000000..482cabd963
--- /dev/null
+++ b/fbgemm_gpu/src/input_combine_gpu.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/input_combine.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+constexpr uint32_t IS_LONG_NUM_BITS = 32;
+constexpr uint32_t NUM_ARGS = 7;
+enum args_pos {
+  P_indices_prts = 0,
+  P_lengths_addrs = 1,
+  P_indices_offsets = 2,
+  P_lengths_offsets = 3,
+  P_per_sample_weight = 4,
+  P_indices_is_long = 5,
+  P_lengths_is_long = 6
+};
+
+template <typename T>
+uint64_t compute_num_uint64s(const uint64_t num_elements) {
+  const uint64_t ratio = sizeof(uint64_t) / sizeof(T);
+  return (num_elements + ratio - 1) / ratio;
+}
+
+void offset_tbe_input_combine_with_length_args(
+    uint64_t** indices_addrs,
+    uint64_t** lengths_addrs,
+    uint64_t** indices_offsets,
+    uint64_t** lengths_offsets,
+    uint64_t** per_sample_weights_addrs,
+    uint32_t** indices_is_long,
+    uint32_t** lengths_is_long,
+    uint64_t* base_addr,
+    const uint64_t* const ptr_offsets,
+    const bool need_weights) {
+  *indices_addrs = base_addr + ptr_offsets[P_indices_prts];
+  *lengths_addrs = base_addr + ptr_offsets[P_lengths_addrs];
+  *indices_offsets = base_addr + ptr_offsets[P_indices_offsets];
+  *lengths_offsets = base_addr + ptr_offsets[P_lengths_offsets];
+  *per_sample_weights_addrs =
+      need_weights ? (base_addr + ptr_offsets[P_per_sample_weight]) : nullptr;
+  *indices_is_long =
+      reinterpret_cast<uint32_t*>(base_addr + ptr_offsets[P_indices_is_long]);
+  *lengths_is_long =
+      reinterpret_cast<uint32_t*>(base_addr + ptr_offsets[P_lengths_is_long]);
+}
+
+std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_gpu(
+    const std::vector<Tensor>& indices_list,
+    const std::vector<Tensor>& lengths_list,
+    const std::vector<Tensor>& per_sample_weights) {
+  const auto num_lists = indices_list.size();
+  TORCH_CHECK(num_lists > 0);
+  TORCH_CHECK(lengths_list.size() == num_lists);
+  TORCH_CHECK(per_sample_weights.size() == num_lists);
+  const bool need_weights = std::any_of(
+      per_sample_weights.begin(), per_sample_weights.end(), [](const auto& x) {
+        return x.numel() > 0;
+      });
+
+  // Store is_longs in 32-bit variables. i-th bit (LSB) indicates if
+  // list i-th is long.
+  const uint64_t num_is_longs =
+      (num_lists + IS_LONG_NUM_BITS - 1) / IS_LONG_NUM_BITS;
+  const uint64_t num_is_longs_64 = compute_num_uint64s<uint32_t>(num_is_longs);
+  // args_tensor stores kernel arguments:
+  // - indices_prts (num_lists uint64_t elements)
+  // - lengths_addrs (num_lists uint64_t elements)
+  // - indices_offsets (num_lists + 1 uint64_t elements)
+  // - lengths_offsets (num_lists + 1 uint64_t elements)
+  // - per_sample_weight (num_lists uint64_t elements; optional)
+  // - indices_is_long (num_is_longs uint32_t elements)
+  // - lengths_is_long (num_is_longs uint32_t elements)
+  uint64_t args_offsets[NUM_ARGS + 1];
+  // Initialize offsets with lengths first
+  args_offsets[P_indices_prts] = num_lists;
+  args_offsets[P_lengths_addrs] = num_lists;
+  args_offsets[P_indices_offsets] = num_lists + 1;
+  args_offsets[P_lengths_offsets] = num_lists + 1;
+  args_offsets[P_per_sample_weight] = need_weights ? num_lists : 0;
+  args_offsets[P_indices_is_long] = num_is_longs_64;
+  args_offsets[P_lengths_is_long] = num_is_longs_64;
+
+  // Compute offsets
+  uint64_t offset = 0;
+  auto next = args_offsets[0];
+  for (uint32_t i = 0; i < NUM_ARGS; i++) {
+    args_offsets[i] = offset;
+    offset += next;
+    next = args_offsets[i + 1];
+  }
+  args_offsets[NUM_ARGS] = offset; // total number of uint64_t elements required
+
+  Tensor args_tensor = at::empty(
+      {static_cast<int64_t>(args_offsets[NUM_ARGS] * sizeof(uint64_t))},
+      at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+
+  uint64_t* indices_addrs = nullptr;
+  uint64_t* lengths_addrs = nullptr;
+  uint64_t* indices_offsets = nullptr;
+  uint64_t* lengths_offsets = nullptr;
+  uint64_t* per_sample_weights_addrs = nullptr;
+  uint32_t* indices_is_long = nullptr;
+  uint32_t* lengths_is_long = nullptr;
+
+  // Offset host pointers
+  offset_tbe_input_combine_with_length_args(
+      &indices_addrs,
+      &lengths_addrs,
+      &indices_offsets,
+      &lengths_offsets,
+      &per_sample_weights_addrs,
+      &indices_is_long,
+      &lengths_is_long,
+      reinterpret_cast<uint64_t*>(args_tensor.data_ptr()),
+      args_offsets,
+      need_weights);
+
+  const auto& indices_0 = indices_list[0];
+  uint64_t total_indices = 0;
+  uint64_t total_lengths = 0;
+  uint64_t max_list_size = 0;
+  for (uint64_t i = 0; i < num_lists; i++) {
+    const uint64_t is_long_idx = i / IS_LONG_NUM_BITS;
+    auto& indices_is_long_ = indices_is_long[is_long_idx];
+    auto& lengths_is_long_ = lengths_is_long[is_long_idx];
+    if (i % IS_LONG_NUM_BITS == 0) {
+      indices_is_long_ = 0;
+      lengths_is_long_ = 0;
+    }
+    const auto& indices = indices_list[i];
+    const auto& lengths = lengths_list[i];
+    TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(indices);
+    TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(lengths);
+    TENSORS_ON_SAME_DEVICE(indices, indices_0);
+    TENSORS_ON_SAME_DEVICE(lengths, indices_0);
+    TORCH_CHECK(indices.dtype() == c10::kInt || indices.dtype() == c10::kLong);
+    TORCH_CHECK(lengths.dtype() == c10::kInt || lengths.dtype() == c10::kLong);
+    TENSOR_NDIM_EQUALS(indices, 1);
+    TENSOR_NDIM_EQUALS(lengths, 1);
+
+    const auto indices_numel = indices.numel();
+    const auto lengths_numel = lengths.numel();
+    indices_offsets[i] = total_indices;
+    lengths_offsets[i] = total_lengths;
+    total_indices += indices_numel;
+    total_lengths += lengths_numel;
+    max_list_size =
+        std::max(max_list_size, static_cast<uint64_t>(indices_numel));
+    max_list_size =
+        std::max(max_list_size, static_cast<uint64_t>(lengths_numel));
+
+    // Store pointers in args_tensor
+    indices_addrs[i] = reinterpret_cast<uint64_t>(indices.data_ptr());
+    lengths_addrs[i] = reinterpret_cast<uint64_t>(lengths.data_ptr());
+    indices_is_long_ |= static_cast<uint32_t>(indices.dtype() == c10::kLong)
+        << (i % IS_LONG_NUM_BITS);
+    lengths_is_long_ |= static_cast<uint32_t>(lengths.dtype() == c10::kLong)
+        << (i % IS_LONG_NUM_BITS);
+
+    const auto& weights = per_sample_weights[i];
+    if (weights.numel() > 0) {
+      TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(weights);
+      TENSORS_ON_SAME_DEVICE(weights, indices_0);
+      TENSOR_TYPE_MUST_BE(weights, c10::kFloat);
+      TENSOR_NDIM_EQUALS(weights, 1);
+      TENSORS_HAVE_SAME_NUMEL(weights, indices);
+
+      per_sample_weights_addrs[i] =
+          reinterpret_cast<uint64_t>(weights.data_ptr());
+    }
+  }
+  indices_offsets[num_lists] = total_indices;
+  lengths_offsets[num_lists] = total_lengths;
+
+  const auto& device = indices_0.device();
+  // Transfer args_tensor from host to device
+  args_tensor = args_tensor.to(device, /*non_blocking=*/true);
+
+  // Offset device pointers
+  offset_tbe_input_combine_with_length_args(
+      &indices_addrs,
+      &lengths_addrs,
+      &indices_offsets,
+      &lengths_offsets,
+      &per_sample_weights_addrs,
+      &indices_is_long,
+      &lengths_is_long,
+      reinterpret_cast<uint64_t*>(args_tensor.data_ptr()),
+      args_offsets,
+      need_weights);
+
+  return tbe_input_combine_with_length_cuda(
+      indices_addrs,
+      lengths_addrs,
+      per_sample_weights_addrs,
+      indices_is_long,
+      lengths_is_long,
+      indices_offsets,
+      lengths_offsets,
+      num_lists,
+      total_indices,
+      total_lengths,
+      max_list_size,
+      device.index());
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
+  DISPATCH_TO_CUDA(
+      "tbe_input_combine_with_length",
+      fbgemm_gpu::tbe_input_combine_with_length_gpu);
+};
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index 74f7581576..07102aec90 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -11,12 +11,20 @@
 from typing import List, Optional, Tuple
 
 import torch
+from hypothesis import given, settings
 
 try:
     # pyre-ignore[21]
     from fbgemm_gpu import open_source  # noqa: F401
+
+    # pyre-ignore[21]
+    from test_utils import cpu_and_maybe_gpu
 except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu")
+    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu
+
+DEFAULT_DEVICE = torch.device("cpu")
 
 
 class TBEInputPrepareReference(torch.nn.Module):
@@ -120,23 +128,23 @@ def forward(
 
 
 class InputCombineTest(unittest.TestCase):
-    def _get_inputs(self, dtypes):
+    def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
         indices_list = [
-            torch.tensor([1, 2, 3], dtype=dtypes[0]),
-            torch.tensor([1, 2, 3, 4], dtype=dtypes[1]),
+            torch.tensor([1, 2, 3], dtype=dtypes[0], device=device),
+            torch.tensor([1, 2, 3, 4], dtype=dtypes[1], device=device),
         ]
         offsets_list = [
-            torch.tensor([0, 2], dtype=dtypes[0]),
-            torch.tensor([0, 1, 4], dtype=dtypes[1]),
+            torch.tensor([0, 2], dtype=dtypes[0], device=device),
+            torch.tensor([0, 1, 4], dtype=dtypes[1], device=device),
         ]
         include_last_offsets = [False, True]
         per_sample_weights = [
-            torch.tensor([1, 2, 1], dtype=torch.float),
-            torch.tensor([1, 2, 1, 3], dtype=torch.float),
+            torch.tensor([1, 2, 1], dtype=torch.float, device=device),
+            torch.tensor([1, 2, 1, 3], dtype=torch.float, device=device),
         ]
         empty_per_sample_weights = [
-            torch.tensor([], dtype=torch.float),
-            torch.tensor([], dtype=torch.float),
+            torch.tensor([], dtype=torch.float, device=device),
+            torch.tensor([], dtype=torch.float, device=device),
         ]
         return (
             indices_list,
@@ -226,27 +234,34 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None:
         self.assertTrue(outputs[1].dtype == torch.int32)
         self.assertTrue(outputs[-1].size(0) == 0)
 
-    def _offsets_to_lengths(self, offsets, indices, include_last_offsets):
+    def _offsets_to_lengths(
+        self, offsets, indices, include_last_offsets, device=DEFAULT_DEVICE
+    ):
         if include_last_offsets:
             offsets_complete = offsets
         else:
             offsets_complete = torch.cat(
-                [offsets, torch.tensor([indices.numel()], dtype=offsets.dtype)]
+                [
+                    offsets,
+                    torch.tensor([indices.numel()], dtype=offsets.dtype, device=device),
+                ]
             )
         return offsets_complete[1:] - offsets_complete[:-1]
 
-    def _run_test_with_length(self, dtypes) -> None:
+    def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None:
         (
             indices_list,
             offsets_list,
             per_sample_weights,
             empty_per_sample_weights,
             include_last_offsets,
-        ) = self._get_inputs(dtypes)
+        ) = self._get_inputs(dtypes, device=device)
         ref_mod = TBEInputPrepareReference(include_last_offsets)
 
         lengths_list = [
-            self._offsets_to_lengths(offsets, indices, include_last_offsets)
+            self._offsets_to_lengths(
+                offsets, indices, include_last_offsets, device=device
+            )
             for offsets, indices, include_last_offsets in zip(
                 offsets_list, indices_list, include_last_offsets
             )
@@ -307,14 +322,20 @@ def test_input_combine_int32(self) -> None:
     def test_input_combined_mix(self) -> None:
         self._run_test((torch.int64, torch.int32))
 
-    def test_input_combine_int64_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int64))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_int64_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int64, torch.int64), device=device)
 
-    def test_input_combine_int32_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int64))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_int32_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int32, torch.int32), device=device)
 
-    def test_input_combined_mix_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int32))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_mix_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int64, torch.int32), device=device)
 
     def test_padding_fused_input_combine_int64(self) -> None:
         self._run_padding_fused_test((torch.int64, torch.int64), 64)

From c960b4595bdb52d3fc3b5ca02a976705fb48132f Mon Sep 17 00:00:00 2001
From: Rengan Xu <renganxu@meta.com>
Date: Wed, 22 Mar 2023 22:47:28 -0700
Subject: [PATCH 19/34] jagged_jagged_bmm operator optimization (#1644)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1644

This diff optimizes the jagged_jagged_bmm operator using tiling across thread blocks and GPU shared memory.

Reviewed By: brad-mengchi

Differential Revision: D44029528

fbshipit-source-id: fa5cd5a26893f935427bce5efb7dfcc731c3f47d
---
 fbgemm_gpu/src/jagged_tensor_ops.cu | 85 +++++++++++++++++++----------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 0282fa9f19..e646d28be2 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -1986,7 +1986,7 @@ Tensor jagged_softmax_backward(
   return grad_input;
 }
 
-template <typename index_t, typename scalar_t>
+template <const int BLOCK_SIZE, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> x_values,
     const at::PackedTensorAccessor32<scalar_t, 2> y_values,
@@ -1997,30 +1997,53 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel(
   const int M = x_values.size(1);
   const int N = y_values.size(1);
 
-  const int b_m_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_m_step = gridDim.x * blockDim.y;
-  for (int b_m = b_m_begin; b_m < B * M; b_m += b_m_step) {
-    const int b = b_m / M;
-    const int m = b_m % M;
+  const auto block_row = blockIdx.y;
+  const auto block_col = blockIdx.x;
+  const auto row = threadIdx.y;
+  const auto col = threadIdx.x;
+  __shared__ scalar_t Xs[BLOCK_SIZE][BLOCK_SIZE];
+  __shared__ scalar_t Ys[BLOCK_SIZE][BLOCK_SIZE];
 
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length == 0) {
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        output[b][m][n] = 0;
+  for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+    auto num_l_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    at::acc_type<scalar_t, true> acc = 0;
+
+    const auto row_offset = block_row * BLOCK_SIZE + row;
+    const auto col_offset = block_col * BLOCK_SIZE + col;
+
+    // for loop block tile in length dimension
+    for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+      Xs[row][col] = 0;
+      Ys[row][col] = 0;
+      const auto bk_offset = bk_l * BLOCK_SIZE;
+
+      // load data from global memory to shared memory
+      const auto l_x = bk_offset + col;
+      if (row_offset < M && l_x < length) {
+        Xs[row][col] = x_values[row_start + l_x][row_offset];
       }
-    } else {
-      // TODO: use shared memory and better reduction
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        at::acc_type<scalar_t, true> acc =
-            x_values[row_start][m] * y_values[row_start][n];
-        for (int l = 1; l < length; ++l) {
-          acc += x_values[row_start + l][m] * y_values[row_start + l][n];
-        }
-        output[b][m][n] = acc;
+
+      const auto l_y = bk_offset + row;
+      if (l_y < length && col_offset < N) {
+        Ys[row][col] = y_values[row_start + l_y][col_offset];
       }
+
+      __syncthreads();
+
+#pragma unroll
+      for (auto e = 0; e < BLOCK_SIZE; e++) {
+        acc += Xs[row][e] * Ys[e][col];
+      }
+      __syncthreads();
     }
+
+    // write the result to the output
+    if ((row_offset < M) && (col_offset < N))
+      output[b][row_offset][col_offset] = acc;
   }
 }
 
@@ -2042,9 +2065,16 @@ Tensor jagged_jagged_bmm_forward(
   auto output = at::zeros({B, M, N}, x_values.options());
 
   if (B > 0 && M > 0 && N > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int BLOCK_SIZE = 16;
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+    const auto grid_dim_x = div_round_up(N, BLOCK_SIZE);
+    const auto grid_dim_y = div_round_up(M, BLOCK_SIZE);
+    TORCH_CHECK(
+        grid_dim_y <= kMaxBlockYDim,
+        "M cannot be larger than",
+        grid_dim_y * BLOCK_SIZE + 1 - BLOCK_SIZE);
+    const auto grid_dim_z = std::min(B, kMaxBlockZDim);
+    const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] {
@@ -2054,11 +2084,8 @@ Tensor jagged_jagged_bmm_forward(
               x_values.scalar_type(),
               "jagged_jagged_bmm_kernel_2",
               [&] {
-                jagged_jagged_bmm_kernel<index_t, scalar_t>
-                    <<<div_round_up(B * M, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
+                jagged_jagged_bmm_kernel<BLOCK_SIZE, index_t, scalar_t>
+                    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
                         x_values.packed_accessor32<scalar_t, 2>(),
                         y_values.packed_accessor32<scalar_t, 2>(),
                         offsets.packed_accessor32<index_t, 1>(),

From edc23dc13c261f4d296788f86cc7e7f3311762b7 Mon Sep 17 00:00:00 2001
From: Doe Hyun Yoon <dhyoon@meta.com>
Date: Thu, 23 Mar 2023 14:40:53 -0700
Subject: [PATCH 20/34] Specify device to emulate_cache_miss kernel (#1660)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1660

When enabled emulate cache miss, it caused illegal memory access, if we're using more than one GPU. It turns out that previous diff didn't specify device within emulate_cache_miss kernel.

This diff fixes it. In addition, cleaned up a bit (e.g., no need to used index_t based kernel launch for emulate_cache_miss kernel, as lxu_cache_locations is always with int32_t.

Reviewed By: sryap, YuzeDaiMeta

Differential Revision: D44340131

fbshipit-source-id: d99ba2364e9030cbca6c1166e578d24d99646bb1
---
 fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index e5930ab745..513f32cf8e 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -507,9 +507,8 @@ std::tuple<Tensor, Tensor, c10::optional<Tensor>> get_unique_indices_cuda(
 
 namespace {
 
-template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
-    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
     const int64_t enforced_misses_per_256,
     const bool gather_cache_stats,
@@ -541,8 +540,11 @@ Tensor emulate_cache_miss(
   TENSOR_ON_CUDA_GPU(lxu_cache_locations);
   TENSOR_ON_CUDA_GPU(uvm_cache_stats);
 
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_locations.get_device());
+
   const auto N = lxu_cache_locations.numel();
-  if (lxu_cache_locations.numel() == 0) {
+  if (N == 0) {
     // nothing to do
     return lxu_cache_locations;
   }
@@ -551,21 +553,17 @@ Tensor emulate_cache_miss(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
-  AT_DISPATCH_INDEX_TYPES(
-      lxu_cache_locations.scalar_type(), "emulate_cache_miss", [&] {
-        emulate_cache_miss_kernel<<<
-            blocks,
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            lxu_cache_locations
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            enforced_misses_per_256,
-            gather_cache_stats,
-            uvm_cache_stats
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+  emulate_cache_miss_kernel<<<
+      blocks,
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      lxu_cache_locations
+          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      enforced_misses_per_256,
+      gather_cache_stats,
+      uvm_cache_stats.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return lxu_cache_locations;
 }
 

From d62b5cf5a311578fd47485b9fe3cbb6e66640e19 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 24 Mar 2023 12:33:10 -0700
Subject: [PATCH 21/34] Add C++17 Support to FBGEMM and FBGEMM_GPU OSS builds
 (#1652)

Summary:
- Add C++17 support for the entire FBGEMM_GPU build
- Add C++17 support for the entire FBGEMM build
- Update FBGEMM tests and benchmarks to be C++17-compatible
- Make FBGEMM builds output more logging
- Cherry-pick code changes from D43776442 v4 now that C++17 is fully supported

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1652

Reviewed By: shintaro-iwasaki

Differential Revision: D44287321

Pulled By: q10

fbshipit-source-id: 4bf2bcf66d528939865d42b6deafc470bee55d17
---
 .bazelrc                                      | 48 +++++++++++++++
 .github/scripts/setup_env.bash                | 41 +++++++++++++
 .github/workflows/fbgemm_ci.yml               | 38 +++++++-----
 BUILD.bazel                                   | 20 +++----
 CMakeLists.txt                                | 35 ++++++-----
 WORKSPACE.bazel                               |  4 +-
 bench/CMakeLists.txt                          | 15 +++--
 bench/EmbeddingSpMDM8BitBenchmark.cc          |  2 +-
 bench/EmbeddingSpMDMBenchmark.cc              |  2 +-
 bench/EmbeddingSpMDMNBitBenchmark.cc          |  2 +-
 ...mbeddingSpMDMNBitRowWiseSparseBenchmark.cc |  2 +-
 bench/RowwiseAdagradFusedBenchmark.cc         |  2 +-
 fbgemm_gpu/CMakeLists.txt                     | 59 +++++++++++--------
 fbgemm_gpu/docs/BuildInstructions.md          | 19 +++---
 include/fbgemm/Types.h                        | 12 ++--
 test/CMakeLists.txt                           | 19 +++---
 third_party/asmjit.BUILD                      |  2 -
 17 files changed, 226 insertions(+), 96 deletions(-)
 create mode 100644 .bazelrc

diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000000..1e5dbcfcb7
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################################################################
+# FBGEMM Bazel configuration file
+#
+# Based on MozoLM build options:
+#   https://github.com/google-research/mozolm/blob/main/.bazelrc
+#
+# Documentation for Bazel configuration options can be found in:
+#   https://bazel.build/reference/command-line-reference
+################################################################################
+
+# Automatically picks up host-OS-specific config lines from bazelrc files
+# Enabling this is equivalent to auto-calling --config=linux on Linux, --config=windows, etc
+build --enable_platform_specific_config
+
+# Print logs for all tests
+test --test_output=all
+
+# Build with verbose logging
+build --verbose_explanations --verbose_failures
+test  --verbose_explanations --verbose_failures
+
+# Build with optimization mode turned on
+build  --compilation_mode opt
+test   --compilation_mode opt
+
+# Build FBGEMM with C17 and C++17
+build:linux --cxxopt=-std=c++17
+build:linux --host_cxxopt=-std=c++17
+build:linux --conlyopt=-std=c17
+build:linux --host_conlyopt=-std=c17
+build:macos --cxxopt=-std=c++17
+build:macos --host_cxxopt=-std=c++17
+build:macos --conlyopt=-std=c17
+build:macos --host_conlyopt=-std=c17
+build:windows --cxxopt=/std:c++17
+build:windows --host_cxxopt=/std:c++17
+build:windows --conlyopt=/std:c17
+build:windows --host_conlyopt=/std:c17
+
+# Generation of `runfiles` directories on Windows has to be explicitly enabled.
+# See https://github.com/bazelbuild/bazel/issues/8843.
+build:windows --enable_runfiles
+test:windows --enable_runfiles
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 8329d661cc..f998bdba3f 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -364,6 +364,38 @@ print_glibc_info () {
 }
 
 
+################################################################################
+# Bazel Setup Functions
+################################################################################
+
+setup_bazel () {
+  echo "################################################################################"
+  echo "# Setup Bazel"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  local bazel_version="6.1.1"
+
+  if [[ $OSTYPE == 'darwin'* ]]; then
+    local bazel_variant="darwin-$(uname -m)"
+  else
+    local bazel_variant="linux-x86_64"
+  fi
+
+  echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..."
+  print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh
+
+  echo "[SETUP] Installing Bazel ..."
+  print_exec bash install-bazel.sh
+  print_exec rm -f install-bazel.sh
+
+  print_exec bazel --version
+  echo "[SETUP] Successfully set up Bazel"
+}
+
+
 ################################################################################
 # Miniconda Setup Functions
 ################################################################################
@@ -915,6 +947,15 @@ install_cxx_compiler () {
 
   # Print out the C++ version
   print_exec conda run -n "${env_name}" c++ --version
+
+  # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler
+  echo "[INSTALL] Printing the default version of the C++ standard used by the compiler ..."
+  print_exec conda run -n "${env_name}" c++ -x c++ /dev/null -E -dM | grep __cplusplus
+
+  # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc
+  echo "[INSTALL] Printing the default version of the C standard used by the compiler ..."
+  print_exec conda run -n "${env_name}" cc -dM -E - < /dev/null | grep __STDC_VERSION__
+
   echo "[INSTALL] Successfully installed C/C++ compilers"
 }
 
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index 9b18dfb884..79561102af 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -56,8 +56,9 @@ jobs:
       run: |
         set -e
         mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake --version
         cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 ..
-        make -j
+        make -j VERBOSE=1
 
     - name: Test FBGEMM Library (${{ matrix.library-type }})
       run: |
@@ -94,23 +95,34 @@ jobs:
       run: |
         set -e
         mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake --version
         cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} ..
-        make -j
+        make -j VERBOSE=1
 
 
   build-bazel:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
     env:
       PRELUDE: .github/scripts/setup_env.bash
+      DEBIAN_FRONTEND: noninteractive
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-latest ]
+        container-image: [ "ubuntu:20.04" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential cmake git libblas-dev python3 sudo unzip wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -120,18 +132,13 @@ jobs:
       run: . $PRELUDE; print_system_info
 
     - name: Download bazel
-      run: |
-        set -e
-        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
-        # verify content
-        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
-        chmod +x bazel
+      run: . $PRELUDE; setup_bazel
 
-    - name: Build FBGEMM with bazel
-      run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
+    - name: Build FBGEMM Library
+      run: bazel build -s :*
 
-    - name: Test FBGEMM bazel build
-      run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
+    - name: Test FBGEMM Library
+      run: bazel test -s :*
 
 
   build-windows:
@@ -168,8 +175,9 @@ jobs:
         mkdir %BUILD_DIR%
         cd %BUILD_DIR%
         echo "STARTING CMAKE"
+        cmake --version
         cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
-        ninja all
+        ninja -v all
         echo "Build Success"
 
     - name: Test FBGEMM Library (${{ matrix.library-type }})
diff --git a/BUILD.bazel b/BUILD.bazel
index e998487255..12e05c4522 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -159,14 +159,14 @@ cc_library(
 )
 
 [
-  cc_test(
-      name = paths.split_extension(paths.basename(filename))[0],
-      size = "medium",
-      srcs = [
-          filename,
-      ],
-      deps = [
-          ":test_utils",
-      ],
-  ) for filename in get_fbgemm_tests()
+    cc_test(
+        name = paths.split_extension(paths.basename(filename))[0],
+        size = "medium",
+        srcs = [
+            filename,
+        ],
+        deps = [
+            ":test_utils",
+        ],
+    ) for filename in get_fbgemm_tests()
 ]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58dcb9aeb0..32920d1d48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,19 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+# Set the default C++ standard to C++17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+
+# Set the default C standard to C11
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
 
@@ -114,17 +129,11 @@ add_dependencies(fbgemm_generic defs.bzl)
 add_dependencies(fbgemm_avx2 defs.bzl)
 add_dependencies(fbgemm_avx512 defs.bzl)
 
-set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES
-      CXX_STANDARD 14
-      CXX_STANDARD_REQUIRED YES
-      CXX_EXTENSIONS NO
-      CXX_VISIBILITY_PRESET hidden)
-
-#On Windows:
-#1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function
-#calls with _dllimport attribute
-#2) MSVC uses /MD in default cxx compiling flags,
-#need to change it to /MT in static case
+# On Windows:
+# 1)  Adding definition of ASMJIT_STATIC to avoid generating asmjit function
+#     calls with _dllimport attribute
+# 2)  MSVC uses /MD in default cxx compiling flags,
+# Need to change it to /MT in static case
 if(MSVC)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4305 /wd4309")
   if(FBGEMM_LIBRARY_TYPE STREQUAL "static")
@@ -267,8 +276,6 @@ elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared")
   set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
-  set_target_properties(fbgemm PROPERTIES
-    CXX_VISIBILITY_PRESET hidden)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static")
   add_library(fbgemm STATIC
     $<TARGET_OBJECTS:fbgemm_generic>
diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel
index 30b1a80424..aff61b2b94 100644
--- a/WORKSPACE.bazel
+++ b/WORKSPACE.bazel
@@ -16,9 +16,9 @@ http_archive(
 
 http_archive(
     name = "com_google_googletest",
-    strip_prefix = "googletest-cd6b9ae3243985d4dc725abd513a874ab4161f3e",
+    strip_prefix = "googletest-1.13.0",
     urls = [
-        "https://github.com/google/googletest/archive/cd6b9ae3243985d4dc725abd513a874ab4161f3e.tar.gz",
+        "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz",
     ],
 )
 
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index b4fad7510a..49f9e38fa2 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,4 +1,12 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 find_package(MKL)
 if (NOT ${MKL_FOUND})
@@ -21,15 +29,12 @@ if (${BLAS_FOUND})
   message(STATUS "BLAS_LIBRARIES= ${BLAS_LIBRARIES}")
 endif()
 
-#benchmarks
+# Benchmarks
 macro(add_benchmark BENCHNAME)
   add_executable(${BENCHNAME} ${ARGN}
     BenchUtils.cc
     ../test/QuantizationHelpers.cc
     ../test/EmbeddingSpMDMTestUtils.cc)
-  set_target_properties(${BENCHNAME} PROPERTIES
-          CXX_STANDARD 11
-          CXX_EXTENSIONS NO)
   target_compile_options(${BENCHNAME} PRIVATE
     "-m64" "-mavx2" "-mfma" "-masm=intel")
   target_link_libraries(${BENCHNAME} fbgemm)
diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc
index 1fcf4607de..17934b6101 100644
--- a/bench/EmbeddingSpMDM8BitBenchmark.cc
+++ b/bench/EmbeddingSpMDM8BitBenchmark.cc
@@ -111,7 +111,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc
index b987586aac..246549f6a7 100644
--- a/bench/EmbeddingSpMDMBenchmark.cc
+++ b/bench/EmbeddingSpMDMBenchmark.cc
@@ -104,7 +104,7 @@ void run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
index ed5485ae29..fff665babb 100644
--- a/bench/EmbeddingSpMDMNBitBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -116,7 +116,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
index d1b28f54b5..c50500768d 100644
--- a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
@@ -131,7 +131,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/RowwiseAdagradFusedBenchmark.cc b/bench/RowwiseAdagradFusedBenchmark.cc
index 6f1203e6ab..a0524afaa5 100644
--- a/bench/RowwiseAdagradFusedBenchmark.cc
+++ b/bench/RowwiseAdagradFusedBenchmark.cc
@@ -90,7 +90,7 @@ void run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 51348505c4..1fb8f397e0 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -1,15 +1,34 @@
-cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR)
-
-option(FBGEMM_CPU_ONLY "Build fbgemm_gpu without GPU support" OFF)
-
-set(message_line
-    "-------------------------------------------------------------")
-message("${message_line}")
+cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
+
+# Set the default C++ standard to C++17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Set the default C standard to C17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
+set(CMAKE_C_STANDARD 17)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+
+function(BLOCK_PRINT)
+  message("================================================================================")
+  foreach(ARG IN LISTS ARGN)
+     message("${ARG}")
+  endforeach()
+  message("================================================================================")
+  message("")
+endfunction()
 
 if(SKBUILD)
-  message("The project is built using scikit-build")
+  BLOCK_PRINT("The project is built using scikit-build")
 endif()
 
+# Build options
+option(FBGEMM_CPU_ONLY "Build FBGEMM_GPU without GPU support" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_ROCM "Use ROCm" OFF)
 
@@ -21,11 +40,10 @@ if(((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH}))
 endif()
 
 if(FBGEMM_CPU_ONLY)
-  message("Building for CPU-only")
+  BLOCK_PRINT("Building the CPU-only variant of FBGEMM-GPU")
 endif()
 
-message("${message_line}")
-message(STATUS "USE_ROCM ${USE_ROCM}")
+BLOCK_PRINT("USE_ROCM: ${USE_ROCM}")
 
 if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
@@ -46,12 +64,16 @@ set(THIRDPARTY ${FBGEMM}/third_party)
 
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
-    set(CXX_STANDARD_REQUIRED ON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
   endif()
-  message("${CMAKE_CXX_FLAGS}")
+  BLOCK_PRINT(
+    "Default C++ compiler flags"
+    "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
+    ""
+    "${CMAKE_CXX_FLAGS}"
+  )
 endif()
 
 #
@@ -72,8 +94,7 @@ if(USE_ROCM)
   include(Hip)
   include(Hipify)
 
-  message("${message_line}")
-  message(STATUS "hip found ${HIP_FOUND}")
+  BLOCK_PRINT("HIP found: ${HIP_FOUND}")
 endif()
 
 #
@@ -414,13 +435,6 @@ if(USE_ROCM)
 else()
   add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
                                    ${cpp_asmjit_files} ${cpp_fbgemm_files})
-  set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES
-                                             "${cuda_architectures}")
-
-  # FBGEMM_CUB_USE_NAMESPACE will cause compilation errors on CUB for CUDA 12+
-  # if(NOT FBGEMM_CPU_ONLY)
-  #   target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
-  # endif()
 endif()
 
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
@@ -430,7 +444,6 @@ if(NVML_LIB_PATH)
   target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH})
 endif()
 target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
-set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
 
 install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu)
 
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index c50bd50d03..4f2c9c142b 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -66,18 +66,23 @@ will also need to be installed to avoid issues with missing versioned symbols
 when compiling FBGEMM_CPU:
 
 ```sh
-conda install -n "${env_name}" -y gxx_linux-64=9.3.0
+conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge
 ```
 
-Note that while newer versions of GCC can be used, binaries compiled under newer
-versions of GCC will not be compatible with older systems such as Ubuntu 20.04
-or CentOS Stream 8, because the compiled library will reference symbols from
-versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support.  To
-see what versions of GLIBCXX the available `libstdc++.so.6` supports:
+While newer versions of GCC can be used, binaries compiled under newer versions
+of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS
+Stream 8, because the compiled library will reference symbols from versions of
+`GLIBCXX` that the system's `libstdc++.so.6` will not support.  To see what
+versions of GLIBC and GLIBCXX the available `libstdc++.so.6` supports:
 
 ```sh
 libcxx_path=/path/to/libstdc++.so.6
-objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+
+# Print supported for GLIBC versions
+objdump -TC "${libcxx_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat
+
+# Print supported for GLIBCXX versions
+objdump -TC "${libcxx_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
 ```
 
 ### Other Build Tools
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index e5daa28d8b..e7d8278464 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -27,14 +27,14 @@ constexpr uint32_t f16_num_exponent_bits = 5;
 constexpr uint32_t f16_num_mantissa_bits = 10;
 constexpr uint32_t f16_num_non_sign_bits =
     f16_num_exponent_bits + f16_num_mantissa_bits;
-constexpr uint32_t f16_exponent_mask = 0x1F; // 5 bits
+constexpr uint32_t f16_exponent_mask = 0b1'1111; // 5 bits
 constexpr uint32_t f16_sign_bit = 1u
     << (f16_num_exponent_bits + f16_num_mantissa_bits);
 constexpr uint32_t f16_exponent_bits = f16_exponent_mask
     << f16_num_mantissa_bits;
-constexpr uint32_t f16_mantissa_mask = 0x3FF; // 10 bits
+constexpr uint32_t f16_mantissa_mask = 0b11'1111'1111; // 10 bits
 constexpr uint32_t f16_exponent_bias = 15;
-constexpr uint32_t f16_nan = 0x7FFF;
+constexpr uint32_t f16_nan = 0x7F'FF;
 
 // The IEEE754 standard specifies a binary32 as having:
 // SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM
@@ -44,10 +44,10 @@ constexpr uint32_t f16_nan = 0x7FFF;
 //  * 23 mantissa/significand bits (a 24th bit is implicit)
 constexpr uint32_t f32_num_exponent_bits = 8;
 constexpr uint32_t f32_num_mantissa_bits = 23;
-constexpr uint32_t f32_exponent_mask = 0xFF; // 8 bits
-constexpr uint32_t f32_mantissa_mask = 0x7FFFFF; // 23 bits
+constexpr uint32_t f32_exponent_mask = 0b1111'1111; // 8 bits
+constexpr uint32_t f32_mantissa_mask = 0x7F'FF'FF; // 23 bits
 constexpr uint32_t f32_exponent_bias = 127;
-constexpr uint32_t f32_all_non_sign_mask = 0x7FFFFFFF; // 31 bits
+constexpr uint32_t f32_all_non_sign_mask = 0x7F'FF'FF'FF; // 31 bits
 constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit
 constexpr uint32_t f32_num_non_sign_bits =
     f32_num_exponent_bits + f32_num_mantissa_bits;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a30735354a..1e996256bf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,12 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 if(FBGEMM_BUILD_TESTS AND NOT TARGET gtest)
   #Download Googletest framework from github if
@@ -38,12 +46,9 @@ macro(add_gtest TESTNAME)
     EmbeddingSpMDMTestUtils.cc
     QuantizationHelpers.cc
     TestUtils.cc)
-  set_target_properties(${TESTNAME} PROPERTIES
-          CXX_STANDARD 11
-          CXX_EXTENSIONS NO)
-  #To compile test files with AVX2 turned on
-  #For static build, defining FBGEMM_STATIC to avoid generating
-  #functions with _dllimport attributes.
+  # To compile test files with AVX2 turned on
+  # For static build, defining FBGEMM_STATIC to avoid generating
+  # functions with _dllimport attributes.
   if(MSVC)
     target_compile_options(${TESTNAME} PRIVATE
       "/arch:AVX2" "/wd4244" "/wd4267" "/wd4305" "/wd4309")
diff --git a/third_party/asmjit.BUILD b/third_party/asmjit.BUILD
index 71dc5c7e6c..c2764a97c4 100644
--- a/third_party/asmjit.BUILD
+++ b/third_party/asmjit.BUILD
@@ -16,9 +16,7 @@ cc_library(
     copts = [
         "-DASMJIT_STATIC",
         "-fno-tree-vectorize",
-        "-std=c++17",
         "-fmerge-all-constants",
-        "-std=gnu++11",
         "-DTH_BLAS_MKL",
     ],
     includes = [

From 277677039bae25b2570a73013b03bfaa9d2a523e Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Mar 2023 09:10:01 -0700
Subject: [PATCH 22/34] Prune CPU/GPU TBE optimizer codegen (#1659)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1659

This diff aims to reduce the build time and libary size of
`//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops`.

The diff modifies the build target to generate and compile only the
necessary files. This is based on the fact that CPU and GPU do not
support all optimizers in `SplitTBE`.  (Before this diff, all optimizers
were generated and compiled for both CPU and GPU.)

The following is the list of supported optimizers

|OptimType|Generated optimizer|Supported on CPU|Supported on GPU|
|EXACT_ADAGRAD|adagrad|x|x|
|EXACT_ROWWISE_ADAGRAD|rowwise_adagrad_with_counter|x|x|
||rowwise_adagrad|x|x|
|EXACT_ROWWISE_WEIGHTED_ADAGRAD|rowwise_weighted_adagrad|x|x|
|EXACT_SGD|sgd|x|x|
|SGD|approx_sgd|x|x|
|ROWWISE_ADAGRAD|approx_rowwise_adagrad_with_counter|x||
||approx_rowwise_adagrad|x||
|ADAM|adam||x|
|LAMB|lamb||x|
|LARS_SGD|lars_sgd||x|
|PARTIAL_ROWWISE_ADAM|partial_rowwise_adam||x|
|PARTIAL_ROWWISE_LAMB|partial_rowwise_lamb||x|
|-|rowwise_adagrad_with_weight_decay|||
|-|approx_rowwise_adagrad_with_weight_decay|||
Note: x = supported

Reviewed By: jianyuh

Differential Revision: D44326540

fbshipit-source-id: 02413256b4a675f13ada8e8820820cb5112cb405
---
 fbgemm_gpu/CMakeLists.txt                     |  38 +--
 .../embedding_backward_code_generator.py      | 109 ++++++---
 ..._embedding_codegen_lookup_invoker.template | 224 +++++++++---------
 3 files changed, 215 insertions(+), 156 deletions(-)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 1fb8f397e0..b30bc1eab4 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -103,21 +103,27 @@ endif()
 
 set(OPTIMIZERS
     adagrad
-    adam
-    approx_rowwise_adagrad
-    approx_rowwise_adagrad_with_weight_decay
-    approx_rowwise_adagrad_with_counter
     approx_sgd
-    lamb
-    lars_sgd
-    partial_rowwise_adam
-    partial_rowwise_lamb
     rowwise_adagrad
-    rowwise_adagrad_with_weight_decay
     rowwise_adagrad_with_counter
     rowwise_weighted_adagrad
     sgd)
 
+set(CPU_ONLY_OPTIMIZERS
+    approx_rowwise_adagrad
+    approx_rowwise_adagrad_with_counter)
+
+set(GPU_ONLY_OPTIMIZERS
+    adam
+    lamb
+    lars_sgd
+    partial_rowwise_adam
+    partial_rowwise_lamb)
+
+set(CPU_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS})
+set(GPU_OPTIMIZERS ${OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS})
+set(ALL_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS})
+
 set(gen_gpu_source_files
     "gen_embedding_forward_dense_weighted_codegen_cuda.cu"
     "gen_embedding_forward_dense_unweighted_codegen_cuda.cu"
@@ -137,16 +143,16 @@ set(gen_cpu_source_files
 
 set(gen_python_files ${CMAKE_BINARY_DIR}/__init__.py)
 
-foreach(optimizer ${OPTIMIZERS})
-  list(APPEND gen_gpu_host_source_files
-       "gen_embedding_backward_split_${optimizer}.cpp")
-
+foreach(optimizer ${CPU_OPTIMIZERS})
   list(APPEND gen_cpu_source_files
        "gen_embedding_backward_split_${optimizer}_cpu.cpp")
   list(APPEND gen_cpu_source_files
        "gen_embedding_backward_${optimizer}_split_cpu.cpp")
+endforeach()
 
-  list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
+foreach(optimizer ${GPU_OPTIMIZERS})
+  list(APPEND gen_gpu_host_source_files
+       "gen_embedding_backward_split_${optimizer}.cpp")
 
   foreach(weight weighted unweighted)
     list(APPEND gen_gpu_source_files
@@ -154,6 +160,10 @@ foreach(optimizer ${OPTIMIZERS})
   endforeach()
 endforeach()
 
+foreach(optimizer ${ALL_OPTIMIZERS})
+  list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
+endforeach()
+
 set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen)
 
 set(codegen_dependencies
diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index fd69a22f6e..aa832947c3 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -127,53 +127,60 @@ def int_arg(name: str, default: int = 0) -> str:
 def generate(**kwargs: Any) -> None:
     gen_args = kwargs["args"]
 
-    # Generates CUDA variants.
     kwargs["args"] = gen_args["cuda"]
+    if kwargs.get("has_gpu_support"):
+        # Generates CUDA variants.
+        template = env.get_template("embedding_backward_split_template.cu")
+        src_cu = template.render(weighted=False, **kwargs)
+        write(
+            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu",
+            src_cu,
+        )
+        src_cu = template.render(weighted=True, **kwargs)
+        write(
+            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu",
+            src_cu,
+        )
+        if not kwargs.get("dense"):
+            template = env.get_template("embedding_backward_split_host_template.cpp")
+            src_cpp = template.render(**kwargs)
+            write(
+                f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp
+            )
 
-    template = env.get_template("embedding_backward_split_template.cu")
-    src_cu = template.render(weighted=False, **kwargs)
-    write(
-        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu",
-        src_cu,
-    )
-    src_cu = template.render(weighted=True, **kwargs)
-    write(
-        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu",
-        src_cu,
-    )
     if not kwargs.get("dense"):
-        template = env.get_template("embedding_backward_split_host_template.cpp")
-        src_cpp = template.render(**kwargs)
-        write(f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp)
-
         # Generates Python invoker for CUDA + CPU
         template = env.get_template("split_embedding_codegen_lookup_invoker.template")
         src_py = template.render(is_fbcode=args.is_fbcode, **kwargs)
         write(f"lookup_{kwargs.get('optimizer')}.py", src_py)
 
-    # Generates CPU variants.
-    kwargs["args"] = gen_args["cpu"]
+    if kwargs.get("has_cpu_support"):
+        # Generates CPU variants.
+        kwargs["args"] = gen_args["cpu"]
 
-    is_approx = "approx" in kwargs.get("optimizer")
-    template = (
-        env.get_template("embedding_backward_split_cpu_approx_template.cpp")
-        if is_approx
-        else env.get_template("embedding_backward_split_cpu_template.cpp")
-    )
-
-    src_cpp = template.render(**kwargs)
-    write(
-        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp",
-        src_cpp,
-    )
+        is_approx = "approx" in kwargs.get("optimizer")
+        template = (
+            env.get_template("embedding_backward_split_cpu_approx_template.cpp")
+            if is_approx
+            else env.get_template("embedding_backward_split_cpu_template.cpp")
+        )
 
-    if not kwargs.get("dense"):
-        template = env.get_template("embedding_backward_split_host_cpu_template.cpp")
         src_cpp = template.render(**kwargs)
         write(
-            f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", src_cpp
+            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp",
+            src_cpp,
         )
 
+        if not kwargs.get("dense"):
+            template = env.get_template(
+                "embedding_backward_split_host_cpu_template.cpp"
+            )
+            src_cpp = template.render(**kwargs)
+            write(
+                f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp",
+                src_cpp,
+            )
+
 
 @dataclass
 class Args:
@@ -369,6 +376,8 @@ def adagrad() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
 
@@ -490,6 +499,8 @@ def rowwise_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -512,6 +523,8 @@ def rowwise_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=False,
     )
 
 
@@ -611,6 +624,9 @@ def rowwise_adagrad_with_weight_decay() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        # Disable both CPU and GPU support
+        has_cpu_support=False,
+        has_gpu_support=False,
     )
 
     approx_split_weight_update = """
@@ -633,6 +649,9 @@ def rowwise_adagrad_with_weight_decay() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        # Disable both CPU and GPU support
+        has_cpu_support=False,
+        has_gpu_support=False,
     )
 
 
@@ -771,6 +790,8 @@ def rowwise_adagrad_with_counter() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -804,6 +825,8 @@ def rowwise_adagrad_with_counter() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=False,
     )
 
 
@@ -874,6 +897,8 @@ def rowwise_weighted_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
 
@@ -893,6 +918,8 @@ def sgd() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -908,6 +935,8 @@ def sgd() -> None:
         split_precomputation="",
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
 
@@ -978,6 +1007,8 @@ def lamb() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=False,
+        has_gpu_support=True,
     )
 
 
@@ -1064,6 +1095,8 @@ def partial_rowwise_lamb() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=False,
+        has_gpu_support=True,
     )
 
 
@@ -1114,6 +1147,8 @@ def adam() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=False,
+        has_gpu_support=True,
     )
 
 
@@ -1174,6 +1209,8 @@ def partial_rowwise_adam() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=False,
+        has_gpu_support=True,
     )
 
 
@@ -1232,6 +1269,8 @@ def lars_sgd() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
+        has_cpu_support=False,
+        has_gpu_support=True,
     )
 
 
@@ -1296,6 +1335,8 @@ def backward_dense() -> None:
                 (FLOAT, "unused"),
             ]
         ),
+        has_cpu_support=True,
+        has_gpu_support=True,
     )
 
 
@@ -1323,7 +1364,7 @@ def emb_codegen(
     partial_rowwise_adam()
     partial_rowwise_lamb()
     rowwise_adagrad()
-    rowwise_adagrad_with_weight_decay()
+    # rowwise_adagrad_with_weight_decay() # Disabled
     rowwise_adagrad_with_counter()
     rowwise_weighted_adagrad()
     sgd()
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index bd406d39fa..844f04782b 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -49,6 +49,7 @@ def invoke(
     max_counter: float,
     {% endif %}
 ) -> torch.Tensor:
+    {% if has_cpu_support %}
     if (common_args.host_weights.numel() > 0):
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
             # common_args
@@ -147,112 +148,119 @@ def invoke(
             max_counter=max_counter,
             {% endif %}
         )
+    {% if not has_gpu_support %}
     else:
-        return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
-            # common_args
-            {% if not dense %}
-            placeholder_autograd_tensor=common_args.placeholder_autograd_tensor,
-            {% endif %}
-            dev_weights=common_args.dev_weights,
-            uvm_weights=common_args.uvm_weights,
-            lxu_cache_weights=common_args.lxu_cache_weights,
-            weights_placements=common_args.weights_placements,
-            weights_offsets=common_args.weights_offsets,
-            D_offsets=common_args.D_offsets,
-            total_D=common_args.total_D,
-            max_D=common_args.max_D,
-            hash_size_cumsum=common_args.hash_size_cumsum,
-            total_hash_size_bits=common_args.total_hash_size_bits,
-            indices=common_args.indices,
-            offsets=common_args.offsets,
-            pooling_mode=common_args.pooling_mode,
-            indice_weights=common_args.indice_weights,
-            feature_requires_grad=common_args.feature_requires_grad,
-            lxu_cache_locations=common_args.lxu_cache_locations,
-            # optimizer_args
-            gradient_clipping = optimizer_args.gradient_clipping,
-            max_gradient=optimizer_args.max_gradient,
-            stochastic_rounding=optimizer_args.stochastic_rounding,
-            {% if "learning_rate" in args.split_function_arg_names %}
-            learning_rate=optimizer_args.learning_rate,
-            {% endif %}
-            {% if "eps" in args.split_function_arg_names %}
-            eps=optimizer_args.eps,
-            {% endif %}
-            {% if "beta1" in args.split_function_arg_names %}
-            beta1=optimizer_args.beta1,
-            {% endif %}
-            {% if "beta2" in args.split_function_arg_names %}
-            beta2=optimizer_args.beta2,
-            {% endif %}
-            {% if "weight_decay" in args.split_function_arg_names %}
-            weight_decay=optimizer_args.weight_decay,
-            {% endif %}
-            {% if "weight_decay_mode" in args.split_function_arg_names %}
-            weight_decay_mode=optimizer_args.weight_decay_mode,
-            {% endif %}
-            {% if "eta" in args.split_function_arg_names %}
-            eta=optimizer_args.eta,
-            {% endif %}
-            {% if "momentum" in args.split_function_arg_names %}
-            momentum=optimizer_args.momentum,
-            {% endif %}
-            {% if "counter_halflife" in args.split_function_arg_names %}
-            counter_halflife=optimizer_args.counter_halflife,
-            {% endif %}
-            {% if "adjustment_iter" in args.split_function_arg_names %}
-            adjustment_iter=optimizer_args.adjustment_iter,
-            {% endif %}
-            {% if "adjustment_ub" in args.split_function_arg_names %}
-            adjustment_ub=optimizer_args.adjustment_ub,
-            {% endif %}
-            {% if "learning_rate_mode" in args.split_function_arg_names %}
-            learning_rate_mode=optimizer_args.learning_rate_mode,
-            {% endif %}
-            {% if "grad_sum_decay" in args.split_function_arg_names %}
-            grad_sum_decay=optimizer_args.grad_sum_decay,
-            {% endif %}
-            {% if "tail_id_threshold" in args.split_function_arg_names %}
-            tail_id_threshold=optimizer_args.tail_id_threshold,
-            {% endif %}
-            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
-            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
-            {% endif %}
-            # momentum1
-            {% if "momentum1_dev" in args.split_function_arg_names %}
-            momentum1_dev=momentum1.dev,
-            momentum1_uvm=momentum1.uvm,
-            momentum1_offsets=momentum1.offsets,
-            momentum1_placements=momentum1.placements,
-            {% endif %}
-            # momentum2
-            {% if "momentum2_dev" in args.split_function_arg_names %}
-            momentum2_dev=momentum2.dev,
-            momentum2_uvm=momentum2.uvm,
-            momentum2_offsets=momentum2.offsets,
-            momentum2_placements=momentum2.placements,
-            {% endif %}
-            # prev_iter
-            {% if "prev_iter_dev" in args.split_function_arg_names %}
-            prev_iter_dev=prev_iter.dev,
-            prev_iter_uvm=prev_iter.uvm,
-            prev_iter_offsets=prev_iter.offsets,
-            prev_iter_placements=prev_iter.placements,
-            {% endif %}
-            # row_counter
-            {% if "row_counter_dev" in args.split_function_arg_names %}
-            row_counter_dev=row_counter.dev,
-            row_counter_uvm=row_counter.uvm,
-            row_counter_offsets=row_counter.offsets,
-            row_counter_placements=row_counter.placements,
-            {% endif %}
-            # iter
-            {% if "iter" in args.split_function_arg_names %}
-            iter=iter,
-            {% endif %}
-            # max counter
-            {% if "max_counter" in args.split_function_arg_names %}
-            max_counter=max_counter,
-            {% endif %}
-            output_dtype=common_args.output_dtype,
-        )
+        assert False, "{{ optimizer }} has only CPU support. host_weights.numel() must be greater than 0."
+    {% endif %}
+    {% endif %}
+
+    {% if has_gpu_support %}
+    return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
+        # common_args
+        {% if not dense %}
+        placeholder_autograd_tensor=common_args.placeholder_autograd_tensor,
+        {% endif %}
+        dev_weights=common_args.dev_weights,
+        uvm_weights=common_args.uvm_weights,
+        lxu_cache_weights=common_args.lxu_cache_weights,
+        weights_placements=common_args.weights_placements,
+        weights_offsets=common_args.weights_offsets,
+        D_offsets=common_args.D_offsets,
+        total_D=common_args.total_D,
+        max_D=common_args.max_D,
+        hash_size_cumsum=common_args.hash_size_cumsum,
+        total_hash_size_bits=common_args.total_hash_size_bits,
+        indices=common_args.indices,
+        offsets=common_args.offsets,
+        pooling_mode=common_args.pooling_mode,
+        indice_weights=common_args.indice_weights,
+        feature_requires_grad=common_args.feature_requires_grad,
+        lxu_cache_locations=common_args.lxu_cache_locations,
+        # optimizer_args
+        gradient_clipping = optimizer_args.gradient_clipping,
+        max_gradient=optimizer_args.max_gradient,
+        stochastic_rounding=optimizer_args.stochastic_rounding,
+        {% if "learning_rate" in args.split_function_arg_names %}
+        learning_rate=optimizer_args.learning_rate,
+        {% endif %}
+        {% if "eps" in args.split_function_arg_names %}
+        eps=optimizer_args.eps,
+        {% endif %}
+        {% if "beta1" in args.split_function_arg_names %}
+        beta1=optimizer_args.beta1,
+        {% endif %}
+        {% if "beta2" in args.split_function_arg_names %}
+        beta2=optimizer_args.beta2,
+        {% endif %}
+        {% if "weight_decay" in args.split_function_arg_names %}
+        weight_decay=optimizer_args.weight_decay,
+        {% endif %}
+        {% if "weight_decay_mode" in args.split_function_arg_names %}
+        weight_decay_mode=optimizer_args.weight_decay_mode,
+        {% endif %}
+        {% if "eta" in args.split_function_arg_names %}
+        eta=optimizer_args.eta,
+        {% endif %}
+        {% if "momentum" in args.split_function_arg_names %}
+        momentum=optimizer_args.momentum,
+        {% endif %}
+        {% if "counter_halflife" in args.split_function_arg_names %}
+        counter_halflife=optimizer_args.counter_halflife,
+        {% endif %}
+        {% if "adjustment_iter" in args.split_function_arg_names %}
+        adjustment_iter=optimizer_args.adjustment_iter,
+        {% endif %}
+        {% if "adjustment_ub" in args.split_function_arg_names %}
+        adjustment_ub=optimizer_args.adjustment_ub,
+        {% endif %}
+        {% if "learning_rate_mode" in args.split_function_arg_names %}
+        learning_rate_mode=optimizer_args.learning_rate_mode,
+        {% endif %}
+        {% if "grad_sum_decay" in args.split_function_arg_names %}
+        grad_sum_decay=optimizer_args.grad_sum_decay,
+        {% endif %}
+        {% if "tail_id_threshold" in args.split_function_arg_names %}
+        tail_id_threshold=optimizer_args.tail_id_threshold,
+        {% endif %}
+        {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+        is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+        {% endif %}
+        # momentum1
+        {% if "momentum1_dev" in args.split_function_arg_names %}
+        momentum1_dev=momentum1.dev,
+        momentum1_uvm=momentum1.uvm,
+        momentum1_offsets=momentum1.offsets,
+        momentum1_placements=momentum1.placements,
+        {% endif %}
+        # momentum2
+        {% if "momentum2_dev" in args.split_function_arg_names %}
+        momentum2_dev=momentum2.dev,
+        momentum2_uvm=momentum2.uvm,
+        momentum2_offsets=momentum2.offsets,
+        momentum2_placements=momentum2.placements,
+        {% endif %}
+        # prev_iter
+        {% if "prev_iter_dev" in args.split_function_arg_names %}
+        prev_iter_dev=prev_iter.dev,
+        prev_iter_uvm=prev_iter.uvm,
+        prev_iter_offsets=prev_iter.offsets,
+        prev_iter_placements=prev_iter.placements,
+        {% endif %}
+        # row_counter
+        {% if "row_counter_dev" in args.split_function_arg_names %}
+        row_counter_dev=row_counter.dev,
+        row_counter_uvm=row_counter.uvm,
+        row_counter_offsets=row_counter.offsets,
+        row_counter_placements=row_counter.placements,
+        {% endif %}
+        # iter
+        {% if "iter" in args.split_function_arg_names %}
+        iter=iter,
+        {% endif %}
+        # max counter
+        {% if "max_counter" in args.split_function_arg_names %}
+        max_counter=max_counter,
+        {% endif %}
+        output_dtype=common_args.output_dtype,
+    )
+    {% endif %}

From dde6d13814a8323fd690af3d42842c53f3acd862 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 28 Mar 2023 16:41:50 -0700
Subject: [PATCH 23/34] Fix the Documentation Build Job (#1673)

Summary:
- Rewrite the documentation builds job to use the build infrastructure tooling
- Rename workflow files for consistency

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1673

Reviewed By: shintaro-iwasaki

Differential Revision: D44472660

Pulled By: q10

fbshipit-source-id: 60434c1f7098b7efa8c750133bb22f14fc98d5dc
---
 .github/scripts/setup_env.bash                | 70 +++++++++++++-
 .github/workflows/fbgemm_docs.yml             | 91 -------------------
 .github/workflows/fbgemm_gpu_ci.yml           |  2 +-
 ...ild_cpu.yml => fbgemm_gpu_cpu_nightly.yml} |  0
 ...ild_cpu.yml => fbgemm_gpu_cpu_release.yml} |  0
 ..._build.yml => fbgemm_gpu_cuda_nightly.yml} |  2 +-
 ..._build.yml => fbgemm_gpu_cuda_release.yml} |  2 +-
 .github/workflows/fbgemm_gpu_docs.yml         | 89 ++++++++++++++++++
 8 files changed, 159 insertions(+), 97 deletions(-)
 delete mode 100644 .github/workflows/fbgemm_docs.yml
 rename .github/workflows/{fbgemm_nightly_build_cpu.yml => fbgemm_gpu_cpu_nightly.yml} (100%)
 rename .github/workflows/{fbgemm_release_build_cpu.yml => fbgemm_gpu_cpu_release.yml} (100%)
 rename .github/workflows/{fbgemm_nightly_build.yml => fbgemm_gpu_cuda_nightly.yml} (99%)
 rename .github/workflows/{fbgemm_release_build.yml => fbgemm_gpu_cuda_release.yml} (99%)
 create mode 100644 .github/workflows/fbgemm_gpu_docs.yml

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index f998bdba3f..57da549463 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -369,6 +369,7 @@ print_glibc_info () {
 ################################################################################
 
 setup_bazel () {
+  local bazel_version="${1:-6.1.1}"
   echo "################################################################################"
   echo "# Setup Bazel"
   echo "#"
@@ -376,9 +377,8 @@ setup_bazel () {
   echo "################################################################################"
   echo ""
 
-  local bazel_version="6.1.1"
-
   if [[ $OSTYPE == 'darwin'* ]]; then
+    # shellcheck disable=SC2155
     local bazel_variant="darwin-$(uname -m)"
   else
     local bazel_variant="linux-x86_64"
@@ -999,6 +999,31 @@ install_build_tools () {
   echo "[INSTALL] Successfully installed all the build tools"
 }
 
+install_docs_tools () {
+  local env_name="$1"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install Documentation Tools"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  echo "[INSTALL] Installing docs tools ..."
+  (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
+    doxygen) || return 1
+
+  # Check binaries are visible in the PAATH
+  (test_binpath "${env_name}" doxygen) || return 1
+
+  echo "[INSTALL] Successfully installed all the build tools"
+}
 
 ################################################################################
 # Combination Functions
@@ -1087,12 +1112,16 @@ __build_fbgemm_gpu_common_pre_steps () {
   (test_binpath "${env_name}" g++) || return 1
 
   if [ "$fbgemm_variant" == "cpu" ]; then
+    echo "[BUILD] Proceeding to build CPU variant"
+
     # Update the package name and build args depending on if CUDA is specified
     echo "[BUILD] Applying CPU-only build args ..."
     build_args=(--cpu_only)
     package_name="${package_name}-cpu"
 
   elif [ "$fbgemm_variant" == "rocm" ]; then
+    echo "[BUILD] Proceeding to build ROCm variant"
+
     (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1
 
     echo "[BUILD] Applying ROCm build args ..."
@@ -1102,6 +1131,7 @@ __build_fbgemm_gpu_common_pre_steps () {
   else
     # Set to the default variant
     fbgemm_variant="gpu"
+    echo "[BUILD] Proceeding to build GPU variant (default)"
 
     # Check nvcc is visible
     (test_binpath "${env_name}" nvcc) || return 1
@@ -1247,7 +1277,7 @@ build_fbgemm_gpu_install () {
   fi
 
   # Run all the common FBGEMM-GPU build pre-steps (set up variables)
-  __build_fbgemm_gpu_common_pre_steps
+  __build_fbgemm_gpu_common_pre_steps || return 1
 
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
@@ -1258,9 +1288,43 @@ build_fbgemm_gpu_install () {
   # Run checks on the built libraries
   (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1
 
+  echo "[INSTALL] Checking imports ..."
+  # Exit this directory to prevent import clashing, since there is an
+  # fbgemm_gpu/ subdirectory present
+  cd - || return 1
+  (test_python_import "${env_name}" fbgemm_gpu) || return 1
+
   echo "[BUILD] FBGEMM-GPU build + install completed"
 }
 
+build_fbgemm_gpu_docs () {
+  env_name="$1"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env      # Build the docs"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Build FBGEMM-GPU Documentation"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  echo "[BUILD] Installing docs-build dependencies ..."
+  (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
+
+  echo "[BUILD] Running Doxygen build ..."
+  (exec_with_retries conda run -n "${env_name}" doxygen Doxyfile.in) || return 1
+
+  echo "[BUILD] Building HTML pages ..."
+  (exec_with_retries conda run -n "${env_name}" make html) || return 1
+
+  echo "[INSTALL] FBGEMM-GPU documentation build completed"
+}
+
 install_fbgemm_gpu_package () {
   local env_name="$1"
   local package_name="$2"
diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml
deleted file mode 100644
index 06e2045a03..0000000000
--- a/.github/workflows/fbgemm_docs.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-name: FBGEMM Documentation
-on:
-  push:
-    branches:
-      - main
-jobs:
-  build_docs_job:
-    runs-on: linux.2xlarge
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: true
-    # Update references
-    # TODO: update the git submodule sync after we fixed the auto-sync part
-    - name: Git Sumbodule Update
-      run: |
-        git submodule init
-        git submodule update --remote --recursive
-        git log
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: setup Path
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: create conda env
-      run: |
-        conda create --name build_binary python=3.9
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install gcc
-      shell: bash
-      run: |
-        sudo yum group install -y "Development Tools"
-    - name: Setup Path
-      run: |
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-    - name: Test PyTorch Installation
-      run: |
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-    - name: Install fbgemm_gpu nightly
-      run: |
-        cd ./fbgemm_gpu
-        conda run -n build_binary python -m pip install -r requirements.txt
-        conda run -n build_binary python setup.py install --cpu_only
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        cd ./fbgemm_gpu/docs
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Install Doxygen
-      run: |
-        conda install -n build_binary -c conda-forge doxygen
-        which doxygen
-    - name: Build the docset
-      run: |
-        cd ./fbgemm_gpu/docs
-        conda run -n build_binary python -m pip install -r requirements.txt
-        conda run -n build_binary doxygen Doxyfile.in
-        conda run -n build_binary make html
-        cd ..
-    - name: Get output time
-      run: echo "The time was ${{ steps.build.outputs.time }}"
-    - name: Deploy
-      uses: JamesIves/github-pages-deploy-action@releases/v3
-      with:
-          ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          BRANCH: gh-pages # The branch the action should deploy to.
-          FOLDER: fbgemm_gpu/docs/build/html # The folder the action should deploy.
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index b7dea4093a..646c9de168 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -200,7 +200,7 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build and Install FBGEMM_GPU (CPU version)
+    - name: Build + Install FBGEMM_GPU (CPU version)
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
similarity index 100%
rename from .github/workflows/fbgemm_nightly_build_cpu.yml
rename to .github/workflows/fbgemm_gpu_cpu_nightly.yml
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
similarity index 100%
rename from .github/workflows/fbgemm_release_build_cpu.yml
rename to .github/workflows/fbgemm_gpu_cpu_release.yml
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
similarity index 99%
rename from .github/workflows/fbgemm_nightly_build.yml
rename to .github/workflows/fbgemm_gpu_cuda_nightly.yml
index 0d9257d554..7ccdbcbf3e 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU Nightly Build
+name: FBGEMM_GPU-CUDA Nightly Build
 
 on:
   # PR Trigger (enabled only for debugging)
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
similarity index 99%
rename from .github/workflows/fbgemm_release_build.yml
rename to .github/workflows/fbgemm_gpu_cuda_release.yml
index b909cec274..7516e6a021 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU Release Build
+name: FBGEMM_GPU-CUDA Release Build
 
 on:
   # PR Trigger (enabled only for debugging)
diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
new file mode 100644
index 0000000000..fb63995752
--- /dev/null
+++ b/.github/workflows/fbgemm_gpu_docs.yml
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: FBGEMM_GPU Documentation
+
+on:
+  # PR Trigger
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
+jobs:
+  build-docs:
+    runs-on: linux.2xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.11" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git rsync sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install Documentation Tools
+      run: . $PRELUDE; install_docs_tools $BUILD_ENV
+
+    - name: Install PyTorch-CPU Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build + Install FBGEMM_GPU (CPU version)
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
+
+    - name: Build FBGEMM_GPU Documentation
+      run: . $PRELUDE; cd fbgemm_gpu/docs; build_fbgemm_gpu_docs $BUILD_ENV
+
+    - name: Deploy FBGEMM_GPU Documentation
+      if: ${{ github.event_name != 'pull_request' }}
+      uses: JamesIves/github-pages-deploy-action@releases/v4
+      with:
+        branch: gh-pages                    # The branch the action should deploy to
+        folder: fbgemm_gpu/docs/build/html  # The folder the action should deploy

From 7ed2a096af1cac33aeb16cadf7a367fdd5b85def Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Tue, 28 Mar 2023 19:54:47 -0700
Subject: [PATCH 24/34] Back out "Prune CPU/GPU TBE optimizer codegen" (#1675)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1675

Original commit changeset: 02413256b4a6

Original Phabricator Diff: D44326540

Reviewed By: q10, jianyuh

Differential Revision: D44475251

fbshipit-source-id: 5be66944a833e03a2737fc6d1baaa5c351455b2c
---
 fbgemm_gpu/CMakeLists.txt                     |  38 ++-
 .../embedding_backward_code_generator.py      | 109 +++------
 ..._embedding_codegen_lookup_invoker.template | 224 +++++++++---------
 3 files changed, 156 insertions(+), 215 deletions(-)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index b30bc1eab4..1fb8f397e0 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -103,27 +103,21 @@ endif()
 
 set(OPTIMIZERS
     adagrad
+    adam
+    approx_rowwise_adagrad
+    approx_rowwise_adagrad_with_weight_decay
+    approx_rowwise_adagrad_with_counter
     approx_sgd
+    lamb
+    lars_sgd
+    partial_rowwise_adam
+    partial_rowwise_lamb
     rowwise_adagrad
+    rowwise_adagrad_with_weight_decay
     rowwise_adagrad_with_counter
     rowwise_weighted_adagrad
     sgd)
 
-set(CPU_ONLY_OPTIMIZERS
-    approx_rowwise_adagrad
-    approx_rowwise_adagrad_with_counter)
-
-set(GPU_ONLY_OPTIMIZERS
-    adam
-    lamb
-    lars_sgd
-    partial_rowwise_adam
-    partial_rowwise_lamb)
-
-set(CPU_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS})
-set(GPU_OPTIMIZERS ${OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS})
-set(ALL_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS})
-
 set(gen_gpu_source_files
     "gen_embedding_forward_dense_weighted_codegen_cuda.cu"
     "gen_embedding_forward_dense_unweighted_codegen_cuda.cu"
@@ -143,16 +137,16 @@ set(gen_cpu_source_files
 
 set(gen_python_files ${CMAKE_BINARY_DIR}/__init__.py)
 
-foreach(optimizer ${CPU_OPTIMIZERS})
+foreach(optimizer ${OPTIMIZERS})
+  list(APPEND gen_gpu_host_source_files
+       "gen_embedding_backward_split_${optimizer}.cpp")
+
   list(APPEND gen_cpu_source_files
        "gen_embedding_backward_split_${optimizer}_cpu.cpp")
   list(APPEND gen_cpu_source_files
        "gen_embedding_backward_${optimizer}_split_cpu.cpp")
-endforeach()
 
-foreach(optimizer ${GPU_OPTIMIZERS})
-  list(APPEND gen_gpu_host_source_files
-       "gen_embedding_backward_split_${optimizer}.cpp")
+  list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
 
   foreach(weight weighted unweighted)
     list(APPEND gen_gpu_source_files
@@ -160,10 +154,6 @@ foreach(optimizer ${GPU_OPTIMIZERS})
   endforeach()
 endforeach()
 
-foreach(optimizer ${ALL_OPTIMIZERS})
-  list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
-endforeach()
-
 set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen)
 
 set(codegen_dependencies
diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index aa832947c3..fd69a22f6e 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -127,60 +127,53 @@ def int_arg(name: str, default: int = 0) -> str:
 def generate(**kwargs: Any) -> None:
     gen_args = kwargs["args"]
 
+    # Generates CUDA variants.
     kwargs["args"] = gen_args["cuda"]
-    if kwargs.get("has_gpu_support"):
-        # Generates CUDA variants.
-        template = env.get_template("embedding_backward_split_template.cu")
-        src_cu = template.render(weighted=False, **kwargs)
-        write(
-            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu",
-            src_cu,
-        )
-        src_cu = template.render(weighted=True, **kwargs)
-        write(
-            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu",
-            src_cu,
-        )
-        if not kwargs.get("dense"):
-            template = env.get_template("embedding_backward_split_host_template.cpp")
-            src_cpp = template.render(**kwargs)
-            write(
-                f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp
-            )
 
+    template = env.get_template("embedding_backward_split_template.cu")
+    src_cu = template.render(weighted=False, **kwargs)
+    write(
+        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu",
+        src_cu,
+    )
+    src_cu = template.render(weighted=True, **kwargs)
+    write(
+        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu",
+        src_cu,
+    )
     if not kwargs.get("dense"):
+        template = env.get_template("embedding_backward_split_host_template.cpp")
+        src_cpp = template.render(**kwargs)
+        write(f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp)
+
         # Generates Python invoker for CUDA + CPU
         template = env.get_template("split_embedding_codegen_lookup_invoker.template")
         src_py = template.render(is_fbcode=args.is_fbcode, **kwargs)
         write(f"lookup_{kwargs.get('optimizer')}.py", src_py)
 
-    if kwargs.get("has_cpu_support"):
-        # Generates CPU variants.
-        kwargs["args"] = gen_args["cpu"]
+    # Generates CPU variants.
+    kwargs["args"] = gen_args["cpu"]
 
-        is_approx = "approx" in kwargs.get("optimizer")
-        template = (
-            env.get_template("embedding_backward_split_cpu_approx_template.cpp")
-            if is_approx
-            else env.get_template("embedding_backward_split_cpu_template.cpp")
-        )
+    is_approx = "approx" in kwargs.get("optimizer")
+    template = (
+        env.get_template("embedding_backward_split_cpu_approx_template.cpp")
+        if is_approx
+        else env.get_template("embedding_backward_split_cpu_template.cpp")
+    )
+
+    src_cpp = template.render(**kwargs)
+    write(
+        f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp",
+        src_cpp,
+    )
 
+    if not kwargs.get("dense"):
+        template = env.get_template("embedding_backward_split_host_cpu_template.cpp")
         src_cpp = template.render(**kwargs)
         write(
-            f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp",
-            src_cpp,
+            f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", src_cpp
         )
 
-        if not kwargs.get("dense"):
-            template = env.get_template(
-                "embedding_backward_split_host_cpu_template.cpp"
-            )
-            src_cpp = template.render(**kwargs)
-            write(
-                f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp",
-                src_cpp,
-            )
-
 
 @dataclass
 class Args:
@@ -376,8 +369,6 @@ def adagrad() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
 
@@ -499,8 +490,6 @@ def rowwise_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -523,8 +512,6 @@ def rowwise_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=False,
     )
 
 
@@ -624,9 +611,6 @@ def rowwise_adagrad_with_weight_decay() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        # Disable both CPU and GPU support
-        has_cpu_support=False,
-        has_gpu_support=False,
     )
 
     approx_split_weight_update = """
@@ -649,9 +633,6 @@ def rowwise_adagrad_with_weight_decay() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        # Disable both CPU and GPU support
-        has_cpu_support=False,
-        has_gpu_support=False,
     )
 
 
@@ -790,8 +771,6 @@ def rowwise_adagrad_with_counter() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -825,8 +804,6 @@ def rowwise_adagrad_with_counter() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=False,
     )
 
 
@@ -897,8 +874,6 @@ def rowwise_weighted_adagrad() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
 
@@ -918,8 +893,6 @@ def sgd() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
     approx_split_weight_update = """
@@ -935,8 +908,6 @@ def sgd() -> None:
         split_precomputation="",
         split_weight_update=approx_split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
 
@@ -1007,8 +978,6 @@ def lamb() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=False,
-        has_gpu_support=True,
     )
 
 
@@ -1095,8 +1064,6 @@ def partial_rowwise_lamb() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=False,
-        has_gpu_support=True,
     )
 
 
@@ -1147,8 +1114,6 @@ def adam() -> None:
         split_precomputation="",
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=False,
-        has_gpu_support=True,
     )
 
 
@@ -1209,8 +1174,6 @@ def partial_rowwise_adam() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=False,
-        has_gpu_support=True,
     )
 
 
@@ -1269,8 +1232,6 @@ def lars_sgd() -> None:
         split_precomputation=split_precomputation,
         split_weight_update=split_weight_update,
         split_weight_update_cpu=split_weight_update_cpu,
-        has_cpu_support=False,
-        has_gpu_support=True,
     )
 
 
@@ -1335,8 +1296,6 @@ def backward_dense() -> None:
                 (FLOAT, "unused"),
             ]
         ),
-        has_cpu_support=True,
-        has_gpu_support=True,
     )
 
 
@@ -1364,7 +1323,7 @@ def emb_codegen(
     partial_rowwise_adam()
     partial_rowwise_lamb()
     rowwise_adagrad()
-    # rowwise_adagrad_with_weight_decay() # Disabled
+    rowwise_adagrad_with_weight_decay()
     rowwise_adagrad_with_counter()
     rowwise_weighted_adagrad()
     sgd()
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index 844f04782b..bd406d39fa 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -49,7 +49,6 @@ def invoke(
     max_counter: float,
     {% endif %}
 ) -> torch.Tensor:
-    {% if has_cpu_support %}
     if (common_args.host_weights.numel() > 0):
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
             # common_args
@@ -148,119 +147,112 @@ def invoke(
             max_counter=max_counter,
             {% endif %}
         )
-    {% if not has_gpu_support %}
     else:
-        assert False, "{{ optimizer }} has only CPU support. host_weights.numel() must be greater than 0."
-    {% endif %}
-    {% endif %}
-
-    {% if has_gpu_support %}
-    return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
-        # common_args
-        {% if not dense %}
-        placeholder_autograd_tensor=common_args.placeholder_autograd_tensor,
-        {% endif %}
-        dev_weights=common_args.dev_weights,
-        uvm_weights=common_args.uvm_weights,
-        lxu_cache_weights=common_args.lxu_cache_weights,
-        weights_placements=common_args.weights_placements,
-        weights_offsets=common_args.weights_offsets,
-        D_offsets=common_args.D_offsets,
-        total_D=common_args.total_D,
-        max_D=common_args.max_D,
-        hash_size_cumsum=common_args.hash_size_cumsum,
-        total_hash_size_bits=common_args.total_hash_size_bits,
-        indices=common_args.indices,
-        offsets=common_args.offsets,
-        pooling_mode=common_args.pooling_mode,
-        indice_weights=common_args.indice_weights,
-        feature_requires_grad=common_args.feature_requires_grad,
-        lxu_cache_locations=common_args.lxu_cache_locations,
-        # optimizer_args
-        gradient_clipping = optimizer_args.gradient_clipping,
-        max_gradient=optimizer_args.max_gradient,
-        stochastic_rounding=optimizer_args.stochastic_rounding,
-        {% if "learning_rate" in args.split_function_arg_names %}
-        learning_rate=optimizer_args.learning_rate,
-        {% endif %}
-        {% if "eps" in args.split_function_arg_names %}
-        eps=optimizer_args.eps,
-        {% endif %}
-        {% if "beta1" in args.split_function_arg_names %}
-        beta1=optimizer_args.beta1,
-        {% endif %}
-        {% if "beta2" in args.split_function_arg_names %}
-        beta2=optimizer_args.beta2,
-        {% endif %}
-        {% if "weight_decay" in args.split_function_arg_names %}
-        weight_decay=optimizer_args.weight_decay,
-        {% endif %}
-        {% if "weight_decay_mode" in args.split_function_arg_names %}
-        weight_decay_mode=optimizer_args.weight_decay_mode,
-        {% endif %}
-        {% if "eta" in args.split_function_arg_names %}
-        eta=optimizer_args.eta,
-        {% endif %}
-        {% if "momentum" in args.split_function_arg_names %}
-        momentum=optimizer_args.momentum,
-        {% endif %}
-        {% if "counter_halflife" in args.split_function_arg_names %}
-        counter_halflife=optimizer_args.counter_halflife,
-        {% endif %}
-        {% if "adjustment_iter" in args.split_function_arg_names %}
-        adjustment_iter=optimizer_args.adjustment_iter,
-        {% endif %}
-        {% if "adjustment_ub" in args.split_function_arg_names %}
-        adjustment_ub=optimizer_args.adjustment_ub,
-        {% endif %}
-        {% if "learning_rate_mode" in args.split_function_arg_names %}
-        learning_rate_mode=optimizer_args.learning_rate_mode,
-        {% endif %}
-        {% if "grad_sum_decay" in args.split_function_arg_names %}
-        grad_sum_decay=optimizer_args.grad_sum_decay,
-        {% endif %}
-        {% if "tail_id_threshold" in args.split_function_arg_names %}
-        tail_id_threshold=optimizer_args.tail_id_threshold,
-        {% endif %}
-        {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
-        is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
-        {% endif %}
-        # momentum1
-        {% if "momentum1_dev" in args.split_function_arg_names %}
-        momentum1_dev=momentum1.dev,
-        momentum1_uvm=momentum1.uvm,
-        momentum1_offsets=momentum1.offsets,
-        momentum1_placements=momentum1.placements,
-        {% endif %}
-        # momentum2
-        {% if "momentum2_dev" in args.split_function_arg_names %}
-        momentum2_dev=momentum2.dev,
-        momentum2_uvm=momentum2.uvm,
-        momentum2_offsets=momentum2.offsets,
-        momentum2_placements=momentum2.placements,
-        {% endif %}
-        # prev_iter
-        {% if "prev_iter_dev" in args.split_function_arg_names %}
-        prev_iter_dev=prev_iter.dev,
-        prev_iter_uvm=prev_iter.uvm,
-        prev_iter_offsets=prev_iter.offsets,
-        prev_iter_placements=prev_iter.placements,
-        {% endif %}
-        # row_counter
-        {% if "row_counter_dev" in args.split_function_arg_names %}
-        row_counter_dev=row_counter.dev,
-        row_counter_uvm=row_counter.uvm,
-        row_counter_offsets=row_counter.offsets,
-        row_counter_placements=row_counter.placements,
-        {% endif %}
-        # iter
-        {% if "iter" in args.split_function_arg_names %}
-        iter=iter,
-        {% endif %}
-        # max counter
-        {% if "max_counter" in args.split_function_arg_names %}
-        max_counter=max_counter,
-        {% endif %}
-        output_dtype=common_args.output_dtype,
-    )
-    {% endif %}
+        return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
+            # common_args
+            {% if not dense %}
+            placeholder_autograd_tensor=common_args.placeholder_autograd_tensor,
+            {% endif %}
+            dev_weights=common_args.dev_weights,
+            uvm_weights=common_args.uvm_weights,
+            lxu_cache_weights=common_args.lxu_cache_weights,
+            weights_placements=common_args.weights_placements,
+            weights_offsets=common_args.weights_offsets,
+            D_offsets=common_args.D_offsets,
+            total_D=common_args.total_D,
+            max_D=common_args.max_D,
+            hash_size_cumsum=common_args.hash_size_cumsum,
+            total_hash_size_bits=common_args.total_hash_size_bits,
+            indices=common_args.indices,
+            offsets=common_args.offsets,
+            pooling_mode=common_args.pooling_mode,
+            indice_weights=common_args.indice_weights,
+            feature_requires_grad=common_args.feature_requires_grad,
+            lxu_cache_locations=common_args.lxu_cache_locations,
+            # optimizer_args
+            gradient_clipping = optimizer_args.gradient_clipping,
+            max_gradient=optimizer_args.max_gradient,
+            stochastic_rounding=optimizer_args.stochastic_rounding,
+            {% if "learning_rate" in args.split_function_arg_names %}
+            learning_rate=optimizer_args.learning_rate,
+            {% endif %}
+            {% if "eps" in args.split_function_arg_names %}
+            eps=optimizer_args.eps,
+            {% endif %}
+            {% if "beta1" in args.split_function_arg_names %}
+            beta1=optimizer_args.beta1,
+            {% endif %}
+            {% if "beta2" in args.split_function_arg_names %}
+            beta2=optimizer_args.beta2,
+            {% endif %}
+            {% if "weight_decay" in args.split_function_arg_names %}
+            weight_decay=optimizer_args.weight_decay,
+            {% endif %}
+            {% if "weight_decay_mode" in args.split_function_arg_names %}
+            weight_decay_mode=optimizer_args.weight_decay_mode,
+            {% endif %}
+            {% if "eta" in args.split_function_arg_names %}
+            eta=optimizer_args.eta,
+            {% endif %}
+            {% if "momentum" in args.split_function_arg_names %}
+            momentum=optimizer_args.momentum,
+            {% endif %}
+            {% if "counter_halflife" in args.split_function_arg_names %}
+            counter_halflife=optimizer_args.counter_halflife,
+            {% endif %}
+            {% if "adjustment_iter" in args.split_function_arg_names %}
+            adjustment_iter=optimizer_args.adjustment_iter,
+            {% endif %}
+            {% if "adjustment_ub" in args.split_function_arg_names %}
+            adjustment_ub=optimizer_args.adjustment_ub,
+            {% endif %}
+            {% if "learning_rate_mode" in args.split_function_arg_names %}
+            learning_rate_mode=optimizer_args.learning_rate_mode,
+            {% endif %}
+            {% if "grad_sum_decay" in args.split_function_arg_names %}
+            grad_sum_decay=optimizer_args.grad_sum_decay,
+            {% endif %}
+            {% if "tail_id_threshold" in args.split_function_arg_names %}
+            tail_id_threshold=optimizer_args.tail_id_threshold,
+            {% endif %}
+            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+            {% endif %}
+            # momentum1
+            {% if "momentum1_dev" in args.split_function_arg_names %}
+            momentum1_dev=momentum1.dev,
+            momentum1_uvm=momentum1.uvm,
+            momentum1_offsets=momentum1.offsets,
+            momentum1_placements=momentum1.placements,
+            {% endif %}
+            # momentum2
+            {% if "momentum2_dev" in args.split_function_arg_names %}
+            momentum2_dev=momentum2.dev,
+            momentum2_uvm=momentum2.uvm,
+            momentum2_offsets=momentum2.offsets,
+            momentum2_placements=momentum2.placements,
+            {% endif %}
+            # prev_iter
+            {% if "prev_iter_dev" in args.split_function_arg_names %}
+            prev_iter_dev=prev_iter.dev,
+            prev_iter_uvm=prev_iter.uvm,
+            prev_iter_offsets=prev_iter.offsets,
+            prev_iter_placements=prev_iter.placements,
+            {% endif %}
+            # row_counter
+            {% if "row_counter_dev" in args.split_function_arg_names %}
+            row_counter_dev=row_counter.dev,
+            row_counter_uvm=row_counter.uvm,
+            row_counter_offsets=row_counter.offsets,
+            row_counter_placements=row_counter.placements,
+            {% endif %}
+            # iter
+            {% if "iter" in args.split_function_arg_names %}
+            iter=iter,
+            {% endif %}
+            # max counter
+            {% if "max_counter" in args.split_function_arg_names %}
+            max_counter=max_counter,
+            {% endif %}
+            output_dtype=common_args.output_dtype,
+        )

From a49926789619fbb864ecf49f4e3a9e81315149c3 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Tue, 28 Mar 2023 22:53:26 -0700
Subject: [PATCH 25/34] Prepare bounds_check_indices for VBE (#1633)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1633

Prepare `bounds_check_indices` for variable batch size TBE (VBE).

- Update the frontend API to accept VBE args
- Update the backend logic to process VBE data

Reviewed By: jianyuh

Differential Revision: D43253703

fbshipit-source-id: 2870f0c41a96265650281a9b6362d4e6dc48009b
---
 fbgemm_gpu/codegen/embedding_bounds_check.cu  | 146 +++++++++++-------
 .../codegen/embedding_bounds_check_host.cpp   |   4 +-
 .../embedding_bounds_check_host_cpu.cpp       |  11 +-
 3 files changed, 103 insertions(+), 58 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
index 4d77d2b508..bc18695ece 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -23,31 +23,52 @@ __device__ void adjust_offset_kernel(
   *offset_acc_end = indices_end;
 }
 
-template <typename index_t>
+template <typename index_t, bool vbe>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         rows_per_table,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const int32_t* const vbe_metadata,
     const int64_t bounds_check_mode_,
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd) {
   int32_t T = rows_per_table.size(0);
-  int32_t B = (offsets.size(0) - 1) / T;
-
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b; // = b_t % B;
-  int32_t t; // = b_t / B;
-  fd.DivMod(b_t, &t, &b);
-  if (t >= T) {
+  int32_t b;
+  int32_t t;
+  int32_t B = 0;
+  int32_t total_B = offsets.size(0) - 1;
+
+  if (!vbe && b_t >= total_B) {
     return;
   }
-  auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
 
-  auto num_rows = rows_per_table[t];
-  auto indices_start = offsets[t * B + b];
-  auto indices_end = offsets[t * B + b + 1];
-  index_t num_indices = indices.size(0);
+  fd.DivMod(b_t, &t, &b);
+
+  if (vbe) {
+    // Check if t is valid
+    if (t >= T) {
+      return;
+    }
+    const auto B_start = vbe_metadata[t];
+    B = vbe_metadata[t + 1] - B_start;
+    // Check if b is valid
+    if (b >= B) {
+      return;
+    }
+    // Update b_t value
+    b_t = B_start + b;
+  } else {
+    B = total_B / T;
+  }
+
+  const auto bounds_check_mode =
+      static_cast<BoundsCheckMode>(bounds_check_mode_);
+  const auto num_rows = rows_per_table[t];
+  auto indices_start = offsets[b_t];
+  auto indices_end = offsets[b_t + 1];
+  const index_t num_indices = indices.size(0);
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
     CUDA_KERNEL_ASSERT(indices_start >= 0);
@@ -58,12 +79,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         indices_end > num_indices) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: (at least one) Out of bounds access for "
-            "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld,"
+            "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for "
+            "batch: %d, table: %d, indices_start: %lld, indices_end: %lld,"
             " num_indices: %lld. Setting indices_start and indices_end within "
             "the range.\n",
-            static_cast<int64_t>(b),
-            static_cast<int64_t>(t),
+            vbe ? "true" : "false",
+            b,
+            t,
             static_cast<int64_t>(indices_start),
             static_cast<int64_t>(indices_end),
             static_cast<int64_t>(num_indices));
@@ -72,16 +94,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
           indices_start,
           indices_end,
           num_indices,
-          &offsets[t * B + b],
-          &offsets[t * B + b + 1]);
+          &offsets[b_t],
+          &offsets[b_t + 1]);
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
     adjust_offset_kernel(
         indices_start,
         indices_end,
         num_indices,
-        &offsets[t * B + b],
-        &offsets[t * B + b + 1]);
+        &offsets[b_t],
+        &offsets[b_t + 1]);
   }
 
   const auto L = indices_end - indices_start;
@@ -100,9 +122,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
       if (idx < 0 || idx >= num_rows) {
         if (gpuAtomicIncrement(&warning[0]) == 0) {
           printf(
-              "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
-              static_cast<int64_t>(b),
-              static_cast<int64_t>(t),
+              "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
+              vbe ? "true" : "false",
+              b,
+              t,
               static_cast<int64_t>(i),
               static_cast<int64_t>(idx),
               num_rows,
@@ -122,25 +145,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
   }
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
-    CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]);
+    CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]);
   } else if (bounds_check_mode == BoundsCheckMode::WARNING) {
-    if (num_indices != offsets[B * T]) {
+    if (num_indices != offsets[total_B]) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: the last element in offsets is incorrect for "
-            "total batch size B: %lld, total table num T: %lld, "
+            "EmbeddingBoundsCheck (VBE %s): the last element in offsets is incorrect for "
+            "total batch size %s: %d, total table num T: %d, "
             " last element in offsets: %lld, indices size: %lld. "
             " Setting the last element in offsets to be indices size.\n",
-            static_cast<int64_t>(B),
-            static_cast<int64_t>(T),
-            static_cast<int64_t>(offsets[B * T]),
+            vbe ? "true" : "false",
+            vbe ? "total_B" : "B",
+            vbe ? total_B : B,
+            T,
+            static_cast<int64_t>(offsets[total_B]),
             static_cast<int64_t>(num_indices));
       }
-      offsets[B * T] = num_indices;
+      offsets[total_B] = num_indices;
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
-    if (num_indices != offsets[B * T]) {
-      offsets[B * T] = num_indices;
+    if (num_indices != offsets[total_B]) {
+      offsets[total_B] = num_indices;
     }
   }
 }
@@ -151,19 +176,23 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B) {
   TENSOR_ON_CUDA_GPU(rows_per_table);
   TENSOR_ON_CUDA_GPU(indices);
   TENSOR_ON_CUDA_GPU(offsets);
   TENSOR_ON_CUDA_GPU(warning);
   TENSOR_EMPTY_OR_ON_CUDA_GPU(weights);
+  TENSOR_EMPTY_OR_ON_CUDA_GPU(vbe_metadata);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(rows_per_table.get_device());
 
   const int32_t T = rows_per_table.size(0);
-  const int32_t B = (offsets.size(0) - 1) / T;
-  if (B == 0 || T == 0) {
+  const int32_t total_B = offsets.size(0) - 1;
+  const int32_t B = (total_B) / T;
+  if (total_B == 0 || T == 0) {
     return;
   }
   const auto bounds_check_mode =
@@ -172,12 +201,17 @@ void bounds_check_indices_cuda(
     warning.zero_();
   }
   const int64_t num_indices = indices.size(0);
+  const auto vbe = vbe_metadata.has_value();
 
-  TORCH_CHECK(
-      offsets.size(0) == B * T + 1,
-      "offsets size " + std::to_string(offsets.size(0)) +
-          " is not equal to B (" + std::to_string(B) + ") * T (" +
-          std::to_string(T) + ") + 1");
+  if (vbe) {
+    TORCH_CHECK(max_B >= 0);
+  } else {
+    TORCH_CHECK(
+        offsets.size(0) == B * T + 1,
+        "offsets size " + std::to_string(offsets.size(0)) +
+            " is not equal to B (" + std::to_string(B) + ") * T (" +
+            std::to_string(T) + ") + 1");
+  }
   if (weights.has_value()) {
     TORCH_CHECK(
         weights.value().size(0) == num_indices,
@@ -186,20 +220,24 @@ void bounds_check_indices_cuda(
   }
 
   constexpr size_t kNumThreads = 256;
+  const auto max_B_ = vbe ? max_B : B;
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
-    bounds_check_indices_kernel<index_t>
-        <<<div_round_up(B * T, kNumThreads / fbgemm_gpu::kWarpSize),
-           dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
-           0,
-           at::cuda::getCurrentCUDAStream()>>>(
-            rows_per_table
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            bounds_check_mode_,
-            warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            FixedDivisor(B));
+    const auto bounds_check_kernel =
+        (vbe ? bounds_check_indices_kernel<index_t, true>
+             : bounds_check_indices_kernel<index_t, false>);
+    bounds_check_kernel<<<
+        div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize),
+        dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(
+        rows_per_table.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        vbe ? vbe_metadata.value().data_ptr<int32_t>() : nullptr,
+        bounds_check_mode_,
+        warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        FixedDivisor(max_B_));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 84575a3361..87e3cd7521 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -23,7 +23,9 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode,
     Tensor& warning,
-    c10::optional<Tensor> weights);
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B);
 
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
index a2dd19a75e..a33e02e164 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -42,7 +42,12 @@ void bounds_check_indices_cpu(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t /*max_B*/) {
+  TORCH_CHECK(
+      !vbe_metadata.has_value(),
+      "bounds_check_indices on CPU does not support variable length (batch size)");
   auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
@@ -163,7 +168,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
 
@@ -171,6 +176,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }

From c2d6c5e0d16425b77ba15106fdd0be0f47878cac Mon Sep 17 00:00:00 2001
From: Yue Dong <yoyoyod@meta.com>
Date: Wed, 29 Mar 2023 05:29:26 -0700
Subject: [PATCH 26/34] Move pruning/index_remapping support to embedding
 inplace update files (#1667)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1667

As title. This diff moves pruning/index_remapping support to embedding inplace update files.

Reviewed By: jianyuh

Differential Revision: D44409419

fbshipit-source-id: 93fc91d83502eb95cb0feca2a8a03b003c336078
---
 ...bedding_forward_quantized_cpu_template.cpp | 39 --------
 .../embedding_forward_quantized_host.cpp      | 10 --
 .../embedding_forward_quantized_host_cpu.cpp  | 13 ---
 ...edding_forward_quantized_split_template.cu | 77 ---------------
 .../fbgemm_gpu/embedding_inplace_update.h     | 24 +++++
 fbgemm_gpu/src/embedding_inplace_update.cu    | 94 +++++++++++++++++++
 .../src/embedding_inplace_update_cpu.cpp      | 58 ++++++++++++
 .../src/embedding_inplace_update_gpu.cpp      |  6 ++
 8 files changed, 182 insertions(+), 139 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
index 9caaacbfb8..829249b297 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -534,44 +534,5 @@ Tensor pruned_array_lookup_cpu(
     return dense_indices;
 }
 
-Tensor pruned_array_lookup_from_row_idx_cpu(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets) {
-    TENSOR_ON_CPU(update_row_indices);
-    TENSOR_ON_CPU(update_table_indices);
-    TENSOR_ON_CPU(index_remappings);
-    TENSOR_ON_CPU(index_remappings_offsets);
-
-    int32_t T = index_remappings_offsets.size(0) - 1;
-    auto dense_indices = empty_like(update_row_indices);
-    const auto num_indices = update_row_indices.numel();
-
-    AT_DISPATCH_INDEX_TYPES(
-      update_row_indices.scalar_type(), "pruned_array_lookup_from_row_idx_cpu_kernel", [&] {
-        const auto update_row_indices_acc = update_row_indices.accessor<index_t, 1>();
-        auto dense_indices_acc = dense_indices.accessor<index_t, 1>();
-        const auto update_table_indices_acc = update_table_indices.accessor<int32_t, 1>();
-
-        const auto index_remappings_acc = index_remappings.accessor<int32_t, 1>();
-        const auto index_remappings_offsets_acc = index_remappings_offsets.accessor<int64_t, 1>();
-
-        for (int64_t idx = 0; idx < num_indices; idx++) {
-            const int table_idx = update_table_indices_acc[idx];
-            const auto row_idx = update_row_indices_acc[idx];
-            int64_t index_remappings_start = index_remappings_offsets_acc[table_idx];
-            int64_t index_remappings_end = index_remappings_offsets_acc[table_idx + 1];
-            int64_t capacity = index_remappings_end - index_remappings_start;
-            if (capacity > 0) {
-                dense_indices_acc[idx] = index_remappings_acc[index_remappings_start + row_idx];
-            } else {
-                dense_indices_acc[idx] = row_idx;
-            }
-        }
-      });
-    return dense_indices;
-}
-
 {% endif %}
 // clang-format on
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
index 43a182b6b1..01c054f818 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
@@ -558,13 +558,6 @@ Tensor pruned_array_lookup_cuda(
     Tensor index_remappings,
     Tensor index_remappings_offsets);
 
-///@ingroup embedding-cuda
-Tensor pruned_array_lookup_from_row_idx_cuda(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets);
-
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "int_nbit_split_embedding_codegen_lookup_function",
@@ -576,7 +569,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "pruned_hashmap_lookup", pruned_hashmap_lookup_unweighted_cuda);
 
   DISPATCH_TO_CUDA("pruned_array_lookup", pruned_array_lookup_cuda);
-  DISPATCH_TO_CUDA(
-      "pruned_array_lookup_from_row_idx",
-      pruned_array_lookup_from_row_idx_cuda);
 }
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
index 93db44ac76..a43671f880 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
@@ -240,13 +240,6 @@ Tensor pruned_array_lookup_cpu(
     Tensor index_remappings,
     Tensor index_remappings_offsets);
 
-///@ingroup embedding-cpu
-Tensor pruned_array_lookup_from_row_idx_cpu(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets);
-
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor");
@@ -278,12 +271,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "pruned_array_lookup(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
   DISPATCH_TO_CPU("pruned_array_lookup", pruned_array_lookup_cpu);
-
-  // GPU version of array lookup.
-  m.def(
-      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
-  DISPATCH_TO_CPU(
-      "pruned_array_lookup_from_row_idx", pruned_array_lookup_from_row_idx_cpu);
 }
 
 class PrunedMapCPU : public torch::jit::CustomClassHolder {
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
index 4b4345f1cc..e0a2f04ee8 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -552,36 +552,6 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
 }
 {% endif %}
 
-{% if not weighted %}
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> update_row_indices,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> update_table_indices,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> index_remappings,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> index_remappings_offsets,
-    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> dense_indices) {
-
-  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= update_row_indices.size(0)) {
-    return;
-  }
-  const int table_idx = update_table_indices[idx];
-  const auto row_idx = update_row_indices[idx];
-
-  const int64_t index_remappings_start = index_remappings_offsets[table_idx];
-  const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1];
-  const int64_t capacity = index_remappings_end - index_remappings_start;
-
-  if (capacity > 0) {
-    dense_indices[idx] = index_remappings[index_remappings_start + row_idx];
-  } else {
-    dense_indices[idx] = row_idx;
-  }
-}
-{% endif %}
-
-
-
 }
 
 {% for nobag in [True, False] %}
@@ -1107,53 +1077,6 @@ Tensor pruned_array_lookup_cuda(
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return dense_indices;
 }
-
-Tensor pruned_array_lookup_from_row_idx_cuda(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets) {
-
-  TENSOR_ON_CUDA_GPU(update_row_indices);
-  TENSOR_ON_CUDA_GPU(update_table_indices);
-  TENSOR_ON_CUDA_GPU(index_remappings);
-  TENSOR_ON_CUDA_GPU(index_remappings_offsets);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(update_table_indices.get_device());
-  auto dense_indices = at::empty_like(update_row_indices);
-  const int32_t T = index_remappings_offsets.size(0) - 1;
-
-  const auto num_indices = update_row_indices.numel();
-  if (num_indices == 0) {
-    return dense_indices;
-  }
-
-  TORCH_CHECK(index_remappings.size(0) < std::numeric_limits<int64_t>::max());
-  TORCH_CHECK(update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim());
-  TORCH_CHECK(update_table_indices.dim() == 1, "Tensor dim: ", update_table_indices.dim());
-  TORCH_CHECK(index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim());
-  TORCH_CHECK(index_remappings_offsets.dim() == 1, "Tensor dim: ", index_remappings_offsets.dim());
-  TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim());
-  constexpr size_t kForwardMaxThreads = 256;
-
-  AT_DISPATCH_INDEX_TYPES(
-      update_row_indices.scalar_type(), "embedding_inplace_update_kernel", [&] {
-        nbit::int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel<<<
-            nbit::div_round_up(num_indices, kForwardMaxThreads),
-            kForwardMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-                update_row_indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                update_table_indices.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                index_remappings.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                index_remappings_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                dense_indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>()
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  return dense_indices;
-}
 {% endif %}
 
                                     // clang-format on
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
index 10670b48d4..cfa457d04b 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
@@ -75,4 +75,28 @@ void embedding_inplace_update_cpu(
         c10::nullopt // Not used, to match cache interface for CUDA op
 );
 
+/**
+ * Index remapping function that returns the remapped indices.
+ *
+ * Args:
+ *    update_row_indices: row indices for every new row
+ *    update_table_indices: table indices for every new row
+ *    index_remappings: concated index remapping for every embedding table
+ *    index_remappings_offsets: offset for each embedding table
+ *
+ * Returns:
+ *    remapped indices for each new row.
+ */
+Tensor pruned_array_lookup_from_row_idx_cuda(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets);
+
+Tensor pruned_array_lookup_from_row_idx_cpu(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu
index 1d0e394919..f301576a49 100644
--- a/fbgemm_gpu/src/embedding_inplace_update.cu
+++ b/fbgemm_gpu/src/embedding_inplace_update.cu
@@ -186,4 +186,98 @@ void embedding_inplace_update_cuda(
       });
 }
 
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel(
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        update_row_indices,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        update_table_indices,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        index_remappings,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        index_remappings_offsets,
+    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        dense_indices) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= update_row_indices.size(0)) {
+    return;
+  }
+  const auto row_idx = update_row_indices[idx];
+  if (idx >= update_table_indices.size(0)) {
+    return;
+  }
+  const int table_idx = update_table_indices[idx];
+
+  const int64_t index_remappings_start = index_remappings_offsets[table_idx];
+  const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1];
+  const int64_t capacity = index_remappings_end - index_remappings_start;
+
+  if (capacity > 0) {
+    dense_indices[idx] = index_remappings[index_remappings_start + row_idx];
+  } else {
+    dense_indices[idx] = row_idx;
+  }
+}
+
+Tensor pruned_array_lookup_from_row_idx_cuda(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets) {
+  TENSOR_ON_CUDA_GPU(update_row_indices);
+  TENSOR_ON_CUDA_GPU(update_table_indices);
+  TENSOR_ON_CUDA_GPU(index_remappings);
+  TENSOR_ON_CUDA_GPU(index_remappings_offsets);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(update_table_indices.get_device());
+  auto dense_indices = at::empty_like(update_row_indices);
+  const int32_t T = index_remappings_offsets.size(0) - 1;
+
+  const auto num_indices = update_row_indices.numel();
+  if (num_indices == 0) {
+    return dense_indices;
+  }
+
+  TORCH_CHECK(index_remappings.size(0) < std::numeric_limits<int64_t>::max());
+  TORCH_CHECK(
+      update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim());
+  TORCH_CHECK(
+      update_table_indices.dim() == 1,
+      "Tensor dim: ",
+      update_table_indices.dim());
+  TORCH_CHECK(
+      index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim());
+  TORCH_CHECK(
+      index_remappings_offsets.dim() == 1,
+      "Tensor dim: ",
+      index_remappings_offsets.dim());
+  TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim());
+  constexpr size_t kForwardMaxThreads = 256;
+
+  AT_DISPATCH_INDEX_TYPES(
+      update_row_indices.scalar_type(),
+      "pruned_array_lookup_from_row_idx_kernel",
+      [&] {
+        pruned_array_lookup_from_row_idx_kernel<<<
+            nbit::div_round_up(num_indices, kForwardMaxThreads),
+            kForwardMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            update_row_indices
+                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            update_table_indices
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            index_remappings
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            index_remappings_offsets
+                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            dense_indices
+                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>());
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+  return dense_indices;
+}
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
index bd1315e023..5f3a648872 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
@@ -116,6 +116,53 @@ void embedding_inplace_update_cpu(
       });
 }
 
+Tensor pruned_array_lookup_from_row_idx_cpu(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets) {
+  TENSOR_ON_CPU(update_row_indices);
+  TENSOR_ON_CPU(update_table_indices);
+  TENSOR_ON_CPU(index_remappings);
+  TENSOR_ON_CPU(index_remappings_offsets);
+
+  auto dense_indices = empty_like(update_row_indices);
+  const auto num_indices = update_row_indices.numel();
+
+  AT_DISPATCH_INDEX_TYPES(
+      update_row_indices.scalar_type(),
+      "pruned_array_lookup_from_row_idx_cpu_kernel",
+      [&] {
+        const auto update_row_indices_acc =
+            update_row_indices.accessor<index_t, 1>();
+        auto dense_indices_acc = dense_indices.accessor<index_t, 1>();
+        const auto update_table_indices_acc =
+            update_table_indices.accessor<int32_t, 1>();
+
+        const auto index_remappings_acc =
+            index_remappings.accessor<int32_t, 1>();
+        const auto index_remappings_offsets_acc =
+            index_remappings_offsets.accessor<int64_t, 1>();
+
+        for (int64_t idx = 0; idx < num_indices; idx++) {
+          const int table_idx = update_table_indices_acc[idx];
+          const auto row_idx = update_row_indices_acc[idx];
+          int64_t index_remappings_start =
+              index_remappings_offsets_acc[table_idx];
+          int64_t index_remappings_end =
+              index_remappings_offsets_acc[table_idx + 1];
+          int64_t capacity = index_remappings_end - index_remappings_start;
+          if (capacity > 0) {
+            dense_indices_acc[idx] =
+                index_remappings_acc[index_remappings_start + row_idx];
+          } else {
+            dense_indices_acc[idx] = row_idx;
+          }
+        }
+      });
+  return dense_indices;
+}
+
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -127,3 +174,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
 }
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  DISPATCH_TO_CPU(
+      "pruned_array_lookup_from_row_idx",
+      fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu);
+}
diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
index 743a902b68..cfb48c2427 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
@@ -14,3 +14,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda);
 }
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  DISPATCH_TO_CUDA(
+      "pruned_array_lookup_from_row_idx",
+      fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda);
+}

From 92305da6dfa1b9845b55fad85edbaf9374092eef Mon Sep 17 00:00:00 2001
From: Rengan Xu <renganxu@meta.com>
Date: Wed, 29 Mar 2023 22:05:47 -0700
Subject: [PATCH 27/34] jagged_softmax forward optimization (#1661)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1661

This diff optimizes jagged_softmax forward with more efficient reduction from cub library.

Reviewed By: brad-mengchi

Differential Revision: D44161021

fbshipit-source-id: bf2e059d14ef4d7ad311edac65155a463ba653ff
---
 fbgemm_gpu/src/jagged_tensor_ops.cu | 122 +++++++++++++++++++++-------
 1 file changed, 92 insertions(+), 30 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index e646d28be2..94400a58c6 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -12,6 +12,7 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 #include <ATen/cuda/Atomic.cuh>
+#include <cub/cub.cuh>
 
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
@@ -1824,39 +1825,101 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
   return {v_grad, a_values_grad};
 }
 
-template <typename index_t, typename scalar_t>
+template <const int THREADS_PER_BLOCK, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> values,
     const at::PackedTensorAccessor32<index_t, 1> offsets,
     at::PackedTensorAccessor32<scalar_t, 2> output,
     const int max_L) {
-  const int B = offsets.size(0) - 1;
-  const int D = output.size(1);
+  const auto B = offsets.size(0) - 1;
+  const auto D = output.size(1);
 
-  const int b_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_step = gridDim.x * blockDim.y;
-  for (int b = b_begin; b < B; b += b_step) {
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length != 0) {
-      // TODO: use shared memory and better reduction
-      for (int d = threadIdx.x; d < D; d += blockDim.x) {
-        scalar_t max_value = values[row_start][d];
-        for (int l = 1; l < length; ++l) {
-          max_value = max(max_value, values[row_start + l][d]);
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<scalar_t, THREADS_PER_BLOCK> BlockReduceT;
+
+  // Allocate shared memory for BlockReduce
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ scalar_t max_value;
+  __shared__ scalar_t exp_sum;
+
+  const auto tid = threadIdx.x;
+  for (auto b = blockIdx.y; b < B; b += gridDim.y) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    if (length > 0) {
+      const auto num_l_blocks =
+          (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+      for (auto d = blockIdx.x; d < D; d += gridDim.x) {
+        if (tid == 0) {
+          max_value = values[row_start][d];
+          exp_sum = 0;
         }
 
-        at::acc_type<scalar_t, true> acc =
-            exp(values[row_start][d] - max_value);
-        for (int l = 1; l < length; ++l) {
-          acc += exp(values[row_start + l][d] - max_value);
+        // Loop through all blocks to calculate the max value
+        // Each block has its own max value block_max_value, and
+        // max_value is the max value across all blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_val = values[row_start][d];
+          if (l < length) {
+            thread_val = values[row_start + l][d];
+          }
+
+          // Collectively compute the block-wide max reduction
+          scalar_t block_max_value =
+              BlockReduceT(temp_storage).Reduce(thread_val, cub::Max());
+          __syncthreads();
+
+          if (tid == 0) {
+            max_value = max(max_value, block_max_value);
+          }
         }
 
-        for (int l = 0; l < length; ++l) {
-          output[row_start + l][d] =
-              exp(values[row_start + l][d] - max_value) / acc;
+        // The max_value was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated max_value
+        __syncthreads();
+
+        // Loop through all blocks to calculate the sum of exp
+        // Each block has its own sum block_exp_acc, and
+        // exp_sum is the sum across all blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          auto l = bk_l * blockDim.x + tid;
+
+          scalar_t thread_exp = 0;
+          if (l < length) {
+            thread_exp = std::exp(values[row_start + l][d] - max_value);
+          }
+
+          // Collectively compute the block-wide sum reduction
+          scalar_t block_exp_sum = BlockReduceT(temp_storage).Sum(thread_exp);
+          __syncthreads();
+
+          if (tid == 0) {
+            exp_sum += block_exp_sum;
+          }
         }
+
+        // The exp_sum was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated exp_sum
+        __syncthreads();
+
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_exp = 0;
+          if (l < length) {
+            thread_exp = std::exp(values[row_start + l][d] - max_value);
+            output[row_start + l][d] = thread_exp / exp_sum;
+          }
+        }
+
+        // The max_value and exp_sum will be reinitialized by thread 0 in the
+        // next d iteration, sync here to make sure the last loop still uses the
+        // reduced values before reinitialization
+        __syncthreads();
       }
     }
   }
@@ -1872,14 +1935,13 @@ Tensor jagged_softmax_forward(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(values.get_device());
 
-  const int B = offsets.numel() - 1;
-  const int D = values.size(1);
+  const auto B = offsets.numel() - 1;
+  const auto D = values.size(1);
   auto output = at::empty_like(values);
 
   if (B > 0 && D > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int THREADS_PER_BLOCK = 128;
+    const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_softmax_kernel_1", [&] {
@@ -1889,9 +1951,9 @@ Tensor jagged_softmax_forward(
               values.scalar_type(),
               "jagged_softmax_kernel_2",
               [&] {
-                jagged_softmax_kernel<index_t, scalar_t>
-                    <<<div_round_up(B, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
+                jagged_softmax_kernel<THREADS_PER_BLOCK, index_t, scalar_t>
+                    <<<grid,
+                       THREADS_PER_BLOCK,
                        0,
                        at::cuda::getCurrentCUDAStream()>>>(
                         values.packed_accessor32<scalar_t, 2>(),

From 802b8dc454c9248b79bad51b91b291202d82106b Mon Sep 17 00:00:00 2001
From: Rengan Xu <renganxu@meta.com>
Date: Wed, 29 Mar 2023 22:05:47 -0700
Subject: [PATCH 28/34] jagged_softmax backward optimization (#1662)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1662

This diff optimizes jagged_softmax backward with more efficient reduction from cub library

Reviewed By: brad-mengchi

Differential Revision: D44205819

fbshipit-source-id: cd1d7a886d6ba68201dc1ad782c2e8cde7ff706b
---
 fbgemm_gpu/src/jagged_tensor_ops.cu | 97 +++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 27 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 94400a58c6..4e249d9553 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -1968,35 +1968,76 @@ Tensor jagged_softmax_forward(
   return output;
 }
 
-template <typename index_t, typename scalar_t>
+template <const int THREADS_PER_BLOCK, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> grad_output,
     const at::PackedTensorAccessor32<scalar_t, 2> output,
     const at::PackedTensorAccessor32<index_t, 1> offsets,
     at::PackedTensorAccessor32<scalar_t, 2> grad_input,
     const int max_L) {
-  const int B = offsets.size(0) - 1;
-  const int D = grad_output.size(1);
+  const auto B = offsets.size(0) - 1;
+  const auto D = grad_output.size(1);
 
-  const int b_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_step = gridDim.x * blockDim.y;
-  for (int b = b_begin; b < B; b += b_step) {
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length != 0) {
-      // TODO: use shared memory and better reduction
-      for (int d = threadIdx.x; d < D; d += blockDim.x) {
-        scalar_t sum_value = grad_output[row_start][d] * output[row_start][d];
-        for (int l = 1; l < length; ++l) {
-          sum_value += grad_output[row_start + l][d] * output[row_start + l][d];
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<scalar_t, THREADS_PER_BLOCK> BlockReduceT;
+
+  // Allocate shared memory for BlockReduce
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ scalar_t sum_value;
+
+  const auto tid = threadIdx.x;
+  for (auto b = blockIdx.y; b < B; b += gridDim.y) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    if (length > 0) {
+      const auto num_l_blocks =
+          (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+      for (auto d = blockIdx.x; d < D; d += gridDim.x) {
+        if (tid == 0) {
+          sum_value = 0;
+        }
+
+        // Loop through all blocks to calculate the sum value
+        // Each block has its own sum, and sum_value is the sum value across all
+        // blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_val = 0;
+          if (l < length) {
+            thread_val =
+                grad_output[row_start + l][d] * output[row_start + l][d];
+          }
+
+          // Collectively compute the block-wide sum reduction
+          scalar_t block_sum_value = BlockReduceT(temp_storage).Sum(thread_val);
+          __syncthreads();
+
+          if (tid == 0) {
+            sum_value += block_sum_value;
+          }
         }
 
-        for (int l = 0; l < length; ++l) {
-          grad_input[row_start + l][d] =
-              (grad_output[row_start + l][d] - sum_value) *
-              output[row_start + l][d];
+        // The sum_value was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated sum_value
+        __syncthreads();
+
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          if (l < length) {
+            grad_input[row_start + l][d] =
+                (grad_output[row_start + l][d] - sum_value) *
+                output[row_start + l][d];
+          }
         }
+
+        // The sum_value will be reinitialized by thread 0 in the
+        // next d iteration, sync here to make sure the last loop still uses the
+        // reduced value before reinitialization
+        __syncthreads();
       }
     }
   }
@@ -2014,14 +2055,13 @@ Tensor jagged_softmax_backward(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(grad_output.get_device());
 
-  const int B = offsets.numel() - 1;
-  const int D = grad_output.size(1);
+  const auto B = offsets.numel() - 1;
+  const auto D = grad_output.size(1);
   auto grad_input = at::empty_like(grad_output);
 
   if (B > 0 && D > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int THREADS_PER_BLOCK = 128;
+    const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_softmax_backward_kernel_1", [&] {
@@ -2031,9 +2071,12 @@ Tensor jagged_softmax_backward(
               grad_output.scalar_type(),
               "jagged_softmax_backward_kernel_2",
               [&] {
-                jagged_softmax_backward_kernel<index_t, scalar_t>
-                    <<<div_round_up(B, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
+                jagged_softmax_backward_kernel<
+                    THREADS_PER_BLOCK,
+                    index_t,
+                    scalar_t>
+                    <<<grid,
+                       THREADS_PER_BLOCK,
                        0,
                        at::cuda::getCurrentCUDAStream()>>>(
                         grad_output.packed_accessor32<scalar_t, 2>(),

From b74d407863c30b7e4eb7048cce1d270139d1154b Mon Sep 17 00:00:00 2001
From: Geet Sethi <gsethi@meta.com>
Date: Thu, 30 Mar 2023 02:26:36 -0700
Subject: [PATCH 29/34] multi-gpu all_to_one improvements (#1674)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1674

improved multi-gpu all_to_one with:
	1. new intermediate hop selection taking advantage of distinct NVLinks
	2. overlapping of intermediate hop transfers with each-other and with direct-peer transfers

Reviewed By: doehyun

Differential Revision: D44285941

fbshipit-source-id: 0202083f04388b5ba60b8155809433f334993ef4
---
 .../src/merge_pooled_embeddings_gpu.cpp       | 219 ++++++++++++------
 .../test/merge_pooled_embeddings_test.py      |   2 +-
 2 files changed, 154 insertions(+), 67 deletions(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index ed3c075bd0..d03b961a79 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -23,72 +23,85 @@
 using Tensor = at::Tensor;
 
 namespace {
-// Hilariously unoptimized, but algorithmic correctness matters more here, and
-// we only do it once.
-AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
-  auto world_size = at::cuda::getNumGPUs();
-  auto intermediate_node = [&](Node i, Node j) {
-    if (i == j) {
-      return std::vector<Node>{-1};
-    }
-    if (links(i, j) != 0) {
-      return std::vector<Node>{-1};
-    }
+struct DirectConnectedPeer {
+  int64_t num_peer_links;
+  int64_t peer_id;
+  // number of transfers from peer
+  int32_t peer_transfers;
+};
 
-    std::vector<std::pair<Node, Links>> paths;
-    for (const auto k : c10::irange(world_size)) {
-      if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) {
-        paths.push_back({k, links(i, k) + links(k, j)});
-      }
-    }
-    if (paths.empty()) {
-      LOG(WARNING)
-          << "Expect very bad performance for p2p copies, we are going via sys path for GPU "
-          << i << " -> GPU " << j;
-      return std::vector<Node>{-1};
-    }
-    auto mp = std::max_element(
-                  paths.begin(),
-                  paths.end(),
-                  [](std::pair<Node, Links> a, std::pair<Node, Links> b) {
-                    return a.second < b.second;
-                  })
-                  ->second;
-    std::vector<Node> candidates;
-    for (const auto& p : paths) {
-      if (p.second == mp) {
-        candidates.push_back(p.first);
-      }
-    }
-    return candidates;
-  };
+struct TwoHopTransferContainer {
+  Tensor intermediate_tensor;
+  uint64_t output_idx;
+  std::unique_ptr<at::cuda::CUDAEvent> transfer_cuda_event;
+};
 
-  std::vector<Node> assignments(world_size * world_size);
-  // Use a two-phase assignment protocol as the greedy approach
-  // can lead to unbalanced usage.
-  std::unordered_map<Node, int64_t> uses;
+AdjacencyMatrix<Node> get_intermediate_node(
+    const AdjacencyMatrix<Links>& links) {
+  const auto world_size = at::cuda::getNumGPUs();
+  std::vector<Node> link_vec(static_cast<size_t>(world_size * world_size));
   for (const auto i : c10::irange(world_size)) {
     for (const auto j : c10::irange(world_size)) {
-      auto ims = intermediate_node(i, j);
-      if (ims.size() == 1) {
-        auto v = ims.front();
-        if (v != -1) {
-          uses[v] += 1;
-        }
-        assignments[i * world_size + j] = v;
-      }
+      link_vec[i * world_size + j] = links(i, j);
     }
   }
+  auto link_tensor = at::from_blob(
+      link_vec.data(),
+      {world_size, world_size},
+      at::TensorOptions().dtype(at::kLong));
+  LOG(INFO) << "NVLink Topology Matrix: \n" << link_tensor;
+  std::vector<Node> assignments(
+      static_cast<size_t>(world_size * world_size), -1);
+  for (const auto dst_rank_id : c10::irange(world_size)) {
+    std::vector<int> non_direct_src_ids;
+    non_direct_src_ids.reserve(world_size);
+    std::vector<DirectConnectedPeer> direct_connected_peers;
+    direct_connected_peers.reserve(world_size);
+    for (const auto src_rank_id : c10::irange(world_size)) {
+      if (dst_rank_id == src_rank_id) {
+        continue;
+      }
 
-  for (const auto i : c10::irange(world_size)) {
-    for (const auto j : c10::irange(world_size)) {
-      auto ims = intermediate_node(i, j);
-      if (ims.size() > 1) {
-        auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) {
-          return uses[a] < uses[b];
-        });
-        uses[v] += 1;
-        assignments[i * world_size + j] = v;
+      const auto num_peer_links = links(dst_rank_id, src_rank_id);
+      if (num_peer_links > 0) {
+        direct_connected_peers.push_back(
+            {.num_peer_links = num_peer_links,
+             .peer_id = src_rank_id,
+             .peer_transfers = 1});
+      } else {
+        non_direct_src_ids.push_back(src_rank_id);
+      }
+    }
+
+    // Assign intermediate hop ranks for non-directly connected peers.
+    // Assigns intermediate hops based on the number of links from the
+    //  potential intermediate rank to target rank, as well as
+    //  the number of two_hop connections already assigned to the
+    //  intermediate rank.
+    for (const auto i : c10::irange(non_direct_src_ids.size())) {
+      std::sort(
+          direct_connected_peers.begin(),
+          direct_connected_peers.end(),
+          [](const auto& a, const auto& b) {
+            if (a.num_peer_links > b.num_peer_links) {
+              return true;
+            } else if (a.num_peer_links == b.num_peer_links) {
+              return a.peer_transfers < b.peer_transfers;
+            } else {
+              return false;
+            }
+          });
+      const auto non_direct_src_id = non_direct_src_ids.at(i);
+      for (auto& j : direct_connected_peers) {
+        const auto potential_hop_id = j.peer_id;
+        const auto potential_hop_peer_links =
+            links(potential_hop_id, non_direct_src_id);
+        if (potential_hop_peer_links > 0) {
+          assignments[dst_rank_id * world_size + non_direct_src_id] =
+              potential_hop_id;
+          j.peer_transfers += 1;
+          break;
+        }
       }
     }
   }
@@ -100,7 +113,8 @@ AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
         {world_size, world_size},
         at::TensorOptions().dtype(at::kLong));
     LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor;
-    return [=](Node i, Node j) { return assignments[i * world_size + j]; };
+    return
+        [=](Node src, Node dst) { return assignments[dst * world_size + src]; };
   } else {
     return [](Node, Node) { return -1; };
   }
@@ -111,7 +125,7 @@ AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
 // tensor in `input_tensors` is already in the `target_device`, we will skip
 // copy it if `skip_if_same_device` is true.
 void all_to_one(
-    std::vector<Tensor>& input_tensors,
+    const std::vector<Tensor>& input_tensors,
     std::vector<Tensor>& output_tensors,
     at::Device target_device,
     bool skip_if_same_device) {
@@ -119,19 +133,48 @@ void all_to_one(
   std::vector<at::cuda::CUDAEvent> copy_begin_events(num_gpus);
   std::vector<at::cuda::CUDAEvent> copy_completion_events(num_gpus);
 
+  std::vector<TwoHopTransferContainer> two_hop_transfers;
+  two_hop_transfers.reserve(input_tensors.size());
+  std::vector<bool> is_two_hop_transfer;
+  is_two_hop_transfer.reserve(input_tensors.size());
+
   static auto intermediate_nodes =
       get_intermediate_node(fbgemm_gpu::get_nvlink_matrix());
-  for (auto& ten : input_tensors) {
-    Node src_device_id = ten.get_device();
+  for (const auto i : c10::irange(input_tensors.size())) {
+    const auto& src = input_tensors.at(i);
+    Node src_device_id = src.get_device();
     auto intermediate_node =
         intermediate_nodes(src_device_id, target_device.index());
     if (intermediate_node != -1) {
-      ten = ten.to(at::Device(at::kCUDA, intermediate_node));
+      two_hop_transfers.push_back(
+          {.intermediate_tensor = at::empty(
+               src.sizes(),
+               src.options().device(at::Device(at::kCUDA, intermediate_node))),
+           .output_idx = i,
+           .transfer_cuda_event =
+               std::make_unique<at::cuda::CUDAEvent>(cudaEventDisableTiming)});
+      auto& dst = two_hop_transfers.back().intermediate_tensor;
+      at::cuda::CUDAStream copy_stream =
+          at::cuda::getCurrentCUDAStream(src_device_id);
+      AT_CUDA_CHECK(cudaMemcpy2DAsync(
+          dst.data_ptr(),
+          dst.stride(0) * dst.element_size(),
+          src.data_ptr(),
+          src.stride(0) * src.element_size(),
+          src.size(1) * src.element_size(),
+          src.size(0),
+          cudaMemcpyDeviceToDevice,
+          copy_stream));
+      two_hop_transfers.back().transfer_cuda_event->record(copy_stream);
+      is_two_hop_transfer.push_back(true);
+    } else {
+      is_two_hop_transfer.push_back(false);
     }
   }
 
-  // For each source device, we sync its current stream and launch all the
-  // copies that are from that device.
+  // For each source device directly connected to the destination device, we
+  // sync its current stream and launch all the copies that are from that
+  // device.
   for (const auto device_id : c10::irange(num_gpus)) {
     auto src_device = at::Device(at::kCUDA, device_id);
     if (src_device == target_device) {
@@ -160,6 +203,13 @@ void all_to_one(
     device_guard.set_device(src_device);
     dst_ready.block(copy_stream);
     for (const auto i : c10::irange(input_tensors.size())) {
+      const auto metadata = is_two_hop_transfer.at(i);
+      // Initiate all transfer for tensors with direct
+      // NVLink connection to target rank
+      if (metadata) {
+        continue;
+      }
+
       auto& src = input_tensors[i];
       if (src.device() != src_device) {
         continue;
@@ -179,6 +229,43 @@ void all_to_one(
     }
   }
 
+  // Complete 2-hop transfers to target rank
+  for (auto& two_hop_transfer : two_hop_transfers) {
+    const auto& src = two_hop_transfer.intermediate_tensor;
+    const auto src_device_id = src.get_device();
+    const auto src_device = at::Device(at::kCUDA, src_device_id);
+    if (src_device == target_device) {
+      continue;
+    }
+
+    // intermediate rank
+    at::cuda::CUDAGuard device_guard(src_device);
+    // intermediate rank stream
+    at::cuda::CUDAStream copy_stream =
+        at::cuda::getCurrentCUDAStream(src_device_id);
+    // wait on first hop transfer
+    two_hop_transfer.transfer_cuda_event->block(copy_stream);
+    // synchronize with target rank
+    auto& dst_ready = copy_begin_events[src_device_id];
+    device_guard.set_device(target_device);
+    dst_ready.record(at::cuda::getCurrentCUDAStream(target_device.index()));
+    device_guard.set_device(src_device);
+    dst_ready.block(copy_stream);
+    // originating tensor output position
+    const auto output_index = two_hop_transfer.output_idx;
+    auto& dst = output_tensors.at(output_index);
+    // on source device, launch memcpy.
+    AT_CUDA_CHECK(cudaMemcpy2DAsync(
+        dst.data_ptr(),
+        dst.stride(0) * dst.element_size(),
+        src.data_ptr(),
+        src.stride(0) * src.element_size(),
+        src.size(1) * src.element_size(),
+        src.size(0),
+        cudaMemcpyDeviceToDevice,
+        copy_stream));
+  }
+
   // Do the same-GPU cases.
   if (!skip_if_same_device) {
     for (const auto i : c10::irange(input_tensors.size())) {
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index de7c80b79b..98e1ede2ee 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -100,7 +100,7 @@ def ref(pooled_ad_embeddings, batch_indices):
         r=st.randoms(use_true_random=False),
     )
     # Can instantiate 8 contexts which takes a long time.
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None)
     def test_all_to_one_device(
         self,
         num_inputs,

From 177ba08dbc3bae56d11e4c89bdf19b04974374dc Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Thu, 30 Mar 2023 07:33:20 -0700
Subject: [PATCH 30/34] Extract and export weights offsets/placements
 initialization functions (#1669)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1669

Extract portions initializing the weights_placements/offsets tensors into separate functions and jit.export them.
SplitState is converted to a NamedTuple since we can't jit.script a dataclass that also holds an enum.

Reviewed By: houseroad

Differential Revision: D44338256

fbshipit-source-id: e1c12e5956f7217d51cd190958c3764d220e521d
---
 .../split_table_batched_embeddings_ops.py     | 107 +++++++++++-------
 1 file changed, 68 insertions(+), 39 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index ff8ce4d094..c327d359cc 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -115,14 +115,16 @@ class CounterBasedRegularizationDefinition:
     [("record_cache_miss_counter", bool), ("record_tablewise_cache_miss", bool)],
 )
 
-
-@dataclass
-class SplitState:
-    dev_size: int
-    host_size: int
-    uvm_size: int
-    placements: List[EmbeddingLocation]
-    offsets: List[int]
+SplitState: NamedTuple = NamedTuple(
+    "SplitState",
+    [
+        ("dev_size", int),
+        ("host_size", int),
+        ("uvm_size", int),
+        ("placements", List[EmbeddingLocation]),
+        ("offsets", List[int]),
+    ],
+)
 
 
 def construct_split_state(
@@ -132,11 +134,11 @@ def construct_split_state(
     precision: SparseType = SparseType.FP32,
     int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET,
 ) -> SplitState:
-    placements = []
-    offsets = []
-    dev_size = 0
-    host_size = 0
-    uvm_size = 0
+    placements: List[EmbeddingLocation] = []
+    offsets: List[int] = []
+    dev_size: int = 0
+    host_size: int = 0
+    uvm_size: int = 0
     for num_embeddings, embedding_dim, location, _ in embedding_specs:
         assert (
             embedding_dim % 4 == 0
@@ -1935,8 +1937,8 @@ def nbit_construct_split_state(
     scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     cacheline_alignment: bool = True,
 ) -> SplitState:
-    placements = []
-    offsets = []
+    placements = torch.jit.annotate(List[EmbeddingLocation], [])
+    offsets = torch.jit.annotate(List[int], [])
     dev_size = 0
     host_size = 0
     uvm_size = 0
@@ -1984,6 +1986,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     cache_miss_counter: torch.Tensor
     uvm_cache_stats: torch.Tensor
     local_uvm_cache_stats: torch.Tensor
+    weights_offsets: torch.Tensor
+    weights_placements: torch.Tensor
 
     def __init__(
         self,
@@ -2165,21 +2169,7 @@ def max_ty_D(ty: SparseType) -> int:
         ]
         self.max_D_cache: int = max(cached_dims) if len(cached_dims) > 0 else 0
 
-        weight_split: SplitState = nbit_construct_split_state(
-            self.embedding_specs,
-            cacheable=True,
-            row_alignment=self.row_alignment,
-            scale_bias_size_in_bytes=self.scale_bias_size_in_bytes,
-            cacheline_alignment=cacheline_alignment,
-        )
-
-        self.weights_physical_placements: List[int] = [
-            t.value for t in weight_split.placements
-        ]
-        self.weights_physical_offsets: List[int] = weight_split.offsets
-        self.host_size: int = weight_split.host_size
-        self.dev_size: int = weight_split.dev_size
-        self.uvm_size: int = weight_split.uvm_size
+        self.initialize_physical_weights_placements_and_offsets(cacheline_alignment)
         self.enforce_hbm: bool = enforce_hbm
 
         # Assign weights after weights and weights_offsets are initialized.
@@ -2192,7 +2182,8 @@ def max_ty_D(ty: SparseType) -> int:
                 self.weights_physical_offsets,
                 self.enforce_hbm,
             )
-            self.assign_embedding_weights(weight_lists)  # type: ignore
+            # pyre-fixme [6]: In call `IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights`, for 1st positional argument, expected `List[Tuple[Tensor, Optional[Tensor]]]` but got `List[Tuple[Tensor, Tensor]]`.
+            self.assign_embedding_weights(weight_lists)
 
         # Handle index remapping for embedding pruning.
         self.register_buffer(
@@ -2654,6 +2645,51 @@ def forward(
             fp8_exponent_bias=self.fp8_exponent_bias,
         )
 
+    def initialize_logical_weights_placements_and_offsets(
+        self,
+    ) -> None:
+        assert len(self.weights_physical_offsets) == len(self.embedding_specs)
+        assert len(self.weights_physical_offsets) == len(
+            self.weights_physical_placements
+        )
+        offsets = [self.weights_physical_offsets[t] for t in self.feature_table_map]
+        placements = [
+            self.weights_physical_placements[t] for t in self.feature_table_map
+        ]
+        self.weights_offsets = torch.tensor(
+            offsets, device=self.current_device, dtype=torch.int64
+        )
+        self.weights_placements = torch.tensor(
+            placements, device=self.current_device, dtype=torch.int32
+        )
+
+    def initialize_physical_weights_placements_and_offsets(
+        self,
+        cacheline_alignment: bool = True,
+    ) -> None:
+        # Initialize physical weights placements and offsets
+        # and host/dev/uvm sizes
+        weight_split: SplitState = nbit_construct_split_state(
+            self.embedding_specs,
+            cacheable=True,
+            row_alignment=self.row_alignment,
+            scale_bias_size_in_bytes=self.scale_bias_size_in_bytes,
+            cacheline_alignment=cacheline_alignment,
+        )
+        self.weights_physical_placements = [t.value for t in weight_split.placements]
+        self.weights_physical_offsets = weight_split.offsets
+        self.host_size = weight_split.host_size
+        self.dev_size = weight_split.dev_size
+        self.uvm_size = weight_split.uvm_size
+
+    @torch.jit.export
+    def reset_weights_placements_and_offsets(
+        self,
+    ) -> None:
+        # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor
+        self.initialize_physical_weights_placements_and_offsets()
+        self.initialize_logical_weights_placements_and_offsets()
+
     def _apply_split(
         self,
         dev_size: int,
@@ -2672,14 +2708,7 @@ def _apply_split(
         self.dev_size = dev_size
         self.uvm_size = uvm_size
 
-        offsets = [offsets[t] for t in self.feature_table_map]
-        placements = [placements[t] for t in self.feature_table_map]
-        self.weights_offsets = torch.tensor(
-            offsets, device=self.current_device, dtype=torch.int64
-        )
-        self.weights_placements = torch.tensor(
-            placements, device=self.current_device, dtype=torch.int32
-        )
+        self.initialize_logical_weights_placements_and_offsets()
 
         if dev_size > 0:
             self.weights_dev = torch.zeros(

From d559a109432222fb827f8cc462eb298d04900901 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 30 Mar 2023 22:55:25 -0700
Subject: [PATCH 31/34] Fix the ROCm Test Job (#1668)

Summary:
- Clean up the ROCm test job and re-enable ROCm testing on the rocm instances.
- Update the build scripts framework to build FBGEMM_GPU against the correct hardware target that it is intended to be tested on.  One thing that was discovered was that if FBGEMM_GPU was built with `PYTORCH_ROCM_ARCH=gfx90a` but run on `gfx908` target, the tests will fail with a segfault.  While the failure is expected, the segfault can be unfriendly and confusing for users.
- Enable correct compilation of `merge_pooled_embeddings` operator under ROCm
- Fix existing code in `jagged_tensor_ops` from PR https://github.com/pytorch/FBGEMM/issues/1661 and https://github.com/pytorch/FBGEMM/issues/1662 that break its compilation under ROCm 5.3

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1668

Reviewed By: shintaro-iwasaki

Differential Revision: D44453594

Pulled By: q10

fbshipit-source-id: 2030cd0e00c6ff9694c2783dfd62c31cf5543da2
---
 .github/scripts/setup_env.bash                | 273 +++++++++++++-----
 .github/workflows/fbgemm_gpu_ci.yml           | 100 ++++---
 .github/workflows/fbgemm_gpu_cuda_nightly.yml |   2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml |   2 +-
 .github/workflows/fbgemm_gpu_lint.yml         |  14 +-
 fbgemm_gpu/CMakeLists.txt                     |   7 +-
 fbgemm_gpu/src/jagged_tensor_ops.cu           |   8 +-
 .../src/merge_pooled_embeddings_gpu.cpp       |   8 +
 8 files changed, 287 insertions(+), 127 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 57da549463..9cf928883c 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -261,7 +261,6 @@ print_gpu_info () {
       echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
       return 1
     fi
-
   else
     if which nvidia-smi; then
       # If nvidia-smi is installed on a machine without GPUs, this will return error
@@ -270,6 +269,21 @@ print_gpu_info () {
       echo "[CHECK] nvidia-smi not found"
     fi
   fi
+
+  if [[ "${ENFORCE_AMD_GPU}" ]]; then
+    # Ensure that rocm-smi is available and returns GPU entries
+    if ! rocm-smi; then
+      echo "[CHECK] AMD driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
+      return 1
+    fi
+  else
+    if which rocm-smi; then
+      # If rocm-smi is installed on a machine without GPUs, this will return error
+      (print_exec rocm-smi) || true
+    else
+      echo "[CHECK] rocm-smi not found"
+    fi
+  fi
 }
 
 __print_system_info_linux () {
@@ -1102,6 +1116,103 @@ prepare_fbgemm_gpu_build () {
   echo "[BUILD] Successfully ran git submodules update"
 }
 
+__configure_fbgemm_gpu_build_cpu () {
+  # Update the package name and build args depending on if CUDA is specified
+  echo "[BUILD] Setting CPU-only build args ..."
+  build_args=(--cpu_only)
+}
+
+__configure_fbgemm_gpu_build_rocm () {
+  local fbgemm_variant_targets="$1"
+
+  # Fetch available ROCm architectures on the machine
+  if [ "$fbgemm_variant_targets" != "" ]; then
+    echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}"
+    local arch_list="${fbgemm_variant_targets}"
+  else
+    if which rocminfo; then
+      # shellcheck disable=SC2155
+      local arch_list=$(rocminfo | grep -o -m 1 'gfx.*')
+      echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
+
+      if [ "$arch_list" == "" ]; then
+        # By default, build for MI250 only to save time
+        local arch_list=gfx90a
+      fi
+    else
+      echo "[BUILD] rocminfo not found in PATH!"
+    fi
+  fi
+
+  echo "[BUILD] Setting the following ROCm targets: ${arch_list}"
+  print_exec conda env config vars set -n "${env_name}" PYTORCH_ROCM_ARCH="${arch_list}"
+
+  echo "[BUILD] Setting ROCm build args ..."
+  build_args=()
+}
+
+__configure_fbgemm_gpu_build_cuda () {
+  local fbgemm_variant_targets="$1"
+
+  # Check nvcc is visible
+  (test_binpath "${env_name}" nvcc) || return 1
+
+  # Check that cuDNN environment variables are available
+  (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1
+  (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1
+  (test_env_var "${env_name}" NVML_LIB_PATH) || return 1
+
+  local arch_list="${fbgemm_variant_targets:-7.0;8.0}"
+  echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
+
+  # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
+  echo "[BUILD] Setting CUDA build args ..."
+  # shellcheck disable=SC2155
+  local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
+  build_args=(
+    --nvml_lib_path="${nvml_lib_path}"
+    -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
+  )
+}
+
+__configure_fbgemm_gpu_build () {
+  local fbgemm_variant="$1"
+  local fbgemm_variant_targets="$2"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Configure FBGEMM-GPU Build"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  if [ "$fbgemm_variant" == "cpu" ]; then
+    echo "[BUILD] Configuring build as CPU variant ..."
+    __configure_fbgemm_gpu_build_cpu
+
+  elif [ "$fbgemm_variant" == "rocm" ]; then
+    echo "[BUILD] Configuring build as ROCm variant ..."
+    __configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}"
+
+  else
+    echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..."
+    __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
+  fi
+
+  # shellcheck disable=SC2145
+  echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
+}
+
 __build_fbgemm_gpu_common_pre_steps () {
   # Private function that uses variables instantiated by its caller
 
@@ -1112,43 +1223,12 @@ __build_fbgemm_gpu_common_pre_steps () {
   (test_binpath "${env_name}" g++) || return 1
 
   if [ "$fbgemm_variant" == "cpu" ]; then
-    echo "[BUILD] Proceeding to build CPU variant"
-
-    # Update the package name and build args depending on if CUDA is specified
-    echo "[BUILD] Applying CPU-only build args ..."
-    build_args=(--cpu_only)
     package_name="${package_name}-cpu"
-
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    echo "[BUILD] Proceeding to build ROCm variant"
-
-    (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1
-
-    echo "[BUILD] Applying ROCm build args ..."
-    build_args=()
     package_name="${package_name}-rocm"
-
   else
     # Set to the default variant
-    fbgemm_variant="gpu"
-    echo "[BUILD] Proceeding to build GPU variant (default)"
-
-    # Check nvcc is visible
-    (test_binpath "${env_name}" nvcc) || return 1
-
-    # Check that cuDNN environment variables are available
-    (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1
-    (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1
-    (test_env_var "${env_name}" NVML_LIB_PATH) || return 1
-
-    # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
-    echo "[BUILD] Applying GPU build args ..."
-    # shellcheck disable=SC2155
-    local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
-    build_args=(
-      --nvml_lib_path="${nvml_lib_path}"
-      -DTORCH_CUDA_ARCH_LIST='7.0;8.0'
-    )
+    fbgemm_variant="cuda"
   fi
 
   # Extract the Python tag
@@ -1168,12 +1248,14 @@ __build_fbgemm_gpu_common_pre_steps () {
   print_exec git diff
 }
 
-check_fbgemm_gpu_build () {
+run_fbgemm_gpu_postbuild_checks () {
   local fbgemm_variant="$1"
   if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} cpu"
+    echo "    ${FUNCNAME[0]} cuda"
+    echo "    ${FUNCNAME[0]} rocm"
     return 1
   fi
 
@@ -1194,7 +1276,13 @@ check_fbgemm_gpu_build () {
   )
 
   # Add more symbols to check for if it's a non-CPU variant
-  if [ "${fbgemm_variant}" != "cpu" ]; then
+  if [ "${fbgemm_variant}" == "cuda" ]; then
+    lib_symbols_to_check+=(
+      fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
+      fbgemm_gpu::merge_pooled_embeddings
+    )
+  elif [ "${fbgemm_variant}" == "rocm" ]; then
+    # merge_pooled_embeddings is missing in ROCm builds bc it requires NVML
     lib_symbols_to_check+=(
       fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
       fbgemm_gpu::merge_pooled_embeddings
@@ -1218,27 +1306,32 @@ build_fbgemm_gpu_package () {
   env_name="$1"
   package_name="$2"
   fbgemm_variant="$3"
-  if [ "$package_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME [CPU_ONLY]"
+  fbgemm_variant_targets="$4"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME VARIANT [TARGETS]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly       # Build the full wheel package"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu   # Build the CPU-only variant of the wheel package"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu                           # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda                          # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda '7.0;8.0'                # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm                          # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm 'gfx906;gfx908;gfx90a'   # ROCm variant for custom target(s)"
     return 1
-  else
-    echo "################################################################################"
-    echo "# Build FBGEMM-GPU Package (Wheel)"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
   fi
 
-  # Run all the common FBGEMM-GPU build pre-steps (set up variables)
+  # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build FBGEMM-GPU Package (Wheel)"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
 
   # manylinux1_x86_64 is specified for PyPI upload
   # Distribute Python extensions as wheels on Linux
-  echo "[BUILD] Building FBGEMM-GPU (VARIANT=${fbgemm_variant}) wheel ..."
+  echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
   print_exec conda run -n "${env_name}" \
     python setup.py bdist_wheel \
       --package_name="${package_name}" \
@@ -1247,7 +1340,7 @@ build_fbgemm_gpu_package () {
       "${build_args[@]}"
 
   # Run checks on the built libraries
-  (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
 
   echo "[BUILD] Enumerating the built wheels ..."
   print_exec ls -lth dist/*.whl
@@ -1261,32 +1354,37 @@ build_fbgemm_gpu_package () {
 build_fbgemm_gpu_install () {
   env_name="$1"
   fbgemm_variant="$2"
-  if [ "$env_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME [CPU_ONLY]"
+  fbgemm_variant_targets="$3"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env      # Build + install the package"
-    echo "    ${FUNCNAME[0]} build_env cpu  # Build + Install the CPU-only variant of the package"
+    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
     return 1
-  else
-    echo "################################################################################"
-    echo "# Build + Install FBGEMM-GPU Package"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
   fi
 
-  # Run all the common FBGEMM-GPU build pre-steps (set up variables)
+  # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build + Install FBGEMM-GPU Package"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
 
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
-  echo "[BUILD] Building and installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
+  echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
   print_exec conda run -n "${env_name}" \
     python setup.py install "${build_args[@]}"
 
   # Run checks on the built libraries
-  (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
 
   echo "[INSTALL] Checking imports ..."
   # Exit this directory to prevent import clashing, since there is an
@@ -1297,6 +1395,44 @@ build_fbgemm_gpu_install () {
   echo "[BUILD] FBGEMM-GPU build + install completed"
 }
 
+build_fbgemm_gpu_develop () {
+  env_name="$1"
+  fbgemm_variant="$2"
+  fbgemm_variant_targets="$3"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
+    return 1
+  fi
+
+  # Set up and configure the build
+  __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build + Install FBGEMM-GPU Package"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  # Parallelism may need to be limited to prevent the build from being
+  # canceled for going over ulimits
+  echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
+  print_exec conda run -n "${env_name}" \
+    python setup.py build develop "${build_args[@]}"
+
+  # Run checks on the built libraries
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
+
+  echo "[BUILD] FBGEMM-GPU build + develop completed"
+}
+
 build_fbgemm_gpu_docs () {
   env_name="$1"
   if [ "$env_name" == "" ]; then
@@ -1357,7 +1493,7 @@ install_fbgemm_gpu_package () {
 
 
 ################################################################################
-# Test Functions
+# FBGEMM_GPU Test Functions
 ################################################################################
 
 run_fbgemm_gpu_tests () {
@@ -1366,7 +1502,7 @@ run_fbgemm_gpu_tests () {
   if [ "$env_name" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to GPU (Nvidia)"
+    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env cpu    # Run all tests applicable to CPU"
     echo "    ${FUNCNAME[0]} build_env rocm   # Run all tests applicable to ROCm"
     return 1
@@ -1398,7 +1534,10 @@ run_fbgemm_gpu_tests () {
       uvm_test.py
     )
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    local ignored_tests=()
+    # https://github.com/pytorch/FBGEMM/issues/1559
+    local ignored_tests=(
+      batched_unary_embeddings_test.py
+    )
   else
     local ignored_tests=()
   fi
@@ -1430,7 +1569,7 @@ run_fbgemm_gpu_tests () {
 
 
 ################################################################################
-# Publish Functions
+# FBGEMM_GPU Publish Functions
 ################################################################################
 
 publish_to_pypi () {
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 646c9de168..50e7c3814b 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -86,13 +86,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU-ROCM Nightly
-      run: |
-        . $PRELUDE
-        cd fbgemm_gpu
-
-        # Build for MI250 only to save time.
-        print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
-        print_exec conda run -n $BUILD_ENV python setup.py build develop
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
 
     - name: Test FBGEMM_GPU-ROCM Nightly Installation
       timeout-minutes: 10
@@ -100,54 +94,66 @@ jobs:
 
 
   test_amd_gpu:
-    if: ${{ false }}  # Disable the job for now
     runs-on: rocm
+    container:
+      image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+      options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      ENFORCE_AMD_GPU: 1
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
+        # ROCm machines are limited, so we only test against Python 3.10
+        python-version: [ "3.10" ]
+        rocm-version: [ "5.3", "5.4.2" ]
 
     steps:
-    - name: pre-checkout
-      shell: bash
+    - name: Setup Build Container
       run: |
-        if [ -d ${{ github.workspace }} ]
-        then
-          sudo chown -R $USER:$USER ${{ github.workspace }}
-        fi
-        sudo add-apt-repository ppa:git-core/ppa
-        sudo apt update
-        sudo apt -y install --only-upgrade git
-
-    - uses: actions/checkout@v3
+        apt update -y
+        apt install -y git wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
       with:
-        ref: ${{ github.ref }}
-        submodules: 'true'
+        submodules: true
 
-    - name: build fbgemm_gpu and test
-      shell: bash
-      run: |
-        set -eux
-        env
-        ls -l
-        DOCKER_IMAGE=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_staging_base
-        docker pull $DOCKER_IMAGE
-        JENKINS_REPO_DIR=fbgemm-private-jenkins
-        JENKINS_REPO_DIR_BAREMETAL=$PWD
-        JENKINS_REPO_DIR_DOCKER=/workspace/$JENKINS_REPO_DIR
-        DOCKER_OPTIONS="\
-        --user 0 \
-        --network=host \
-        --ipc=host \
-        --shm-size 16G \
-        --group-add video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device=/dev/kfd \
-        --device=/dev/dri \
-        -v $JENKINS_REPO_DIR_BAREMETAL:$JENKINS_REPO_DIR_DOCKER
-        "
-        docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Free Disk Space
+      run: . $PRELUDE; free_disk_space
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch-ROCm Nightly
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU-ROCM Nightly
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
+
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+      timeout-minutes: 15
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
 
   build_and_test_cpu:
@@ -203,6 +209,6 @@ jobs:
     - name: Build + Install FBGEMM_GPU (CPU version)
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
 
-    - name: Test with PyTest
+    - name: Test FBGEMM_GPU-CPU Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 7ccdbcbf3e..c08d088991 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -97,7 +97,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly cuda
 
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index 7516e6a021..3a41125170 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -88,7 +88,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu cuda
 
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index 1ff7203108..8a484e9844 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -6,10 +6,14 @@
 name: FBGEMM_GPU Lint
 
 on:
+  # PR Trigger
+  #
   push:
     branches:
       - main
 
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
   pull_request:
     branches:
       - main
@@ -20,11 +24,11 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_pylint:
+  run-lint:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.8" ]
+        python-version: [ "3.10" ]
     steps:
     - uses: actions/checkout@v3
 
@@ -38,7 +42,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install click flake8 ufmt
 
-    - name: Analyzing the code with flake8
+    - name: Analyzing the Code with flake8
       run: |
         echo "::add-matcher::fbgemm_gpu/test/lint/flake8_problem_matcher.json"
         flake8 --ignore=E501,W503,E203 .
@@ -46,13 +50,13 @@ jobs:
         # W503 = line break before binary operator (deprecated)
         # E203 = whitespace before ":"
 
-    - name: Analyzing the code with ufmt
+    - name: Analyzing the Code with ufmt
       run: |
         ufmt diff fbgemm_gpu/fbgemm_gpu
         ufmt diff fbgemm_gpu/test
         ufmt diff fbgemm_gpu/bench
 
-    - name: Check Meta copyright header
+    - name: Check Meta Copyright Header
       run: |
         python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/fbgemm_gpu --fixit=False
         python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/test --fixit=False
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 1fb8f397e0..2276ca9ff2 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -344,6 +344,10 @@ if(NOT FBGEMM_CPU_ONLY)
 
   if(NVML_LIB_PATH)
     message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}")
+  endif()
+
+  if(NVML_LIB_PATH OR USE_ROCM)
+    message(STATUS "Adding merge_pooled_embeddings sources")
     list(
       APPEND
       fbgemm_gpu_sources_cpu
@@ -351,8 +355,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/merge_pooled_embeddings_gpu.cpp
       src/topology_utils.cpp)
   else()
-    message(STATUS
-    "Could not find NVML_LIB_PATH; skipping certain sources into the build")
+    message(STATUS "Skipping merge_pooled_embeddings sources")
   endif()
 endif()
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 4e249d9553..62cef01113 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -1844,7 +1844,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel(
   __shared__ scalar_t exp_sum;
 
   const auto tid = threadIdx.x;
-  for (auto b = blockIdx.y; b < B; b += gridDim.y) {
+  for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) {
     const index_t row_start = offsets[b];
     const index_t row_end = offsets[b + 1];
     const auto length = min(row_end - row_start, (index_t)max_L);
@@ -1853,7 +1853,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel(
       const auto num_l_blocks =
           (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
-      for (auto d = blockIdx.x; d < D; d += gridDim.x) {
+      for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) {
         if (tid == 0) {
           max_value = values[row_start][d];
           exp_sum = 0;
@@ -1987,7 +1987,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel(
   __shared__ scalar_t sum_value;
 
   const auto tid = threadIdx.x;
-  for (auto b = blockIdx.y; b < B; b += gridDim.y) {
+  for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) {
     const index_t row_start = offsets[b];
     const index_t row_end = offsets[b + 1];
     const auto length = min(row_end - row_start, (index_t)max_L);
@@ -1996,7 +1996,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel(
       const auto num_l_blocks =
           (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
-      for (auto d = blockIdx.x; d < D; d += gridDim.x) {
+      for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) {
         if (tid == 0) {
           sum_value = 0;
         }
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index d03b961a79..8257faff9b 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -20,6 +20,14 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/topology_utils.h"
 
+// For some reason, hipify fails to replace the macro names when compiling for
+// ROCm, so we manually replace it here.  Name mapping based on:
+// https://github.com/pytorch/pytorch/blob/master/torch/utils/hipify/cuda_to_hip_mappings.py
+#ifdef __HIP_PLATFORM_HCC__
+#define C10_CUDA_CLEAR_ERROR C10_HIP_CLEAR_ERROR
+#define C10_CUDA_ERROR_HANDLED C10_HIP_ERROR_HANDLED
+#endif
+
 using Tensor = at::Tensor;
 
 namespace {

From 1ac526f7935c432911f887a0a113f53934a1ea98 Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Fri, 31 Mar 2023 09:36:10 -0700
Subject: [PATCH 32/34] Use exported functions instead of calling
 initialize_weights in weights loading (#1676)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1676

Export a function to reset the embedding specs by target location

Reviewed By: RoshanPAN, houseroad

Differential Revision: D44338258

fbshipit-source-id: 502733e9f3a164450a02656d2822492fbf69f994
---
 .../split_table_batched_embeddings_ops.py     | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index c327d359cc..f8ad2ccaf2 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -2684,12 +2684,33 @@ def initialize_physical_weights_placements_and_offsets(
 
     @torch.jit.export
     def reset_weights_placements_and_offsets(
-        self,
+        self, device: torch.device, location: int
     ) -> None:
+        # Reset device/location denoted in embedding specs
+        self.reset_embedding_spec_location(device, location)
         # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor
         self.initialize_physical_weights_placements_and_offsets()
         self.initialize_logical_weights_placements_and_offsets()
 
+    def reset_embedding_spec_location(
+        self, device: torch.device, location: int
+    ) -> None:
+        # Overwrite location in embedding_specs with new location
+        # Use map since can't script enum call (ie. EmbeddingLocation(value))
+        INT_TO_EMBEDDING_LOCATION = {
+            0: EmbeddingLocation.DEVICE,
+            1: EmbeddingLocation.MANAGED,
+            2: EmbeddingLocation.MANAGED_CACHING,
+            3: EmbeddingLocation.HOST,
+        }
+        target_location = INT_TO_EMBEDDING_LOCATION[location]
+        self.current_device = device
+        self.row_alignment = 1 if target_location == EmbeddingLocation.HOST else 16
+        self.embedding_specs = [
+            (spec[0], spec[1], spec[2], spec[3], target_location)
+            for spec in self.embedding_specs
+        ]
+
     def _apply_split(
         self,
         dev_size: int,

From 99edf260151a463d95f35d45897281ae25c0d65b Mon Sep 17 00:00:00 2001
From: Janet Yang <qxy11@meta.com>
Date: Fri, 31 Mar 2023 09:36:10 -0700
Subject: [PATCH 33/34] Extract index remappings array initialization and
 jit.export it (#1670)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1670

ATT

Reviewed By: RoshanPAN, houseroad

Differential Revision: D44338257

fbshipit-source-id: c091666c7a4d294c283f5e3774d0494089fc3478
---
 .../split_table_batched_embeddings_ops.py     | 75 +++++++++++--------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index f8ad2ccaf2..2c7d99610f 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -3111,6 +3111,49 @@ def assign_embedding_weights(
             else:
                 assert dest_weight[1] is None
 
+    @torch.jit.export
+    def set_index_remappings_array(
+        self,
+        index_remapping: List[Tensor],
+    ) -> None:
+        rows: List[int] = [e[1] for e in self.embedding_specs]
+        index_remappings_array_offsets = [0]
+        original_feature_rows = torch.jit.annotate(List[int], [])
+        last_offset = 0
+        for t, mapping in enumerate(index_remapping):
+            if mapping is not None:
+                current_original_row = mapping.numel()
+                last_offset += current_original_row
+                original_feature_rows.append(current_original_row)
+            else:
+                original_feature_rows.append(rows[t])
+            index_remappings_array_offsets.append(last_offset)
+
+        self.index_remappings_array_offsets = torch.tensor(
+            index_remappings_array_offsets,
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+        if len(original_feature_rows) == 0:
+            original_feature_rows = rows
+        self.original_rows_per_table = torch.tensor(
+            [original_feature_rows[t] for t in self.feature_table_map],
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+        if self.index_remappings_array_offsets[-1] == 0:
+            self.index_remappings_array = torch.empty(
+                0, dtype=torch.int32, device=self.current_device
+            )
+        else:
+            index_remappings_filter_nones = []
+            for mapping in index_remapping:
+                if mapping is not None:
+                    index_remappings_filter_nones.append(mapping)
+            self.index_remappings_array = torch.cat(index_remappings_filter_nones).to(
+                self.current_device
+            )
+
     def set_index_remappings(
         self,
         index_remapping: List[Tensor],
@@ -3177,37 +3220,7 @@ def set_index_remappings(
                 self.index_remapping_hash_table_cpu = None
         # Array mapping pruning
         else:
-            index_remappings_array_offsets = [0]
-            original_feature_rows = []
-            last_offset = 0
-            for t, mapping in enumerate(index_remapping):
-                if mapping is not None:
-                    current_original_row = mapping.numel()
-                    last_offset += current_original_row
-                    original_feature_rows.append(current_original_row)
-                else:
-                    original_feature_rows.append(rows[t])
-                index_remappings_array_offsets.append(last_offset)
-
-            self.index_remappings_array_offsets = torch.tensor(
-                index_remappings_array_offsets,
-                device=self.current_device,
-                dtype=torch.int64,
-            )
-            if len(original_feature_rows) == 0:
-                original_feature_rows = rows
-            self.original_rows_per_table = torch.tensor(
-                [original_feature_rows[t] for t in self.feature_table_map],
-                device=self.current_device,
-                dtype=torch.int64,
-            )
-            self.index_remappings_array = (
-                torch.empty(0, dtype=torch.int32, device=self.current_device)
-                if self.index_remappings_array_offsets[-1] == 0
-                else torch.cat(
-                    [mapping for mapping in index_remapping if mapping is not None]
-                ).to(self.current_device)
-            )
+            self.set_index_remappings_array(index_remapping)
 
     def _embedding_inplace_update_per_table(
         self,

From 49c1fc811ec9131b9adcb00d4d7fdf59f50205a5 Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Fri, 31 Mar 2023 18:13:47 +0000
Subject: [PATCH 34/34] update hipify_torch and remove the manually mapping of
 the C10 macros

---
 fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp | 8 --------
 third_party/hipify_torch                       | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 8257faff9b..d03b961a79 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -20,14 +20,6 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/topology_utils.h"
 
-// For some reason, hipify fails to replace the macro names when compiling for
-// ROCm, so we manually replace it here.  Name mapping based on:
-// https://github.com/pytorch/pytorch/blob/master/torch/utils/hipify/cuda_to_hip_mappings.py
-#ifdef __HIP_PLATFORM_HCC__
-#define C10_CUDA_CLEAR_ERROR C10_HIP_CLEAR_ERROR
-#define C10_CUDA_ERROR_HANDLED C10_HIP_ERROR_HANDLED
-#endif
-
 using Tensor = at::Tensor;
 
 namespace {
diff --git a/third_party/hipify_torch b/third_party/hipify_torch
index 1840658c18..23f53b025b 160000
--- a/third_party/hipify_torch
+++ b/third_party/hipify_torch
@@ -1 +1 @@
-Subproject commit 1840658c184f3eeba787dae0f06c45756c1daaf5
+Subproject commit 23f53b025b466d8ec3c45d52290d3442f7fbe6b1