diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000000..1e5dbcfcb7
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################################################################
+# FBGEMM Bazel configuration file
+#
+# Based on MozoLM build options:
+#   https://github.com/google-research/mozolm/blob/main/.bazelrc
+#
+# Documentation for Bazel configuration options can be found in:
+#   https://bazel.build/reference/command-line-reference
+################################################################################
+
+# Automatically picks up host-OS-specific config lines from bazelrc files
+# Enabling this is equivalent to auto-calling --config=linux on Linux, --config=windows, etc
+build --enable_platform_specific_config
+
+# Print logs for all tests
+test --test_output=all
+
+# Build with verbose logging
+build --verbose_explanations --verbose_failures
+test  --verbose_explanations --verbose_failures
+
+# Build with optimization mode turned on
+build  --compilation_mode opt
+test   --compilation_mode opt
+
+# Build FBGEMM with C17 and C++17
+build:linux --cxxopt=-std=c++17
+build:linux --host_cxxopt=-std=c++17
+build:linux --conlyopt=-std=c17
+build:linux --host_conlyopt=-std=c17
+build:macos --cxxopt=-std=c++17
+build:macos --host_cxxopt=-std=c++17
+build:macos --conlyopt=-std=c17
+build:macos --host_conlyopt=-std=c17
+build:windows --cxxopt=/std:c++17
+build:windows --host_cxxopt=/std:c++17
+build:windows --conlyopt=/std:c17
+build:windows --host_conlyopt=/std:c17
+
+# Generation of `runfiles` directories on Windows has to be explicitly enabled.
+# See https://github.com/bazelbuild/bazel/issues/8843.
+build:windows --enable_runfiles
+test:windows --enable_runfiles
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 4f1c808598..9cf928883c 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -13,8 +13,13 @@
 print_exec () {
   echo "+ $*"
   echo ""
-  "$@"
+  if "$@"; then
+    local retcode=0
+  else
+    local retcode=$?
+  fi
   echo ""
+  return $retcode
 }
 
 exec_with_retries () {
@@ -205,10 +210,12 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
+    echo ""
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
+    echo ""
     return 1
   fi
 }
@@ -254,27 +261,32 @@ print_gpu_info () {
       echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
       return 1
     fi
-
   else
     if which nvidia-smi; then
       # If nvidia-smi is installed on a machine without GPUs, this will return error
       (print_exec nvidia-smi) || true
+    else
+      echo "[CHECK] nvidia-smi not found"
     fi
   fi
-}
 
-print_system_info () {
-  echo "################################################################################"
-  echo "# Print System Info"
-  echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-  echo "################################################################################"
-  echo ""
-
-  echo "################################################################################"
-  echo "[INFO] Printing environment variables ..."
-  print_exec printenv
+  if [[ "${ENFORCE_AMD_GPU}" ]]; then
+    # Ensure that rocm-smi is available and returns GPU entries
+    if ! rocm-smi; then
+      echo "[CHECK] AMD driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
+      return 1
+    fi
+  else
+    if which rocm-smi; then
+      # If rocm-smi is installed on a machine without GPUs, this will return error
+      (print_exec rocm-smi) || true
+    else
+      echo "[CHECK] rocm-smi not found"
+    fi
+  fi
+}
 
+__print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Check ldd version ..."
   print_exec ldd --version
@@ -291,6 +303,36 @@ print_system_info () {
   print_exec cat /etc/os-release
 }
 
+__print_system_info_macos () {
+  echo "################################################################################"
+  echo "[INFO] Check CPU info ..."
+  sysctl -a | grep machdep.cpu
+
+  echo "################################################################################"
+  echo "[INFO] Check MacOS version info ..."
+  print_exec uname -a
+  print_exec sw_vers
+}
+
+print_system_info () {
+  echo "################################################################################"
+  echo "# Print System Info"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "################################################################################"
+  echo "[INFO] Printing environment variables ..."
+  print_exec printenv
+
+  if [[ $OSTYPE == 'darwin'* ]]; then
+    __print_system_info_macos
+  else
+    __print_system_info_linux
+  fi
+}
+
 print_ec2_info () {
   echo "################################################################################"
   echo "# Print EC2 Instance Info"
@@ -311,11 +353,73 @@ print_ec2_info () {
   echo "instance-type: $(get_ec2_metadata instance-type)"
 }
 
+print_glibc_info () {
+  local library_path="$1"
+  if [ "$library_path" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6"
+    return 1
+  fi
+
+  if [ -f "${library_path}" ]; then
+    echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat
+    echo ""
+
+    echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    echo ""
+
+  else
+    echo "[CHECK] No file at path: ${library_path}"
+    return 1
+  fi
+}
+
+
+################################################################################
+# Bazel Setup Functions
+################################################################################
+
+setup_bazel () {
+  local bazel_version="${1:-6.1.1}"
+  echo "################################################################################"
+  echo "# Setup Bazel"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  if [[ $OSTYPE == 'darwin'* ]]; then
+    # shellcheck disable=SC2155
+    local bazel_variant="darwin-$(uname -m)"
+  else
+    local bazel_variant="linux-x86_64"
+  fi
+
+  echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..."
+  print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh
+
+  echo "[SETUP] Installing Bazel ..."
+  print_exec bash install-bazel.sh
+  print_exec rm -f install-bazel.sh
+
+  print_exec bazel --version
+  echo "[SETUP] Successfully set up Bazel"
+}
+
 
 ################################################################################
-# Environment Setup and Install Functions
+# Miniconda Setup Functions
 ################################################################################
 
+__conda_cleanup () {
+  echo "[SETUP] Cleaning up Conda packages ..."
+  (print_exec conda clean --packages --tarball -y) || return 1
+  (print_exec conda clean --all -y) || return 1
+}
+
 setup_miniconda () {
   local miniconda_prefix="$1"
   if [ "$miniconda_prefix" == "" ]; then
@@ -337,7 +441,7 @@ setup_miniconda () {
     print_exec mkdir -p "$miniconda_prefix"
 
     echo "[SETUP] Downloading the Miniconda installer ..."
-    print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+    (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1
 
     echo "[SETUP] Installing Miniconda ..."
     print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u
@@ -349,15 +453,25 @@ setup_miniconda () {
   print_exec . ~/.bashrc
 
   echo "[SETUP] Updating Miniconda base packages ..."
-  (exec_with_retries conda update -n base -c defaults -y conda) || return 1
+  (exec_with_retries conda update -n base -c defaults --update-deps -y conda) || return 1
+
+  # Clean up packages
+  __conda_cleanup
 
   # Print Conda info
   print_exec conda info
 
   # These variables will be exported outside
+  echo "[SETUP] Exporting Miniconda variables ..."
   export PATH="${miniconda_prefix}/bin:${PATH}"
   export CONDA="${miniconda_prefix}"
 
+  if [ -f "${GITHUB_PATH}" ]; then
+    echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..."
+    echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}"
+    echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}"
+  fi
+
   echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}"
 }
 
@@ -398,17 +512,22 @@ create_conda_environment () {
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
 
+
+################################################################################
+# PyTorch Setup Functions
+################################################################################
+
 install_pytorch_conda () {
   local env_name="$1"
   local pytorch_version="$2"
-  local pytorch_cpu="$3"
+  local pytorch_variant_type="$3"
   if [ "$pytorch_version" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION [CPU]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env 1.11.0      # Install a specific version"
-    echo "    ${FUNCNAME[0]} build_env latest      # Install the latest stable release"
-    echo "    ${FUNCNAME[0]} build_env test        # Install the pre-release"
-    echo "    ${FUNCNAME[0]} build_env nightly 1   # Install the CPU variant of the nightly"
+    echo "    ${FUNCNAME[0]} build_env 1.11.0       # Install a specific version"
+    echo "    ${FUNCNAME[0]} build_env latest       # Install the latest stable release"
+    echo "    ${FUNCNAME[0]} build_env test         # Install the pre-release"
+    echo "    ${FUNCNAME[0]} build_env nightly cpu  # Install the CPU variant of the nightly"
     return 1
   else
     echo "################################################################################"
@@ -419,11 +538,11 @@ install_pytorch_conda () {
     echo ""
   fi
 
-  # Install cpuonly if needed
-  if [ "$pytorch_cpu" != "" ]; then
-    pytorch_cpu=1
+  # Install the cpuonly package if needed
+  if [ "$pytorch_variant_type" == "cpu" ]; then
     local pytorch_package="cpuonly pytorch"
   else
+    pytorch_variant_type="cuda"
     local pytorch_package="pytorch"
   fi
 
@@ -437,13 +556,25 @@ install_pytorch_conda () {
     local pytorch_channel="pytorch"
   fi
 
+  # Clean up packages before installation
+  __conda_cleanup
+
   # Install PyTorch packages
-  echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..."
+  # NOTE: Installation of large package might fail due to corrupt package download
+  # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
+  echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+
+  # Check that PyTorch is importable
+  (test_python_import "${env_name}" torch.distributed) || return 1
+
+  # Print out the actual installed PyTorch version
+  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
 
   # Run check for GPU variant
-  if [ "$pytorch_cpu" == "" ]; then
+  if [ "$pytorch_variant_type" == "cuda" ]; then
     # Ensure that the PyTorch build is the GPU variant (i.e. contains cuDNN reference)
     # This test usually applies to the PyTorch nightly builds
     if conda list -n "${env_name}" pytorch | grep cudnn; then
@@ -462,13 +593,7 @@ install_pytorch_conda () {
     (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
   fi
 
-  # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
-
-  # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
-  echo "[INSTALL] Installed PyTorch through Conda"
-  echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
+  echo "[INSTALL] Successfully installed PyTorch through Conda"
 }
 
 install_pytorch_pip () {
@@ -527,30 +652,53 @@ install_pytorch_pip () {
   # shellcheck disable=SC2086
   (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
 
-  if [ "$pytorch_variant_type" != "cpu" ]; then
-    if [ "$pytorch_variant_type" == "cuda" ]; then
-      # Ensure that the PyTorch-CUDA headers are properly installed
-      (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
-    fi
+  # Check that PyTorch is importable
+  (test_python_import "${env_name}" torch.distributed) || return 1
 
+  # Print out the actual installed PyTorch version
+  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
+
+  if [ "$pytorch_variant_type" != "cpu" ]; then
     # Ensure that the PyTorch build is of the correct variant
     # This test usually applies to the PyTorch nightly builds
-    if conda run -n build_binary pip list torch | grep torch | grep "${pytorch_variant}"; then
+    if conda run -n "${env_name}" pip list torch | grep torch | grep "${pytorch_variant}"; then
       echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})"
     else
       echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!"
-      echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA presently installed on the system has not been published yet."
+      echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available."
       return 1
     fi
   fi
 
-  # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
+  if [ "$pytorch_variant_type" == "cuda" ]; then
+    # Ensure that the PyTorch-CUDA headers are properly installed
+    (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
+  fi
 
-  # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
-  echo "[INSTALL] Installed PyTorch through PIP"
-  echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
+  echo "[INSTALL] Successfully installed PyTorch through PIP"
+}
+
+
+################################################################################
+# CUDA Setup Functions
+################################################################################
+
+install_nvidia_drivers_centos () {
+  echo "################################################################################"
+  echo "# Install NVIDIA Drivers"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "[SETUP] Adding NVIDIA repos to yum ..."
+  print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+  print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+  print_exec sudo yum clean expire-cache
+
+  echo "[SETUP] Installing NVIDIA drivers ..."
+  install_system_packages nvidia-driver-latest-dkms
 }
 
 install_cuda () {
@@ -578,9 +726,12 @@ install_cuda () {
     return 1
   fi
 
+  # Clean up packages before installation
+  __conda_cleanup
+
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
-  (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
 
   # Ensure that nvcc is properly installed
   (test_binpath "${env_name}" nvcc) || return 1
@@ -604,6 +755,86 @@ install_cuda () {
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
+install_cudnn () {
+  local env_name="$1"
+  local install_path="$2"
+  local cuda_version="$3"
+  if [ "$cuda_version" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
+    echo "Example:"
+    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install cuDNN"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  # Install cuDNN manually
+  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  local cudnn_packages=(
+    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
+    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
+  )
+
+  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
+  # shellcheck disable=SC2206
+  local cuda_version_arr=(${cuda_version//./ })
+  # Fetch the major and minor version to concat
+  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
+
+  # Get the URL
+  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
+  if [ "$cudnn_url" == "" ]; then
+    # Default to cuDNN for 11.7 if no CUDA version fits
+    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
+    cudnn_url="${cudnn_packages[117]}"
+  fi
+
+  # Clear the install path
+  rm -rf "$install_path"
+  mkdir -p "$install_path"
+
+  # Create temporary directory
+  # shellcheck disable=SC2155
+  local tmp_dir=$(mktemp -d)
+  cd "$tmp_dir" || return 1
+
+  # Download cuDNN
+  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
+  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
+
+  # Unpack the tarball
+  echo "[INSTALL] Unpacking cuDNN ..."
+  tar -xvf cudnn.tar.xz
+
+  # Copy the includes and libs over to the install path
+  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
+  rm -rf "${install_path:?}/include"
+  rm -rf "${install_path:?}/lib"
+  mv cudnn-linux-*/include "$install_path"
+  mv cudnn-linux-*/lib "$install_path"
+
+  # Delete the temporary directory
+  cd - || return 1
+  rm -rf "$tmp_dir"
+
+  # Export the environment variables to the Conda environment
+  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
+  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+
+  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
+}
+
+################################################################################
+# ROCm Setup Functions
+################################################################################
+
 install_rocm_ubuntu () {
   local env_name="$1"
   local rocm_version="$2"
@@ -652,15 +883,25 @@ install_rocm_ubuntu () {
   (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
 
   echo "[INSTALL] Installing HIP-relevant packages ..."
-  install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
   install_system_packages hipify-clang miopen-hip miopen-hip-dev
 
+  # There is no need to install these packages for ROCm
+  # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
+
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
+  echo "[INFO] Check ROCM GPU info ..."
+  print_exec rocm-smi
+
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
 }
 
+
+################################################################################
+# Build Tools Setup Functions
+################################################################################
+
 install_cxx_compiler () {
   local env_name="$1"
   local use_system_package_manager="$2"
@@ -684,15 +925,19 @@ install_cxx_compiler () {
     install_system_packages gcc gcc-c++
 
   else
-    # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as
-    # the latter breaks builds:
+    # Install gxx_linux-64 from conda-forge instead of from anaconda channel.
+    # sysroot_linux-64 needs to be installed alongside this:
+    #
     #   https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6
+    #   https://github.com/conda-forge/conda-forge.github.io/issues/1625
+    #   https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7
+    #   https://github.com/conda/conda-build/issues/4371
     #
-    # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with
-    # references to GLIBCXX_3.4.29, which is not available on systems with older
+    # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that
+    # reference GLIBCXX_3.4.29, which may not be available on systems with older
     # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04
     echo "[INSTALL] Installing C/C++ compilers through Conda ..."
-    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1
+    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1
 
     # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
     # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
@@ -716,6 +961,15 @@ install_cxx_compiler () {
 
   # Print out the C++ version
   print_exec conda run -n "${env_name}" c++ --version
+
+  # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler
+  echo "[INSTALL] Printing the default version of the C++ standard used by the compiler ..."
+  print_exec conda run -n "${env_name}" c++ -x c++ /dev/null -E -dM | grep __cplusplus
+
+  # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc
+  echo "[INSTALL] Printing the default version of the C standard used by the compiler ..."
+  print_exec conda run -n "${env_name}" cc -dM -E - < /dev/null | grep __STDC_VERSION__
+
   echo "[INSTALL] Successfully installed C/C++ compilers"
 }
 
@@ -759,83 +1013,32 @@ install_build_tools () {
   echo "[INSTALL] Successfully installed all the build tools"
 }
 
-install_cudnn () {
+install_docs_tools () {
   local env_name="$1"
-  local install_path="$2"
-  local cuda_version="$3"
-  if [ "$cuda_version" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
-    echo "Example:"
-    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env"
     return 1
   else
     echo "################################################################################"
-    echo "# Install cuDNN"
+    echo "# Install Documentation Tools"
     echo "#"
     echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
     echo "################################################################################"
     echo ""
   fi
 
-  # Install cuDNN manually
-  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
-  local cudnn_packages=(
-    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
-    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
-  )
-
-  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
-  # shellcheck disable=SC2206
-  local cuda_version_arr=(${cuda_version//./ })
-  # Fetch the major and minor version to concat
-  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
-
-  # Get the URL
-  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
-  if [ "$cudnn_url" == "" ]; then
-    # Default to cuDNN for 11.7 if no CUDA version fits
-    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
-    cudnn_url="${cudnn_packages[117]}"
-  fi
-
-  # Clear the install path
-  rm -rf "$install_path"
-  mkdir -p "$install_path"
-
-  # Create temporary directory
-  # shellcheck disable=SC2155
-  local tmp_dir=$(mktemp -d)
-  cd "$tmp_dir" || return 1
-
-  # Download cuDNN
-  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
-  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
-
-  # Unpack the tarball
-  echo "[INSTALL] Unpacking cuDNN ..."
-  tar -xvf cudnn.tar.xz
+  echo "[INSTALL] Installing docs tools ..."
+  (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
+    doxygen) || return 1
 
-  # Copy the includes and libs over to the install path
-  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
-  rm -rf "${install_path:?}/include"
-  rm -rf "${install_path:?}/lib"
-  mv cudnn-linux-*/include "$install_path"
-  mv cudnn-linux-*/lib "$install_path"
-
-  # Delete the temporary directory
-  cd - || return 1
-  rm -rf "$tmp_dir"
-
-  # Export the environment variables to the Conda environment
-  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
-  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+  # Check binaries are visible in the PAATH
+  (test_binpath "${env_name}" doxygen) || return 1
 
-  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
+  echo "[INSTALL] Successfully installed all the build tools"
 }
 
-
 ################################################################################
 # Combination Functions
 ################################################################################
@@ -866,7 +1069,7 @@ create_conda_pytorch_environment () {
 
   if [ "${cuda_version}" == "" ]; then
     # Install the CPU variant of PyTorch
-    install_pytorch_conda "${env_name}" "${pytorch_version}" 1
+    install_pytorch_conda "${env_name}" "${pytorch_version}" cpu
   else
     # Install CUDA and the GPU variant of PyTorch
     install_cuda "${env_name}" "${cuda_version}"
@@ -876,7 +1079,7 @@ create_conda_pytorch_environment () {
 
 
 ################################################################################
-# Build Functions
+# FBGEMM_GPU Build Functions
 ################################################################################
 
 prepare_fbgemm_gpu_build () {
@@ -895,6 +1098,11 @@ prepare_fbgemm_gpu_build () {
     echo ""
   fi
 
+  if [[ "${GITHUB_WORKSPACE}" ]]; then
+    # https://github.com/actions/checkout/issues/841
+    git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+  fi
+
   echo "[BUILD] Running git submodules update ..."
   git submodule sync
   git submodule update --init --recursive
@@ -908,6 +1116,103 @@ prepare_fbgemm_gpu_build () {
   echo "[BUILD] Successfully ran git submodules update"
 }
 
+__configure_fbgemm_gpu_build_cpu () {
+  # Update the package name and build args depending on if CUDA is specified
+  echo "[BUILD] Setting CPU-only build args ..."
+  build_args=(--cpu_only)
+}
+
+__configure_fbgemm_gpu_build_rocm () {
+  local fbgemm_variant_targets="$1"
+
+  # Fetch available ROCm architectures on the machine
+  if [ "$fbgemm_variant_targets" != "" ]; then
+    echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}"
+    local arch_list="${fbgemm_variant_targets}"
+  else
+    if which rocminfo; then
+      # shellcheck disable=SC2155
+      local arch_list=$(rocminfo | grep -o -m 1 'gfx.*')
+      echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
+
+      if [ "$arch_list" == "" ]; then
+        # By default, build for MI250 only to save time
+        local arch_list=gfx90a
+      fi
+    else
+      echo "[BUILD] rocminfo not found in PATH!"
+    fi
+  fi
+
+  echo "[BUILD] Setting the following ROCm targets: ${arch_list}"
+  print_exec conda env config vars set -n "${env_name}" PYTORCH_ROCM_ARCH="${arch_list}"
+
+  echo "[BUILD] Setting ROCm build args ..."
+  build_args=()
+}
+
+__configure_fbgemm_gpu_build_cuda () {
+  local fbgemm_variant_targets="$1"
+
+  # Check nvcc is visible
+  (test_binpath "${env_name}" nvcc) || return 1
+
+  # Check that cuDNN environment variables are available
+  (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1
+  (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1
+  (test_env_var "${env_name}" NVML_LIB_PATH) || return 1
+
+  local arch_list="${fbgemm_variant_targets:-7.0;8.0}"
+  echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
+
+  # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
+  echo "[BUILD] Setting CUDA build args ..."
+  # shellcheck disable=SC2155
+  local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
+  build_args=(
+    --nvml_lib_path="${nvml_lib_path}"
+    -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
+  )
+}
+
+__configure_fbgemm_gpu_build () {
+  local fbgemm_variant="$1"
+  local fbgemm_variant_targets="$2"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Configure FBGEMM-GPU Build"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  if [ "$fbgemm_variant" == "cpu" ]; then
+    echo "[BUILD] Configuring build as CPU variant ..."
+    __configure_fbgemm_gpu_build_cpu
+
+  elif [ "$fbgemm_variant" == "rocm" ]; then
+    echo "[BUILD] Configuring build as ROCm variant ..."
+    __configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}"
+
+  else
+    echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..."
+    __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
+  fi
+
+  # shellcheck disable=SC2145
+  echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
+}
+
 __build_fbgemm_gpu_common_pre_steps () {
   # Private function that uses variables instantiated by its caller
 
@@ -918,38 +1223,12 @@ __build_fbgemm_gpu_common_pre_steps () {
   (test_binpath "${env_name}" g++) || return 1
 
   if [ "$fbgemm_variant" == "cpu" ]; then
-    # Update the package name and build args depending on if CUDA is specified
-    echo "[BUILD] Applying CPU-only build args ..."
-    build_args=(--cpu_only)
     package_name="${package_name}-cpu"
-
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1
-
-    echo "[BUILD] Applying ROCm build args ..."
-    build_args=()
     package_name="${package_name}-rocm"
-
   else
     # Set to the default variant
-    fbgemm_variant="gpu"
-
-    # Check nvcc is visible
-    (test_binpath "${env_name}" nvcc) || return 1
-
-    # Check that cuDNN environment variables are available
-    (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1
-    (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1
-    (test_env_var "${env_name}" NVML_LIB_PATH) || return 1
-
-    # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
-    echo "[BUILD] Applying GPU build args ..."
-    # shellcheck disable=SC2155
-    local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
-    build_args=(
-      --nvml_lib_path="${nvml_lib_path}"
-      -DTORCH_CUDA_ARCH_LIST='7.0;8.0'
-    )
+    fbgemm_variant="cuda"
   fi
 
   # Extract the Python tag
@@ -969,12 +1248,14 @@ __build_fbgemm_gpu_common_pre_steps () {
   print_exec git diff
 }
 
-check_fbgemm_gpu_build () {
+run_fbgemm_gpu_postbuild_checks () {
   local fbgemm_variant="$1"
   if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} cpu"
+    echo "    ${FUNCNAME[0]} cuda"
+    echo "    ${FUNCNAME[0]} rocm"
     return 1
   fi
 
@@ -995,7 +1276,13 @@ check_fbgemm_gpu_build () {
   )
 
   # Add more symbols to check for if it's a non-CPU variant
-  if [ "${fbgemm_variant}" != "cpu" ]; then
+  if [ "${fbgemm_variant}" == "cuda" ]; then
+    lib_symbols_to_check+=(
+      fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
+      fbgemm_gpu::merge_pooled_embeddings
+    )
+  elif [ "${fbgemm_variant}" == "rocm" ]; then
+    # merge_pooled_embeddings is missing in ROCm builds bc it requires NVML
     lib_symbols_to_check+=(
       fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
       fbgemm_gpu::merge_pooled_embeddings
@@ -1004,7 +1291,7 @@ check_fbgemm_gpu_build () {
 
   for library in "${fbgemm_gpu_so_files[@]}"; do
     echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}"
-    objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    print_glibc_info "${library}"
 
     echo "[CHECK] Verifying sample subset of symbols in the library ..."
     for symbol in "${lib_symbols_to_check[@]}"; do
@@ -1019,27 +1306,32 @@ build_fbgemm_gpu_package () {
   env_name="$1"
   package_name="$2"
   fbgemm_variant="$3"
-  if [ "$package_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME [CPU_ONLY]"
+  fbgemm_variant_targets="$4"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME VARIANT [TARGETS]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly       # Build the full wheel package"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu   # Build the CPU-only variant of the wheel package"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu                           # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda                          # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda '7.0;8.0'                # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm                          # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm 'gfx906;gfx908;gfx90a'   # ROCm variant for custom target(s)"
     return 1
-  else
-    echo "################################################################################"
-    echo "# Build FBGEMM-GPU Package (Wheel)"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
   fi
 
-  # Run all the common FBGEMM-GPU build pre-steps (set up variables)
+  # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build FBGEMM-GPU Package (Wheel)"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
 
   # manylinux1_x86_64 is specified for PyPI upload
   # Distribute Python extensions as wheels on Linux
-  echo "[BUILD] Building FBGEMM-GPU (VARIANT=${fbgemm_variant}) wheel ..."
+  echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
   print_exec conda run -n "${env_name}" \
     python setup.py bdist_wheel \
       --package_name="${package_name}" \
@@ -1048,7 +1340,7 @@ build_fbgemm_gpu_package () {
       "${build_args[@]}"
 
   # Run checks on the built libraries
-  (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
 
   echo "[BUILD] Enumerating the built wheels ..."
   print_exec ls -lth dist/*.whl
@@ -1062,34 +1354,111 @@ build_fbgemm_gpu_package () {
 build_fbgemm_gpu_install () {
   env_name="$1"
   fbgemm_variant="$2"
+  fbgemm_variant_targets="$3"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
+    return 1
+  fi
+
+  # Set up and configure the build
+  __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build + Install FBGEMM-GPU Package"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  # Parallelism may need to be limited to prevent the build from being
+  # canceled for going over ulimits
+  echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
+  print_exec conda run -n "${env_name}" \
+    python setup.py install "${build_args[@]}"
+
+  # Run checks on the built libraries
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
+
+  echo "[INSTALL] Checking imports ..."
+  # Exit this directory to prevent import clashing, since there is an
+  # fbgemm_gpu/ subdirectory present
+  cd - || return 1
+  (test_python_import "${env_name}" fbgemm_gpu) || return 1
+
+  echo "[BUILD] FBGEMM-GPU build + install completed"
+}
+
+build_fbgemm_gpu_develop () {
+  env_name="$1"
+  fbgemm_variant="$2"
+  fbgemm_variant_targets="$3"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
+    return 1
+  fi
+
+  # Set up and configure the build
+  __build_fbgemm_gpu_common_pre_steps || return 1
+  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+
+  echo "################################################################################"
+  echo "# Build + Install FBGEMM-GPU Package"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  # Parallelism may need to be limited to prevent the build from being
+  # canceled for going over ulimits
+  echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
+  print_exec conda run -n "${env_name}" \
+    python setup.py build develop "${build_args[@]}"
+
+  # Run checks on the built libraries
+  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
+
+  echo "[BUILD] FBGEMM-GPU build + develop completed"
+}
+
+build_fbgemm_gpu_docs () {
+  env_name="$1"
   if [ "$env_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME [CPU_ONLY]"
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env      # Build + install the package"
-    echo "    ${FUNCNAME[0]} build_env cpu  # Build + Install the CPU-only variant of the package"
+    echo "    ${FUNCNAME[0]} build_env      # Build the docs"
     return 1
   else
     echo "################################################################################"
-    echo "# Build + Install FBGEMM-GPU Package"
+    echo "# Build FBGEMM-GPU Documentation"
     echo "#"
     echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
     echo "################################################################################"
     echo ""
   fi
 
-  # Run all the common FBGEMM-GPU build pre-steps (set up variables)
-  __build_fbgemm_gpu_common_pre_steps
+  echo "[BUILD] Installing docs-build dependencies ..."
+  (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
 
-  # Parallelism may need to be limited to prevent the build from being
-  # canceled for going over ulimits
-  echo "[BUILD] Building and installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
-    python setup.py install "${build_args[@]}"
+  echo "[BUILD] Running Doxygen build ..."
+  (exec_with_retries conda run -n "${env_name}" doxygen Doxyfile.in) || return 1
 
-  # Run checks on the built libraries
-  (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1
+  echo "[BUILD] Building HTML pages ..."
+  (exec_with_retries conda run -n "${env_name}" make html) || return 1
 
-  echo "[BUILD] FBGEMM-GPU build + install completed"
+  echo "[INSTALL] FBGEMM-GPU documentation build completed"
 }
 
 install_fbgemm_gpu_package () {
@@ -1124,7 +1493,7 @@ install_fbgemm_gpu_package () {
 
 
 ################################################################################
-# Test Functions
+# FBGEMM_GPU Test Functions
 ################################################################################
 
 run_fbgemm_gpu_tests () {
@@ -1133,7 +1502,7 @@ run_fbgemm_gpu_tests () {
   if [ "$env_name" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to GPU (Nvidia)"
+    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env cpu    # Run all tests applicable to CPU"
     echo "    ${FUNCNAME[0]} build_env rocm   # Run all tests applicable to ROCm"
     return 1
@@ -1165,7 +1534,10 @@ run_fbgemm_gpu_tests () {
       uvm_test.py
     )
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    local ignored_tests=()
+    # https://github.com/pytorch/FBGEMM/issues/1559
+    local ignored_tests=(
+      batched_unary_embeddings_test.py
+    )
   else
     local ignored_tests=()
   fi
@@ -1197,7 +1569,7 @@ run_fbgemm_gpu_tests () {
 
 
 ################################################################################
-# Publish Functions
+# FBGEMM_GPU Publish Functions
 ################################################################################
 
 publish_to_pypi () {
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index f6bae56123..79561102af 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -13,186 +13,179 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
-  build-posix:
-    runs-on: ${{ matrix.os }}
+  build-linux:
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+      DEBIAN_FRONTEND: noninteractive
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macos-latest ]
+        container-image: [ "ubuntu:20.04" ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
+    - name: Setup Build Container
       run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+        apt update -y
+        apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget
+        git config --global --add safe.directory '*'
 
-    - name: Get CPU info on Ubuntu
-      if: contains(runner.os, 'linux')
-      run: |
-        cat /proc/cpuinfo
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
-    - name: Get CPU info on macOS
-      if: contains(runner.os, 'macOs')
-      run: |
-        sysctl -a | grep machdep.cpu
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
 
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_static
-        cd build_static
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake --version
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 ..
+        make -j VERBOSE=1
 
-    - name: Test static FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        cd build_static
+        cd $BUILD_DIR
         ctest --rerun-failed --output-on-failure
 
-    - name: Build shared FBGEMM lib
+
+  build-macos:
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ macos-latest ]
+        library-type: [ static, shared ]
+
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    # Build but skip tests due to lack of support for AVX2
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_shared
-        cd build_shared
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake --version
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} ..
+        make -j VERBOSE=1
 
-    - name: Test shared FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+
+  build-bazel:
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      DEBIAN_FRONTEND: noninteractive
+    strategy:
+      fail-fast: false
+      matrix:
+        container-image: [ "ubuntu:20.04" ]
+
+    steps:
+    - name: Setup Build Container
       run: |
-        set -e
-        cd build_shared
-        ctest --rerun-failed --output-on-failure
+        apt update -y
+        apt install -y binutils build-essential cmake git libblas-dev python3 sudo unzip wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Download bazel
+      run: . $PRELUDE; setup_bazel
+
+    - name: Build FBGEMM Library
+      run: bazel build -s :*
+
+    - name: Test FBGEMM Library
+      run: bazel test -s :*
+
 
   build-windows:
     runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: cmd
+    env:
+      BUILD_DIR: build_${{ matrix.library-type }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [windows-2019]
+        os: [ windows-2019 ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
     - name: Get CPU info on Windows
       shell: cmd
       run: |
         wmic cpu list full
 
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
         echo "INSTALL NINJA:"
         pip install ninja
         which ninja
-        mkdir build_static
-        cd build_static
+        mkdir %BUILD_DIR%
+        cd %BUILD_DIR%
         echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
-        ninja all
+        cmake --version
+        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
+        ninja -v all
         echo "Build Success"
 
-    - name: Test static FBGEMM lib
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         echo %cd%
-        cd build_static
-        ctest --rerun-failed --output-on-failure
-        if errorlevel 1 exit /b 1
-
-    - name: Build shared FBGEMM lib
-      shell: cmd
-      run: |
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-        echo "INSTALL NINJA:"
-        pip install ninja
-        which ninja
-        mkdir build_shared
-        cd build_shared
-        echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
-        ninja all
-        if errorlevel 1 exit /b 1
-
-    - name: Test shared FBGEMM lib
-      shell: cmd
-      run: |
-        echo %cd%
-        cd build_shared
+        cd %BUILD_DIR%
         set PATH=%PATH%;%cd%;%cd%\asmjit
         echo %PATH%
         ctest --rerun-failed --output-on-failure
         if errorlevel 1 exit /b 1
-
-  build-bazel:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ ubuntu-latest ]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
-
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Download bazel
-      run: |
-        set -e
-        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
-        # verify content
-        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
-        chmod +x bazel
-
-
-    - name: Build FBGEMM with bazel
-      run: |
-        set -e
-        ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
-
-    - name: Test FBGEMM bazel build
-      run: |
-        set -e
-        ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml
deleted file mode 100644
index 06e2045a03..0000000000
--- a/.github/workflows/fbgemm_docs.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-name: FBGEMM Documentation
-on:
-  push:
-    branches:
-      - main
-jobs:
-  build_docs_job:
-    runs-on: linux.2xlarge
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: true
-    # Update references
-    # TODO: update the git submodule sync after we fixed the auto-sync part
-    - name: Git Sumbodule Update
-      run: |
-        git submodule init
-        git submodule update --remote --recursive
-        git log
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: setup Path
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: create conda env
-      run: |
-        conda create --name build_binary python=3.9
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install gcc
-      shell: bash
-      run: |
-        sudo yum group install -y "Development Tools"
-    - name: Setup Path
-      run: |
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-    - name: Test PyTorch Installation
-      run: |
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-    - name: Install fbgemm_gpu nightly
-      run: |
-        cd ./fbgemm_gpu
-        conda run -n build_binary python -m pip install -r requirements.txt
-        conda run -n build_binary python setup.py install --cpu_only
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        cd ./fbgemm_gpu/docs
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Install Doxygen
-      run: |
-        conda install -n build_binary -c conda-forge doxygen
-        which doxygen
-    - name: Build the docset
-      run: |
-        cd ./fbgemm_gpu/docs
-        conda run -n build_binary python -m pip install -r requirements.txt
-        conda run -n build_binary doxygen Doxyfile.in
-        conda run -n build_binary make html
-        cd ..
-    - name: Get output time
-      run: echo "The time was ${{ steps.build.outputs.time }}"
-    - name: Deploy
-      uses: JamesIves/github-pages-deploy-action@releases/v3
-      with:
-          ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          BRANCH: gh-pages # The branch the action should deploy to.
-          FOLDER: fbgemm_gpu/docs/build/html # The folder the action should deploy.
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 8e021c4451..50e7c3814b 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -6,16 +6,33 @@
 name: FBGEMM_GPU CI
 
 on:
-  push:
+  # PR Trigger
+  #
+  pull_request:
     branches:
       - main
-  pull_request:
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
     branches:
       - main
 
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build_and_test_amd:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -25,11 +42,17 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04 ]
-        python-version: [ "3.10" ]
-        rocm-version: [ "5.3" ]
+        container-image: [ "ubuntu:20.04" ]
+        python-version: [ "3.8", "3.9", "3.10" ]
+        rocm-version: [ "5.3", "5.4.2" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -45,10 +68,7 @@ jobs:
       run: . $PRELUDE; free_disk_space
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -62,76 +82,85 @@ jobs:
     - name: Install PyTorch-ROCm Nightly
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU-ROCM Nightly
-      run: |
-        . $PRELUDE
-        cd fbgemm_gpu
-
-        # Build for MI250 only to save time.
-        print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
-        print_exec conda run -n $BUILD_ENV python setup.py build develop
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
 
-    - name: Test FBGEMM_GPU-ROCM Nightly installation
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
 
   test_amd_gpu:
-    if: ${{ false }}  # Disable the job for now
     runs-on: rocm
+    container:
+      image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+      options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      ENFORCE_AMD_GPU: 1
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
+        # ROCm machines are limited, so we only test against Python 3.10
+        python-version: [ "3.10" ]
+        rocm-version: [ "5.3", "5.4.2" ]
 
     steps:
-    - name: pre-checkout
-      shell: bash
+    - name: Setup Build Container
       run: |
-        if [ -d ${{ github.workspace }} ]
-        then
-          sudo chown -R $USER:$USER ${{ github.workspace }}
-        fi
-        sudo add-apt-repository ppa:git-core/ppa
-        sudo apt update
-        sudo apt -y install --only-upgrade git
-
-    - uses: actions/checkout@v3
+        apt update -y
+        apt install -y git wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
       with:
-        ref: ${{ github.ref }}
-        submodules: 'true'
+        submodules: true
 
-    - name: build fbgemm_gpu and test
-      shell: bash
-      run: |
-        set -eux
-        env
-        ls -l
-        DOCKER_IMAGE=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_staging_base
-        docker pull $DOCKER_IMAGE
-        JENKINS_REPO_DIR=fbgemm-private-jenkins
-        JENKINS_REPO_DIR_BAREMETAL=$PWD
-        JENKINS_REPO_DIR_DOCKER=/workspace/$JENKINS_REPO_DIR
-        DOCKER_OPTIONS="\
-        --user 0 \
-        --network=host \
-        --ipc=host \
-        --shm-size 16G \
-        --group-add video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device=/dev/kfd \
-        --device=/dev/dri \
-        -v $JENKINS_REPO_DIR_BAREMETAL:$JENKINS_REPO_DIR_DOCKER
-        "
-        docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Free Disk Space
+      run: . $PRELUDE; free_disk_space
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch-ROCm Nightly
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU-ROCM Nightly
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
+
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+      timeout-minutes: 15
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
 
   build_and_test_cpu:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -141,10 +170,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04, ubuntu-latest ]
+        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -157,10 +192,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -171,12 +203,12 @@ jobs:
     - name: Install PyTorch
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build and Install FBGEMM_GPU (CPU version)
+    - name: Build + Install FBGEMM_GPU (CPU version)
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
 
-    - name: Test with PyTest
+    - name: Test FBGEMM_GPU-CPU Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
similarity index 77%
rename from .github/workflows/fbgemm_nightly_build_cpu.yml
rename to .github/workflows/fbgemm_gpu_cpu_nightly.yml
index 72a0af01e7..8d1d39805f 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -30,24 +30,36 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -60,10 +72,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -75,9 +84,9 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly (CPU version)
@@ -92,7 +101,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -102,16 +114,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -119,29 +138,21 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
-
     - name: Install FBGEMM_GPU Nightly (CPU version)
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
similarity index 78%
rename from .github/workflows/fbgemm_release_build_cpu.yml
rename to .github/workflows/fbgemm_gpu_cpu_release.yml
index a652c89854..577f0b5e88 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -22,24 +22,35 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -52,10 +63,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -67,9 +75,9 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU (CPU version)
@@ -84,7 +92,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -94,16 +105,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -111,29 +129,21 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
+      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
-
     - name: Install FBGEMM_GPU (CPU version)
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
similarity index 72%
rename from .github/workflows/fbgemm_nightly_build.yml
rename to .github/workflows/fbgemm_gpu_cuda_nightly.yml
index 4cdb10aaa8..c08d088991 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU Nightly Build
+name: FBGEMM_GPU-CUDA Nightly Build
 
 on:
   # PR Trigger (enabled only for debugging)
@@ -30,25 +30,36 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -61,10 +72,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -78,17 +86,18 @@ jobs:
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly cuda
 
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
@@ -99,7 +108,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.g5.4xlarge.nvidia.gpu
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root --gpus all
     defaults:
       run:
         shell: bash
@@ -110,19 +122,30 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils curl git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -130,10 +153,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -141,21 +161,17 @@ jobs:
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
     - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
-
     - name: Install FBGEMM_GPU Nightly
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
similarity index 77%
rename from .github/workflows/fbgemm_release_build.yml
rename to .github/workflows/fbgemm_gpu_cuda_release.yml
index 5e3d369fe4..3a41125170 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU Release Build
+name: FBGEMM_GPU-CUDA Release Build
 
 on:
   # PR Trigger (enabled only for debugging)
@@ -22,25 +22,36 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
+    continue-on-error: true
     strategy:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -53,10 +64,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -76,11 +84,11 @@ jobs:
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu cuda
 
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
@@ -91,7 +99,10 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.g5.4xlarge.nvidia.gpu
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root --gpus all
     defaults:
       run:
         shell: bash
@@ -102,18 +113,30 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
     needs: build_artifact
+
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils curl git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
         submodules: true
 
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
 
@@ -121,10 +144,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -135,18 +155,13 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
-      with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
-
     - name: Install FBGEMM_GPU
       run: |
         . $PRELUDE
-        ls .
+        pwd; ls -la .
         install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
new file mode 100644
index 0000000000..fb63995752
--- /dev/null
+++ b/.github/workflows/fbgemm_gpu_docs.yml
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: FBGEMM_GPU Documentation
+
+on:
+  # PR Trigger
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
+jobs:
+  build-docs:
+    runs-on: linux.2xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.11" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git rsync sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install Documentation Tools
+      run: . $PRELUDE; install_docs_tools $BUILD_ENV
+
+    - name: Install PyTorch-CPU Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build + Install FBGEMM_GPU (CPU version)
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
+
+    - name: Build FBGEMM_GPU Documentation
+      run: . $PRELUDE; cd fbgemm_gpu/docs; build_fbgemm_gpu_docs $BUILD_ENV
+
+    - name: Deploy FBGEMM_GPU Documentation
+      if: ${{ github.event_name != 'pull_request' }}
+      uses: JamesIves/github-pages-deploy-action@releases/v4
+      with:
+        branch: gh-pages                    # The branch the action should deploy to
+        folder: fbgemm_gpu/docs/build/html  # The folder the action should deploy
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index dc2b6344ce..8a484e9844 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -6,20 +6,29 @@
 name: FBGEMM_GPU Lint
 
 on:
+  # PR Trigger
+  #
   push:
     branches:
       - main
 
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
   pull_request:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
-  run_pylint:
+  run-lint:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.8" ]
+        python-version: [ "3.10" ]
     steps:
     - uses: actions/checkout@v3
 
@@ -33,7 +42,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install click flake8 ufmt
 
-    - name: Analyzing the code with flake8
+    - name: Analyzing the Code with flake8
       run: |
         echo "::add-matcher::fbgemm_gpu/test/lint/flake8_problem_matcher.json"
         flake8 --ignore=E501,W503,E203 .
@@ -41,13 +50,13 @@ jobs:
         # W503 = line break before binary operator (deprecated)
         # E203 = whitespace before ":"
 
-    - name: Analyzing the code with ufmt
+    - name: Analyzing the Code with ufmt
       run: |
         ufmt diff fbgemm_gpu/fbgemm_gpu
         ufmt diff fbgemm_gpu/test
         ufmt diff fbgemm_gpu/bench
 
-    - name: Check Meta copyright header
+    - name: Check Meta Copyright Header
       run: |
         python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/fbgemm_gpu --fixit=False
         python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/test --fixit=False
diff --git a/BUILD.bazel b/BUILD.bazel
index e998487255..12e05c4522 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -159,14 +159,14 @@ cc_library(
 )
 
 [
-  cc_test(
-      name = paths.split_extension(paths.basename(filename))[0],
-      size = "medium",
-      srcs = [
-          filename,
-      ],
-      deps = [
-          ":test_utils",
-      ],
-  ) for filename in get_fbgemm_tests()
+    cc_test(
+        name = paths.split_extension(paths.basename(filename))[0],
+        size = "medium",
+        srcs = [
+            filename,
+        ],
+        deps = [
+            ":test_utils",
+        ],
+    ) for filename in get_fbgemm_tests()
 ]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58dcb9aeb0..32920d1d48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,19 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+# Set the default C++ standard to C++17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+
+# Set the default C standard to C11
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
 
@@ -114,17 +129,11 @@ add_dependencies(fbgemm_generic defs.bzl)
 add_dependencies(fbgemm_avx2 defs.bzl)
 add_dependencies(fbgemm_avx512 defs.bzl)
 
-set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES
-      CXX_STANDARD 14
-      CXX_STANDARD_REQUIRED YES
-      CXX_EXTENSIONS NO
-      CXX_VISIBILITY_PRESET hidden)
-
-#On Windows:
-#1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function
-#calls with _dllimport attribute
-#2) MSVC uses /MD in default cxx compiling flags,
-#need to change it to /MT in static case
+# On Windows:
+# 1)  Adding definition of ASMJIT_STATIC to avoid generating asmjit function
+#     calls with _dllimport attribute
+# 2)  MSVC uses /MD in default cxx compiling flags,
+# Need to change it to /MT in static case
 if(MSVC)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4305 /wd4309")
   if(FBGEMM_LIBRARY_TYPE STREQUAL "static")
@@ -267,8 +276,6 @@ elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared")
   set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
-  set_target_properties(fbgemm PROPERTIES
-    CXX_VISIBILITY_PRESET hidden)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static")
   add_library(fbgemm STATIC
     $<TARGET_OBJECTS:fbgemm_generic>
diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel
index 30b1a80424..aff61b2b94 100644
--- a/WORKSPACE.bazel
+++ b/WORKSPACE.bazel
@@ -16,9 +16,9 @@ http_archive(
 
 http_archive(
     name = "com_google_googletest",
-    strip_prefix = "googletest-cd6b9ae3243985d4dc725abd513a874ab4161f3e",
+    strip_prefix = "googletest-1.13.0",
     urls = [
-        "https://github.com/google/googletest/archive/cd6b9ae3243985d4dc725abd513a874ab4161f3e.tar.gz",
+        "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz",
     ],
 )
 
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index b4fad7510a..49f9e38fa2 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,4 +1,12 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 find_package(MKL)
 if (NOT ${MKL_FOUND})
@@ -21,15 +29,12 @@ if (${BLAS_FOUND})
   message(STATUS "BLAS_LIBRARIES= ${BLAS_LIBRARIES}")
 endif()
 
-#benchmarks
+# Benchmarks
 macro(add_benchmark BENCHNAME)
   add_executable(${BENCHNAME} ${ARGN}
     BenchUtils.cc
     ../test/QuantizationHelpers.cc
     ../test/EmbeddingSpMDMTestUtils.cc)
-  set_target_properties(${BENCHNAME} PROPERTIES
-          CXX_STANDARD 11
-          CXX_EXTENSIONS NO)
   target_compile_options(${BENCHNAME} PRIVATE
     "-m64" "-mavx2" "-mfma" "-masm=intel")
   target_link_libraries(${BENCHNAME} fbgemm)
diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc
index 1fcf4607de..17934b6101 100644
--- a/bench/EmbeddingSpMDM8BitBenchmark.cc
+++ b/bench/EmbeddingSpMDM8BitBenchmark.cc
@@ -111,7 +111,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc
index b987586aac..246549f6a7 100644
--- a/bench/EmbeddingSpMDMBenchmark.cc
+++ b/bench/EmbeddingSpMDMBenchmark.cc
@@ -104,7 +104,7 @@ void run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
index ed5485ae29..fff665babb 100644
--- a/bench/EmbeddingSpMDMNBitBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -116,7 +116,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
index d1b28f54b5..c50500768d 100644
--- a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
@@ -131,7 +131,7 @@ int run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/bench/RowwiseAdagradFusedBenchmark.cc b/bench/RowwiseAdagradFusedBenchmark.cc
index 6f1203e6ab..a0524afaa5 100644
--- a/bench/RowwiseAdagradFusedBenchmark.cc
+++ b/bench/RowwiseAdagradFusedBenchmark.cc
@@ -90,7 +90,7 @@ void run_benchmark(
   // please note we generate unique indices
   for (int i = 0; i < batch_size; ++i) {
     iota(container.begin(), container.end(), 0);
-    random_shuffle(container.begin(), container.end());
+    shuffle(container.begin(), container.end(), generator);
     copy(
         container.begin(),
         container.begin() + (offsets[i + 1] - offsets[i]),
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 5f393b0010..2276ca9ff2 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -1,15 +1,34 @@
-cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR)
-
-option(FBGEMM_CPU_ONLY "Build fbgemm_gpu without GPU support" OFF)
-
-set(message_line
-    "-------------------------------------------------------------")
-message("${message_line}")
+cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
+
+# Set the default C++ standard to C++17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Set the default C standard to C17
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
+set(CMAKE_C_STANDARD 17)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+
+function(BLOCK_PRINT)
+  message("================================================================================")
+  foreach(ARG IN LISTS ARGN)
+     message("${ARG}")
+  endforeach()
+  message("================================================================================")
+  message("")
+endfunction()
 
 if(SKBUILD)
-  message("The project is built using scikit-build")
+  BLOCK_PRINT("The project is built using scikit-build")
 endif()
 
+# Build options
+option(FBGEMM_CPU_ONLY "Build FBGEMM_GPU without GPU support" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_ROCM "Use ROCm" OFF)
 
@@ -21,11 +40,10 @@ if(((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH}))
 endif()
 
 if(FBGEMM_CPU_ONLY)
-  message("Building for CPU-only")
+  BLOCK_PRINT("Building the CPU-only variant of FBGEMM-GPU")
 endif()
 
-message("${message_line}")
-message(STATUS "USE_ROCM ${USE_ROCM}")
+BLOCK_PRINT("USE_ROCM: ${USE_ROCM}")
 
 if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
@@ -46,12 +64,16 @@ set(THIRDPARTY ${FBGEMM}/third_party)
 
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
-    set(CXX_STANDARD_REQUIRED ON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
   endif()
-  message("${CMAKE_CXX_FLAGS}")
+  BLOCK_PRINT(
+    "Default C++ compiler flags"
+    "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
+    ""
+    "${CMAKE_CXX_FLAGS}"
+  )
 endif()
 
 #
@@ -72,8 +94,7 @@ if(USE_ROCM)
   include(Hip)
   include(Hipify)
 
-  message("${message_line}")
-  message(STATUS "hip found ${HIP_FOUND}")
+  BLOCK_PRINT("HIP found: ${HIP_FOUND}")
 endif()
 
 #
@@ -167,7 +188,8 @@ set(codegen_dependencies
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh)
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_tensor_accessor.h)
 
 if(USE_ROCM)
 message(STATUS "${PYTHON_EXECUTABLE}" "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py" "--opensource --is_rocm")
@@ -317,10 +339,15 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_utils.cpp
     src/split_table_batched_embeddings.cpp
     src/metric_ops_host.cpp
-    src/embedding_inplace_update_gpu.cpp)
+    src/embedding_inplace_update_gpu.cpp
+    src/input_combine_gpu.cpp)
 
   if(NVML_LIB_PATH)
     message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}")
+  endif()
+
+  if(NVML_LIB_PATH OR USE_ROCM)
+    message(STATUS "Adding merge_pooled_embeddings sources")
     list(
       APPEND
       fbgemm_gpu_sources_cpu
@@ -328,8 +355,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/merge_pooled_embeddings_gpu.cpp
       src/topology_utils.cpp)
   else()
-    message(STATUS
-    "Could not find NVML_LIB_PATH; skipping certain sources into the build")
+    message(STATUS "Skipping merge_pooled_embeddings sources")
   endif()
 endif()
 
@@ -351,7 +377,8 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache_cuda.cu
       src/split_embeddings_utils.cu
       src/metric_ops.cu
-      src/embedding_inplace_update.cu)
+      src/embedding_inplace_update.cu
+      src/input_combine.cu)
 
   set_source_files_properties(
     ${fbgemm_gpu_sources_gpu} PROPERTIES COMPILE_OPTIONS
@@ -411,13 +438,6 @@ if(USE_ROCM)
 else()
   add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
                                    ${cpp_asmjit_files} ${cpp_fbgemm_files})
-  set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES
-                                             "${cuda_architectures}")
-
-  # FBGEMM_CUB_USE_NAMESPACE will cause compilation errors on CUB for CUDA 12+
-  # if(NOT FBGEMM_CPU_ONLY)
-  #   target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
-  # endif()
 endif()
 
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
@@ -427,7 +447,6 @@ if(NVML_LIB_PATH)
   target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH})
 endif()
 target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
-set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
 
 install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu)
 
diff --git a/fbgemm_gpu/codegen/__init__.template b/fbgemm_gpu/codegen/__init__.template
index de8bf21dd0..661622eff9 100644
--- a/fbgemm_gpu/codegen/__init__.template
+++ b/fbgemm_gpu/codegen/__init__.template
@@ -13,7 +13,9 @@ import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_lars_sgd as loo
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_adam as lookup_partial_rowwise_adam  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_lamb as lookup_partial_rowwise_lamb  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad as lookup_rowwise_adagrad  # noqa: F401
+import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad_with_counter as lookup_rowwise_adagrad_with_counter  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_sgd as lookup_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_sgd as lookup_approx_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad as lookup_approx_rowwise_adagrad  # noqa: F401
+import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad_with_counter as lookup_approx_rowwise_adagrad_with_counter  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_weighted_adagrad as lookup_rowwise_weighted_adagrad  # noqa: F401
diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index 9d67358902..fd69a22f6e 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -646,6 +646,11 @@ def rowwise_adagrad_with_counter() -> None:
     split_precomputation = """
     at::acc_type<cache_t, true> freq = 1.0;
     at::acc_type<cache_t, true> l2_wd = 0.0;
+    at::acc_type<cache_t, true> tail_id_threshold_val = tail_id_threshold;
+    CUDA_KERNEL_ASSERT(max_counter > 0.0); // avoid divide by zero error
+    if (is_tail_id_thresh_ratio == 1){
+        tail_id_threshold_val = floorf(tail_id_threshold * max_counter);
+    }
     if (counter_halflife > 0 && threadIdx.x == 0) {
         // if id occurs multiple times in a batch, iter_delta=1
         const auto iter_delta = prev_iter[idx] == 0 ? 1.0 : iter * 1.0 - prev_iter[idx];
@@ -660,6 +665,7 @@ def rowwise_adagrad_with_counter() -> None:
     }
     freq = SHFL_SYNC(freq, 0);
     l2_wd = SHFL_SYNC(l2_wd, 0);
+    tail_id_threshold_val = SHFL_SYNC(tail_id_threshold_val, 0);
 
     at::acc_type<cache_t, true> g_local_sum_square = 0.0;
 
@@ -682,10 +688,7 @@ def rowwise_adagrad_with_counter() -> None:
     at::acc_type<cache_t, true> multiplier;
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
-    at::acc_type<cache_t, true> tail_id_threshold_val = tail_id_threshold;
-    if (is_tail_id_thresh_ratio == 1){
-        tail_id_threshold_val = floorf(tail_id_threshold * max_counter);
-    }
+
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
         momentum1[idx] = new_sum_square_grads;
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
index 4d77d2b508..bc18695ece 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -23,31 +23,52 @@ __device__ void adjust_offset_kernel(
   *offset_acc_end = indices_end;
 }
 
-template <typename index_t>
+template <typename index_t, bool vbe>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         rows_per_table,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const int32_t* const vbe_metadata,
     const int64_t bounds_check_mode_,
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd) {
   int32_t T = rows_per_table.size(0);
-  int32_t B = (offsets.size(0) - 1) / T;
-
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b; // = b_t % B;
-  int32_t t; // = b_t / B;
-  fd.DivMod(b_t, &t, &b);
-  if (t >= T) {
+  int32_t b;
+  int32_t t;
+  int32_t B = 0;
+  int32_t total_B = offsets.size(0) - 1;
+
+  if (!vbe && b_t >= total_B) {
     return;
   }
-  auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
 
-  auto num_rows = rows_per_table[t];
-  auto indices_start = offsets[t * B + b];
-  auto indices_end = offsets[t * B + b + 1];
-  index_t num_indices = indices.size(0);
+  fd.DivMod(b_t, &t, &b);
+
+  if (vbe) {
+    // Check if t is valid
+    if (t >= T) {
+      return;
+    }
+    const auto B_start = vbe_metadata[t];
+    B = vbe_metadata[t + 1] - B_start;
+    // Check if b is valid
+    if (b >= B) {
+      return;
+    }
+    // Update b_t value
+    b_t = B_start + b;
+  } else {
+    B = total_B / T;
+  }
+
+  const auto bounds_check_mode =
+      static_cast<BoundsCheckMode>(bounds_check_mode_);
+  const auto num_rows = rows_per_table[t];
+  auto indices_start = offsets[b_t];
+  auto indices_end = offsets[b_t + 1];
+  const index_t num_indices = indices.size(0);
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
     CUDA_KERNEL_ASSERT(indices_start >= 0);
@@ -58,12 +79,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         indices_end > num_indices) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: (at least one) Out of bounds access for "
-            "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld,"
+            "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for "
+            "batch: %d, table: %d, indices_start: %lld, indices_end: %lld,"
             " num_indices: %lld. Setting indices_start and indices_end within "
             "the range.\n",
-            static_cast<int64_t>(b),
-            static_cast<int64_t>(t),
+            vbe ? "true" : "false",
+            b,
+            t,
             static_cast<int64_t>(indices_start),
             static_cast<int64_t>(indices_end),
             static_cast<int64_t>(num_indices));
@@ -72,16 +94,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
           indices_start,
           indices_end,
           num_indices,
-          &offsets[t * B + b],
-          &offsets[t * B + b + 1]);
+          &offsets[b_t],
+          &offsets[b_t + 1]);
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
     adjust_offset_kernel(
         indices_start,
         indices_end,
         num_indices,
-        &offsets[t * B + b],
-        &offsets[t * B + b + 1]);
+        &offsets[b_t],
+        &offsets[b_t + 1]);
   }
 
   const auto L = indices_end - indices_start;
@@ -100,9 +122,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
       if (idx < 0 || idx >= num_rows) {
         if (gpuAtomicIncrement(&warning[0]) == 0) {
           printf(
-              "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
-              static_cast<int64_t>(b),
-              static_cast<int64_t>(t),
+              "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
+              vbe ? "true" : "false",
+              b,
+              t,
               static_cast<int64_t>(i),
               static_cast<int64_t>(idx),
               num_rows,
@@ -122,25 +145,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
   }
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
-    CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]);
+    CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]);
   } else if (bounds_check_mode == BoundsCheckMode::WARNING) {
-    if (num_indices != offsets[B * T]) {
+    if (num_indices != offsets[total_B]) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: the last element in offsets is incorrect for "
-            "total batch size B: %lld, total table num T: %lld, "
+            "EmbeddingBoundsCheck (VBE %s): the last element in offsets is incorrect for "
+            "total batch size %s: %d, total table num T: %d, "
             " last element in offsets: %lld, indices size: %lld. "
             " Setting the last element in offsets to be indices size.\n",
-            static_cast<int64_t>(B),
-            static_cast<int64_t>(T),
-            static_cast<int64_t>(offsets[B * T]),
+            vbe ? "true" : "false",
+            vbe ? "total_B" : "B",
+            vbe ? total_B : B,
+            T,
+            static_cast<int64_t>(offsets[total_B]),
             static_cast<int64_t>(num_indices));
       }
-      offsets[B * T] = num_indices;
+      offsets[total_B] = num_indices;
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
-    if (num_indices != offsets[B * T]) {
-      offsets[B * T] = num_indices;
+    if (num_indices != offsets[total_B]) {
+      offsets[total_B] = num_indices;
     }
   }
 }
@@ -151,19 +176,23 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B) {
   TENSOR_ON_CUDA_GPU(rows_per_table);
   TENSOR_ON_CUDA_GPU(indices);
   TENSOR_ON_CUDA_GPU(offsets);
   TENSOR_ON_CUDA_GPU(warning);
   TENSOR_EMPTY_OR_ON_CUDA_GPU(weights);
+  TENSOR_EMPTY_OR_ON_CUDA_GPU(vbe_metadata);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(rows_per_table.get_device());
 
   const int32_t T = rows_per_table.size(0);
-  const int32_t B = (offsets.size(0) - 1) / T;
-  if (B == 0 || T == 0) {
+  const int32_t total_B = offsets.size(0) - 1;
+  const int32_t B = (total_B) / T;
+  if (total_B == 0 || T == 0) {
     return;
   }
   const auto bounds_check_mode =
@@ -172,12 +201,17 @@ void bounds_check_indices_cuda(
     warning.zero_();
   }
   const int64_t num_indices = indices.size(0);
+  const auto vbe = vbe_metadata.has_value();
 
-  TORCH_CHECK(
-      offsets.size(0) == B * T + 1,
-      "offsets size " + std::to_string(offsets.size(0)) +
-          " is not equal to B (" + std::to_string(B) + ") * T (" +
-          std::to_string(T) + ") + 1");
+  if (vbe) {
+    TORCH_CHECK(max_B >= 0);
+  } else {
+    TORCH_CHECK(
+        offsets.size(0) == B * T + 1,
+        "offsets size " + std::to_string(offsets.size(0)) +
+            " is not equal to B (" + std::to_string(B) + ") * T (" +
+            std::to_string(T) + ") + 1");
+  }
   if (weights.has_value()) {
     TORCH_CHECK(
         weights.value().size(0) == num_indices,
@@ -186,20 +220,24 @@ void bounds_check_indices_cuda(
   }
 
   constexpr size_t kNumThreads = 256;
+  const auto max_B_ = vbe ? max_B : B;
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
-    bounds_check_indices_kernel<index_t>
-        <<<div_round_up(B * T, kNumThreads / fbgemm_gpu::kWarpSize),
-           dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
-           0,
-           at::cuda::getCurrentCUDAStream()>>>(
-            rows_per_table
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            bounds_check_mode_,
-            warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            FixedDivisor(B));
+    const auto bounds_check_kernel =
+        (vbe ? bounds_check_indices_kernel<index_t, true>
+             : bounds_check_indices_kernel<index_t, false>);
+    bounds_check_kernel<<<
+        div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize),
+        dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(
+        rows_per_table.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        vbe ? vbe_metadata.value().data_ptr<int32_t>() : nullptr,
+        bounds_check_mode_,
+        warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        FixedDivisor(max_B_));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 84575a3361..87e3cd7521 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -23,7 +23,9 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode,
     Tensor& warning,
-    c10::optional<Tensor> weights);
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B);
 
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
index a2dd19a75e..a33e02e164 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -42,7 +42,12 @@ void bounds_check_indices_cpu(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t /*max_B*/) {
+  TORCH_CHECK(
+      !vbe_metadata.has_value(),
+      "bounds_check_indices on CPU does not support variable length (batch size)");
   auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
@@ -163,7 +168,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
 
@@ -171,6 +176,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
index 9caaacbfb8..829249b297 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -534,44 +534,5 @@ Tensor pruned_array_lookup_cpu(
     return dense_indices;
 }
 
-Tensor pruned_array_lookup_from_row_idx_cpu(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets) {
-    TENSOR_ON_CPU(update_row_indices);
-    TENSOR_ON_CPU(update_table_indices);
-    TENSOR_ON_CPU(index_remappings);
-    TENSOR_ON_CPU(index_remappings_offsets);
-
-    int32_t T = index_remappings_offsets.size(0) - 1;
-    auto dense_indices = empty_like(update_row_indices);
-    const auto num_indices = update_row_indices.numel();
-
-    AT_DISPATCH_INDEX_TYPES(
-      update_row_indices.scalar_type(), "pruned_array_lookup_from_row_idx_cpu_kernel", [&] {
-        const auto update_row_indices_acc = update_row_indices.accessor<index_t, 1>();
-        auto dense_indices_acc = dense_indices.accessor<index_t, 1>();
-        const auto update_table_indices_acc = update_table_indices.accessor<int32_t, 1>();
-
-        const auto index_remappings_acc = index_remappings.accessor<int32_t, 1>();
-        const auto index_remappings_offsets_acc = index_remappings_offsets.accessor<int64_t, 1>();
-
-        for (int64_t idx = 0; idx < num_indices; idx++) {
-            const int table_idx = update_table_indices_acc[idx];
-            const auto row_idx = update_row_indices_acc[idx];
-            int64_t index_remappings_start = index_remappings_offsets_acc[table_idx];
-            int64_t index_remappings_end = index_remappings_offsets_acc[table_idx + 1];
-            int64_t capacity = index_remappings_end - index_remappings_start;
-            if (capacity > 0) {
-                dense_indices_acc[idx] = index_remappings_acc[index_remappings_start + row_idx];
-            } else {
-                dense_indices_acc[idx] = row_idx;
-            }
-        }
-      });
-    return dense_indices;
-}
-
 {% endif %}
 // clang-format on
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
index 6d4426cb27..01c054f818 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
@@ -4,12 +4,12 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-#include <algorithm>
 #include "c10/core/ScalarType.h"
 #ifdef FBCODE_CAFFE2
 #include "common/stats/Stats.h"
@@ -18,6 +18,8 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
 
+#include <algorithm>
+
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
@@ -37,7 +39,7 @@ DEFINE_quantile_stat(
     facebook::fb303::ExportTypeConsts::kNone,
     std::array<double, 4>{{.25, .50, .75, .99}});
 
-// Miss rate due to conflict in cache associativity.
+// (Unique) Miss rate due to conflict in cache associativity.
 // # unique misses due to conflict / # requested indices.
 DEFINE_quantile_stat(
     tbe_uvm_cache_conflict_unique_miss_rate,
@@ -45,6 +47,21 @@ DEFINE_quantile_stat(
     facebook::fb303::ExportTypeConsts::kNone,
     std::array<double, 4>{{.25, .50, .75, .99}});
 
+// Miss rate due to conflict in cache associativity.
+// # misses due to conflict / # requested indices.
+DEFINE_quantile_stat(
+    tbe_uvm_cache_conflict_miss_rate,
+    "tbe_uvm_cache_conflict_miss_rate_per_mille",
+    facebook::fb303::ExportTypeConsts::kNone,
+    std::array<double, 4>{{.25, .50, .75, .99}});
+
+// Total miss rate.
+DEFINE_quantile_stat(
+    tbe_uvm_cache_total_miss_rate,
+    "tbe_uvm_cache_total_miss_rate_per_mille",
+    facebook::fb303::ExportTypeConsts::kNone,
+    std::array<double, 4>{{.25, .50, .75, .99}});
+
 // FLAGs to control UVMCacheStats.
 DEFINE_int32(
     tbe_uvm_cache_stat_report,
@@ -58,6 +75,12 @@ DEFINE_int32(
     "If tbe_uvm_cache_stat_report is enabled, more detailed raw stats will be printed with this "
     "period. This should be an integer multiple of tbe_uvm_cache_stat_report.");
 
+DEFINE_int32(
+    tbe_uvm_cache_enforced_misses,
+    0,
+    "If set to non-zero, some cache lookups (tbe_uvm_cache_enforced_misses / 256) are enforced to be misses; "
+    "this is performance evaluation purposes only; and should be zero otherwise.");
+
 // TODO: align this with uvm_cache_stats_index in
 // split_embeddings_cache_cuda.cu.
 const int kUvmCacheStatsSize = 6;
@@ -84,10 +107,11 @@ void process_uvm_cache_stats(
     // uvm_cache_stats_counters[0]: num_req_indices
     // uvm_cache_stats_counters[1]: num_unique_indices
     // uvm_cache_stats_counters[2]: num_unique_misses
-    // uvm_cache_stats_counters[3]: num_unique_conflict_misses
+    // uvm_cache_stats_counters[3]: num_conflict_unique_misses
+    // uvm_cache_stats_counters[4]: num_conflict_misses
     // They should be zero-out after the calculated rates are populated into
     // cache counters.
-    static std::vector<int64_t> uvm_cache_stats_counters(4);
+    static std::vector<int64_t> uvm_cache_stats_counters(5);
 
     // Export cache stats.
     auto uvm_cache_stats_cpu = uvm_cache_stats.cpu();
@@ -107,19 +131,32 @@ void process_uvm_cache_stats(
         // Calculate cache related ratios based on the cumulated numbers and
         // push them into the counter pools.
         if (populate_uvm_stats && uvm_cache_stats_counters[0] > 0) {
-          double unique_rate =
+          const double unique_rate =
               static_cast<double>(uvm_cache_stats_counters[1]) /
               uvm_cache_stats_counters[0] * 1000;
-          double unique_miss_rate =
+          const double unique_miss_rate =
               static_cast<double>(uvm_cache_stats_counters[2]) /
               uvm_cache_stats_counters[0] * 1000;
-          double unique_conflict_miss_rate =
+          const double conflict_unique_miss_rate =
               static_cast<double>(uvm_cache_stats_counters[3]) /
               uvm_cache_stats_counters[0] * 1000;
+          const double conflict_miss_rate =
+              static_cast<double>(uvm_cache_stats_counters[4]) /
+              uvm_cache_stats_counters[0] * 1000;
+          // total # misses = unique misses - conflict_unique_misses + conflict
+          // misses.
+          const double total_miss_rate =
+              static_cast<double>(
+                  uvm_cache_stats_counters[2] - uvm_cache_stats_counters[3] +
+                  uvm_cache_stats_counters[4]) /
+              uvm_cache_stats_counters[0] * 1000;
+
           STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate);
           STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate);
           STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue(
-              unique_conflict_miss_rate);
+              conflict_unique_miss_rate);
+          STATS_tbe_uvm_cache_conflict_miss_rate.addValue(conflict_miss_rate);
+          STATS_tbe_uvm_cache_total_miss_rate.addValue(total_miss_rate);
 
           // Fill all the elements of the vector uvm_cache_stats_counters as 0
           // to zero out the cumulated counters.
@@ -365,7 +402,7 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
     // cache_index_table_map: (linearized) index to table number map.
     // 1D tensor, dtype=int32.
     c10::optional<Tensor> cache_index_table_map,
-    // lxu_cache_state: Cache state (cached idnex, or invalid).
+    // lxu_cache_state: Cache state (cached index, or invalid).
     // 2D tensor: # sets x assoc. dtype=int64.
     c10::optional<Tensor> lxu_cache_state,
     // lxu_state: meta info for replacement (time stamp for LRU).
@@ -461,6 +498,16 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
         uvm_cache_stats);
 
 #ifdef FBCODE_CAFFE2
+    if (FLAGS_tbe_uvm_cache_enforced_misses > 0) {
+      // Override some lxu_cache_locations (N for every 256 indices) with cache
+      // miss to enforce access to UVM.
+      lxu_cache_locations = emulate_cache_miss(
+          lxu_cache_locations.value(),
+          FLAGS_tbe_uvm_cache_enforced_misses,
+          gather_uvm_stats,
+          uvm_cache_stats);
+    }
+
     process_uvm_cache_stats(
         signature,
         total_cache_hash_size.value(),
@@ -511,13 +558,6 @@ Tensor pruned_array_lookup_cuda(
     Tensor index_remappings,
     Tensor index_remappings_offsets);
 
-///@ingroup embedding-cuda
-Tensor pruned_array_lookup_from_row_idx_cuda(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets);
-
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "int_nbit_split_embedding_codegen_lookup_function",
@@ -529,7 +569,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "pruned_hashmap_lookup", pruned_hashmap_lookup_unweighted_cuda);
 
   DISPATCH_TO_CUDA("pruned_array_lookup", pruned_array_lookup_cuda);
-  DISPATCH_TO_CUDA(
-      "pruned_array_lookup_from_row_idx",
-      pruned_array_lookup_from_row_idx_cuda);
 }
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
index 93db44ac76..a43671f880 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
@@ -240,13 +240,6 @@ Tensor pruned_array_lookup_cpu(
     Tensor index_remappings,
     Tensor index_remappings_offsets);
 
-///@ingroup embedding-cpu
-Tensor pruned_array_lookup_from_row_idx_cpu(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets);
-
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor");
@@ -278,12 +271,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "pruned_array_lookup(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
   DISPATCH_TO_CPU("pruned_array_lookup", pruned_array_lookup_cpu);
-
-  // GPU version of array lookup.
-  m.def(
-      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
-  DISPATCH_TO_CPU(
-      "pruned_array_lookup_from_row_idx", pruned_array_lookup_from_row_idx_cpu);
 }
 
 class PrunedMapCPU : public torch::jit::CustomClassHolder {
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
index 6ac2b2d3c0..e0a2f04ee8 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -552,36 +552,6 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
 }
 {% endif %}
 
-{% if not weighted %}
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> update_row_indices,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> update_table_indices,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> index_remappings,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> index_remappings_offsets,
-    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> dense_indices) {
-
-  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= update_row_indices.size(0)) {
-    return;
-  }
-  const int table_idx = update_table_indices[idx];
-  const auto row_idx = update_row_indices[idx];
-
-  const int64_t index_remappings_start = index_remappings_offsets[table_idx];
-  const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1];
-  const int64_t capacity = index_remappings_end - index_remappings_start;
-
-  if (capacity > 0) {
-    dense_indices[idx] = index_remappings[index_remappings_start + row_idx];
-  } else {
-    dense_indices[idx] = row_idx;
-  }
-}
-{% endif %}
-
-
-
 }
 
 {% for nobag in [True, False] %}
@@ -737,13 +707,16 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int2_D > 0) {
         auto max_int2_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int2_D, SparseType::INT2, row_alignment), 128);
-        TORCH_CHECK(max_int2_128b_rows <= 2);
+        TORCH_CHECK(max_int2_128b_rows <= 4);
         if (max_int2_128b_rows > 0) {
           Y(2, 16, 0, 1);
         }
         if (max_int2_128b_rows > 1) {
           Y(2, 8, 1, 2);
         }
+        if (max_int2_128b_rows > 2) {
+          Y(2, 8, 2, 4);
+        }
       }
     }));
     #undef X
@@ -783,7 +756,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int4_D > 0) {
         auto max_int4_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int4_D, SparseType::INT4, row_alignment), 128);
-        TORCH_CHECK(max_int4_128b_rows <= 4);
+        TORCH_CHECK(max_int4_128b_rows <= 8);
         if (max_int4_128b_rows > 0) {
           Y(4, 8, 0, 1);
         }
@@ -793,6 +766,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_int4_128b_rows > 2) {
           Y(1, 4, 2, 4);
         }
+        if (max_int4_128b_rows > 4) {
+          Y(1, 4, 4, 8);
+        }
       }
     }));
     #undef X
@@ -831,7 +807,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int8_D > 0) {
         auto max_int8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int8_D, SparseType::INT8, row_alignment), 128);
-        TORCH_CHECK(max_int8_128b_rows <= 8);
+        TORCH_CHECK(max_int8_128b_rows <= 16);
         if (max_int8_128b_rows > 0) {
           Y(2, 8, 0, 1);
         }
@@ -844,6 +820,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_int8_128b_rows > 4) {
           Y(2, 4, 4, 8);
         }
+        if (max_int8_128b_rows > 8) {
+          Y(2, 2, 8, 16);
+        }
       }
     }));
     #undef X
@@ -884,7 +863,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float8_D > 0) {
         auto max_fp8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float8_D, SparseType::FP8, row_alignment), 128);
-        TORCH_CHECK(max_fp8_128b_rows <= 8);
+        TORCH_CHECK(max_fp8_128b_rows <= 16);
         if (max_fp8_128b_rows > 0) {
           Y(2, 8, 0, 1);
         }
@@ -897,6 +876,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp8_128b_rows > 4) {
           Y(2, 4, 4, 8);
         }
+        if (max_fp8_128b_rows > 8) {
+          Y(2, 2, 4, 8);
+        }
       }
     }));
     #undef X
@@ -935,7 +917,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float16_D > 0) {
         auto max_fp16_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float16_D, SparseType::FP16, row_alignment), 128);
-        TORCH_CHECK(max_fp16_128b_rows <= 16);
+        TORCH_CHECK(max_fp16_128b_rows <= 32);
         if (max_fp16_128b_rows > 0) {
           Y(2, 8, 0, 2);
         }
@@ -948,6 +930,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp16_128b_rows > 8) {
           Y(2, 2, 8, 16);
         }
+        if (max_fp16_128b_rows > 16) {
+          Y(2, 1, 16, 32);
+        }
       }
     }));
     #undef X
@@ -986,7 +971,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float32_D > 0) {
         auto max_fp32_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float32_D, SparseType::FP32, row_alignment), 128);
-        TORCH_CHECK(max_fp32_128b_rows <= 32);
+        TORCH_CHECK(max_fp32_128b_rows <= 64);
         if (max_fp32_128b_rows > 0) {
           Y(2, 4, 0, 4);
         }
@@ -996,6 +981,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         if (max_fp32_128b_rows > 16) {
           Y(1, 1, 16, 32);
         }
+        if (max_fp32_128b_rows > 32) {
+          Y(1, 1, 32, 64);
+        }
       }
     }));
     #undef X
@@ -1089,53 +1077,6 @@ Tensor pruned_array_lookup_cuda(
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return dense_indices;
 }
-
-Tensor pruned_array_lookup_from_row_idx_cuda(
-    Tensor update_row_indices,
-    Tensor update_table_indices,
-    Tensor index_remappings,
-    Tensor index_remappings_offsets) {
-
-  TENSOR_ON_CUDA_GPU(update_row_indices);
-  TENSOR_ON_CUDA_GPU(update_table_indices);
-  TENSOR_ON_CUDA_GPU(index_remappings);
-  TENSOR_ON_CUDA_GPU(index_remappings_offsets);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(update_table_indices.get_device());
-  auto dense_indices = at::empty_like(update_row_indices);
-  const int32_t T = index_remappings_offsets.size(0) - 1;
-
-  const auto num_indices = update_row_indices.numel();
-  if (num_indices == 0) {
-    return dense_indices;
-  }
-
-  TORCH_CHECK(index_remappings.size(0) < std::numeric_limits<int64_t>::max());
-  TORCH_CHECK(update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim());
-  TORCH_CHECK(update_table_indices.dim() == 1, "Tensor dim: ", update_table_indices.dim());
-  TORCH_CHECK(index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim());
-  TORCH_CHECK(index_remappings_offsets.dim() == 1, "Tensor dim: ", index_remappings_offsets.dim());
-  TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim());
-  constexpr size_t kForwardMaxThreads = 256;
-
-  AT_DISPATCH_INDEX_TYPES(
-      update_row_indices.scalar_type(), "embedding_inplace_update_kernel", [&] {
-        nbit::int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel<<<
-            nbit::div_round_up(num_indices, kForwardMaxThreads),
-            kForwardMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-                update_row_indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                update_table_indices.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                index_remappings.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                index_remappings_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                dense_indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>()
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  return dense_indices;
-}
 {% endif %}
 
                                     // clang-format on
diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py
index c5a3d465e9..8c98a96a1a 100644
--- a/fbgemm_gpu/codegen/lookup_args.py
+++ b/fbgemm_gpu/codegen/lookup_args.py
@@ -44,6 +44,13 @@ class OptimizerArgs(NamedTuple):
     weight_decay_mode: int
     eta: float
     momentum: float
+    counter_halflife: int
+    adjustment_iter: int
+    adjustment_ub: float
+    learning_rate_mode: int
+    grad_sum_decay: int
+    tail_id_threshold: float
+    is_tail_id_thresh_ratio: int
 
 
 class Momentum(NamedTuple):
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index 4cdc5b8766..bd406d39fa 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -36,9 +36,18 @@ def invoke(
     {% if "momentum2_dev" in args.split_function_arg_names %}
     momentum2: Momentum,
     {% endif %}
+    {% if "prev_iter_dev" in args.split_function_arg_names %}
+    prev_iter: Momentum,
+    {% endif %}
+    {% if "row_counter_dev" in args.split_function_arg_names %}
+    row_counter: Momentum,
+    {% endif %}
     {% if "iter" in args.split_function_arg_names %}
     iter: int,
     {% endif %}
+    {% if "max_counter" in args.split_function_arg_names %}
+    max_counter: float,
+    {% endif %}
 ) -> torch.Tensor:
     if (common_args.host_weights.numel() > 0):
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
@@ -84,6 +93,27 @@ def invoke(
             {% if "momentum" in args.split_function_arg_names %}
             momentum=optimizer_args.momentum,
             {% endif %}
+            {% if "counter_halflife" in args.split_function_arg_names %}
+            counter_halflife=optimizer_args.counter_halflife,
+            {% endif %}
+            {% if "adjustment_iter" in args.split_function_arg_names %}
+            adjustment_iter=optimizer_args.adjustment_iter,
+            {% endif %}
+            {% if "adjustment_ub" in args.split_function_arg_names %}
+            adjustment_ub=optimizer_args.adjustment_ub,
+            {% endif %}
+            {% if "learning_rate_mode" in args.split_function_arg_names %}
+            learning_rate_mode=optimizer_args.learning_rate_mode,
+            {% endif %}
+            {% if "grad_sum_decay" in args.split_function_arg_names %}
+            grad_sum_decay=optimizer_args.grad_sum_decay,
+            {% endif %}
+            {% if "tail_id_threshold" in args.split_function_arg_names %}
+            tail_id_threshold=optimizer_args.tail_id_threshold,
+            {% endif %}
+            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+            {% endif %}
             # momentum1
             {% if "momentum1_dev" in args.split_function_arg_names %}
             momentum1_host=momentum1.host,
@@ -96,10 +126,26 @@ def invoke(
             momentum2_offsets=momentum2.offsets,
             momentum2_placements=momentum2.placements,
             {% endif %}
+            # prev_iter
+            {% if "prev_iter_dev" in args.split_function_arg_names %}
+            prev_iter_host=prev_iter.host,
+            prev_iter_offsets=prev_iter.offsets,
+            prev_iter_placements=prev_iter.placements,
+            {% endif %}
+            # row_counter
+            {% if "row_counter_dev" in args.split_function_arg_names %}
+            row_counter_host=row_counter.host,
+            row_counter_offsets=row_counter.offsets,
+            row_counter_placements=row_counter.placements,
+            {% endif %}
             # iter
             {% if "iter" in args.split_function_arg_names %}
             iter=iter,
             {% endif %}
+            # max counter
+            {% if "max_counter" in args.split_function_arg_names %}
+            max_counter=max_counter,
+            {% endif %}
         )
     else:
         return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function(
@@ -151,6 +197,27 @@ def invoke(
             {% if "momentum" in args.split_function_arg_names %}
             momentum=optimizer_args.momentum,
             {% endif %}
+            {% if "counter_halflife" in args.split_function_arg_names %}
+            counter_halflife=optimizer_args.counter_halflife,
+            {% endif %}
+            {% if "adjustment_iter" in args.split_function_arg_names %}
+            adjustment_iter=optimizer_args.adjustment_iter,
+            {% endif %}
+            {% if "adjustment_ub" in args.split_function_arg_names %}
+            adjustment_ub=optimizer_args.adjustment_ub,
+            {% endif %}
+            {% if "learning_rate_mode" in args.split_function_arg_names %}
+            learning_rate_mode=optimizer_args.learning_rate_mode,
+            {% endif %}
+            {% if "grad_sum_decay" in args.split_function_arg_names %}
+            grad_sum_decay=optimizer_args.grad_sum_decay,
+            {% endif %}
+            {% if "tail_id_threshold" in args.split_function_arg_names %}
+            tail_id_threshold=optimizer_args.tail_id_threshold,
+            {% endif %}
+            {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %}
+            is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio,
+            {% endif %}
             # momentum1
             {% if "momentum1_dev" in args.split_function_arg_names %}
             momentum1_dev=momentum1.dev,
@@ -165,9 +232,27 @@ def invoke(
             momentum2_offsets=momentum2.offsets,
             momentum2_placements=momentum2.placements,
             {% endif %}
+            # prev_iter
+            {% if "prev_iter_dev" in args.split_function_arg_names %}
+            prev_iter_dev=prev_iter.dev,
+            prev_iter_uvm=prev_iter.uvm,
+            prev_iter_offsets=prev_iter.offsets,
+            prev_iter_placements=prev_iter.placements,
+            {% endif %}
+            # row_counter
+            {% if "row_counter_dev" in args.split_function_arg_names %}
+            row_counter_dev=row_counter.dev,
+            row_counter_uvm=row_counter.uvm,
+            row_counter_offsets=row_counter.offsets,
+            row_counter_placements=row_counter.placements,
+            {% endif %}
             # iter
             {% if "iter" in args.split_function_arg_names %}
             iter=iter,
             {% endif %}
+            # max counter
+            {% if "max_counter" in args.split_function_arg_names %}
+            max_counter=max_counter,
+            {% endif %}
             output_dtype=common_args.output_dtype,
         )
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
new file mode 100644
index 0000000000..4f2c9c142b
--- /dev/null
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -0,0 +1,437 @@
+# FBGEMM_GPU Build Instructions
+
+The most up-to-date instructions are embedded in
+[`setup_env.bash`](../../.github/scripts/setup_env.bash).  The general steps for
+building FBGEMM_GPU are as follows:
+
+1. Set up an isolated environment for building (Miniconda)
+1. Install the relevant build tools (C/C++ compiler)
+1. Set up for either CUDA, ROCm, or CPU build
+1. Install PyTorch
+1. Run the build
+
+
+## Set Up an Isolated Build Environment
+
+### Install Miniconda
+
+Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+environment is recommended for reproducible builds:
+
+```sh
+# Set the Miniconda prefix directory
+miniconda_prefix=$HOME/miniconda
+
+# Download the Miniconda installer
+wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+
+# Run the installer
+bash miniconda.sh -b -p "$miniconda_prefix" -u
+
+# Load the shortcuts
+. ~/.bashrc
+
+# Run updates
+conda update -n base -c defaults -y conda
+```
+
+From here on out, all installation commands will be run against or inside a
+Conda environment.
+
+
+### Set Up the Conda Environment
+
+Create a Conda environment with the specified Python version:
+
+```sh
+env_name=<ENV NAME>
+python_version=3.10
+
+# Create the environment
+conda create -y --name "${env_name}" python="${python_version}"
+
+# Upgrade PIP and pyOpenSSL package
+conda run -n "${env_name}" pip install --upgrade pip
+conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
+```
+
+## Install the Build Tools
+
+### C/C++ Compiler
+
+Install a version of the GCC toolchain that supports **C++17**.  Note that GCC
+(as opposed to Clang for example) is required for GPU (CUDA) builds because
+NVIDIA's `nvcc` relies on `gcc` and `g++` in the path.  The `sysroot` package
+will also need to be installed to avoid issues with missing versioned symbols
+when compiling FBGEMM_CPU:
+
+```sh
+conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge
+```
+
+While newer versions of GCC can be used, binaries compiled under newer versions
+of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS
+Stream 8, because the compiled library will reference symbols from versions of
+`GLIBCXX` that the system's `libstdc++.so.6` will not support.  To see what
+versions of GLIBC and GLIBCXX the available `libstdc++.so.6` supports:
+
+```sh
+libcxx_path=/path/to/libstdc++.so.6
+
+# Print supported for GLIBC versions
+objdump -TC "${libcxx_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat
+
+# Print supported for GLIBCXX versions
+objdump -TC "${libcxx_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+```
+
+### Other Build Tools
+
+Install the other necessary build tools such as `ninja`, `cmake`, etc:
+
+```sh
+conda install -n "${env_name}" -y \
+    click \
+    cmake \
+    hypothesis \
+    jinja2 \
+    ninja \
+    numpy \
+    scikit-build \
+    wheel
+```
+
+
+## Set Up for CUDA Build
+
+The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability
+3.5+.  Setting the machine up for CUDA builds of FBGEMM_GPU can be done either
+through pre-built Docker images or through Conda installation on bare metal.
+Note that neither a GPU nor the NVIDIA drivers need to be present for builds,
+since they are only used at runtime.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired
+Linux distribution and CUDA version.
+
+```sh
+# Run for Ubuntu 22.04, CUDA 11.8
+docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install CUDA
+
+Install the full CUDA package through Conda, which includes
+[NVML](https://developer.nvidia.com/nvidia-management-library-nvml):
+
+```sh
+cuda_version=11.7.1
+
+# Install the full CUDA package
+conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}"
+```
+
+Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are
+found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_runtime.h
+find "${conda_prefix}" -name libnvidia-ml.so
+```
+
+### Install cuDNN
+
+[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the
+CUDA variant of FBGEMM_GPU.  Download and extract the cuDNN package for the
+given CUDA version:
+
+```sh
+# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+
+# Download and unpack cuDNN
+wget -q "${cudnn_url}" -O cudnn.tar.xz
+```
+
+### [OPTIONAL] Install CUB
+
+[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for
+the CUDA variant FBGEMM_GPU.  This must be installed separately for
+**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged.
+
+To install CUB through Conda:
+
+```sh
+conda install -c bottler nvidiacub
+```
+
+Alternatively, CUB may be installed manually by downloading from the
+[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking
+the package:
+
+```sh
+# Download and unpack CUB
+wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+```
+
+
+## Set Up for ROCm Build
+
+Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through
+pre-built Docker images or through bare metal.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the
+desired ROCm CUDA version.
+
+```sh
+# Run for ROCm 5.4.2
+docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install ROCm
+
+Install the full ROCm package through the operating system package manager. The
+full instructions can be found in the
+[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html):
+
+```sh
+# [OPTIONAL] Disable apt installation prompts
+export DEBIAN_FRONTEND=noninteractive
+
+# Update the repo DB
+apt update
+
+# Download the installer
+wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb
+
+# Run the installer
+apt install ./amdgpu-install_5.4.50403-1_all.deb
+
+# Install ROCm
+amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
+```
+
+### Install MIOpen
+
+[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the
+ROCm variant of FBGEMM_GPU that needs to be installed:
+
+```sh
+apt install hipify-clang miopen-hip miopen-hip-dev
+```
+
+
+## Install PyTorch
+
+The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains
+the most authoritative instructions on how to install PyTorch, either through
+Conda or through PIP.
+
+### Installation Through Conda
+
+```sh
+# Install the latest nightly
+conda install -n "${env_name}" -y pytorch -c pytorch-nightly
+# Install the latest test (RC)
+conda install -n "${env_name}" -y pytorch -c pytorch-test
+# Install a specific version
+conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch
+```
+
+Note that installing PyTorch through Conda without specifying a version (as in
+the case of nightly builds) may not always be reliable.  For example, it is known
+that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the
+CPU-only builds.  As such, a Conda installation of `pytorch-nightly` in that time
+window will silently fall back to installing the CPU-only version.
+
+Also note that, because both the GPU and CPU-only versions of PyTorch are placed
+into the same artifact bucket, the PyTorch variant that is selected during
+installation will depend on whether or not CUDA is installed on the system.  Thus
+for GPU builds, it is important to install CUDA first prior to PyTorch.
+
+### Installation Through PIP
+
+Note that PIP is the only choice of installation of PyTorch for ROCm builds.
+
+```sh
+# Install the latest nightly
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/
+# Install the latest test (RC)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/
+# Install a specific version
+conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/
+# Install the latest nightly (ROCm 5.3)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
+```
+
+### Post-Install Checks
+
+Verify the PyTorch installation with an `import` test:
+
+```sh
+conda run -n "${env_name}" python -c "import torch.distributed"
+```
+
+For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`**
+is found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_cmake_macros.h
+```
+
+
+## Build the FBGEMM_GPU Package
+
+### Preparing the Build
+
+Clone the repo along with its submodules, and install the `requirements.txt`:
+
+```sh
+# !! Run inside the Conda environment !!
+
+# Select a version tag
+FBGEMM_VERSION=v0.4.0
+
+# Clone the repo along with its submodules
+git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION}
+
+# Install additional required packages for building and testing
+cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu
+pip install requirements.txt
+```
+
+### The Build Process
+
+The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it
+keeps state across install runs.  As such, builds can become stale and can cause
+problems when re-runs are attempted after a build failure due to missing
+dependencies, etc.  To address this, simply clear the build cache:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python setup.py clean
+```
+
+### CUDA Build
+
+Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and
+made available to the build through environment variables:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# [OPTIONAL] Specify the CUDA installation paths
+# This may be required if CMake is unable to find nvcc
+export CUDACXX=/path/to/nvcc
+export CUDA_BIN_PATH=/path/to/cuda/installation
+
+# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1)
+export CUB_DIR=/path/to/cub
+
+# Specify cuDNN header and library paths
+export CUDNN_INCLUDE_DIR=/path/to/cudnn/include
+export CUDNN_LIBRARY=/path/to/cudnn/lib
+
+# Specify NVML path
+export NVML_LIB_PATH=/path/to/libnvidia-ml.so
+
+# Update to reflect the version of Python in the Conda environment
+python_tag=py310
+package_name=fbgemm_gpu
+
+# Build for SM70/80 (V100/A100 GPU); update as needed
+# If not specified, only the CUDA architecture supported by current system will be targeted
+# If no CUDA device is present either, all CUDA architectures will be targeted
+cuda_arch_list=7.0;8.0
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+
+# Build and install the library into the Conda environment
+python setup.py install \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+```
+
+### ROCm Build
+
+For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a')
+export ROCM_PATH=/path/to/rocm
+export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*')
+
+python_tag=py310
+package_name=fbgemm_gpu_rocm
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64
+
+# Build and install the library into the Conda environment
+python setup.py install develop
+```
+
+### CPU-Only Build
+
+For CPU-only builds, the `--cpu_only` needs to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python_tag=py310
+package_name=fbgemm_gpu_cpu
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --cpu_only
+
+# Build and install the library into the Conda environment
+python setup.py install --cpu_only
+```
+
+### Post-Build Checks
+
+After the build completes, it is useful to check the built library and verify
+the version numbers of GLIBCXX referenced as well as the availability of certain
+function symbols:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Locate the built .SO file
+fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so)
+
+# Note the versions of GLIBCXX referenced by the .SO
+# The libstdc++.so.6 available on the install target must support these versions
+objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+
+# Test for the existence of a given function symbol in the .SO
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings("
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense("
+```
diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md
index 097cde17dc..e2b0c81ae7 100644
--- a/fbgemm_gpu/docs/README.md
+++ b/fbgemm_gpu/docs/README.md
@@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti
 
    ```
    pip3 install -r requirements.txt
-   doxygen Doxygen.ini
+   doxygen Doxyfile.in
    make html
    ```
 
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index 0552e9c981..2c7d99610f 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -9,7 +9,7 @@
 
 import enum
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from itertools import accumulate
 from math import log2
 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union
@@ -71,6 +71,43 @@ class WeightDecayMode(enum.IntEnum):
     NONE = 0
     L2 = 1
     DECOUPLE = 2
+    COUNTER = 3
+
+
+class CounterWeightDecayMode(enum.IntEnum):
+    NONE = 0
+    L2 = 1
+    DECOUPLE = 2
+
+
+class LearningRateMode(enum.IntEnum):
+    EQUAL = -1
+    TAIL_ID_LR_INCREASE = 0
+    TAIL_ID_LR_DECREASE = 1
+    COUNTER_SGD = 2
+
+
+class GradSumDecay(enum.IntEnum):
+    NO_DECAY = -1
+    CTR_DECAY = 0
+
+
+@dataclass
+class TailIdThreshold:
+    val: float = 0
+    is_ratio: bool = False
+
+
+@dataclass
+class CounterBasedRegularizationDefinition:
+    counter_weight_decay_mode: CounterWeightDecayMode = CounterWeightDecayMode.NONE
+    counter_halflife: int = -1
+    adjustment_iter: int = -1
+    adjustment_ub: float = 1.0
+    learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL
+    grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY
+    tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold)
+    max_counter_update_freq: int = 1000
 
 
 RecordCacheMetrics: NamedTuple = NamedTuple(
@@ -78,14 +115,16 @@ class WeightDecayMode(enum.IntEnum):
     [("record_cache_miss_counter", bool), ("record_tablewise_cache_miss", bool)],
 )
 
-
-@dataclass
-class SplitState:
-    dev_size: int
-    host_size: int
-    uvm_size: int
-    placements: List[EmbeddingLocation]
-    offsets: List[int]
+SplitState: NamedTuple = NamedTuple(
+    "SplitState",
+    [
+        ("dev_size", int),
+        ("host_size", int),
+        ("uvm_size", int),
+        ("placements", List[EmbeddingLocation]),
+        ("offsets", List[int]),
+    ],
+)
 
 
 def construct_split_state(
@@ -95,11 +134,11 @@ def construct_split_state(
     precision: SparseType = SparseType.FP32,
     int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET,
 ) -> SplitState:
-    placements = []
-    offsets = []
-    dev_size = 0
-    host_size = 0
-    uvm_size = 0
+    placements: List[EmbeddingLocation] = []
+    offsets: List[int] = []
+    dev_size: int = 0
+    host_size: int = 0
+    uvm_size: int = 0
     for num_embeddings, embedding_dim, location, _ in embedding_specs:
         assert (
             embedding_dim % 4 == 0
@@ -235,6 +274,9 @@ def __init__(  # noqa C901
         eta: float = 0.001,  # used by LARS-SGD,
         beta1: float = 0.9,  # used by LAMB and ADAM
         beta2: float = 0.999,  # used by LAMB and ADAM
+        counter_based_regularization: Optional[
+            CounterBasedRegularizationDefinition
+        ] = None,  # used by Rowwise Adagrad
         pooling_mode: PoolingMode = PoolingMode.SUM,
         device: Optional[Union[str, int, torch.device]] = None,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
@@ -408,6 +450,34 @@ def __init__(  # noqa C901
         self.stochastic_rounding = stochastic_rounding
         self.optimizer = optimizer
 
+        self.weight_decay_mode = weight_decay_mode
+        if (
+            weight_decay_mode == WeightDecayMode.COUNTER
+            and counter_based_regularization is None
+        ):
+            raise AssertionError(
+                "weight_decay_mode is set to WeightDecayMode.COUNTER but counter_based_regularization is None"
+            )
+
+        self._used_rowwise_adagrad_with_counter: bool = (
+            optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.ROWWISE_ADAGRAD)
+            and weight_decay_mode == WeightDecayMode.COUNTER
+            and counter_based_regularization is not None
+        )
+
+        if counter_based_regularization is None:
+            counter_based_regularization = CounterBasedRegularizationDefinition()
+        self._max_counter_update_freq: int = -1
+        if self._used_rowwise_adagrad_with_counter:
+            self._max_counter_update_freq = (
+                counter_based_regularization.max_counter_update_freq
+            )
+            opt_arg_weight_decay_mode = (
+                counter_based_regularization.counter_weight_decay_mode
+            )
+        else:
+            opt_arg_weight_decay_mode = weight_decay_mode
+
         self.optimizer_args = invokers.lookup_args.OptimizerArgs(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
@@ -417,9 +487,18 @@ def __init__(  # noqa C901
             beta1=beta1,
             beta2=beta2,
             weight_decay=weight_decay,
-            weight_decay_mode=weight_decay_mode.value,
+            weight_decay_mode=opt_arg_weight_decay_mode.value,
             eta=eta,
             momentum=momentum,
+            counter_halflife=counter_based_regularization.counter_halflife,
+            adjustment_iter=counter_based_regularization.adjustment_iter,
+            adjustment_ub=counter_based_regularization.adjustment_ub,
+            learning_rate_mode=counter_based_regularization.learning_rate_mode.value,
+            grad_sum_decay=counter_based_regularization.grad_sum_decay.value,
+            tail_id_threshold=counter_based_regularization.tail_id_threshold.val,
+            is_tail_id_thresh_ratio=int(
+                counter_based_regularization.tail_id_threshold.is_ratio
+            ),
         )
 
         if optimizer in (
@@ -427,25 +506,7 @@ def __init__(  # noqa C901
             OptimType.EXACT_SGD,
         ):
             # NOTE: make TorchScript work!
-            self.register_buffer(
-                "momentum1_dev", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_host", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_uvm", torch.tensor([0], dtype=torch.int64), persistent=False
-            )
-            self.register_buffer(
-                "momentum1_placements",
-                torch.tensor([0], dtype=torch.int64),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum1_offsets",
-                torch.tensor([0], dtype=torch.int64),
-                persistent=False,
-            )
+            self._register_nonpersistent_buffers("momentum1")
         else:
             self._apply_split(
                 construct_split_state(
@@ -484,29 +545,40 @@ def __init__(  # noqa C901
             )
         else:
             # NOTE: make TorchScript work!
-            self.register_buffer(
-                "momentum2_dev",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum2_host",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
-            )
-            self.register_buffer(
-                "momentum2_uvm",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
+            self._register_nonpersistent_buffers("momentum2")
+        if self._used_rowwise_adagrad_with_counter:
+            self._apply_split(
+                construct_split_state(
+                    embedding_specs,
+                    rowwise=True,
+                    cacheable=False,
+                ),
+                prefix="prev_iter",
+                # TODO: ideally we should use int64 to track iter but it failed to compile.
+                # It may be related to low precision training code. Currently using float32
+                # as a workaround while investigating the issue.
+                # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
+                #  but got `Type[torch.float32]`.
+                dtype=torch.float32,
             )
-            self.register_buffer(
-                "momentum2_placements",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
-                persistent=False,
+            self._apply_split(
+                construct_split_state(
+                    embedding_specs,
+                    rowwise=True,
+                    cacheable=False,
+                ),
+                prefix="row_counter",
+                # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
+                #  but got `Type[torch.float32]`.
+                dtype=torch.float32,
             )
+            self.register_buffer("max_counter", torch.tensor([1], dtype=torch.float32))
+        else:
+            self._register_nonpersistent_buffers("prev_iter")
+            self._register_nonpersistent_buffers("row_counter")
             self.register_buffer(
-                "momentum2_offsets",
-                torch.zeros(1, dtype=torch.int64, device=self.current_device),
+                "max_counter",
+                torch.ones(1, dtype=torch.float32, device=self.current_device),
                 persistent=False,
             )
         if optimizer in (
@@ -519,6 +591,7 @@ def __init__(  # noqa C901
             self.register_buffer(
                 "iter", torch.zeros(1, dtype=torch.int64, device=self.current_device)
             )
+
         else:
             self.register_buffer(
                 "iter",
@@ -572,6 +645,34 @@ def __init__(  # noqa C901
 
         self.step = 0
 
+    def _register_nonpersistent_buffers(self, prefix: str) -> None:
+        # NOTE: make TorchScript work!
+        self.register_buffer(
+            f"{prefix}_dev",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_host",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_uvm",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_placements",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+        self.register_buffer(
+            f"{prefix}_offsets",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=False,
+        )
+
     def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         if not hasattr(self, f"{prefix}_physical_placements"):
             raise DoesNotHavePrefix()
@@ -590,7 +691,7 @@ def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tenso
 
     def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]:
         all_states = []
-        for prefix in ["weights", "momentum1", "momentum2"]:
+        for prefix in ["weights", "momentum1", "momentum2", "prev_iter", "row_counter"]:
             try:
                 all_states.append(self.get_states(prefix))
             except DoesNotHavePrefix:
@@ -681,10 +782,20 @@ def forward(
             return invokers.lookup_approx_sgd.invoke(common_args, self.optimizer_args)
 
         momentum1 = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
             dev=self.momentum1_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
             host=self.momentum1_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
             uvm=self.momentum1_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
             offsets=self.momentum1_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
             placements=self.momentum1_placements,
         )
 
@@ -696,21 +807,22 @@ def forward(
             return invokers.lookup_adagrad.invoke(
                 common_args, self.optimizer_args, momentum1
             )
-        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
-            return invokers.lookup_rowwise_adagrad.invoke(
-                common_args, self.optimizer_args, momentum1
-            )
-        if self.optimizer == OptimType.ROWWISE_ADAGRAD:
-            assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode"
-            return invokers.lookup_approx_rowwise_adagrad.invoke(
-                common_args, self.optimizer_args, momentum1
-            )
 
         momentum2 = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
             dev=self.momentum2_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
             host=self.momentum2_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
             uvm=self.momentum2_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
             offsets=self.momentum2_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
             placements=self.momentum2_placements,
         )
         # Ensure iter is always on CPU so the increment doesn't synchronize.
@@ -768,6 +880,79 @@ def forward(
                 self.iter.item(),
             )
 
+        prev_iter = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
+            dev=self.prev_iter_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
+            host=self.prev_iter_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
+            uvm=self.prev_iter_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
+            offsets=self.prev_iter_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
+            placements=self.prev_iter_placements,
+        )
+        row_counter = invokers.lookup_args.Momentum(
+            # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor,
+            #  nn.Module]`.
+            dev=self.row_counter_dev,
+            # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor,
+            #  nn.Module]`.
+            host=self.row_counter_host,
+            # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor,
+            #  nn.Module]`.
+            uvm=self.row_counter_uvm,
+            # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor,
+            #  nn.Module]`.
+            offsets=self.row_counter_offsets,
+            # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor,
+            #  nn.Module]`.
+            placements=self.row_counter_placements,
+        )
+        if self._used_rowwise_adagrad_with_counter:
+            if self.iter.item() % self._max_counter_update_freq == 0:
+                max_counter = torch.max(self.row_counter_dev.detach())
+                self.max_counter = max_counter.cpu() + 1
+
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            if self._used_rowwise_adagrad_with_counter:
+                return invokers.lookup_rowwise_adagrad_with_counter.invoke(
+                    common_args,
+                    self.optimizer_args,
+                    momentum1,
+                    prev_iter,
+                    row_counter,
+                    # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`.
+                    self.iter.item(),
+                    self.max_counter.item(),
+                )
+            else:
+                return invokers.lookup_rowwise_adagrad.invoke(
+                    common_args, self.optimizer_args, momentum1
+                )
+        if self.optimizer == OptimType.ROWWISE_ADAGRAD:
+            assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode"
+            if self._used_rowwise_adagrad_with_counter:
+                return invokers.lookup_approx_rowwise_adagrad_with_counter.invoke(
+                    common_args,
+                    self.optimizer_args,
+                    momentum1,
+                    prev_iter,
+                    row_counter,
+                    # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`.
+                    self.iter.item(),
+                    self.max_counter.item(),
+                )
+            else:
+                return invokers.lookup_approx_rowwise_adagrad.invoke(
+                    common_args, self.optimizer_args, momentum1
+                )
+
         raise ValueError(f"Invalid OptimType: {self.optimizer}")
 
     def reset_uvm_cache_stats(self) -> None:
@@ -796,10 +981,11 @@ def print_uvm_cache_stats(self) -> None:
             f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n"
             f"N_conflict_misses: {uvm_cache_stats[5]}\n"
         )
-        logging.info(
-            f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
-            f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
-        )
+        if uvm_cache_stats[1]:
+            logging.info(
+                f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
+                f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
+            )
 
     def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         self.timestep += 1
@@ -1013,8 +1199,12 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
             or self.optimizer == OptimType.ROWWISE_ADAGRAD
             or self.optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD
         ):
+            split_optimizer_states = self.split_optimizer_states()
             list_of_state_dict = [
-                {"sum": _sum[0]} for _sum in self.split_optimizer_states()
+                {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]}
+                if self._used_rowwise_adagrad_with_counter
+                else {"sum": states[0]}
+                for states in split_optimizer_states
             ]
         else:
             raise NotImplementedError(
@@ -1024,7 +1214,9 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
         return list_of_state_dict
 
     @torch.jit.ignore
-    def split_optimizer_states(self) -> List[Tuple[torch.Tensor]]:
+    def split_optimizer_states(
+        self,
+    ) -> List[List[torch.Tensor]]:
         """
         Returns a list of states, split by table
         """
@@ -1062,8 +1254,14 @@ def get_optimizer_states(
         ):
             states.append(
                 get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum1_uvm,
                     # pyre-fixme[6]: Expected `Tensor` for 4th param but got
                     #  `Union[Tensor, nn.Module]`.
@@ -1087,8 +1285,14 @@ def get_optimizer_states(
         ):
             states.append(
                 get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
                     self.momentum2_uvm,
                     # pyre-fixme[6]: Expected `Tensor` for 4th param but got
                     #  `Union[Tensor, nn.Module]`.
@@ -1100,7 +1304,49 @@ def get_optimizer_states(
                     in (OptimType.PARTIAL_ROWWISE_ADAM, OptimType.PARTIAL_ROWWISE_LAMB),
                 )
             )
-        return list(zip(*states))
+        if self._used_rowwise_adagrad_with_counter:
+            states.append(
+                get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_uvm,
+                    # pyre-fixme[6]: Expected `Tensor` for 4th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_physical_offsets,
+                    # pyre-fixme[6]: Expected `Tensor` for 5th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.prev_iter_physical_placements,
+                    rowwise=True,
+                )
+            )
+            states.append(
+                get_optimizer_states(
+                    # pyre-fixme[6]: Expected `Tensor` for 1st param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_dev,
+                    # pyre-fixme[6]: Expected `Tensor` for 2nd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_host,
+                    # pyre-fixme[6]: Expected `Tensor` for 3rd param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_uvm,
+                    # pyre-fixme[6]: Expected `Tensor` for 4th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_physical_offsets,
+                    # pyre-fixme[6]: Expected `Tensor` for 5th param but got
+                    #  `Union[Tensor, nn.Module]`.
+                    self.row_counter_physical_placements,
+                    rowwise=True,
+                )
+            )
+        return_states = [list(s) for s in zip(*states)]
+        return return_states
 
     @torch.jit.export
     def set_learning_rate(self, lr: float) -> None:
@@ -1691,8 +1937,8 @@ def nbit_construct_split_state(
     scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     cacheline_alignment: bool = True,
 ) -> SplitState:
-    placements = []
-    offsets = []
+    placements = torch.jit.annotate(List[EmbeddingLocation], [])
+    offsets = torch.jit.annotate(List[int], [])
     dev_size = 0
     host_size = 0
     uvm_size = 0
@@ -1740,6 +1986,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     cache_miss_counter: torch.Tensor
     uvm_cache_stats: torch.Tensor
     local_uvm_cache_stats: torch.Tensor
+    weights_offsets: torch.Tensor
+    weights_placements: torch.Tensor
 
     def __init__(
         self,
@@ -1921,21 +2169,7 @@ def max_ty_D(ty: SparseType) -> int:
         ]
         self.max_D_cache: int = max(cached_dims) if len(cached_dims) > 0 else 0
 
-        weight_split: SplitState = nbit_construct_split_state(
-            self.embedding_specs,
-            cacheable=True,
-            row_alignment=self.row_alignment,
-            scale_bias_size_in_bytes=self.scale_bias_size_in_bytes,
-            cacheline_alignment=cacheline_alignment,
-        )
-
-        self.weights_physical_placements: List[int] = [
-            t.value for t in weight_split.placements
-        ]
-        self.weights_physical_offsets: List[int] = weight_split.offsets
-        self.host_size: int = weight_split.host_size
-        self.dev_size: int = weight_split.dev_size
-        self.uvm_size: int = weight_split.uvm_size
+        self.initialize_physical_weights_placements_and_offsets(cacheline_alignment)
         self.enforce_hbm: bool = enforce_hbm
 
         # Assign weights after weights and weights_offsets are initialized.
@@ -1948,7 +2182,8 @@ def max_ty_D(ty: SparseType) -> int:
                 self.weights_physical_offsets,
                 self.enforce_hbm,
             )
-            self.assign_embedding_weights(weight_lists)  # type: ignore
+            # pyre-fixme [6]: In call `IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights`, for 1st positional argument, expected `List[Tuple[Tensor, Optional[Tensor]]]` but got `List[Tuple[Tensor, Tensor]]`.
+            self.assign_embedding_weights(weight_lists)
 
         # Handle index remapping for embedding pruning.
         self.register_buffer(
@@ -2104,10 +2339,11 @@ def print_uvm_cache_stats(self) -> None:
             f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n"
             f"N_conflict_misses: {uvm_cache_stats[5]}\n"
         )
-        logging.info(
-            f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
-            f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
-        )
+        if uvm_cache_stats[1]:
+            logging.info(
+                f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n"
+                f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n"
+            )
 
     @torch.jit.export
     def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
@@ -2409,6 +2645,72 @@ def forward(
             fp8_exponent_bias=self.fp8_exponent_bias,
         )
 
+    def initialize_logical_weights_placements_and_offsets(
+        self,
+    ) -> None:
+        assert len(self.weights_physical_offsets) == len(self.embedding_specs)
+        assert len(self.weights_physical_offsets) == len(
+            self.weights_physical_placements
+        )
+        offsets = [self.weights_physical_offsets[t] for t in self.feature_table_map]
+        placements = [
+            self.weights_physical_placements[t] for t in self.feature_table_map
+        ]
+        self.weights_offsets = torch.tensor(
+            offsets, device=self.current_device, dtype=torch.int64
+        )
+        self.weights_placements = torch.tensor(
+            placements, device=self.current_device, dtype=torch.int32
+        )
+
+    def initialize_physical_weights_placements_and_offsets(
+        self,
+        cacheline_alignment: bool = True,
+    ) -> None:
+        # Initialize physical weights placements and offsets
+        # and host/dev/uvm sizes
+        weight_split: SplitState = nbit_construct_split_state(
+            self.embedding_specs,
+            cacheable=True,
+            row_alignment=self.row_alignment,
+            scale_bias_size_in_bytes=self.scale_bias_size_in_bytes,
+            cacheline_alignment=cacheline_alignment,
+        )
+        self.weights_physical_placements = [t.value for t in weight_split.placements]
+        self.weights_physical_offsets = weight_split.offsets
+        self.host_size = weight_split.host_size
+        self.dev_size = weight_split.dev_size
+        self.uvm_size = weight_split.uvm_size
+
+    @torch.jit.export
+    def reset_weights_placements_and_offsets(
+        self, device: torch.device, location: int
+    ) -> None:
+        # Reset device/location denoted in embedding specs
+        self.reset_embedding_spec_location(device, location)
+        # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor
+        self.initialize_physical_weights_placements_and_offsets()
+        self.initialize_logical_weights_placements_and_offsets()
+
+    def reset_embedding_spec_location(
+        self, device: torch.device, location: int
+    ) -> None:
+        # Overwrite location in embedding_specs with new location
+        # Use map since can't script enum call (ie. EmbeddingLocation(value))
+        INT_TO_EMBEDDING_LOCATION = {
+            0: EmbeddingLocation.DEVICE,
+            1: EmbeddingLocation.MANAGED,
+            2: EmbeddingLocation.MANAGED_CACHING,
+            3: EmbeddingLocation.HOST,
+        }
+        target_location = INT_TO_EMBEDDING_LOCATION[location]
+        self.current_device = device
+        self.row_alignment = 1 if target_location == EmbeddingLocation.HOST else 16
+        self.embedding_specs = [
+            (spec[0], spec[1], spec[2], spec[3], target_location)
+            for spec in self.embedding_specs
+        ]
+
     def _apply_split(
         self,
         dev_size: int,
@@ -2427,14 +2729,7 @@ def _apply_split(
         self.dev_size = dev_size
         self.uvm_size = uvm_size
 
-        offsets = [offsets[t] for t in self.feature_table_map]
-        placements = [placements[t] for t in self.feature_table_map]
-        self.weights_offsets = torch.tensor(
-            offsets, device=self.current_device, dtype=torch.int64
-        )
-        self.weights_placements = torch.tensor(
-            placements, device=self.current_device, dtype=torch.int32
-        )
+        self.initialize_logical_weights_placements_and_offsets()
 
         if dev_size > 0:
             self.weights_dev = torch.zeros(
@@ -2816,6 +3111,49 @@ def assign_embedding_weights(
             else:
                 assert dest_weight[1] is None
 
+    @torch.jit.export
+    def set_index_remappings_array(
+        self,
+        index_remapping: List[Tensor],
+    ) -> None:
+        rows: List[int] = [e[1] for e in self.embedding_specs]
+        index_remappings_array_offsets = [0]
+        original_feature_rows = torch.jit.annotate(List[int], [])
+        last_offset = 0
+        for t, mapping in enumerate(index_remapping):
+            if mapping is not None:
+                current_original_row = mapping.numel()
+                last_offset += current_original_row
+                original_feature_rows.append(current_original_row)
+            else:
+                original_feature_rows.append(rows[t])
+            index_remappings_array_offsets.append(last_offset)
+
+        self.index_remappings_array_offsets = torch.tensor(
+            index_remappings_array_offsets,
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+        if len(original_feature_rows) == 0:
+            original_feature_rows = rows
+        self.original_rows_per_table = torch.tensor(
+            [original_feature_rows[t] for t in self.feature_table_map],
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+        if self.index_remappings_array_offsets[-1] == 0:
+            self.index_remappings_array = torch.empty(
+                0, dtype=torch.int32, device=self.current_device
+            )
+        else:
+            index_remappings_filter_nones = []
+            for mapping in index_remapping:
+                if mapping is not None:
+                    index_remappings_filter_nones.append(mapping)
+            self.index_remappings_array = torch.cat(index_remappings_filter_nones).to(
+                self.current_device
+            )
+
     def set_index_remappings(
         self,
         index_remapping: List[Tensor],
@@ -2882,37 +3220,7 @@ def set_index_remappings(
                 self.index_remapping_hash_table_cpu = None
         # Array mapping pruning
         else:
-            index_remappings_array_offsets = [0]
-            original_feature_rows = []
-            last_offset = 0
-            for t, mapping in enumerate(index_remapping):
-                if mapping is not None:
-                    current_original_row = mapping.numel()
-                    last_offset += current_original_row
-                    original_feature_rows.append(current_original_row)
-                else:
-                    original_feature_rows.append(rows[t])
-                index_remappings_array_offsets.append(last_offset)
-
-            self.index_remappings_array_offsets = torch.tensor(
-                index_remappings_array_offsets,
-                device=self.current_device,
-                dtype=torch.int64,
-            )
-            if len(original_feature_rows) == 0:
-                original_feature_rows = rows
-            self.original_rows_per_table = torch.tensor(
-                [original_feature_rows[t] for t in self.feature_table_map],
-                device=self.current_device,
-                dtype=torch.int64,
-            )
-            self.index_remappings_array = (
-                torch.empty(0, dtype=torch.int32, device=self.current_device)
-                if self.index_remappings_array_offsets[-1] == 0
-                else torch.cat(
-                    [mapping for mapping in index_remapping if mapping is not None]
-                ).to(self.current_device)
-            )
+            self.set_index_remappings_array(index_remapping)
 
     def _embedding_inplace_update_per_table(
         self,
diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
index 1eec03fdd9..250f84abb6 100644
--- a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
@@ -18,6 +18,7 @@
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
     align_to_cacheline,
     CacheAlgorithm,
+    CounterBasedRegularizationDefinition,
     DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     EmbeddingLocation,
     PoolingMode,
@@ -88,6 +89,9 @@ def __init__(
         eta: float = 0.001,  # used by LARS-SGD,
         beta1: float = 0.9,  # used by LAMB and ADAM
         beta2: float = 0.999,  # used by LAMB and ADAM
+        counter_based_regularization: Optional[
+            CounterBasedRegularizationDefinition
+        ] = None,  # used by Rowwise Adagrad
         pooling_mode: PoolingMode = PoolingMode.SUM,
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
@@ -217,6 +221,12 @@ def __init__(
         self.ssd_set_end = torch.cuda.Event()
         self.timesteps_prefetched: List[int] = []
 
+        if weight_decay_mode == WeightDecayMode.COUNTER or counter_based_regularization:
+            raise AssertionError(
+                "weight_decay_mode = WeightDecayMode.COUNTER is not supported for SSD TBE."
+            )
+        counter_based_regularization = CounterBasedRegularizationDefinition()
+
         self.optimizer_args = invokers.lookup_args.OptimizerArgs(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
@@ -229,6 +239,15 @@ def __init__(
             weight_decay_mode=weight_decay_mode.value,
             eta=eta,
             momentum=momentum,
+            counter_halflife=counter_based_regularization.counter_halflife,
+            adjustment_iter=counter_based_regularization.adjustment_iter,
+            adjustment_ub=counter_based_regularization.adjustment_ub,
+            learning_rate_mode=counter_based_regularization.learning_rate_mode.value,
+            grad_sum_decay=counter_based_regularization.grad_sum_decay.value,
+            tail_id_threshold=counter_based_regularization.tail_id_threshold.val,
+            is_tail_id_thresh_ratio=int(
+                counter_based_regularization.tail_id_threshold.is_ratio
+            ),
         )
         self.weights_dev = nn.Parameter(
             torch.empty((0,), device=self.current_device, dtype=torch.float32)
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
index 10670b48d4..cfa457d04b 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
@@ -75,4 +75,28 @@ void embedding_inplace_update_cpu(
         c10::nullopt // Not used, to match cache interface for CUDA op
 );
 
+/**
+ * Index remapping function that returns the remapped indices.
+ *
+ * Args:
+ *    update_row_indices: row indices for every new row
+ *    update_table_indices: table indices for every new row
+ *    index_remappings: concated index remapping for every embedding table
+ *    index_remappings_offsets: offset for each embedding table
+ *
+ * Returns:
+ *    remapped indices for each new row.
+ */
+Tensor pruned_array_lookup_from_row_idx_cuda(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets);
+
+Tensor pruned_array_lookup_from_row_idx_cpu(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 5ce7d4f5d1..c21057ac49 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -62,6 +62,11 @@ static constexpr int32_t kWarpSize = 32;
 #endif
 // Max thread num in one thread block
 static constexpr int32_t kMaxThreads = 1024;
+// Max block size in Y dimension of a grid
+static constexpr int32_t kMaxBlockYDim = 65535;
+// Max block size in Z dimension of a grid
+static constexpr int32_t kMaxBlockZDim = 65535;
+
 static constexpr float kQParamEps = 1e-8f;
 
 /* For rowwise int8 quantization, two quantization parameters (qparams)
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h
new file mode 100644
index 0000000000..750d315d05
--- /dev/null
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace fbgemm_gpu {
+
+static constexpr size_t PTR_NAME_MAX_LEN = 16;
+static constexpr size_t FUNC_NAME_MAX_LEN = 64;
+
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we
+// restrict ourselves to functions and types available there (e.g.
+// at::IntArrayRef isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__`
+// pointers.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data),
+        sizes_(sizes),
+        strides_(strides),
+        ptr_name_(ptr_name),
+        func_name_(func_name) {
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+  }
+  C10_HOST at::IntArrayRef sizes() const {
+    return at::IntArrayRef(sizes_, N);
+  }
+  C10_HOST at::IntArrayRef strides() const {
+    return at::IntArrayRef(strides_, N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+  C10_HOST_DEVICE T& at(index_t idx) const {
+    if (idx < 0) {
+      printf(
+          "ERROR: idx < 0, tensor %s in %s, idx %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx));
+      CUDA_KERNEL_ASSERT(idx >= 0)
+    } else if (idx >= numel_) {
+      printf(
+          "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx),
+          static_cast<int64_t>(numel_));
+      CUDA_KERNEL_ASSERT(idx < numel_);
+    }
+    return data_[idx];
+  }
+
+ protected:
+  PtrType data_;
+  const index_t* const sizes_;
+  const index_t* const strides_;
+  index_t numel_;
+  const char* const ptr_name_;
+  const char* const func_name_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and
+// only indexing on the device uses `TensorAccessor`s.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessor : public TensorAccessorBase<T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->ptr_name_,
+        this->func_name);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) const {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->ptr_name_,
+        this->func_name);
+  }
+};
+
+template <typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T, 1, PtrTraits, index_t>
+    : public TensorAccessorBase<T, 1, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* func_name)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+  C10_HOST_DEVICE T& operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+  C10_HOST_DEVICE const T& operator[](index_t i) const {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+};
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on
+// for CUDA `Tensor`s on the host and as In contrast to `TensorAccessor`s, they
+// copy the strides and sizes on instantiation (on the host) in order to
+// transfer them on the device when calling kernels. On the device, indexing of
+// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as
+// PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and
+// std::copy isn't available on the device, so those functions are host only.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data) {
+    std::copy(sizes, sizes + N, std::begin(sizes_));
+    std::copy(strides, strides + N, std::begin(strides_));
+    // Compute numel_
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+    copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN);
+    copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN);
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : data_(data) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes[i];
+      this->strides_[i] = strides[i];
+    }
+    // Compute numel_
+    numel_ = 0;
+    for (size_t d = 0; d < N; d++) {
+      numel_ += sizes[d];
+    }
+    copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN);
+    copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN);
+  }
+
+  C10_HOST void copy_str(char* dst, const char* src, const size_t max_len) {
+    const auto len = std::min(strlen(src), max_len - 1);
+    std::memcpy(dst, src, sizeof(char) * len);
+    dst[len] = '\0';
+  }
+
+  C10_HOST_DEVICE T& at(index_t idx) const {
+    if (idx < 0) {
+      printf(
+          "ERROR: idx < 0, tensor %s in %s, idx %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx));
+      CUDA_KERNEL_ASSERT(idx >= 0)
+    } else if (idx >= numel_) {
+      printf(
+          "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n",
+          ptr_name_,
+          func_name_,
+          static_cast<int64_t>(idx),
+          static_cast<int64_t>(numel_));
+      CUDA_KERNEL_ASSERT(idx < numel_)
+    }
+    return data_[idx];
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+
+ protected:
+  PtrType data_;
+  index_t sizes_[N];
+  index_t strides_[N];
+  index_t numel_;
+  char ptr_name_[PTR_NAME_MAX_LEN];
+  char func_name_[FUNC_NAME_MAX_LEN];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
+        0 <= i && i < index_t{N},
+        "Index ",
+        i,
+        " is not within bounds of a tensor of dimension ",
+        N);
+  }
+};
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessor
+    : public GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        new_sizes,
+        new_strides,
+        this->ptr_name_,
+        this->func_name_);
+  }
+
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
+      index_t i) const {
+    const index_t* const new_sizes = this->sizes_ + 1;
+    const index_t* const new_strides = this->strides_ + 1;
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        new_sizes,
+        new_strides,
+        this->ptr_name_,
+        this->func_name_);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template <typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>
+    : public GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const index_t* const sizes,
+      const index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = typename std::enable_if<
+          std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data,
+      const source_index_t* const sizes,
+      const source_index_t* const strides,
+      const char* const ptr_name,
+      const char* const func_name)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(
+            data,
+            sizes,
+            strides,
+            ptr_name,
+            func_name) {}
+
+  C10_DEVICE T& operator[](index_t i) {
+    return this->at(this->strides_[0] * i);
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->at(this->strides_[0] * i);
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};
+
+// Can't put this directly into the macro function args because of commas
+#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+
+// Old name for `GenericPackedTensorAccessor`
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
+
+#undef AT_X
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor32 =
+    GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor64 =
+    GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;
+
+} // namespace fbgemm_gpu
+
+#ifdef FBGEMM_GPU_MEMCHECK
+namespace pta = fbgemm_gpu;
+#else
+namespace pta = at;
+#endif
+
+#ifdef FBGEMM_GPU_MEMCHECK
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits,
+    typename index_t = int64_t>
+const fbgemm_gpu::GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+make_generic_packed_tensor_accessor(
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+  static_assert(
+      N > 0,
+      "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+  TORCH_CHECK(
+      tensor.dim() == N,
+      "TensorAccessor expected ",
+      N,
+      " dims but tensor has ",
+      tensor.dim());
+  return fbgemm_gpu::GenericPackedTensorAccessor<T, N, PtrTraits, index_t>(
+      static_cast<typename PtrTraits<T>::PtrType>(tensor.data_ptr<T>()),
+      tensor.sizes().data(),
+      tensor.strides().data(),
+      ptr_name,
+      func_name);
+}
+#endif
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits>
+const pta::PackedTensorAccessor32<T, N, PtrTraits>
+make_packed_tensor_accessor32(
+#ifdef FBGEMM_GPU_MEMCHECK
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+#else
+    at::Tensor& tensor) {
+#endif
+  TORCH_CHECK(
+      tensor.numel() <=
+          static_cast<int64_t>(std::numeric_limits<int32_t>::max()),
+      "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64");
+#ifdef FBGEMM_GPU_MEMCHECK
+  return make_generic_packed_tensor_accessor<T, N, PtrTraits, int32_t>(
+      tensor, ptr_name, func_name);
+#else
+  return tensor.packed_accessor32<T, N, PtrTraits>();
+#endif
+}
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits>
+const pta::PackedTensorAccessor64<T, N, PtrTraits>
+make_packed_tensor_accessor64(
+#ifdef FBGEMM_GPU_MEMCHECK
+    at::Tensor& tensor,
+    const char* const ptr_name,
+    const char* const func_name) {
+  return make_generic_packed_tensor_accessor<T, N, PtrTraits, int64_t>(
+      tensor, ptr_name, func_name);
+#else
+    at::Tensor& tensor) {
+  return tensor.packed_accessor64<T, N, PtrTraits>();
+#endif
+}
+
+#ifdef FBGEMM_GPU_MEMCHECK
+#define MAKE_PACKED_TENSOR_ACCESSOR_BASE(                     \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS)         \
+  make_packed_tensor_accessor##INDEX_NBITS<T, N, PTR_TRAITS>( \
+      TENSOR, #TENSOR, FUNC_NAME)
+
+#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE(    \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<           \
+      at::acc_type<T, true>,                          \
+      N,                                              \
+      PTR_TRAITS>(TENSOR, #TENSOR, FUNC_NAME)
+#else
+#define MAKE_PACKED_TENSOR_ACCESSOR_BASE(             \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<T, N, PTR_TRAITS>(TENSOR)
+
+#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE(    \
+    FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \
+  make_packed_tensor_accessor##INDEX_NBITS<           \
+      at::acc_type<T, true>,                          \
+      N,                                              \
+      PTR_TRAITS>(TENSOR)
+#endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
index 348e0bebfc..c329d6c9d9 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
@@ -30,4 +30,19 @@ padding_fused_tbe_input_combine_cpu(
     const at::Tensor& include_last_offsets,
     int64_t batch_size);
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+tbe_input_combine_with_length_cuda(
+    const uint64_t* const indices_addrs,
+    const uint64_t* const lengths_addrs,
+    const uint64_t* const per_sample_weights_addrs,
+    const uint32_t* const indices_is_long,
+    const uint32_t* const lengths_is_long,
+    const uint64_t* const indices_offsets,
+    const uint64_t* const lengths_offsets,
+    const uint64_t num_lists,
+    const uint64_t total_indices,
+    const uint64_t total_lengths,
+    const uint64_t max_list_size,
+    const c10::DeviceIndex& device);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
index 52854a4f2e..3532928963 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
@@ -155,6 +155,12 @@ at::Tensor lxu_cache_lookup_cuda(
     bool gather_cache_stats,
     c10::optional<at::Tensor> uvm_cache_stats);
 
+at::Tensor emulate_cache_miss(
+    at::Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    at::Tensor uvm_cache_stats);
+
 ///@ingroup table-batched-embed-cuda
 /// Lookup the LRU/LFU cache: find the cache weights location for all indices.
 /// Look up the slots in the cache corresponding to `linear_cache_indices`, with
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 6b8ebbb570..2b34cb240a 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -7,6 +7,7 @@
 import argparse
 import os
 import random
+import re
 import subprocess
 import sys
 
@@ -38,8 +39,9 @@ def generate_package_version(package_name: str):
         print(
             f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
         )
-        # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
-        version = gitversion.version_from_git().split("+")[0]
+        # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
+        # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
+        version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
 
     print(f"[SETUP.PY] Setting the package version: {version}")
     return version
diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
index 7a060681f0..7b49040a83 100644
--- a/fbgemm_gpu/src/cumem_utils.cu
+++ b/fbgemm_gpu/src/cumem_utils.cu
@@ -41,7 +41,8 @@ struct CUDAHostMappedContext {
   ~CUDAHostMappedContext() {
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(cuda_device_);
-    AT_CUDA_CHECK(cudaFreeHost(ptr_));
+    AT_CUDA_CHECK(cudaHostUnregister(ptr_));
+    free(ptr_);
   }
 
   static void release(void* ptr) {
@@ -206,9 +207,28 @@ Tensor new_host_mapped_tensor(
   auto strides = defaultStrides(sizes);
   size_t size_bytes =
       at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
-  void* ptr;
-  AT_CUDA_CHECK(cudaHostAlloc(
-      &ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped));
+
+  // When using cudaHostAlloc for large allocations, we found that it can
+  // potentially take a global lock and lock out CUDA APIs from other processes.
+  // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
+  // of using this cuda API, we can do regular malloc, pre-fault the pages, and
+  // then do cudaHostRegister with GPU mapping flags to lock the pages, so we
+  // can minimize the cost while holding this global lock.
+  void* const ptr = malloc(size_bytes);
+
+  // advise the kernel to allocate large 2M pages
+  madvise(ptr, size_bytes, MADV_HUGEPAGE);
+
+  // pre-fault/map the pages by setting the first byte of the page
+  size_t pageSize = (1 << 21);
+  uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1));
+  for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes);
+       p += pageSize) {
+    memset((void*)p, 0, 1);
+  }
+
+  AT_CUDA_CHECK(cudaHostRegister(
+      ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
   void* dev_ptr;
   AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));
 
diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu
index 1d0e394919..f301576a49 100644
--- a/fbgemm_gpu/src/embedding_inplace_update.cu
+++ b/fbgemm_gpu/src/embedding_inplace_update.cu
@@ -186,4 +186,98 @@ void embedding_inplace_update_cuda(
       });
 }
 
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel(
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        update_row_indices,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        update_table_indices,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        index_remappings,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        index_remappings_offsets,
+    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        dense_indices) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= update_row_indices.size(0)) {
+    return;
+  }
+  const auto row_idx = update_row_indices[idx];
+  if (idx >= update_table_indices.size(0)) {
+    return;
+  }
+  const int table_idx = update_table_indices[idx];
+
+  const int64_t index_remappings_start = index_remappings_offsets[table_idx];
+  const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1];
+  const int64_t capacity = index_remappings_end - index_remappings_start;
+
+  if (capacity > 0) {
+    dense_indices[idx] = index_remappings[index_remappings_start + row_idx];
+  } else {
+    dense_indices[idx] = row_idx;
+  }
+}
+
+Tensor pruned_array_lookup_from_row_idx_cuda(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets) {
+  TENSOR_ON_CUDA_GPU(update_row_indices);
+  TENSOR_ON_CUDA_GPU(update_table_indices);
+  TENSOR_ON_CUDA_GPU(index_remappings);
+  TENSOR_ON_CUDA_GPU(index_remappings_offsets);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(update_table_indices.get_device());
+  auto dense_indices = at::empty_like(update_row_indices);
+  const int32_t T = index_remappings_offsets.size(0) - 1;
+
+  const auto num_indices = update_row_indices.numel();
+  if (num_indices == 0) {
+    return dense_indices;
+  }
+
+  TORCH_CHECK(index_remappings.size(0) < std::numeric_limits<int64_t>::max());
+  TORCH_CHECK(
+      update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim());
+  TORCH_CHECK(
+      update_table_indices.dim() == 1,
+      "Tensor dim: ",
+      update_table_indices.dim());
+  TORCH_CHECK(
+      index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim());
+  TORCH_CHECK(
+      index_remappings_offsets.dim() == 1,
+      "Tensor dim: ",
+      index_remappings_offsets.dim());
+  TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim());
+  constexpr size_t kForwardMaxThreads = 256;
+
+  AT_DISPATCH_INDEX_TYPES(
+      update_row_indices.scalar_type(),
+      "pruned_array_lookup_from_row_idx_kernel",
+      [&] {
+        pruned_array_lookup_from_row_idx_kernel<<<
+            nbit::div_round_up(num_indices, kForwardMaxThreads),
+            kForwardMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            update_row_indices
+                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            update_table_indices
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            index_remappings
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            index_remappings_offsets
+                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            dense_indices
+                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>());
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+  return dense_indices;
+}
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
index bd1315e023..5f3a648872 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
@@ -116,6 +116,53 @@ void embedding_inplace_update_cpu(
       });
 }
 
+Tensor pruned_array_lookup_from_row_idx_cpu(
+    const Tensor& update_row_indices,
+    const Tensor& update_table_indices,
+    const Tensor& index_remappings,
+    const Tensor& index_remappings_offsets) {
+  TENSOR_ON_CPU(update_row_indices);
+  TENSOR_ON_CPU(update_table_indices);
+  TENSOR_ON_CPU(index_remappings);
+  TENSOR_ON_CPU(index_remappings_offsets);
+
+  auto dense_indices = empty_like(update_row_indices);
+  const auto num_indices = update_row_indices.numel();
+
+  AT_DISPATCH_INDEX_TYPES(
+      update_row_indices.scalar_type(),
+      "pruned_array_lookup_from_row_idx_cpu_kernel",
+      [&] {
+        const auto update_row_indices_acc =
+            update_row_indices.accessor<index_t, 1>();
+        auto dense_indices_acc = dense_indices.accessor<index_t, 1>();
+        const auto update_table_indices_acc =
+            update_table_indices.accessor<int32_t, 1>();
+
+        const auto index_remappings_acc =
+            index_remappings.accessor<int32_t, 1>();
+        const auto index_remappings_offsets_acc =
+            index_remappings_offsets.accessor<int64_t, 1>();
+
+        for (int64_t idx = 0; idx < num_indices; idx++) {
+          const int table_idx = update_table_indices_acc[idx];
+          const auto row_idx = update_row_indices_acc[idx];
+          int64_t index_remappings_start =
+              index_remappings_offsets_acc[table_idx];
+          int64_t index_remappings_end =
+              index_remappings_offsets_acc[table_idx + 1];
+          int64_t capacity = index_remappings_end - index_remappings_start;
+          if (capacity > 0) {
+            dense_indices_acc[idx] =
+                index_remappings_acc[index_remappings_start + row_idx];
+          } else {
+            dense_indices_acc[idx] = row_idx;
+          }
+        }
+      });
+  return dense_indices;
+}
+
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -127,3 +174,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
 }
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  DISPATCH_TO_CPU(
+      "pruned_array_lookup_from_row_idx",
+      fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu);
+}
diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
index 743a902b68..cfb48c2427 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
@@ -14,3 +14,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda);
 }
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  DISPATCH_TO_CUDA(
+      "pruned_array_lookup_from_row_idx",
+      fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda);
+}
diff --git a/fbgemm_gpu/src/input_combine.cu b/fbgemm_gpu/src/input_combine.cu
new file mode 100644
index 0000000000..040ca14bbf
--- /dev/null
+++ b/fbgemm_gpu/src/input_combine.cu
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <c10/cuda/CUDAGuard.h>
+#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "fbgemm_gpu/input_combine.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+template <typename src_t, typename dst_t, uint32_t VEC_WIDTH>
+DEVICE_INLINE void vec_copy_with_implicit_type_cast(
+    dst_t* const __restrict__ dst,
+    const uint64_t src_addr,
+    const uint64_t src_offset,
+    const uint64_t dst_offset,
+    const uint64_t src_bound) {
+  // TODO: Use vector load/store if address aligns with the vector type
+  const src_t* const src = reinterpret_cast<src_t*>(src_addr);
+#pragma unroll
+  for (uint64_t i = 0; i < VEC_WIDTH && src_offset + i < src_bound; i++) {
+    dst[dst_offset + i] = src[src_offset + i];
+  }
+}
+
+template <uint32_t VEC_WIDTH, uint32_t IS_LONG_NUM_BITS>
+__global__
+__launch_bounds__(kMaxThreads) void tbe_input_combine_with_length_kernel(
+    int32_t* const __restrict__ combined_indices,
+    int32_t* const __restrict__ combined_lengths,
+    float* const __restrict__ combined_weights,
+    const uint64_t* const __restrict__ indices_addrs,
+    const uint64_t* const __restrict__ lengths_addrs,
+    const uint64_t* const __restrict__ per_sample_weights_addrs,
+    const uint32_t* const __restrict__ indices_is_long,
+    const uint32_t* const __restrict__ lengths_is_long,
+    const uint64_t* const __restrict__ indices_offsets,
+    const uint64_t* const __restrict__ lengths_offsets,
+    const uint64_t num_lists,
+    const FixedDivisor fd_num_warps_per_list) {
+  const auto global_warp_id = blockIdx.x * blockDim.y + threadIdx.y;
+  uint32_t list_id;
+  uint32_t warp_id;
+  fd_num_warps_per_list.DivMod(
+      global_warp_id,
+      reinterpret_cast<int32_t*>(&list_id),
+      reinterpret_cast<int32_t*>(&warp_id));
+
+  if (list_id >= num_lists) {
+    return;
+  }
+
+  // IS_LONG_NUM_BITS is power of 2 (default = 32); div and mod should be cheap
+  const uint32_t is_long_idx = list_id / IS_LONG_NUM_BITS;
+  const uint32_t is_long_mask = 1u << (list_id % IS_LONG_NUM_BITS);
+  const uint64_t src_idx = (warp_id * kWarpSize + threadIdx.x) * VEC_WIDTH;
+  const auto indices_start = indices_offsets[list_id];
+  const auto indices_end = indices_offsets[list_id + 1];
+  const auto lengths_start = lengths_offsets[list_id];
+  const auto lengths_end = lengths_offsets[list_id + 1];
+
+  // Invoke a function based on the indices type
+  ((indices_is_long[is_long_idx] & is_long_mask)
+       ? vec_copy_with_implicit_type_cast<int64_t, int32_t, VEC_WIDTH>
+       : vec_copy_with_implicit_type_cast<
+             int32_t,
+             int32_t,
+             VEC_WIDTH>)(combined_indices, indices_addrs[list_id], src_idx, indices_start + src_idx, indices_end - indices_start);
+
+  // Invoke a function based on the lengths type
+  ((lengths_is_long[is_long_idx] & is_long_mask)
+       ? vec_copy_with_implicit_type_cast<int64_t, int32_t, VEC_WIDTH>
+       : vec_copy_with_implicit_type_cast<
+             int32_t,
+             int32_t,
+             VEC_WIDTH>)(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start + src_idx, lengths_end - lengths_start);
+
+  if (per_sample_weights_addrs) {
+    vec_copy_with_implicit_type_cast<float, float, VEC_WIDTH>(
+        combined_weights,
+        per_sample_weights_addrs[list_id],
+        src_idx,
+        indices_start + src_idx,
+        indices_end - indices_start);
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_cuda(
+    const uint64_t* const indices_addrs,
+    const uint64_t* const lengths_addrs,
+    const uint64_t* const per_sample_weights_addrs,
+    const uint32_t* const indices_is_long,
+    const uint32_t* const lengths_is_long,
+    const uint64_t* const indices_offsets,
+    const uint64_t* const lengths_offsets,
+    const uint64_t num_lists,
+    const uint64_t total_indices,
+    const uint64_t total_lengths,
+    const uint64_t max_list_size,
+    const c10::DeviceIndex& device) {
+  constexpr uint32_t IS_LONG_NUM_BITS = 32;
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(device);
+
+  // combined_indices and combined_legnths are int tensors
+  const auto int_options = at::TensorOptions().dtype(at::kInt).device(
+      at::kCUDA, at::cuda::current_device());
+  Tensor combined_indices =
+      at::empty({static_cast<int64_t>(total_indices)}, int_options);
+  Tensor combined_lengths =
+      at::empty({static_cast<int64_t>(total_lengths)}, int_options);
+  // combined_weights is a float tensor
+  Tensor combined_weights = at::empty(
+      {per_sample_weights_addrs ? static_cast<int64_t>(total_indices)
+                                : static_cast<int64_t>(0)},
+      at::TensorOptions()
+          .dtype(at::kFloat)
+          .device(at::kCUDA, at::cuda::current_device()));
+
+  // Each thread loads 4 elements (rule of thumb; should work well with 32-bit
+  // inputs)
+  constexpr uint32_t VEC_WIDTH = 4;
+  constexpr uint32_t NUM_WARPS_PER_BLOCK = kMaxThreads / kWarpSize;
+  const auto num_warps_per_list =
+      div_round_up(max_list_size, kWarpSize * VEC_WIDTH);
+  const auto num_blocks =
+      div_round_up(num_warps_per_list * num_lists, NUM_WARPS_PER_BLOCK);
+
+  tbe_input_combine_with_length_kernel<VEC_WIDTH, IS_LONG_NUM_BITS>
+      <<<num_blocks,
+         dim3(kWarpSize, NUM_WARPS_PER_BLOCK),
+         0,
+         at::cuda::getCurrentCUDAStream()>>>(
+          combined_indices.data_ptr<int32_t>(),
+          combined_lengths.data_ptr<int32_t>(),
+          per_sample_weights_addrs ? combined_weights.data_ptr<float>()
+                                   : nullptr,
+          indices_addrs,
+          lengths_addrs,
+          per_sample_weights_addrs,
+          indices_is_long,
+          lengths_is_long,
+          indices_offsets,
+          lengths_offsets,
+          num_lists,
+          FixedDivisor(num_warps_per_list));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {
+      std::move(combined_indices),
+      std::move(combined_lengths),
+      std::move(combined_weights)};
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_gpu.cpp
new file mode 100644
index 0000000000..482cabd963
--- /dev/null
+++ b/fbgemm_gpu/src/input_combine_gpu.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/input_combine.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+constexpr uint32_t IS_LONG_NUM_BITS = 32;
+constexpr uint32_t NUM_ARGS = 7;
+enum args_pos {
+  P_indices_prts = 0,
+  P_lengths_addrs = 1,
+  P_indices_offsets = 2,
+  P_lengths_offsets = 3,
+  P_per_sample_weight = 4,
+  P_indices_is_long = 5,
+  P_lengths_is_long = 6
+};
+
+template <typename T>
+uint64_t compute_num_uint64s(const uint64_t num_elements) {
+  const uint64_t ratio = sizeof(uint64_t) / sizeof(T);
+  return (num_elements + ratio - 1) / ratio;
+}
+
+void offset_tbe_input_combine_with_length_args(
+    uint64_t** indices_addrs,
+    uint64_t** lengths_addrs,
+    uint64_t** indices_offsets,
+    uint64_t** lengths_offsets,
+    uint64_t** per_sample_weights_addrs,
+    uint32_t** indices_is_long,
+    uint32_t** lengths_is_long,
+    uint64_t* base_addr,
+    const uint64_t* const ptr_offsets,
+    const bool need_weights) {
+  *indices_addrs = base_addr + ptr_offsets[P_indices_prts];
+  *lengths_addrs = base_addr + ptr_offsets[P_lengths_addrs];
+  *indices_offsets = base_addr + ptr_offsets[P_indices_offsets];
+  *lengths_offsets = base_addr + ptr_offsets[P_lengths_offsets];
+  *per_sample_weights_addrs =
+      need_weights ? (base_addr + ptr_offsets[P_per_sample_weight]) : nullptr;
+  *indices_is_long =
+      reinterpret_cast<uint32_t*>(base_addr + ptr_offsets[P_indices_is_long]);
+  *lengths_is_long =
+      reinterpret_cast<uint32_t*>(base_addr + ptr_offsets[P_lengths_is_long]);
+}
+
+std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_gpu(
+    const std::vector<Tensor>& indices_list,
+    const std::vector<Tensor>& lengths_list,
+    const std::vector<Tensor>& per_sample_weights) {
+  const auto num_lists = indices_list.size();
+  TORCH_CHECK(num_lists > 0);
+  TORCH_CHECK(lengths_list.size() == num_lists);
+  TORCH_CHECK(per_sample_weights.size() == num_lists);
+  const bool need_weights = std::any_of(
+      per_sample_weights.begin(), per_sample_weights.end(), [](const auto& x) {
+        return x.numel() > 0;
+      });
+
+  // Store is_longs in 32-bit variables. i-th bit (LSB) indicates if
+  // list i-th is long.
+  const uint64_t num_is_longs =
+      (num_lists + IS_LONG_NUM_BITS - 1) / IS_LONG_NUM_BITS;
+  const uint64_t num_is_longs_64 = compute_num_uint64s<uint32_t>(num_is_longs);
+  // args_tensor stores kernel arguments:
+  // - indices_prts (num_lists uint64_t elements)
+  // - lengths_addrs (num_lists uint64_t elements)
+  // - indices_offsets (num_lists + 1 uint64_t elements)
+  // - lengths_offsets (num_lists + 1 uint64_t elements)
+  // - per_sample_weight (num_lists uint64_t elements; optional)
+  // - indices_is_long (num_is_longs uint32_t elements)
+  // - lengths_is_long (num_is_longs uint32_t elements)
+  uint64_t args_offsets[NUM_ARGS + 1];
+  // Initialize offsets with lengths first
+  args_offsets[P_indices_prts] = num_lists;
+  args_offsets[P_lengths_addrs] = num_lists;
+  args_offsets[P_indices_offsets] = num_lists + 1;
+  args_offsets[P_lengths_offsets] = num_lists + 1;
+  args_offsets[P_per_sample_weight] = need_weights ? num_lists : 0;
+  args_offsets[P_indices_is_long] = num_is_longs_64;
+  args_offsets[P_lengths_is_long] = num_is_longs_64;
+
+  // Compute offsets
+  uint64_t offset = 0;
+  auto next = args_offsets[0];
+  for (uint32_t i = 0; i < NUM_ARGS; i++) {
+    args_offsets[i] = offset;
+    offset += next;
+    next = args_offsets[i + 1];
+  }
+  args_offsets[NUM_ARGS] = offset; // total number of uint64_t elements required
+
+  Tensor args_tensor = at::empty(
+      {static_cast<int64_t>(args_offsets[NUM_ARGS] * sizeof(uint64_t))},
+      at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+
+  uint64_t* indices_addrs = nullptr;
+  uint64_t* lengths_addrs = nullptr;
+  uint64_t* indices_offsets = nullptr;
+  uint64_t* lengths_offsets = nullptr;
+  uint64_t* per_sample_weights_addrs = nullptr;
+  uint32_t* indices_is_long = nullptr;
+  uint32_t* lengths_is_long = nullptr;
+
+  // Offset host pointers
+  offset_tbe_input_combine_with_length_args(
+      &indices_addrs,
+      &lengths_addrs,
+      &indices_offsets,
+      &lengths_offsets,
+      &per_sample_weights_addrs,
+      &indices_is_long,
+      &lengths_is_long,
+      reinterpret_cast<uint64_t*>(args_tensor.data_ptr()),
+      args_offsets,
+      need_weights);
+
+  const auto& indices_0 = indices_list[0];
+  uint64_t total_indices = 0;
+  uint64_t total_lengths = 0;
+  uint64_t max_list_size = 0;
+  for (uint64_t i = 0; i < num_lists; i++) {
+    const uint64_t is_long_idx = i / IS_LONG_NUM_BITS;
+    auto& indices_is_long_ = indices_is_long[is_long_idx];
+    auto& lengths_is_long_ = lengths_is_long[is_long_idx];
+    if (i % IS_LONG_NUM_BITS == 0) {
+      indices_is_long_ = 0;
+      lengths_is_long_ = 0;
+    }
+    const auto& indices = indices_list[i];
+    const auto& lengths = lengths_list[i];
+    TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(indices);
+    TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(lengths);
+    TENSORS_ON_SAME_DEVICE(indices, indices_0);
+    TENSORS_ON_SAME_DEVICE(lengths, indices_0);
+    TORCH_CHECK(indices.dtype() == c10::kInt || indices.dtype() == c10::kLong);
+    TORCH_CHECK(lengths.dtype() == c10::kInt || lengths.dtype() == c10::kLong);
+    TENSOR_NDIM_EQUALS(indices, 1);
+    TENSOR_NDIM_EQUALS(lengths, 1);
+
+    const auto indices_numel = indices.numel();
+    const auto lengths_numel = lengths.numel();
+    indices_offsets[i] = total_indices;
+    lengths_offsets[i] = total_lengths;
+    total_indices += indices_numel;
+    total_lengths += lengths_numel;
+    max_list_size =
+        std::max(max_list_size, static_cast<uint64_t>(indices_numel));
+    max_list_size =
+        std::max(max_list_size, static_cast<uint64_t>(lengths_numel));
+
+    // Store pointers in args_tensor
+    indices_addrs[i] = reinterpret_cast<uint64_t>(indices.data_ptr());
+    lengths_addrs[i] = reinterpret_cast<uint64_t>(lengths.data_ptr());
+    indices_is_long_ |= static_cast<uint32_t>(indices.dtype() == c10::kLong)
+        << (i % IS_LONG_NUM_BITS);
+    lengths_is_long_ |= static_cast<uint32_t>(lengths.dtype() == c10::kLong)
+        << (i % IS_LONG_NUM_BITS);
+
+    const auto& weights = per_sample_weights[i];
+    if (weights.numel() > 0) {
+      TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(weights);
+      TENSORS_ON_SAME_DEVICE(weights, indices_0);
+      TENSOR_TYPE_MUST_BE(weights, c10::kFloat);
+      TENSOR_NDIM_EQUALS(weights, 1);
+      TENSORS_HAVE_SAME_NUMEL(weights, indices);
+
+      per_sample_weights_addrs[i] =
+          reinterpret_cast<uint64_t>(weights.data_ptr());
+    }
+  }
+  indices_offsets[num_lists] = total_indices;
+  lengths_offsets[num_lists] = total_lengths;
+
+  const auto& device = indices_0.device();
+  // Transfer args_tensor from host to device
+  args_tensor = args_tensor.to(device, /*non_blocking=*/true);
+
+  // Offset device pointers
+  offset_tbe_input_combine_with_length_args(
+      &indices_addrs,
+      &lengths_addrs,
+      &indices_offsets,
+      &lengths_offsets,
+      &per_sample_weights_addrs,
+      &indices_is_long,
+      &lengths_is_long,
+      reinterpret_cast<uint64_t*>(args_tensor.data_ptr()),
+      args_offsets,
+      need_weights);
+
+  return tbe_input_combine_with_length_cuda(
+      indices_addrs,
+      lengths_addrs,
+      per_sample_weights_addrs,
+      indices_is_long,
+      lengths_is_long,
+      indices_offsets,
+      lengths_offsets,
+      num_lists,
+      total_indices,
+      total_lengths,
+      max_list_size,
+      device.index());
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
+  DISPATCH_TO_CUDA(
+      "tbe_input_combine_with_length",
+      fbgemm_gpu::tbe_input_combine_with_length_gpu);
+};
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index 4e93d08a65..62cef01113 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -12,6 +12,7 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 #include <ATen/cuda/Atomic.cuh>
+#include <cub/cub.cuh>
 
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
@@ -1824,39 +1825,101 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
   return {v_grad, a_values_grad};
 }
 
-template <typename index_t, typename scalar_t>
+template <const int THREADS_PER_BLOCK, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> values,
     const at::PackedTensorAccessor32<index_t, 1> offsets,
     at::PackedTensorAccessor32<scalar_t, 2> output,
     const int max_L) {
-  const int B = offsets.size(0) - 1;
-  const int D = output.size(1);
+  const auto B = offsets.size(0) - 1;
+  const auto D = output.size(1);
 
-  const int b_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_step = gridDim.x * blockDim.y;
-  for (int b = b_begin; b < B; b += b_step) {
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length != 0) {
-      // TODO: use shared memory and better reduction
-      for (int d = threadIdx.x; d < D; d += blockDim.x) {
-        scalar_t max_value = values[row_start][d];
-        for (int l = 1; l < length; ++l) {
-          max_value = max(max_value, values[row_start + l][d]);
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<scalar_t, THREADS_PER_BLOCK> BlockReduceT;
+
+  // Allocate shared memory for BlockReduce
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ scalar_t max_value;
+  __shared__ scalar_t exp_sum;
+
+  const auto tid = threadIdx.x;
+  for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    if (length > 0) {
+      const auto num_l_blocks =
+          (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+      for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) {
+        if (tid == 0) {
+          max_value = values[row_start][d];
+          exp_sum = 0;
         }
 
-        at::acc_type<scalar_t, true> acc =
-            exp(values[row_start][d] - max_value);
-        for (int l = 1; l < length; ++l) {
-          acc += exp(values[row_start + l][d] - max_value);
+        // Loop through all blocks to calculate the max value
+        // Each block has its own max value block_max_value, and
+        // max_value is the max value across all blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_val = values[row_start][d];
+          if (l < length) {
+            thread_val = values[row_start + l][d];
+          }
+
+          // Collectively compute the block-wide max reduction
+          scalar_t block_max_value =
+              BlockReduceT(temp_storage).Reduce(thread_val, cub::Max());
+          __syncthreads();
+
+          if (tid == 0) {
+            max_value = max(max_value, block_max_value);
+          }
+        }
+
+        // The max_value was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated max_value
+        __syncthreads();
+
+        // Loop through all blocks to calculate the sum of exp
+        // Each block has its own sum block_exp_acc, and
+        // exp_sum is the sum across all blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          auto l = bk_l * blockDim.x + tid;
+
+          scalar_t thread_exp = 0;
+          if (l < length) {
+            thread_exp = std::exp(values[row_start + l][d] - max_value);
+          }
+
+          // Collectively compute the block-wide sum reduction
+          scalar_t block_exp_sum = BlockReduceT(temp_storage).Sum(thread_exp);
+          __syncthreads();
+
+          if (tid == 0) {
+            exp_sum += block_exp_sum;
+          }
         }
 
-        for (int l = 0; l < length; ++l) {
-          output[row_start + l][d] =
-              exp(values[row_start + l][d] - max_value) / acc;
+        // The exp_sum was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated exp_sum
+        __syncthreads();
+
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_exp = 0;
+          if (l < length) {
+            thread_exp = std::exp(values[row_start + l][d] - max_value);
+            output[row_start + l][d] = thread_exp / exp_sum;
+          }
         }
+
+        // The max_value and exp_sum will be reinitialized by thread 0 in the
+        // next d iteration, sync here to make sure the last loop still uses the
+        // reduced values before reinitialization
+        __syncthreads();
       }
     }
   }
@@ -1872,14 +1935,13 @@ Tensor jagged_softmax_forward(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(values.get_device());
 
-  const int B = offsets.numel() - 1;
-  const int D = values.size(1);
+  const auto B = offsets.numel() - 1;
+  const auto D = values.size(1);
   auto output = at::empty_like(values);
 
   if (B > 0 && D > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int THREADS_PER_BLOCK = 128;
+    const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_softmax_kernel_1", [&] {
@@ -1889,9 +1951,9 @@ Tensor jagged_softmax_forward(
               values.scalar_type(),
               "jagged_softmax_kernel_2",
               [&] {
-                jagged_softmax_kernel<index_t, scalar_t>
-                    <<<div_round_up(B, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
+                jagged_softmax_kernel<THREADS_PER_BLOCK, index_t, scalar_t>
+                    <<<grid,
+                       THREADS_PER_BLOCK,
                        0,
                        at::cuda::getCurrentCUDAStream()>>>(
                         values.packed_accessor32<scalar_t, 2>(),
@@ -1906,35 +1968,76 @@ Tensor jagged_softmax_forward(
   return output;
 }
 
-template <typename index_t, typename scalar_t>
+template <const int THREADS_PER_BLOCK, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> grad_output,
     const at::PackedTensorAccessor32<scalar_t, 2> output,
     const at::PackedTensorAccessor32<index_t, 1> offsets,
     at::PackedTensorAccessor32<scalar_t, 2> grad_input,
     const int max_L) {
-  const int B = offsets.size(0) - 1;
-  const int D = grad_output.size(1);
+  const auto B = offsets.size(0) - 1;
+  const auto D = grad_output.size(1);
 
-  const int b_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_step = gridDim.x * blockDim.y;
-  for (int b = b_begin; b < B; b += b_step) {
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length != 0) {
-      // TODO: use shared memory and better reduction
-      for (int d = threadIdx.x; d < D; d += blockDim.x) {
-        scalar_t sum_value = grad_output[row_start][d] * output[row_start][d];
-        for (int l = 1; l < length; ++l) {
-          sum_value += grad_output[row_start + l][d] * output[row_start + l][d];
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<scalar_t, THREADS_PER_BLOCK> BlockReduceT;
+
+  // Allocate shared memory for BlockReduce
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ scalar_t sum_value;
+
+  const auto tid = threadIdx.x;
+  for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    if (length > 0) {
+      const auto num_l_blocks =
+          (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+      for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) {
+        if (tid == 0) {
+          sum_value = 0;
+        }
+
+        // Loop through all blocks to calculate the sum value
+        // Each block has its own sum, and sum_value is the sum value across all
+        // blocks
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          scalar_t thread_val = 0;
+          if (l < length) {
+            thread_val =
+                grad_output[row_start + l][d] * output[row_start + l][d];
+          }
+
+          // Collectively compute the block-wide sum reduction
+          scalar_t block_sum_value = BlockReduceT(temp_storage).Sum(thread_val);
+          __syncthreads();
+
+          if (tid == 0) {
+            sum_value += block_sum_value;
+          }
         }
 
-        for (int l = 0; l < length; ++l) {
-          grad_input[row_start + l][d] =
-              (grad_output[row_start + l][d] - sum_value) *
-              output[row_start + l][d];
+        // The sum_value was updated by thread 0 in the last loop, sync here to
+        // make sure the next loop uses the updated sum_value
+        __syncthreads();
+
+        for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+          const auto l = bk_l * blockDim.x + tid;
+          if (l < length) {
+            grad_input[row_start + l][d] =
+                (grad_output[row_start + l][d] - sum_value) *
+                output[row_start + l][d];
+          }
         }
+
+        // The sum_value will be reinitialized by thread 0 in the
+        // next d iteration, sync here to make sure the last loop still uses the
+        // reduced value before reinitialization
+        __syncthreads();
       }
     }
   }
@@ -1952,14 +2055,13 @@ Tensor jagged_softmax_backward(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(grad_output.get_device());
 
-  const int B = offsets.numel() - 1;
-  const int D = grad_output.size(1);
+  const auto B = offsets.numel() - 1;
+  const auto D = grad_output.size(1);
   auto grad_input = at::empty_like(grad_output);
 
   if (B > 0 && D > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int THREADS_PER_BLOCK = 128;
+    const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_softmax_backward_kernel_1", [&] {
@@ -1969,9 +2071,12 @@ Tensor jagged_softmax_backward(
               grad_output.scalar_type(),
               "jagged_softmax_backward_kernel_2",
               [&] {
-                jagged_softmax_backward_kernel<index_t, scalar_t>
-                    <<<div_round_up(B, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
+                jagged_softmax_backward_kernel<
+                    THREADS_PER_BLOCK,
+                    index_t,
+                    scalar_t>
+                    <<<grid,
+                       THREADS_PER_BLOCK,
                        0,
                        at::cuda::getCurrentCUDAStream()>>>(
                         grad_output.packed_accessor32<scalar_t, 2>(),
@@ -1986,7 +2091,7 @@ Tensor jagged_softmax_backward(
   return grad_input;
 }
 
-template <typename index_t, typename scalar_t>
+template <const int BLOCK_SIZE, typename index_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> x_values,
     const at::PackedTensorAccessor32<scalar_t, 2> y_values,
@@ -1997,30 +2102,53 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel(
   const int M = x_values.size(1);
   const int N = y_values.size(1);
 
-  const int b_m_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_m_step = gridDim.x * blockDim.y;
-  for (int b_m = b_m_begin; b_m < B * M; b_m += b_m_step) {
-    const int b = b_m / M;
-    const int m = b_m % M;
+  const auto block_row = blockIdx.y;
+  const auto block_col = blockIdx.x;
+  const auto row = threadIdx.y;
+  const auto col = threadIdx.x;
+  __shared__ scalar_t Xs[BLOCK_SIZE][BLOCK_SIZE];
+  __shared__ scalar_t Ys[BLOCK_SIZE][BLOCK_SIZE];
+
+  for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) {
+    const index_t row_start = offsets[b];
+    const index_t row_end = offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+    auto num_l_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    at::acc_type<scalar_t, true> acc = 0;
+
+    const auto row_offset = block_row * BLOCK_SIZE + row;
+    const auto col_offset = block_col * BLOCK_SIZE + col;
+
+    // for loop block tile in length dimension
+    for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) {
+      Xs[row][col] = 0;
+      Ys[row][col] = 0;
+      const auto bk_offset = bk_l * BLOCK_SIZE;
+
+      // load data from global memory to shared memory
+      const auto l_x = bk_offset + col;
+      if (row_offset < M && l_x < length) {
+        Xs[row][col] = x_values[row_start + l_x][row_offset];
+      }
 
-    const int row_start = offsets[b];
-    const int row_end = offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length == 0) {
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        output[b][m][n] = 0;
+      const auto l_y = bk_offset + row;
+      if (l_y < length && col_offset < N) {
+        Ys[row][col] = y_values[row_start + l_y][col_offset];
       }
-    } else {
-      // TODO: use shared memory and better reduction
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        at::acc_type<scalar_t, true> acc =
-            x_values[row_start][m] * y_values[row_start][n];
-        for (int l = 1; l < length; ++l) {
-          acc += x_values[row_start + l][m] * y_values[row_start + l][n];
-        }
-        output[b][m][n] = acc;
+
+      __syncthreads();
+
+#pragma unroll
+      for (auto e = 0; e < BLOCK_SIZE; e++) {
+        acc += Xs[row][e] * Ys[e][col];
       }
+      __syncthreads();
     }
+
+    // write the result to the output
+    if ((row_offset < M) && (col_offset < N))
+      output[b][row_offset][col_offset] = acc;
   }
 }
 
@@ -2042,9 +2170,16 @@ Tensor jagged_jagged_bmm_forward(
   auto output = at::zeros({B, M, N}, x_values.options());
 
   if (B > 0 && M > 0 && N > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    constexpr int BLOCK_SIZE = 16;
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+    const auto grid_dim_x = div_round_up(N, BLOCK_SIZE);
+    const auto grid_dim_y = div_round_up(M, BLOCK_SIZE);
+    TORCH_CHECK(
+        grid_dim_y <= kMaxBlockYDim,
+        "M cannot be larger than",
+        grid_dim_y * BLOCK_SIZE + 1 - BLOCK_SIZE);
+    const auto grid_dim_z = std::min(B, kMaxBlockZDim);
+    const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z);
 
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] {
@@ -2054,11 +2189,8 @@ Tensor jagged_jagged_bmm_forward(
               x_values.scalar_type(),
               "jagged_jagged_bmm_kernel_2",
               [&] {
-                jagged_jagged_bmm_kernel<index_t, scalar_t>
-                    <<<div_round_up(B * M, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
+                jagged_jagged_bmm_kernel<BLOCK_SIZE, index_t, scalar_t>
+                    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
                         x_values.packed_accessor32<scalar_t, 2>(),
                         y_values.packed_accessor32<scalar_t, 2>(),
                         offsets.packed_accessor32<index_t, 1>(),
@@ -2071,7 +2203,17 @@ Tensor jagged_jagged_bmm_forward(
   return output;
 }
 
-template <typename index_t, typename scalar_t>
+template <
+    const int BLOCK_TILE_M, // tile height of C that each thread block
+                            // calculates
+    const int BLOCK_TILE_N, // tile width of C that each thread block
+                            // calculates
+    const int BLOCK_TILE_K, // tile width of A that each thread block calculates
+    const int THREAD_TILE_M, // tile height of C that each thread
+                             // calculates
+    const int THREAD_TILE_N, // tile width of C that each thread calcualtes
+    typename index_t,
+    typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(
     const at::PackedTensorAccessor32<scalar_t, 2> x_values,
     const at::PackedTensorAccessor32<index_t, 1> x_offsets,
@@ -2082,25 +2224,116 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(
   const int K = x_values.size(1);
   const int N = y.size(2);
 
-  const int b_l_begin = blockIdx.x * blockDim.y + threadIdx.y;
-  const int b_l_step = gridDim.x * blockDim.y;
-  for (int b_l = b_l_begin; b_l < B * max_L; b_l += b_l_step) {
-    const int b = b_l / max_L;
-    const int l = b_l % max_L;
-
-    const int row_start = x_offsets[b];
-    const int row_end = x_offsets[b + 1];
-    const int length = min(row_end - row_start, max_L);
-    if (length == 0 || l >= length) {
-      return;
-    } else {
-      // TODO: use shared memory and better reduction
-      for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        at::acc_type<scalar_t, true> acc = 0;
-        for (int k = 0; k < K; ++k) {
-          acc += x_values[row_start + l][k] * y[b][k][n];
+  const auto block_row = blockIdx.y;
+  const auto block_col = blockIdx.x;
+
+  const int THREADS_X_PER_BLOCK = BLOCK_TILE_N / THREAD_TILE_N;
+  const int THREADS_Y_PER_BLOCK = BLOCK_TILE_M / THREAD_TILE_M;
+  const int THREADS_PER_BLOCK = THREADS_X_PER_BLOCK * THREADS_Y_PER_BLOCK;
+  const auto thread_row = threadIdx.x / THREADS_X_PER_BLOCK;
+  const auto thread_col = threadIdx.x % THREADS_X_PER_BLOCK;
+  const auto NUM_K_BLOCKS = (K + BLOCK_TILE_K - 1) / BLOCK_TILE_K;
+
+  __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K];
+  __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N];
+
+  // Once we remove ROCm<=5.3 support, we should replace uint32_t with auto.
+  // See #1655
+  for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) {
+    const index_t row_start = x_offsets[b];
+    const index_t row_end = x_offsets[b + 1];
+    const auto length = min(row_end - row_start, (index_t)max_L);
+
+    // the indices that this current will load into shared mem
+    const auto inner_row_a = threadIdx.x / BLOCK_TILE_K;
+    const auto inner_col_a = threadIdx.x % BLOCK_TILE_K;
+    // the number of rows of As that will be loaded per step by a thread block
+    const auto A_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_K;
+
+    const auto inner_row_b = threadIdx.x / BLOCK_TILE_N;
+    const auto inner_col_b = threadIdx.x % BLOCK_TILE_N;
+    const auto B_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_N;
+
+    // registers for C
+    scalar_t accum[THREAD_TILE_M][THREAD_TILE_N] = {0};
+
+    // registers for As and Bs
+    scalar_t fragment_a[THREAD_TILE_M] = {0};
+    scalar_t fragment_b[THREAD_TILE_N] = {0};
+
+    // loop for block tiles in K dimension
+    for (auto block = 0; block < NUM_K_BLOCKS; block++) {
+// load a block of x_values from global memory to shared memory
+// apply tiling for threads in a block
+#pragma unroll
+      for (auto offset = 0; offset < BLOCK_TILE_M;
+           offset += A_TILE_ROW_STRIDE) {
+        auto x_row_offset = block_row * BLOCK_TILE_M + inner_row_a + offset;
+        auto x_col_offset = block * BLOCK_TILE_K + inner_col_a;
+        if ((x_row_offset < length) && (x_col_offset < K)) {
+          As[inner_row_a + offset][inner_col_a] =
+              x_values[row_start + x_row_offset][x_col_offset];
+        } else {
+          As[inner_row_a + offset][inner_col_a] = 0;
+        }
+      }
+
+// load a block of y from global memory to shared memory
+// apply tiling for threads in a block
+#pragma unroll
+      for (auto offset = 0; offset < BLOCK_TILE_K;
+           offset += B_TILE_ROW_STRIDE) {
+        auto y_row_offset = block * BLOCK_TILE_K + inner_row_b + offset;
+        auto y_col_offset = block_col * BLOCK_TILE_N + inner_col_b;
+        if ((y_row_offset < K) && (y_col_offset < N)) {
+          Bs[inner_row_b + offset][inner_col_b] =
+              y[b][y_row_offset][y_col_offset];
+        } else {
+          Bs[inner_row_b + offset][inner_col_b] = 0;
+        }
+      }
+
+      __syncthreads();
+
+// calculate the results per thread
+#pragma unroll
+      for (auto k = 0; k < BLOCK_TILE_K; k++) {
+        // load values from shared memory to registers for x_values
+        for (auto row = 0; row < THREAD_TILE_M; row++) {
+          fragment_a[row] = As[thread_row * THREAD_TILE_M + row][k];
+        }
+
+// load values from shared memory to registers for y
+#pragma unroll
+        for (auto col = 0; col < THREAD_TILE_N; col++) {
+          fragment_b[col] = Bs[k][thread_col * THREAD_TILE_N + col];
+        }
+
+// each thread calcualtes THREAD_TILE_M * THREAD_TILE_N elements
+#pragma unroll
+        for (auto row = 0; row < THREAD_TILE_M; row++) {
+#pragma unroll
+          for (auto col = 0; col < THREAD_TILE_N; col++) {
+            accum[row][col] += fragment_a[row] * fragment_b[col];
+          }
+        }
+      }
+
+      __syncthreads();
+    }
+
+// write the result to the output
+#pragma unroll
+    for (auto row = 0; row < THREAD_TILE_M; row++) {
+#pragma unroll
+      for (auto col = 0; col < THREAD_TILE_N; col++) {
+        auto out_row_offset =
+            block_row * BLOCK_TILE_M + thread_row * THREAD_TILE_M + row;
+        auto out_col_offset =
+            block_col * BLOCK_TILE_N + thread_col * THREAD_TILE_N + col;
+        if ((out_row_offset < length) && (out_col_offset < N)) {
+          output[row_start + out_row_offset][out_col_offset] = accum[row][col];
         }
-        output[row_start + l][n] = acc;
       }
     }
   }
@@ -2124,9 +2357,29 @@ Tensor jagged_dense_bmm_forward(
   const int total_L = x_values.size(0);
   auto output = at::zeros({total_L, N}, x_values.options());
   if (B > 0 && M > 0 && N > 0) {
-    const int block_dim_x =
-        std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads);
-    const int block_dim_y = kMaxThreads / block_dim_x;
+    // The shared memory size is (BLOCK_TILE_M + BLOCK_TILE_N) * BLOCK_TILE_K
+    // BLOCK_TILE_M needs to be multiple of THREAD_TILE_M, and
+    // BLOCK_TILE_N needs to be multiple of THREAD_TILE_N
+    // The setting of these parameters needs to balance the hardware's shared
+    // memory size limit and occupancy
+    // TODO: autotune these parameters based on max_L and input and output
+    // tensor sizes
+    constexpr int BLOCK_TILE_M = 64;
+    constexpr int BLOCK_TILE_N = 8;
+    constexpr int BLOCK_TILE_K = 8;
+    constexpr int THREAD_TILE_M = 4;
+    constexpr int THREAD_TILE_N = 4;
+
+    const dim3 block(
+        (BLOCK_TILE_M * BLOCK_TILE_N) / (THREAD_TILE_M * THREAD_TILE_N));
+    const auto grid_dim_x = div_round_up(N, BLOCK_TILE_N);
+    const auto grid_dim_y = div_round_up(max_L, BLOCK_TILE_M);
+    TORCH_CHECK(
+        grid_dim_y <= kMaxBlockYDim,
+        "max_L cannot be larger than",
+        grid_dim_y * BLOCK_TILE_M + 1 - BLOCK_TILE_M);
+    const auto grid_dim_z = std::min(B, kMaxBlockZDim);
+    const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z);
 
     AT_DISPATCH_INDEX_TYPES(
         x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] {
@@ -2136,11 +2389,15 @@ Tensor jagged_dense_bmm_forward(
               x_values.scalar_type(),
               "jagged_dense_bmm_kernel_2",
               [&] {
-                jagged_dense_bmm_kernel<index_t, scalar_t>
-                    <<<div_round_up(B * max_L, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
+                jagged_dense_bmm_kernel<
+                    BLOCK_TILE_M,
+                    BLOCK_TILE_N,
+                    BLOCK_TILE_K,
+                    THREAD_TILE_M,
+                    THREAD_TILE_N,
+                    index_t,
+                    scalar_t>
+                    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
                         x_values.packed_accessor32<scalar_t, 2>(),
                         x_offsets.packed_accessor32<index_t, 1>(),
                         y.packed_accessor32<scalar_t, 3>(),
diff --git a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
index 283422b7ae..347ec089e0 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
@@ -644,7 +644,7 @@ jagged_dense_elementwise_add_jagged_output(
     const Tensor& y) {
   // Convert to jagged
   auto jagged_values =
-      DenseToJaggedOp::apply(y, x_offsets, c10::optional<int64_t>())[0];
+      DenseToJaggedOp::apply(y, x_offsets, x_values.size(0))[0];
 
   // Add jagged_values + x_values -> sum_values
   auto sum_values = x_values + jagged_values;
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index ed3c075bd0..d03b961a79 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -23,72 +23,85 @@
 using Tensor = at::Tensor;
 
 namespace {
-// Hilariously unoptimized, but algorithmic correctness matters more here, and
-// we only do it once.
-AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
-  auto world_size = at::cuda::getNumGPUs();
-  auto intermediate_node = [&](Node i, Node j) {
-    if (i == j) {
-      return std::vector<Node>{-1};
-    }
-    if (links(i, j) != 0) {
-      return std::vector<Node>{-1};
-    }
+struct DirectConnectedPeer {
+  int64_t num_peer_links;
+  int64_t peer_id;
+  // number of transfers from peer
+  int32_t peer_transfers;
+};
 
-    std::vector<std::pair<Node, Links>> paths;
-    for (const auto k : c10::irange(world_size)) {
-      if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) {
-        paths.push_back({k, links(i, k) + links(k, j)});
-      }
-    }
-    if (paths.empty()) {
-      LOG(WARNING)
-          << "Expect very bad performance for p2p copies, we are going via sys path for GPU "
-          << i << " -> GPU " << j;
-      return std::vector<Node>{-1};
-    }
-    auto mp = std::max_element(
-                  paths.begin(),
-                  paths.end(),
-                  [](std::pair<Node, Links> a, std::pair<Node, Links> b) {
-                    return a.second < b.second;
-                  })
-                  ->second;
-    std::vector<Node> candidates;
-    for (const auto& p : paths) {
-      if (p.second == mp) {
-        candidates.push_back(p.first);
-      }
-    }
-    return candidates;
-  };
+struct TwoHopTransferContainer {
+  Tensor intermediate_tensor;
+  uint64_t output_idx;
+  std::unique_ptr<at::cuda::CUDAEvent> transfer_cuda_event;
+};
 
-  std::vector<Node> assignments(world_size * world_size);
-  // Use a two-phase assignment protocol as the greedy approach
-  // can lead to unbalanced usage.
-  std::unordered_map<Node, int64_t> uses;
+AdjacencyMatrix<Node> get_intermediate_node(
+    const AdjacencyMatrix<Links>& links) {
+  const auto world_size = at::cuda::getNumGPUs();
+  std::vector<Node> link_vec(static_cast<size_t>(world_size * world_size));
   for (const auto i : c10::irange(world_size)) {
     for (const auto j : c10::irange(world_size)) {
-      auto ims = intermediate_node(i, j);
-      if (ims.size() == 1) {
-        auto v = ims.front();
-        if (v != -1) {
-          uses[v] += 1;
-        }
-        assignments[i * world_size + j] = v;
-      }
+      link_vec[i * world_size + j] = links(i, j);
     }
   }
+  auto link_tensor = at::from_blob(
+      link_vec.data(),
+      {world_size, world_size},
+      at::TensorOptions().dtype(at::kLong));
+  LOG(INFO) << "NVLink Topology Matrix: \n" << link_tensor;
+  std::vector<Node> assignments(
+      static_cast<size_t>(world_size * world_size), -1);
+  for (const auto dst_rank_id : c10::irange(world_size)) {
+    std::vector<int> non_direct_src_ids;
+    non_direct_src_ids.reserve(world_size);
+    std::vector<DirectConnectedPeer> direct_connected_peers;
+    direct_connected_peers.reserve(world_size);
+    for (const auto src_rank_id : c10::irange(world_size)) {
+      if (dst_rank_id == src_rank_id) {
+        continue;
+      }
 
-  for (const auto i : c10::irange(world_size)) {
-    for (const auto j : c10::irange(world_size)) {
-      auto ims = intermediate_node(i, j);
-      if (ims.size() > 1) {
-        auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) {
-          return uses[a] < uses[b];
-        });
-        uses[v] += 1;
-        assignments[i * world_size + j] = v;
+      const auto num_peer_links = links(dst_rank_id, src_rank_id);
+      if (num_peer_links > 0) {
+        direct_connected_peers.push_back(
+            {.num_peer_links = num_peer_links,
+             .peer_id = src_rank_id,
+             .peer_transfers = 1});
+      } else {
+        non_direct_src_ids.push_back(src_rank_id);
+      }
+    }
+
+    // Assign intermediate hop ranks for non-directly connected peers.
+    // Assigns intermediate hops based on the number of links from the
+    //  potential intermediate rank to target rank, as well as
+    //  the number of two_hop connections already assigned to the
+    //  intermediate rank.
+    for (const auto i : c10::irange(non_direct_src_ids.size())) {
+      std::sort(
+          direct_connected_peers.begin(),
+          direct_connected_peers.end(),
+          [](const auto& a, const auto& b) {
+            if (a.num_peer_links > b.num_peer_links) {
+              return true;
+            } else if (a.num_peer_links == b.num_peer_links) {
+              return a.peer_transfers < b.peer_transfers;
+            } else {
+              return false;
+            }
+          });
+      const auto non_direct_src_id = non_direct_src_ids.at(i);
+      for (auto& j : direct_connected_peers) {
+        const auto potential_hop_id = j.peer_id;
+        const auto potential_hop_peer_links =
+            links(potential_hop_id, non_direct_src_id);
+        if (potential_hop_peer_links > 0) {
+          assignments[dst_rank_id * world_size + non_direct_src_id] =
+              potential_hop_id;
+          j.peer_transfers += 1;
+          break;
+        }
       }
     }
   }
@@ -100,7 +113,8 @@ AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
         {world_size, world_size},
         at::TensorOptions().dtype(at::kLong));
     LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor;
-    return [=](Node i, Node j) { return assignments[i * world_size + j]; };
+    return
+        [=](Node src, Node dst) { return assignments[dst * world_size + src]; };
   } else {
     return [](Node, Node) { return -1; };
   }
@@ -111,7 +125,7 @@ AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
 // tensor in `input_tensors` is already in the `target_device`, we will skip
 // copy it if `skip_if_same_device` is true.
 void all_to_one(
-    std::vector<Tensor>& input_tensors,
+    const std::vector<Tensor>& input_tensors,
     std::vector<Tensor>& output_tensors,
     at::Device target_device,
     bool skip_if_same_device) {
@@ -119,19 +133,48 @@ void all_to_one(
   std::vector<at::cuda::CUDAEvent> copy_begin_events(num_gpus);
   std::vector<at::cuda::CUDAEvent> copy_completion_events(num_gpus);
 
+  std::vector<TwoHopTransferContainer> two_hop_transfers;
+  two_hop_transfers.reserve(input_tensors.size());
+  std::vector<bool> is_two_hop_transfer;
+  is_two_hop_transfer.reserve(input_tensors.size());
+
   static auto intermediate_nodes =
       get_intermediate_node(fbgemm_gpu::get_nvlink_matrix());
-  for (auto& ten : input_tensors) {
-    Node src_device_id = ten.get_device();
+  for (const auto i : c10::irange(input_tensors.size())) {
+    const auto& src = input_tensors.at(i);
+    Node src_device_id = src.get_device();
     auto intermediate_node =
         intermediate_nodes(src_device_id, target_device.index());
     if (intermediate_node != -1) {
-      ten = ten.to(at::Device(at::kCUDA, intermediate_node));
+      two_hop_transfers.push_back(
+          {.intermediate_tensor = at::empty(
+               src.sizes(),
+               src.options().device(at::Device(at::kCUDA, intermediate_node))),
+           .output_idx = i,
+           .transfer_cuda_event =
+               std::make_unique<at::cuda::CUDAEvent>(cudaEventDisableTiming)});
+      auto& dst = two_hop_transfers.back().intermediate_tensor;
+      at::cuda::CUDAStream copy_stream =
+          at::cuda::getCurrentCUDAStream(src_device_id);
+      AT_CUDA_CHECK(cudaMemcpy2DAsync(
+          dst.data_ptr(),
+          dst.stride(0) * dst.element_size(),
+          src.data_ptr(),
+          src.stride(0) * src.element_size(),
+          src.size(1) * src.element_size(),
+          src.size(0),
+          cudaMemcpyDeviceToDevice,
+          copy_stream));
+      two_hop_transfers.back().transfer_cuda_event->record(copy_stream);
+      is_two_hop_transfer.push_back(true);
+    } else {
+      is_two_hop_transfer.push_back(false);
     }
   }
 
-  // For each source device, we sync its current stream and launch all the
-  // copies that are from that device.
+  // For each source device directly connected to the destination device, we
+  // sync its current stream and launch all the copies that are from that
+  // device.
   for (const auto device_id : c10::irange(num_gpus)) {
     auto src_device = at::Device(at::kCUDA, device_id);
     if (src_device == target_device) {
@@ -160,6 +203,13 @@ void all_to_one(
     device_guard.set_device(src_device);
     dst_ready.block(copy_stream);
     for (const auto i : c10::irange(input_tensors.size())) {
+      const auto metadata = is_two_hop_transfer.at(i);
+      // Initiate all transfer for tensors with direct
+      // NVLink connection to target rank
+      if (metadata) {
+        continue;
+      }
+
       auto& src = input_tensors[i];
       if (src.device() != src_device) {
         continue;
@@ -179,6 +229,43 @@ void all_to_one(
     }
   }
 
+  // Complete 2-hop transfers to target rank
+  for (auto& two_hop_transfer : two_hop_transfers) {
+    const auto& src = two_hop_transfer.intermediate_tensor;
+    const auto src_device_id = src.get_device();
+    const auto src_device = at::Device(at::kCUDA, src_device_id);
+    if (src_device == target_device) {
+      continue;
+    }
+
+    // intermediate rank
+    at::cuda::CUDAGuard device_guard(src_device);
+    // intermediate rank stream
+    at::cuda::CUDAStream copy_stream =
+        at::cuda::getCurrentCUDAStream(src_device_id);
+    // wait on first hop transfer
+    two_hop_transfer.transfer_cuda_event->block(copy_stream);
+    // synchronize with target rank
+    auto& dst_ready = copy_begin_events[src_device_id];
+    device_guard.set_device(target_device);
+    dst_ready.record(at::cuda::getCurrentCUDAStream(target_device.index()));
+    device_guard.set_device(src_device);
+    dst_ready.block(copy_stream);
+    // originating tensor output position
+    const auto output_index = two_hop_transfer.output_idx;
+    auto& dst = output_tensors.at(output_index);
+    // on source device, launch memcpy.
+    AT_CUDA_CHECK(cudaMemcpy2DAsync(
+        dst.data_ptr(),
+        dst.stride(0) * dst.element_size(),
+        src.data_ptr(),
+        src.stride(0) * src.element_size(),
+        src.size(1) * src.element_size(),
+        src.size(0),
+        cudaMemcpyDeviceToDevice,
+        copy_stream));
+  }
+
   // Do the same-GPU cases.
   if (!skip_if_same_device) {
     for (const auto i : c10::irange(input_tensors.size())) {
diff --git a/fbgemm_gpu/src/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops_gpu.cpp
index e3e1225fb9..0126ff414f 100644
--- a/fbgemm_gpu/src/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops_gpu.cpp
@@ -500,12 +500,41 @@ Tensor index_select_dim0_gpu(
 std::vector<Tensor> group_index_select_dim0_gpu(
     const std::vector<Tensor>& input_group,
     const std::vector<Tensor>& indices_group) {
+  const auto group_size = input_group.size();
   std::vector<Tensor> output_group;
-  apply_(
-      [&](auto&&... args) {
-        output_group = GroupIndexSelectDim0GPUOp::apply(indices_group, args...);
-      },
-      input_group);
+  // We use the APPLY_AUTOGRAD_FN macros to instantiate
+  // GroupIndexSelectDim0GPUOp for different group sizes.  We only instantiate
+  // up to group size of 54.
+  constexpr size_t max_group_size = 54;
+  // Specialize this path to avoid copy
+  if (group_size <= max_group_size) {
+    apply_(
+        [&](auto&&... args) {
+          output_group =
+              GroupIndexSelectDim0GPUOp::apply(indices_group, args...);
+        },
+        input_group);
+    return output_group;
+  }
+
+  const auto input_itr = input_group.begin();
+  const auto indices_itr = indices_group.begin();
+
+  for (size_t start = 0; start < group_size; start += max_group_size) {
+    const auto end = std::min(start + max_group_size, group_size);
+    std::vector<Tensor> input_subgroup(input_itr + start, input_itr + end);
+    std::vector<Tensor> indices_subgroup(
+        indices_itr + start, indices_itr + end);
+    std::vector<Tensor> output_subgroup;
+    apply_(
+        [&](auto&&... args) {
+          output_subgroup =
+              GroupIndexSelectDim0GPUOp::apply(indices_subgroup, args...);
+        },
+        input_subgroup);
+    output_group.insert(
+        output_group.end(), output_subgroup.begin(), output_subgroup.end());
+  }
   return output_group;
 }
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index 9d23ee9fff..513f32cf8e 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -79,6 +79,18 @@ enum uvm_cache_stats_index {
   num_conflict_misses = 5,
 };
 
+// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
+// not sensitive to grid size as long as the number thread blocks per SM is not
+// too small nor too big.
+constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
+
+int get_max_thread_blocks_for_cache_kernels_() {
+  cudaDeviceProp* deviceProp =
+      at::cuda::getDeviceProperties(c10::cuda::current_device());
+  return deviceProp->multiProcessorCount *
+      MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
+}
+
 } // namespace
 
 int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) {
@@ -495,6 +507,67 @@ std::tuple<Tensor, Tensor, c10::optional<Tensor>> get_unique_indices_cuda(
 
 namespace {
 
+__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
+  const int32_t N = lxu_cache_locations.size(0);
+  int64_t n_enforced_misses = 0;
+  CUDA_KERNEL_LOOP(n, N) {
+    if ((n & 0x00FF) < enforced_misses_per_256) {
+      if (lxu_cache_locations[n] >= 0) {
+        n_enforced_misses++;
+      }
+      lxu_cache_locations[n] = kCacheLocationMissing;
+    }
+  }
+  if (gather_cache_stats && n_enforced_misses > 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+        n_enforced_misses);
+  }
+}
+} // namespace
+
+Tensor emulate_cache_miss(
+    Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    Tensor uvm_cache_stats) {
+  TENSOR_ON_CUDA_GPU(lxu_cache_locations);
+  TENSOR_ON_CUDA_GPU(uvm_cache_stats);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_locations.get_device());
+
+  const auto N = lxu_cache_locations.numel();
+  if (N == 0) {
+    // nothing to do
+    return lxu_cache_locations;
+  }
+
+  const dim3 blocks(std::min(
+      div_round_up(N, kMaxThreads),
+      get_max_thread_blocks_for_cache_kernels_()));
+
+  emulate_cache_miss_kernel<<<
+      blocks,
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      lxu_cache_locations
+          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      enforced_misses_per_256,
+      gather_cache_stats,
+      uvm_cache_stats.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return lxu_cache_locations;
+}
+
+namespace {
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
     const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
@@ -622,19 +695,6 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel
     }
   }
 }
-
-// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
-// not sensitive to grid size as long as the number thread blocks per SM is not
-// too small nor too big.
-constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
-
-int get_max_thread_blocks_for_cache_kernels_() {
-  cudaDeviceProp* deviceProp =
-      at::cuda::getDeviceProperties(c10::cuda::current_device());
-  return deviceProp->multiProcessorCount *
-      MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
-}
-
 } // namespace
 
 std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
@@ -798,8 +858,8 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats) {
   const int32_t C = lxu_cache_state.size(0);
-  int64_t n_conflict_misses = 0;
-  int64_t n_inserted = 0;
+  int32_t n_conflict_misses = 0;
+  int32_t n_inserted = 0;
   for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
        n += gridDim.x * blockDim.y) {
     // check if this warp is responsible for this whole segment.
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index 74f7581576..07102aec90 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -11,12 +11,20 @@
 from typing import List, Optional, Tuple
 
 import torch
+from hypothesis import given, settings
 
 try:
     # pyre-ignore[21]
     from fbgemm_gpu import open_source  # noqa: F401
+
+    # pyre-ignore[21]
+    from test_utils import cpu_and_maybe_gpu
 except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu")
+    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu
+
+DEFAULT_DEVICE = torch.device("cpu")
 
 
 class TBEInputPrepareReference(torch.nn.Module):
@@ -120,23 +128,23 @@ def forward(
 
 
 class InputCombineTest(unittest.TestCase):
-    def _get_inputs(self, dtypes):
+    def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
         indices_list = [
-            torch.tensor([1, 2, 3], dtype=dtypes[0]),
-            torch.tensor([1, 2, 3, 4], dtype=dtypes[1]),
+            torch.tensor([1, 2, 3], dtype=dtypes[0], device=device),
+            torch.tensor([1, 2, 3, 4], dtype=dtypes[1], device=device),
         ]
         offsets_list = [
-            torch.tensor([0, 2], dtype=dtypes[0]),
-            torch.tensor([0, 1, 4], dtype=dtypes[1]),
+            torch.tensor([0, 2], dtype=dtypes[0], device=device),
+            torch.tensor([0, 1, 4], dtype=dtypes[1], device=device),
         ]
         include_last_offsets = [False, True]
         per_sample_weights = [
-            torch.tensor([1, 2, 1], dtype=torch.float),
-            torch.tensor([1, 2, 1, 3], dtype=torch.float),
+            torch.tensor([1, 2, 1], dtype=torch.float, device=device),
+            torch.tensor([1, 2, 1, 3], dtype=torch.float, device=device),
         ]
         empty_per_sample_weights = [
-            torch.tensor([], dtype=torch.float),
-            torch.tensor([], dtype=torch.float),
+            torch.tensor([], dtype=torch.float, device=device),
+            torch.tensor([], dtype=torch.float, device=device),
         ]
         return (
             indices_list,
@@ -226,27 +234,34 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None:
         self.assertTrue(outputs[1].dtype == torch.int32)
         self.assertTrue(outputs[-1].size(0) == 0)
 
-    def _offsets_to_lengths(self, offsets, indices, include_last_offsets):
+    def _offsets_to_lengths(
+        self, offsets, indices, include_last_offsets, device=DEFAULT_DEVICE
+    ):
         if include_last_offsets:
             offsets_complete = offsets
         else:
             offsets_complete = torch.cat(
-                [offsets, torch.tensor([indices.numel()], dtype=offsets.dtype)]
+                [
+                    offsets,
+                    torch.tensor([indices.numel()], dtype=offsets.dtype, device=device),
+                ]
             )
         return offsets_complete[1:] - offsets_complete[:-1]
 
-    def _run_test_with_length(self, dtypes) -> None:
+    def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None:
         (
             indices_list,
             offsets_list,
             per_sample_weights,
             empty_per_sample_weights,
             include_last_offsets,
-        ) = self._get_inputs(dtypes)
+        ) = self._get_inputs(dtypes, device=device)
         ref_mod = TBEInputPrepareReference(include_last_offsets)
 
         lengths_list = [
-            self._offsets_to_lengths(offsets, indices, include_last_offsets)
+            self._offsets_to_lengths(
+                offsets, indices, include_last_offsets, device=device
+            )
             for offsets, indices, include_last_offsets in zip(
                 offsets_list, indices_list, include_last_offsets
             )
@@ -307,14 +322,20 @@ def test_input_combine_int32(self) -> None:
     def test_input_combined_mix(self) -> None:
         self._run_test((torch.int64, torch.int32))
 
-    def test_input_combine_int64_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int64))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_int64_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int64, torch.int64), device=device)
 
-    def test_input_combine_int32_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int64))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_int32_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int32, torch.int32), device=device)
 
-    def test_input_combined_mix_with_length(self) -> None:
-        self._run_test_with_length((torch.int64, torch.int32))
+    @given(device=cpu_and_maybe_gpu())
+    @settings(deadline=None)
+    def test_input_combine_mix_with_length(self, device: torch.device) -> None:
+        self._run_test_with_length((torch.int64, torch.int32), device=device)
 
     def test_padding_fused_input_combine_int64(self) -> None:
         self._run_padding_fused_test((torch.int64, torch.int64), 64)
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index fa65a8bb49..98021007f4 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -20,7 +20,12 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, running_on_github, TEST_WITH_ROCM
+    from test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        running_on_github,
+        TEST_WITH_ROCM,
+    )
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index de7c80b79b..98e1ede2ee 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -100,7 +100,7 @@ def ref(pooled_ad_embeddings, batch_indices):
         r=st.randoms(use_true_random=False),
     )
     # Can instantiate 8 contexts which takes a long time.
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None)
     def test_all_to_one_device(
         self,
         num_inputs,
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index c0ba5f6f64..ddab386bf0 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -9,10 +9,11 @@
 
 import copy
 
+import math
 import pickle
 import random
 import unittest
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import fbgemm_gpu
 import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops
@@ -31,11 +32,16 @@
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
     BoundsCheckMode,
+    CounterBasedRegularizationDefinition,
+    CounterWeightDecayMode,
+    GradSumDecay,
     INT8_EMB_ROW_DIM_OFFSET,
+    LearningRateMode,
     OptimType,
     RecordCacheMetrics,
     rounded_row_size_in_bytes,
     SparseType,
+    TailIdThreshold,
     WeightDecayMode,
 )
 from hypothesis import assume, given, HealthCheck, settings, Verbosity
@@ -1627,6 +1633,7 @@ def execute_backward_adagrad_(  # noqa C901
         use_cpu: bool,
         exact: bool,
         output_dtype: SparseType,
+        weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE,
     ) -> None:
         # NOTE: cache is not applicable to CPU version.
         assume(not use_cpu or not use_cache)
@@ -1826,31 +1833,39 @@ def execute_backward_adagrad_(  # noqa C901
             goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         cc.flush()
-        split_optimizer_states = [s for (s,) in cc.split_optimizer_states()]
+        split_optimizer_states = cc.split_optimizer_states()
+        assert len(split_optimizer_states) == T
         tolerance = (
             1.0e-4
             if weights_precision == SparseType.FP32 and output_dtype == SparseType.FP32
             else 1.0e-2
         )
         for t in range(T):
+            if row_wise and weight_decay_mode == WeightDecayMode.COUNTER:
+                (m1, c1, c2) = split_optimizer_states[t]
+            else:
+                (m1,) = split_optimizer_states[t]
             # pyre-fixme[16]: `Optional` has no attribute `float`.
             ref_optimizer_state = bs[t].weight.grad.float().cpu().to_dense().pow(2)
             torch.testing.assert_close(
-                split_optimizer_states[t].float().cpu(),
+                m1.float().cpu(),
                 ref_optimizer_state.mean(dim=1) if row_wise else ref_optimizer_state,
                 atol=tolerance,
                 rtol=tolerance,
             )
         for t in range(T):
             # optimizer_state = squares (no row-wise) or sum squares (row-wise)
+            if row_wise and weight_decay_mode == WeightDecayMode.COUNTER:
+                (m1, c1, c2) = split_optimizer_states[t]
+            else:
+                (m1,) = split_optimizer_states[t]
             torch.testing.assert_close(
                 cc.split_embedding_weights()[t].float().cpu(),
                 torch.addcdiv(
                     bs[t].weight.float().cpu(),
                     value=-lr,
                     tensor1=bs[t].weight.grad.float().cpu().to_dense(),
-                    tensor2=split_optimizer_states[t]
-                    .float()
+                    tensor2=m1.float()
                     .sqrt_()
                     .add_(eps)
                     .view(Es[t], 1 if row_wise else Ds[t])
@@ -2589,6 +2604,8 @@ def execute_backward_optimizers_(  # noqa C901
             0.9,
             0.01,
         )
+        counter_based_regularization: CounterBasedRegularizationDefinition
+
         if optimizer == OptimType.EXACT_ADAGRAD:
             optimizer_kwargs["eps"] = eps
 
@@ -2596,6 +2613,21 @@ def execute_backward_optimizers_(  # noqa C901
             optimizer_kwargs["eps"] = eps
             optimizer_kwargs["weight_decay"] = weight_decay
             optimizer_kwargs["weight_decay_mode"] = weight_decay_mode
+            if weight_decay_mode == WeightDecayMode.COUNTER:
+                counter_based_regularization = CounterBasedRegularizationDefinition(
+                    counter_weight_decay_mode=CounterWeightDecayMode.DECOUPLE,
+                    counter_halflife=20000,
+                    adjustment_iter=24000,
+                    adjustment_ub=0.1,
+                    learning_rate_mode=LearningRateMode.TAIL_ID_LR_DECREASE,
+                    grad_sum_decay=GradSumDecay.NO_DECAY,
+                    tail_id_threshold=TailIdThreshold(val=1000, is_ratio=False),
+                )
+
+                optimizer_kwargs[
+                    "counter_based_regularization"
+                    # pyre-fixme[6]: Expected `float` for 2nd param but got `CounterBasedRegularizationDefinition`.
+                ] = counter_based_regularization
 
         if optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD:
             optimizer_kwargs["eps"] = eps
@@ -2654,15 +2686,39 @@ def execute_backward_optimizers_(  # noqa C901
         if optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.EXACT_ADAGRAD):
             rowwise = optimizer == OptimType.EXACT_ROWWISE_ADAGRAD
             for t in range(T):
-                (m1,) = split_optimizer_states[t]
+                row_counter: Optional[torch.Tensor] = None
+                freq: Optional[torch.Tensor] = None
+                iter_: int = -1
+
+                if rowwise and weight_decay_mode == WeightDecayMode.COUNTER:
+                    (m1, prev_iter, row_counter) = split_optimizer_states[t]
+                else:
+                    (m1,) = split_optimizer_states[t]
                 # to_dense in GPU is non-deterministic due to atmomics used in
                 # coalescing and floating point non-associativity.
                 # pyre-fixme[16]: `Optional` has no attribute `cpu`.
                 dense_cpu_grad = bs[t].weight.grad.cpu().to_dense()
-                if rowwise and not use_cpu and weight_decay_mode == WeightDecayMode.L2:
+                if rowwise and not use_cpu:
                     # We need to skip when using cpu because use_fbgemm (https://fburl.com/code/12131iub)
                     # is true and the template code (https://fburl.com/code/1kctlup3) is not executed.
-                    dense_cpu_grad += weight_decay * bs[t].weight.cpu()
+                    if weight_decay_mode == WeightDecayMode.L2:
+                        dense_cpu_grad += weight_decay * bs[t].weight.cpu()
+                    elif weight_decay_mode == WeightDecayMode.COUNTER:
+                        iter_ = int(cc.iter.item())
+                        (
+                            dense_cpu_grad,
+                            row_counter,
+                            freq,
+                        ) = self.get_grad_from_counter_adagrad(
+                            dense_cpu_grad,
+                            bs[t].weight.cpu(),
+                            counter_based_regularization,
+                            row_counter.cpu(),
+                            prev_iter.cpu(),
+                            iter_,
+                            weight_decay,
+                        )
+
                 m1_ref = (
                     dense_cpu_grad.pow(2)
                     if not rowwise
@@ -2681,14 +2737,31 @@ def execute_backward_optimizers_(  # noqa C901
                     )
                     + eps
                 )
-                if (
-                    rowwise
-                    and not use_cpu
-                    and weight_decay_mode == WeightDecayMode.DECOUPLE
-                ):
-                    weights_ref = bs[t].weight.cpu() - lr * (
-                        dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu()
-                    )
+                if rowwise and not use_cpu:
+                    if weight_decay_mode == WeightDecayMode.DECOUPLE:
+                        weights_ref = bs[t].weight.cpu() - lr * (
+                            dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu()
+                        )
+                    elif weight_decay_mode == WeightDecayMode.L2:
+                        # pyre-fixme[58]: `/` is not supported for operand types `float`
+                        #  and `Tensor`.
+                        weights_ref = bs[t].weight.cpu() - lr * dense_cpu_grad / denom
+                    elif weight_decay_mode == WeightDecayMode.COUNTER:
+                        max_counter = cc.max_counter.item()
+                        weights_ref = self.get_wts_from_counter_adagrad(
+                            dense_cpu_grad,
+                            bs[t].weight.cpu(),
+                            denom,
+                            counter_based_regularization,
+                            row_counter,
+                            # pyre-fixme[6]: Expected `Tensor` for 6th param but got `Optional[Tensor]`
+                            freq,
+                            max_counter,
+                            iter_,
+                            eps,
+                            lr,
+                            weight_decay,
+                        )
                 else:
                     # pyre-fixme[58]: `/` is not supported for operand types `float`
                     #  and `Tensor`.
@@ -2833,6 +2906,117 @@ def execute_backward_optimizers_(  # noqa C901
                     rtol=1.0e-4,
                 )
 
+    def get_grad_from_counter_adagrad(
+        self,
+        dense_cpu_grad: torch.Tensor,
+        weights: torch.Tensor,
+        counter_based_regularization: CounterBasedRegularizationDefinition,
+        row_counter: torch.Tensor,
+        prev_iter: torch.Tensor,
+        iter_: int,
+        weight_decay: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        row_counter = row_counter.view(row_counter.numel(), 1)
+        prev_iter = prev_iter.view(prev_iter.numel(), 1)
+        freq = torch.ones_like(row_counter)
+        counter_weight_decay_mode = (
+            counter_based_regularization.counter_weight_decay_mode
+        )
+        counter_halflife = counter_based_regularization.counter_halflife
+        l2_wd = 1.0 if counter_weight_decay_mode == CounterWeightDecayMode.L2 else 0.0
+
+        if counter_halflife > 0:
+            counter_log_rho = math.log(2.0) / counter_halflife
+            # if id occurs multiple times in a batch, iter_delta=1
+            iter_delta = torch.where(prev_iter == 0.0, 1.0, iter_ * 1.0 - prev_iter)
+            prev_iter = iter_ * torch.ones_like(prev_iter)
+            row_counter = 1.0 + torch.exp(-iter_delta * counter_log_rho) * row_counter
+            freq = torch.tensor([counter_halflife]) / row_counter
+
+        dense_cpu_grad += l2_wd * freq * weight_decay * weights
+        return dense_cpu_grad, row_counter, freq
+
+    def get_wts_from_counter_adagrad(
+        self,
+        dense_cpu_grad: torch.Tensor,
+        weights: torch.Tensor,
+        denom: torch.Tensor,
+        counter_based_regularization: CounterBasedRegularizationDefinition,
+        row_counter: torch.Tensor,
+        freq: torch.Tensor,
+        max_counter: float,
+        iter_: int,
+        eps: float,
+        learning_rate: float,
+        weight_decay: float,
+    ) -> torch.Tensor:
+        counter_weight_decay_mode = (
+            counter_based_regularization.counter_weight_decay_mode
+        )
+        counter_halflife = counter_based_regularization.counter_halflife
+        tail_id_threshold_val = counter_based_regularization.tail_id_threshold.val
+        if counter_based_regularization.tail_id_threshold.is_ratio:
+            tail_id_threshold_val = math.floor(tail_id_threshold_val * max_counter)
+        learning_rate_mode = counter_based_regularization.learning_rate_mode
+        adjustment_iter = counter_based_regularization.adjustment_iter
+        adjustment_ub = counter_based_regularization.adjustment_ub
+
+        multiplier = torch.tensor([learning_rate]) / denom
+        adjusted_multiplier = multiplier
+        exp_reg_correction = torch.ones_like(row_counter)
+
+        if counter_halflife > 0:
+            if adjustment_iter <= 0 or (
+                adjustment_iter > 0 and iter_ > adjustment_iter
+            ):
+                if learning_rate_mode == LearningRateMode.TAIL_ID_LR_INCREASE:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        multiplier
+                        * torch.maximum(
+                            torch.minimum(
+                                torch.pow(
+                                    torch.tensor([max_counter]) / (row_counter + 1.0),
+                                    adjustment_ub,
+                                ),
+                                torch.Tensor([10.0]),
+                            ),
+                            torch.Tensor([1.0]),
+                        ),
+                        multiplier,
+                    )
+                elif learning_rate_mode == LearningRateMode.TAIL_ID_LR_DECREASE:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        multiplier
+                        * torch.minimum(
+                            torch.maximum(
+                                torch.pow(
+                                    (row_counter + 1.0) / max_counter,
+                                    adjustment_ub,
+                                ),
+                                torch.Tensor([0.1]),
+                            ),
+                            torch.Tensor([1.0]),
+                        ),
+                        multiplier,
+                    )
+                elif learning_rate_mode == LearningRateMode.COUNTER_SGD:
+                    adjusted_multiplier = torch.where(
+                        row_counter > tail_id_threshold_val,
+                        torch.Tensor([learning_rate])
+                        / (torch.sqrt(adjustment_ub * row_counter) + eps),
+                        multiplier,
+                    )
+
+                if counter_weight_decay_mode == CounterWeightDecayMode.DECOUPLE:
+                    exp_reg_correction = 1.0 - freq * weight_decay * learning_rate
+                elif counter_weight_decay_mode == CounterWeightDecayMode.L2:
+                    exp_reg_correction = 1.0 - freq * weight_decay * multiplier
+
+        weights = exp_reg_correction * weights - adjusted_multiplier * dense_cpu_grad
+        return weights
+
     @given(
         T=st.integers(min_value=1, max_value=5),
         D=st.integers(min_value=2, max_value=256),
@@ -2901,7 +3085,7 @@ def test_backward_optimizers_adam(  # noqa C901
         D=st.integers(min_value=2, max_value=256),
         B=st.integers(min_value=1, max_value=128),
         log_E=st.integers(min_value=3, max_value=5),
-        L=st.integers(min_value=0, max_value=20),
+        L=st.integers(min_value=2, max_value=20),
         weighted=st.booleans(),
         mixed=st.booleans(),
         optimizer=st.sampled_from(
@@ -2928,6 +3112,7 @@ def test_backward_optimizers_adam(  # noqa C901
             [
                 WeightDecayMode.L2,
                 WeightDecayMode.DECOUPLE,
+                WeightDecayMode.COUNTER,
             ]
         ),
     )
@@ -3394,7 +3579,7 @@ def test_nbit_forward_cpu(
         T = random.randint(1, 50)
         B = random.randint(0, 128)
         L = random.randint(0, 32)
-        D = random.randint(2, 1024)
+        D = random.randint(2, 2048)
         log_E = random.randint(2, 4)
 
         use_cache = False
@@ -3475,7 +3660,7 @@ def test_nbit_forward_gpu_no_cache(
         T = random.randint(1, 50)
         B = random.randint(0, 128)
         L = random.randint(0, 32)
-        D = random.randint(2, 1024)
+        D = random.randint(2, 2048)
         log_E = random.randint(2, 4)
 
         use_cache = False
diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
new file mode 100644
index 0000000000..808ed33624
--- /dev/null
+++ b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <gtest/gtest.h>
+
+#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
+
+using namespace ::testing;
+
+// Helper function that generates input tensor for emulate_cache_miss testing.
+at::Tensor generate_lxu_cache_locations(
+    const int64_t num_requests,
+    const int64_t num_sets,
+    const int64_t associativity = 32) {
+  const auto lxu_cache_locations = at::randint(
+      0,
+      num_sets * associativity,
+      {num_requests},
+      at::device(at::kCPU).dtype(at::kInt));
+  return lxu_cache_locations;
+}
+
+// Wrapper function that takes lxu_cache_locations on CPU, copies it to GPU,
+// runs emulate_cache_miss(), and then returns the result, placed on CPU.
+std::pair<at::Tensor, at::Tensor> run_emulate_cache_miss(
+    at::Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_uvm_stats = false) {
+  at::Tensor lxu_cache_locations_copy = at::_to_copy(lxu_cache_locations);
+  const auto options =
+      lxu_cache_locations.options().device(at::kCUDA).dtype(at::kInt);
+  const auto uvm_cache_stats =
+      gather_uvm_stats ? at::zeros({6}, options) : at::empty({0}, options);
+
+  const auto lxu_cache_location_with_cache_misses = emulate_cache_miss(
+      lxu_cache_locations_copy.to(at::kCUDA),
+      enforced_misses_per_256,
+      gather_uvm_stats,
+      uvm_cache_stats);
+  return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()};
+}
+
+TEST(uvm_cache_miss_emulate_test, no_cache_miss) {
+  constexpr int64_t num_requests = 10000;
+  constexpr int64_t num_sets = 32768;
+  constexpr int64_t associativity = 32;
+
+  auto lxu_cache_locations_cpu =
+      generate_lxu_cache_locations(num_requests, num_sets, associativity);
+  auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats =
+      run_emulate_cache_miss(lxu_cache_locations_cpu, 0);
+  auto lxu_cache_location_with_cache_misses =
+      lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first;
+  EXPECT_TRUE(
+      at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses));
+}
+
+TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) {
+  constexpr int64_t num_requests = 10000;
+  constexpr int64_t num_sets = 32768;
+  constexpr int64_t associativity = 32;
+  constexpr std::array<int64_t, 6> enforced_misses_per_256_for_testing = {
+      1, 5, 7, 33, 100, 256};
+
+  for (const bool miss_in_lxu_cache_locations : {false, true}) {
+    for (const bool gather_cache_stats : {false, true}) {
+      for (const auto enforced_misses_per_256 :
+           enforced_misses_per_256_for_testing) {
+        auto lxu_cache_locations_cpu =
+            generate_lxu_cache_locations(num_requests, num_sets, associativity);
+        if (miss_in_lxu_cache_locations) {
+          // one miss in the original lxu_cache_locations; shouldn't be counted
+          // as enforced misses from emulate_cache_miss().
+          auto z = lxu_cache_locations_cpu.data_ptr<int32_t>();
+          z[0] = -1;
+        }
+        auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats =
+            run_emulate_cache_miss(
+                lxu_cache_locations_cpu,
+                enforced_misses_per_256,
+                gather_cache_stats);
+        auto lxu_cache_location_with_cache_misses =
+            lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first;
+        EXPECT_FALSE(at::equal(
+            lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses));
+
+        auto x = lxu_cache_locations_cpu.data_ptr<int32_t>();
+        auto y = lxu_cache_location_with_cache_misses.data_ptr<int32_t>();
+        int64_t enforced_misses = 0;
+        for (int32_t i = 0; i < lxu_cache_locations_cpu.numel(); ++i) {
+          if (x[i] != y[i]) {
+            EXPECT_EQ(y[i], -1);
+            enforced_misses++;
+          }
+        }
+        int64_t num_requests_over_256 =
+            static_cast<int64_t>(num_requests / 256);
+        int64_t expected_misses = num_requests_over_256 *
+                enforced_misses_per_256 +
+            std::min((num_requests - num_requests_over_256 * 256),
+                     enforced_misses_per_256);
+        if (miss_in_lxu_cache_locations) {
+          expected_misses--;
+        }
+        EXPECT_EQ(expected_misses, enforced_misses);
+        if (gather_cache_stats) {
+          auto uvm_cache_stats =
+              lxu_cache_location_with_cache_misses_and_uvm_cache_stats.second;
+          auto cache_stats_ptr = uvm_cache_stats.data_ptr<int32_t>();
+          // enforced misses are recorded as conflict misses.
+          EXPECT_EQ(expected_misses, cache_stats_ptr[5]);
+        }
+      }
+    }
+  }
+}
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index be8ac4ec8b..e7d8278464 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -15,145 +15,184 @@ namespace fbgemm {
 using float16 = std::uint16_t;
 using bfloat16 = std::uint16_t;
 
+// The IEEE754 standard species a binary16 as having the following format:
+// SEEEEEMMMMMMMMMM
+// 0432109876543210
+// That is:
+//  *  1 sign bit
+//  *  5 exponent bits
+//  * 10 mantissa/significand bits (an 11th bit is implicit)
+constexpr uint32_t f16_num_bits = 16;
+constexpr uint32_t f16_num_exponent_bits = 5;
+constexpr uint32_t f16_num_mantissa_bits = 10;
+constexpr uint32_t f16_num_non_sign_bits =
+    f16_num_exponent_bits + f16_num_mantissa_bits;
+constexpr uint32_t f16_exponent_mask = 0b1'1111; // 5 bits
+constexpr uint32_t f16_sign_bit = 1u
+    << (f16_num_exponent_bits + f16_num_mantissa_bits);
+constexpr uint32_t f16_exponent_bits = f16_exponent_mask
+    << f16_num_mantissa_bits;
+constexpr uint32_t f16_mantissa_mask = 0b11'1111'1111; // 10 bits
+constexpr uint32_t f16_exponent_bias = 15;
+constexpr uint32_t f16_nan = 0x7F'FF;
+
+// The IEEE754 standard specifies a binary32 as having:
+// SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM
+// That is:
+//  *  1 sign bit
+//  *  8 exponent bits
+//  * 23 mantissa/significand bits (a 24th bit is implicit)
+constexpr uint32_t f32_num_exponent_bits = 8;
+constexpr uint32_t f32_num_mantissa_bits = 23;
+constexpr uint32_t f32_exponent_mask = 0b1111'1111; // 8 bits
+constexpr uint32_t f32_mantissa_mask = 0x7F'FF'FF; // 23 bits
+constexpr uint32_t f32_exponent_bias = 127;
+constexpr uint32_t f32_all_non_sign_mask = 0x7F'FF'FF'FF; // 31 bits
+constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit
+constexpr uint32_t f32_num_non_sign_bits =
+    f32_num_exponent_bits + f32_num_mantissa_bits;
+
 // Round to nearest even
 static inline float16 cpu_float2half_rn(float f) {
-  float16 ret;
-
   static_assert(
-      sizeof(unsigned int) == sizeof(float),
-      "Programming error sizeof(unsigned int) != sizeof(float)");
+      sizeof(uint32_t) == sizeof(float),
+      "Programming error sizeof(uint32_t) != sizeof(float)");
 
-  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
-  unsigned x = *xp;
-  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
+  uint32_t* xp = reinterpret_cast<uint32_t*>(&f);
+  uint32_t x = *xp;
+  uint32_t u = (x & f32_all_non_sign_mask);
 
   // Get rid of +NaN/-NaN case first.
   if (u > 0x7f800000) {
-    ret = 0x7fffU;
-    return ret;
+    return static_cast<float16>(f16_nan);
   }
 
-  sign = ((x >> 16) & 0x8000);
+  uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit);
 
   // Get rid of +Inf/-Inf, +0/-0.
   if (u > 0x477fefff) {
-    ret = static_cast<float16>(sign | 0x7c00U);
-    return ret;
+    return static_cast<float16>(sign | f16_exponent_bits);
   }
   if (u < 0x33000001) {
-    ret = static_cast<float16>(sign | 0x0000);
-    return ret;
+    return static_cast<float16>(sign | 0x0000);
   }
 
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
+  uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask);
+  uint32_t mantissa = (u & f32_mantissa_mask);
 
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
+  uint32_t shift;
+  if (exponent > f32_exponent_bias - f16_exponent_bias) {
+    shift = f32_num_mantissa_bits - f16_num_mantissa_bits;
+    exponent -= f32_exponent_bias - f16_exponent_bias;
   } else {
-    shift = 0x7e - exponent;
+    shift = (f32_exponent_bias - 1) - exponent;
     exponent = 0;
-    mantissa |= 0x800000;
+    mantissa |=
+        (1u
+         << f32_num_mantissa_bits); // Bump the least significant exponent bit
   }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
+  const uint32_t lsb = (1u << shift);
+  const uint32_t lsb_s1 = (lsb >> 1);
+  const uint32_t lsb_m1 = (lsb - 1);
 
   // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
+  const uint32_t remainder = (mantissa & lsb_m1);
   mantissa >>= shift;
   if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
     ++mantissa;
-    if (!(mantissa & 0x3ff)) {
+    if (!(mantissa & f16_mantissa_mask)) {
       ++exponent;
       mantissa = 0;
     }
   }
 
-  ret = static_cast<float16>(sign | (exponent << 10) | mantissa);
-
-  return ret;
+  return static_cast<float16>(
+      sign | (exponent << f16_num_mantissa_bits) | mantissa);
 }
 
 // Round to zero
 static inline float16 cpu_float2half_rz(float f) {
-  float16 ret;
-
   static_assert(
-      sizeof(unsigned int) == sizeof(float),
-      "Programming error sizeof(unsigned int) != sizeof(float)");
+      sizeof(uint32_t) == sizeof(float),
+      "Programming error sizeof(uint32_t) != sizeof(float)");
 
-  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
-  unsigned x = *xp;
-  unsigned u = (x & 0x7fffffff);
-  unsigned shift, sign, exponent, mantissa;
+  const uint32_t* xp = reinterpret_cast<uint32_t*>(&f);
+  const uint32_t x = *xp;
+  const uint32_t u = (x & f32_all_non_sign_mask);
 
   // Get rid of +NaN/-NaN case first.
   if (u > 0x7f800000) {
-    ret = static_cast<float16>(0x7fffU);
-    return ret;
+    return static_cast<float16>(f16_nan);
   }
 
-  sign = ((x >> 16) & 0x8000);
+  uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit);
 
   // Get rid of +Inf/-Inf, +0/-0.
   if (u > 0x477fefff) {
-    ret = static_cast<float16>(sign | 0x7c00U);
-    return ret;
+    return static_cast<float16>(sign | f16_exponent_bits);
   }
   if (u < 0x33000001) {
-    ret = static_cast<float16>(sign | 0x0000);
-    return ret;
+    return static_cast<float16>(sign | 0x0000);
   }
 
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
+  uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask);
+  uint32_t mantissa = (u & f32_mantissa_mask);
 
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
+  uint32_t shift;
+  if (exponent > f32_exponent_bias - f16_exponent_bias) {
+    shift = f32_num_mantissa_bits - f16_num_mantissa_bits;
+    exponent -= f32_exponent_bias - f16_exponent_bias;
   } else {
-    shift = 0x7e - exponent;
+    shift = (f32_exponent_bias - 1) - exponent;
     exponent = 0;
-    mantissa |= 0x800000;
+    mantissa |=
+        (1u
+         << f32_num_mantissa_bits); // Bump the least significant exponent bit
   }
 
   // Round to zero.
   mantissa >>= shift;
 
-  ret = static_cast<float16>(sign | (exponent << 10) | mantissa);
-
-  return ret;
+  return static_cast<float16>(
+      sign | (exponent << f16_num_mantissa_bits) | mantissa);
 }
 
-static inline float cpu_half2float(float16 h) {
-  unsigned sign = ((h >> 15) & 1);
-  unsigned exponent = ((h >> 10) & 0x1f);
-  unsigned mantissa = ((h & 0x3ff) << 13);
-
-  if (exponent == 0x1f) { /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) { /* Denorm or Zero */
+// Converts a 16-bit unsigned integer representation of a IEEE754 half-precision
+// float into an IEEE754 32-bit single-precision float
+static inline float cpu_half2float(const float16 h) {
+  // Get sign and exponent alone by themselves
+  uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1;
+  uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask;
+  // Shift mantissa so that it fills the most significant bits of a float32
+  uint32_t mantissa = (h & f16_mantissa_mask)
+      << (f32_num_mantissa_bits - f16_num_mantissa_bits);
+
+  if (exponent == f16_exponent_mask) { // NaN or Inf
     if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
+      mantissa = f32_mantissa_mask;
+      sign_bit = 0;
+    }
+    exponent = f32_exponent_mask;
+  } else if (!exponent) { // Denorm or Zero
+    if (mantissa) {
+      uint32_t msb;
+      exponent = f32_exponent_bias - f16_exponent_bias + 1;
       do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1; /* normalize */
+        msb = mantissa & f32_most_significant_bit;
+        mantissa <<= 1; // normalize
         --exponent;
       } while (!msb);
-      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+      mantissa &= f32_mantissa_mask; // 1.mantissa is implicit
     }
   } else {
-    exponent += 0x70;
+    exponent += f32_exponent_bias - f16_exponent_bias;
   }
 
-  unsigned i = ((sign << 31) | (exponent << 23) | mantissa);
+  const uint32_t i = (sign_bit << f32_num_non_sign_bits) |
+      (exponent << f32_num_mantissa_bits) | mantissa;
+
   float ret;
-  memcpy(&ret, &i, sizeof(i));
+  std::memcpy(&ret, &i, sizeof(float));
   return ret;
 }
 
@@ -161,14 +200,14 @@ static inline float cpu_bf162float(bfloat16 src) {
   float ret;
   uint32_t val_fp32 =
       static_cast<uint32_t>(reinterpret_cast<const uint16_t*>(&src)[0]) << 16;
-  memcpy(&ret, &val_fp32, sizeof(ret));
+  memcpy(&ret, &val_fp32, sizeof(float));
   return ret;
 }
 
 static inline bfloat16 cpu_float2bfloat16(float src) {
   uint32_t temp;
-  memcpy(&temp, &src, sizeof(temp));
-  return (temp + (1 << 15)) >> 16;
+  memcpy(&temp, &src, sizeof(uint32_t));
+  return (temp + (1u << 15)) >> 16;
 }
 
 } // namespace fbgemm
diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h
index a1af6078a8..4fb1220eba 100644
--- a/include/fbgemm/UtilsAvx2.h
+++ b/include/fbgemm/UtilsAvx2.h
@@ -8,6 +8,7 @@
 // This file defines common utilities used in code compiled with avx2/avx512
 // flags.
 
+#include <cstdint>
 #include <string>
 
 namespace fbgemm {
diff --git a/src/InlineAsmDefines.h b/src/InlineAsmDefines.h
index 80612536b7..fa3f706602 100644
--- a/src/InlineAsmDefines.h
+++ b/src/InlineAsmDefines.h
@@ -10,13 +10,14 @@
 // We need to do a hack in inline assembly in some clang versions where we have
 // to do `.intel_syntax noprefix`. This was fixed in clang in
 // https://reviews.llvm.org/D113707, which made it into clang-14, but not in
-// Apple's clang-14 that ships with Xcode 14.
+// Apple's clang-14 that ships with Xcode 14.2. It was first fixed in Xcode 14.3
+// where the clang version is 14.0.3.
 #if defined(__clang__)
 
 #if (                                                                      \
     defined(__apple_build_version__) ||                                    \
     (defined(__has_builtin) && __has_builtin(__builtin_pika_xxhash64))) && \
-    (__clang_major__ < 15)
+    (__clang_major__ < 15 && __clang_minor__ == 0 && __clang_patchlevel__ < 3)
 #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1
 #elif (__clang_major__ < 14)
 #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a30735354a..1e996256bf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,12 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
 
 if(FBGEMM_BUILD_TESTS AND NOT TARGET gtest)
   #Download Googletest framework from github if
@@ -38,12 +46,9 @@ macro(add_gtest TESTNAME)
     EmbeddingSpMDMTestUtils.cc
     QuantizationHelpers.cc
     TestUtils.cc)
-  set_target_properties(${TESTNAME} PROPERTIES
-          CXX_STANDARD 11
-          CXX_EXTENSIONS NO)
-  #To compile test files with AVX2 turned on
-  #For static build, defining FBGEMM_STATIC to avoid generating
-  #functions with _dllimport attributes.
+  # To compile test files with AVX2 turned on
+  # For static build, defining FBGEMM_STATIC to avoid generating
+  # functions with _dllimport attributes.
   if(MSVC)
     target_compile_options(${TESTNAME} PRIVATE
       "/arch:AVX2" "/wd4244" "/wd4267" "/wd4305" "/wd4309")
diff --git a/third_party/asmjit.BUILD b/third_party/asmjit.BUILD
index 71dc5c7e6c..c2764a97c4 100644
--- a/third_party/asmjit.BUILD
+++ b/third_party/asmjit.BUILD
@@ -16,9 +16,7 @@ cc_library(
     copts = [
         "-DASMJIT_STATIC",
         "-fno-tree-vectorize",
-        "-std=c++17",
         "-fmerge-all-constants",
-        "-std=gnu++11",
         "-DTH_BLAS_MKL",
     ],
     includes = [
diff --git a/third_party/hipify_torch b/third_party/hipify_torch
index 1840658c18..23f53b025b 160000
--- a/third_party/hipify_torch
+++ b/third_party/hipify_torch
@@ -1 +1 @@
-Subproject commit 1840658c184f3eeba787dae0f06c45756c1daaf5
+Subproject commit 23f53b025b466d8ec3c45d52290d3442f7fbe6b1