diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000000..1e5dbcfcb7 --- /dev/null +++ b/.bazelrc @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +################################################################################ +# FBGEMM Bazel configuration file +# +# Based on MozoLM build options: +# https://github.com/google-research/mozolm/blob/main/.bazelrc +# +# Documentation for Bazel configuration options can be found in: +# https://bazel.build/reference/command-line-reference +################################################################################ + +# Automatically picks up host-OS-specific config lines from bazelrc files +# Enabling this is equivalent to auto-calling --config=linux on Linux, --config=windows, etc +build --enable_platform_specific_config + +# Print logs for all tests +test --test_output=all + +# Build with verbose logging +build --verbose_explanations --verbose_failures +test --verbose_explanations --verbose_failures + +# Build with optimization mode turned on +build --compilation_mode opt +test --compilation_mode opt + +# Build FBGEMM with C17 and C++17 +build:linux --cxxopt=-std=c++17 +build:linux --host_cxxopt=-std=c++17 +build:linux --conlyopt=-std=c17 +build:linux --host_conlyopt=-std=c17 +build:macos --cxxopt=-std=c++17 +build:macos --host_cxxopt=-std=c++17 +build:macos --conlyopt=-std=c17 +build:macos --host_conlyopt=-std=c17 +build:windows --cxxopt=/std:c++17 +build:windows --host_cxxopt=/std:c++17 +build:windows --conlyopt=/std:c17 +build:windows --host_conlyopt=/std:c17 + +# Generation of `runfiles` directories on Windows has to be explicitly enabled. +# See https://github.com/bazelbuild/bazel/issues/8843. +build:windows --enable_runfiles +test:windows --enable_runfiles diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 4f1c808598..9cf928883c 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -13,8 +13,13 @@ print_exec () { echo "+ $*" echo "" - "$@" + if "$@"; then + local retcode=0 + else + local retcode=$? + fi echo "" + return $retcode } exec_with_retries () { @@ -205,10 +210,12 @@ run_python_test () { echo "################################################################################" fi - if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then + if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then echo "[TEST] Python test suite PASSED: ${python_test_file}" + echo "" else echo "[TEST] Python test suite FAILED: ${python_test_file}" + echo "" return 1 fi } @@ -254,27 +261,32 @@ print_gpu_info () { echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!" return 1 fi - else if which nvidia-smi; then # If nvidia-smi is installed on a machine without GPUs, this will return error (print_exec nvidia-smi) || true + else + echo "[CHECK] nvidia-smi not found" fi fi -} -print_system_info () { - echo "################################################################################" - echo "# Print System Info" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - - echo "################################################################################" - echo "[INFO] Printing environment variables ..." - print_exec printenv + if [[ "${ENFORCE_AMD_GPU}" ]]; then + # Ensure that rocm-smi is available and returns GPU entries + if ! rocm-smi; then + echo "[CHECK] AMD driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!" + return 1 + fi + else + if which rocm-smi; then + # If rocm-smi is installed on a machine without GPUs, this will return error + (print_exec rocm-smi) || true + else + echo "[CHECK] rocm-smi not found" + fi + fi +} +__print_system_info_linux () { echo "################################################################################" echo "[INFO] Check ldd version ..." print_exec ldd --version @@ -291,6 +303,36 @@ print_system_info () { print_exec cat /etc/os-release } +__print_system_info_macos () { + echo "################################################################################" + echo "[INFO] Check CPU info ..." + sysctl -a | grep machdep.cpu + + echo "################################################################################" + echo "[INFO] Check MacOS version info ..." + print_exec uname -a + print_exec sw_vers +} + +print_system_info () { + echo "################################################################################" + echo "# Print System Info" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "################################################################################" + echo "[INFO] Printing environment variables ..." + print_exec printenv + + if [[ $OSTYPE == 'darwin'* ]]; then + __print_system_info_macos + else + __print_system_info_linux + fi +} + print_ec2_info () { echo "################################################################################" echo "# Print EC2 Instance Info" @@ -311,11 +353,73 @@ print_ec2_info () { echo "instance-type: $(get_ec2_metadata instance-type)" } +print_glibc_info () { + local library_path="$1" + if [ "$library_path" == "" ]; then + echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH" + echo "Example(s):" + echo " ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6" + return 1 + fi + + if [ -f "${library_path}" ]; then + echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat + echo "" + + echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + + else + echo "[CHECK] No file at path: ${library_path}" + return 1 + fi +} + + +################################################################################ +# Bazel Setup Functions +################################################################################ + +setup_bazel () { + local bazel_version="${1:-6.1.1}" + echo "################################################################################" + echo "# Setup Bazel" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + if [[ $OSTYPE == 'darwin'* ]]; then + # shellcheck disable=SC2155 + local bazel_variant="darwin-$(uname -m)" + else + local bazel_variant="linux-x86_64" + fi + + echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..." + print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh + + echo "[SETUP] Installing Bazel ..." + print_exec bash install-bazel.sh + print_exec rm -f install-bazel.sh + + print_exec bazel --version + echo "[SETUP] Successfully set up Bazel" +} + ################################################################################ -# Environment Setup and Install Functions +# Miniconda Setup Functions ################################################################################ +__conda_cleanup () { + echo "[SETUP] Cleaning up Conda packages ..." + (print_exec conda clean --packages --tarball -y) || return 1 + (print_exec conda clean --all -y) || return 1 +} + setup_miniconda () { local miniconda_prefix="$1" if [ "$miniconda_prefix" == "" ]; then @@ -337,7 +441,7 @@ setup_miniconda () { print_exec mkdir -p "$miniconda_prefix" echo "[SETUP] Downloading the Miniconda installer ..." - print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1 echo "[SETUP] Installing Miniconda ..." print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u @@ -349,15 +453,25 @@ setup_miniconda () { print_exec . ~/.bashrc echo "[SETUP] Updating Miniconda base packages ..." - (exec_with_retries conda update -n base -c defaults -y conda) || return 1 + (exec_with_retries conda update -n base -c defaults --update-deps -y conda) || return 1 + + # Clean up packages + __conda_cleanup # Print Conda info print_exec conda info # These variables will be exported outside + echo "[SETUP] Exporting Miniconda variables ..." export PATH="${miniconda_prefix}/bin:${PATH}" export CONDA="${miniconda_prefix}" + if [ -f "${GITHUB_PATH}" ]; then + echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..." + echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}" + echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}" + fi + echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}" } @@ -398,17 +512,22 @@ create_conda_environment () { echo "[SETUP] Successfully created Conda environment: ${env_name}" } + +################################################################################ +# PyTorch Setup Functions +################################################################################ + install_pytorch_conda () { local env_name="$1" local pytorch_version="$2" - local pytorch_cpu="$3" + local pytorch_variant_type="$3" if [ "$pytorch_version" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION [CPU]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env 1.11.0 # Install a specific version" - echo " ${FUNCNAME[0]} build_env latest # Install the latest stable release" - echo " ${FUNCNAME[0]} build_env test # Install the pre-release" - echo " ${FUNCNAME[0]} build_env nightly 1 # Install the CPU variant of the nightly" + echo " ${FUNCNAME[0]} build_env 1.11.0 # Install a specific version" + echo " ${FUNCNAME[0]} build_env latest # Install the latest stable release" + echo " ${FUNCNAME[0]} build_env test # Install the pre-release" + echo " ${FUNCNAME[0]} build_env nightly cpu # Install the CPU variant of the nightly" return 1 else echo "################################################################################" @@ -419,11 +538,11 @@ install_pytorch_conda () { echo "" fi - # Install cpuonly if needed - if [ "$pytorch_cpu" != "" ]; then - pytorch_cpu=1 + # Install the cpuonly package if needed + if [ "$pytorch_variant_type" == "cpu" ]; then local pytorch_package="cpuonly pytorch" else + pytorch_variant_type="cuda" local pytorch_package="pytorch" fi @@ -437,13 +556,25 @@ install_pytorch_conda () { local pytorch_channel="pytorch" fi + # Clean up packages before installation + __conda_cleanup + # Install PyTorch packages - echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..." + # NOTE: Installation of large package might fail due to corrupt package download + # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed + echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..." # shellcheck disable=SC2086 - (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 + + # Check that PyTorch is importable + (test_python_import "${env_name}" torch.distributed) || return 1 + + # Print out the actual installed PyTorch version + installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") + echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}" # Run check for GPU variant - if [ "$pytorch_cpu" == "" ]; then + if [ "$pytorch_variant_type" == "cuda" ]; then # Ensure that the PyTorch build is the GPU variant (i.e. contains cuDNN reference) # This test usually applies to the PyTorch nightly builds if conda list -n "${env_name}" pytorch | grep cudnn; then @@ -462,13 +593,7 @@ install_pytorch_conda () { (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 fi - # Check that PyTorch is importable - (test_python_import "${env_name}" torch.distributed) || return 1 - - # Print out the actual installed PyTorch version - installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") - echo "[INSTALL] Installed PyTorch through Conda" - echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" + echo "[INSTALL] Successfully installed PyTorch through Conda" } install_pytorch_pip () { @@ -527,30 +652,53 @@ install_pytorch_pip () { # shellcheck disable=SC2086 (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1 - if [ "$pytorch_variant_type" != "cpu" ]; then - if [ "$pytorch_variant_type" == "cuda" ]; then - # Ensure that the PyTorch-CUDA headers are properly installed - (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 - fi + # Check that PyTorch is importable + (test_python_import "${env_name}" torch.distributed) || return 1 + # Print out the actual installed PyTorch version + installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") + echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}" + + if [ "$pytorch_variant_type" != "cpu" ]; then # Ensure that the PyTorch build is of the correct variant # This test usually applies to the PyTorch nightly builds - if conda run -n build_binary pip list torch | grep torch | grep "${pytorch_variant}"; then + if conda run -n "${env_name}" pip list torch | grep torch | grep "${pytorch_variant}"; then echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})" else echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!" - echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA presently installed on the system has not been published yet." + echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available." return 1 fi fi - # Check that PyTorch is importable - (test_python_import "${env_name}" torch.distributed) || return 1 + if [ "$pytorch_variant_type" == "cuda" ]; then + # Ensure that the PyTorch-CUDA headers are properly installed + (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 + fi - # Print out the actual installed PyTorch version - installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") - echo "[INSTALL] Installed PyTorch through PIP" - echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" + echo "[INSTALL] Successfully installed PyTorch through PIP" +} + + +################################################################################ +# CUDA Setup Functions +################################################################################ + +install_nvidia_drivers_centos () { + echo "################################################################################" + echo "# Install NVIDIA Drivers" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "[SETUP] Adding NVIDIA repos to yum ..." + print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo + print_exec sudo yum clean expire-cache + + echo "[SETUP] Installing NVIDIA drivers ..." + install_system_packages nvidia-driver-latest-dkms } install_cuda () { @@ -578,9 +726,12 @@ install_cuda () { return 1 fi + # Clean up packages before installation + __conda_cleanup + # Install CUDA packages echo "[INSTALL] Installing CUDA ${cuda_version} ..." - (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 # Ensure that nvcc is properly installed (test_binpath "${env_name}" nvcc) || return 1 @@ -604,6 +755,86 @@ install_cuda () { echo "[INSTALL] Successfully installed CUDA ${cuda_version}" } +install_cudnn () { + local env_name="$1" + local install_path="$2" + local cuda_version="$3" + if [ "$cuda_version" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" + echo "Example:" + echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" + return 1 + else + echo "################################################################################" + echo "# Install cuDNN" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + # Install cuDNN manually + # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh + local cudnn_packages=( + ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" + ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" + ) + + # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] + # shellcheck disable=SC2206 + local cuda_version_arr=(${cuda_version//./ }) + # Fetch the major and minor version to concat + local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" + + # Get the URL + local cudnn_url="${cudnn_packages[cuda_concat_version]}" + if [ "$cudnn_url" == "" ]; then + # Default to cuDNN for 11.7 if no CUDA version fits + echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" + cudnn_url="${cudnn_packages[117]}" + fi + + # Clear the install path + rm -rf "$install_path" + mkdir -p "$install_path" + + # Create temporary directory + # shellcheck disable=SC2155 + local tmp_dir=$(mktemp -d) + cd "$tmp_dir" || return 1 + + # Download cuDNN + echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." + (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 + + # Unpack the tarball + echo "[INSTALL] Unpacking cuDNN ..." + tar -xvf cudnn.tar.xz + + # Copy the includes and libs over to the install path + echo "[INSTALL] Moving cuDNN files to ${install_path} ..." + rm -rf "${install_path:?}/include" + rm -rf "${install_path:?}/lib" + mv cudnn-linux-*/include "$install_path" + mv cudnn-linux-*/lib "$install_path" + + # Delete the temporary directory + cd - || return 1 + rm -rf "$tmp_dir" + + # Export the environment variables to the Conda environment + echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." + print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" + + echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" +} + +################################################################################ +# ROCm Setup Functions +################################################################################ + install_rocm_ubuntu () { local env_name="$1" local rocm_version="$2" @@ -652,15 +883,25 @@ install_rocm_ubuntu () { (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1 echo "[INSTALL] Installing HIP-relevant packages ..." - install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev install_system_packages hipify-clang miopen-hip miopen-hip-dev + # There is no need to install these packages for ROCm + # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev + echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" + echo "[INFO] Check ROCM GPU info ..." + print_exec rocm-smi + echo "[INSTALL] Successfully installed ROCm ${rocm_version}" } + +################################################################################ +# Build Tools Setup Functions +################################################################################ + install_cxx_compiler () { local env_name="$1" local use_system_package_manager="$2" @@ -684,15 +925,19 @@ install_cxx_compiler () { install_system_packages gcc gcc-c++ else - # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as - # the latter breaks builds: + # Install gxx_linux-64 from conda-forge instead of from anaconda channel. + # sysroot_linux-64 needs to be installed alongside this: + # # https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6 + # https://github.com/conda-forge/conda-forge.github.io/issues/1625 + # https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7 + # https://github.com/conda/conda-build/issues/4371 # - # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with - # references to GLIBCXX_3.4.29, which is not available on systems with older + # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that + # reference GLIBCXX_3.4.29, which may not be available on systems with older # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04 echo "[INSTALL] Installing C/C++ compilers through Conda ..." - (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1 + (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1 # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created @@ -716,6 +961,15 @@ install_cxx_compiler () { # Print out the C++ version print_exec conda run -n "${env_name}" c++ --version + + # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler + echo "[INSTALL] Printing the default version of the C++ standard used by the compiler ..." + print_exec conda run -n "${env_name}" c++ -x c++ /dev/null -E -dM | grep __cplusplus + + # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc + echo "[INSTALL] Printing the default version of the C standard used by the compiler ..." + print_exec conda run -n "${env_name}" cc -dM -E - < /dev/null | grep __STDC_VERSION__ + echo "[INSTALL] Successfully installed C/C++ compilers" } @@ -759,83 +1013,32 @@ install_build_tools () { echo "[INSTALL] Successfully installed all the build tools" } -install_cudnn () { +install_docs_tools () { local env_name="$1" - local install_path="$2" - local cuda_version="$3" - if [ "$cuda_version" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" - echo "Example:" - echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" + if [ "$env_name" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env" return 1 else echo "################################################################################" - echo "# Install cuDNN" + echo "# Install Documentation Tools" echo "#" echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" echo "################################################################################" echo "" fi - # Install cuDNN manually - # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh - local cudnn_packages=( - ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" - ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" - ) - - # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] - # shellcheck disable=SC2206 - local cuda_version_arr=(${cuda_version//./ }) - # Fetch the major and minor version to concat - local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" - - # Get the URL - local cudnn_url="${cudnn_packages[cuda_concat_version]}" - if [ "$cudnn_url" == "" ]; then - # Default to cuDNN for 11.7 if no CUDA version fits - echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" - cudnn_url="${cudnn_packages[117]}" - fi - - # Clear the install path - rm -rf "$install_path" - mkdir -p "$install_path" - - # Create temporary directory - # shellcheck disable=SC2155 - local tmp_dir=$(mktemp -d) - cd "$tmp_dir" || return 1 - - # Download cuDNN - echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." - (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 - - # Unpack the tarball - echo "[INSTALL] Unpacking cuDNN ..." - tar -xvf cudnn.tar.xz + echo "[INSTALL] Installing docs tools ..." + (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \ + doxygen) || return 1 - # Copy the includes and libs over to the install path - echo "[INSTALL] Moving cuDNN files to ${install_path} ..." - rm -rf "${install_path:?}/include" - rm -rf "${install_path:?}/lib" - mv cudnn-linux-*/include "$install_path" - mv cudnn-linux-*/lib "$install_path" - - # Delete the temporary directory - cd - || return 1 - rm -rf "$tmp_dir" - - # Export the environment variables to the Conda environment - echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." - print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" + # Check binaries are visible in the PAATH + (test_binpath "${env_name}" doxygen) || return 1 - echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" + echo "[INSTALL] Successfully installed all the build tools" } - ################################################################################ # Combination Functions ################################################################################ @@ -866,7 +1069,7 @@ create_conda_pytorch_environment () { if [ "${cuda_version}" == "" ]; then # Install the CPU variant of PyTorch - install_pytorch_conda "${env_name}" "${pytorch_version}" 1 + install_pytorch_conda "${env_name}" "${pytorch_version}" cpu else # Install CUDA and the GPU variant of PyTorch install_cuda "${env_name}" "${cuda_version}" @@ -876,7 +1079,7 @@ create_conda_pytorch_environment () { ################################################################################ -# Build Functions +# FBGEMM_GPU Build Functions ################################################################################ prepare_fbgemm_gpu_build () { @@ -895,6 +1098,11 @@ prepare_fbgemm_gpu_build () { echo "" fi + if [[ "${GITHUB_WORKSPACE}" ]]; then + # https://github.com/actions/checkout/issues/841 + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + fi + echo "[BUILD] Running git submodules update ..." git submodule sync git submodule update --init --recursive @@ -908,6 +1116,103 @@ prepare_fbgemm_gpu_build () { echo "[BUILD] Successfully ran git submodules update" } +__configure_fbgemm_gpu_build_cpu () { + # Update the package name and build args depending on if CUDA is specified + echo "[BUILD] Setting CPU-only build args ..." + build_args=(--cpu_only) +} + +__configure_fbgemm_gpu_build_rocm () { + local fbgemm_variant_targets="$1" + + # Fetch available ROCm architectures on the machine + if [ "$fbgemm_variant_targets" != "" ]; then + echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}" + local arch_list="${fbgemm_variant_targets}" + else + if which rocminfo; then + # shellcheck disable=SC2155 + local arch_list=$(rocminfo | grep -o -m 1 'gfx.*') + echo "[BUILD] Architectures list from rocminfo: ${arch_list}" + + if [ "$arch_list" == "" ]; then + # By default, build for MI250 only to save time + local arch_list=gfx90a + fi + else + echo "[BUILD] rocminfo not found in PATH!" + fi + fi + + echo "[BUILD] Setting the following ROCm targets: ${arch_list}" + print_exec conda env config vars set -n "${env_name}" PYTORCH_ROCM_ARCH="${arch_list}" + + echo "[BUILD] Setting ROCm build args ..." + build_args=() +} + +__configure_fbgemm_gpu_build_cuda () { + local fbgemm_variant_targets="$1" + + # Check nvcc is visible + (test_binpath "${env_name}" nvcc) || return 1 + + # Check that cuDNN environment variables are available + (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1 + (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1 + (test_env_var "${env_name}" NVML_LIB_PATH) || return 1 + + local arch_list="${fbgemm_variant_targets:-7.0;8.0}" + echo "[BUILD] Setting the following CUDA targets: ${arch_list}" + + # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI. + echo "[BUILD] Setting CUDA build args ..." + # shellcheck disable=SC2155 + local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH) + build_args=( + --nvml_lib_path="${nvml_lib_path}" + -DTORCH_CUDA_ARCH_LIST="'${arch_list}'" + ) +} + +__configure_fbgemm_gpu_build () { + local fbgemm_variant="$1" + local fbgemm_variant_targets="$2" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT" + echo "Example(s):" + echo " ${FUNCNAME[0]} cpu # CPU-only variant" + echo " ${FUNCNAME[0]} cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" + return 1 + else + echo "################################################################################" + echo "# Configure FBGEMM-GPU Build" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + if [ "$fbgemm_variant" == "cpu" ]; then + echo "[BUILD] Configuring build as CPU variant ..." + __configure_fbgemm_gpu_build_cpu + + elif [ "$fbgemm_variant" == "rocm" ]; then + echo "[BUILD] Configuring build as ROCm variant ..." + __configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}" + + else + echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..." + __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}" + fi + + # shellcheck disable=SC2145 + echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}" +} + __build_fbgemm_gpu_common_pre_steps () { # Private function that uses variables instantiated by its caller @@ -918,38 +1223,12 @@ __build_fbgemm_gpu_common_pre_steps () { (test_binpath "${env_name}" g++) || return 1 if [ "$fbgemm_variant" == "cpu" ]; then - # Update the package name and build args depending on if CUDA is specified - echo "[BUILD] Applying CPU-only build args ..." - build_args=(--cpu_only) package_name="${package_name}-cpu" - elif [ "$fbgemm_variant" == "rocm" ]; then - (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1 - - echo "[BUILD] Applying ROCm build args ..." - build_args=() package_name="${package_name}-rocm" - else # Set to the default variant - fbgemm_variant="gpu" - - # Check nvcc is visible - (test_binpath "${env_name}" nvcc) || return 1 - - # Check that cuDNN environment variables are available - (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1 - (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1 - (test_env_var "${env_name}" NVML_LIB_PATH) || return 1 - - # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI. - echo "[BUILD] Applying GPU build args ..." - # shellcheck disable=SC2155 - local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH) - build_args=( - --nvml_lib_path="${nvml_lib_path}" - -DTORCH_CUDA_ARCH_LIST='7.0;8.0' - ) + fbgemm_variant="cuda" fi # Extract the Python tag @@ -969,12 +1248,14 @@ __build_fbgemm_gpu_common_pre_steps () { print_exec git diff } -check_fbgemm_gpu_build () { +run_fbgemm_gpu_postbuild_checks () { local fbgemm_variant="$1" if [ "$fbgemm_variant" == "" ]; then echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT" echo "Example(s):" echo " ${FUNCNAME[0]} cpu" + echo " ${FUNCNAME[0]} cuda" + echo " ${FUNCNAME[0]} rocm" return 1 fi @@ -995,7 +1276,13 @@ check_fbgemm_gpu_build () { ) # Add more symbols to check for if it's a non-CPU variant - if [ "${fbgemm_variant}" != "cpu" ]; then + if [ "${fbgemm_variant}" == "cuda" ]; then + lib_symbols_to_check+=( + fbgemm_gpu::asynchronous_inclusive_cumsum_gpu + fbgemm_gpu::merge_pooled_embeddings + ) + elif [ "${fbgemm_variant}" == "rocm" ]; then + # merge_pooled_embeddings is missing in ROCm builds bc it requires NVML lib_symbols_to_check+=( fbgemm_gpu::asynchronous_inclusive_cumsum_gpu fbgemm_gpu::merge_pooled_embeddings @@ -1004,7 +1291,7 @@ check_fbgemm_gpu_build () { for library in "${fbgemm_gpu_so_files[@]}"; do echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}" - objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + print_glibc_info "${library}" echo "[CHECK] Verifying sample subset of symbols in the library ..." for symbol in "${lib_symbols_to_check[@]}"; do @@ -1019,27 +1306,32 @@ build_fbgemm_gpu_package () { env_name="$1" package_name="$2" fbgemm_variant="$3" - if [ "$package_name" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME [CPU_ONLY]" + fbgemm_variant_targets="$4" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME VARIANT [TARGETS]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly # Build the full wheel package" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu # Build the CPU-only variant of the wheel package" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" return 1 - else - echo "################################################################################" - echo "# Build FBGEMM-GPU Package (Wheel)" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" fi - # Run all the common FBGEMM-GPU build pre-steps (set up variables) + # Set up and configure the build __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build FBGEMM-GPU Package (Wheel)" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" # manylinux1_x86_64 is specified for PyPI upload # Distribute Python extensions as wheels on Linux - echo "[BUILD] Building FBGEMM-GPU (VARIANT=${fbgemm_variant}) wheel ..." + echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..." print_exec conda run -n "${env_name}" \ python setup.py bdist_wheel \ --package_name="${package_name}" \ @@ -1048,7 +1340,7 @@ build_fbgemm_gpu_package () { "${build_args[@]}" # Run checks on the built libraries - (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1 + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 echo "[BUILD] Enumerating the built wheels ..." print_exec ls -lth dist/*.whl @@ -1062,34 +1354,111 @@ build_fbgemm_gpu_package () { build_fbgemm_gpu_install () { env_name="$1" fbgemm_variant="$2" + fbgemm_variant_targets="$3" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" + return 1 + fi + + # Set up and configure the build + __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build + Install FBGEMM-GPU Package" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + # Parallelism may need to be limited to prevent the build from being + # canceled for going over ulimits + echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." + print_exec conda run -n "${env_name}" \ + python setup.py install "${build_args[@]}" + + # Run checks on the built libraries + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 + + echo "[INSTALL] Checking imports ..." + # Exit this directory to prevent import clashing, since there is an + # fbgemm_gpu/ subdirectory present + cd - || return 1 + (test_python_import "${env_name}" fbgemm_gpu) || return 1 + + echo "[BUILD] FBGEMM-GPU build + install completed" +} + +build_fbgemm_gpu_develop () { + env_name="$1" + fbgemm_variant="$2" + fbgemm_variant_targets="$3" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" + return 1 + fi + + # Set up and configure the build + __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build + Install FBGEMM-GPU Package" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + # Parallelism may need to be limited to prevent the build from being + # canceled for going over ulimits + echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." + print_exec conda run -n "${env_name}" \ + python setup.py build develop "${build_args[@]}" + + # Run checks on the built libraries + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 + + echo "[BUILD] FBGEMM-GPU build + develop completed" +} + +build_fbgemm_gpu_docs () { + env_name="$1" if [ "$env_name" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME [CPU_ONLY]" + echo "Usage: ${FUNCNAME[0]} ENV_NAME" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env # Build + install the package" - echo " ${FUNCNAME[0]} build_env cpu # Build + Install the CPU-only variant of the package" + echo " ${FUNCNAME[0]} build_env # Build the docs" return 1 else echo "################################################################################" - echo "# Build + Install FBGEMM-GPU Package" + echo "# Build FBGEMM-GPU Documentation" echo "#" echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" echo "################################################################################" echo "" fi - # Run all the common FBGEMM-GPU build pre-steps (set up variables) - __build_fbgemm_gpu_common_pre_steps + echo "[BUILD] Installing docs-build dependencies ..." + (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1 - # Parallelism may need to be limited to prevent the build from being - # canceled for going over ulimits - echo "[BUILD] Building and installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." - print_exec conda run -n "${env_name}" \ - python setup.py install "${build_args[@]}" + echo "[BUILD] Running Doxygen build ..." + (exec_with_retries conda run -n "${env_name}" doxygen Doxyfile.in) || return 1 - # Run checks on the built libraries - (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1 + echo "[BUILD] Building HTML pages ..." + (exec_with_retries conda run -n "${env_name}" make html) || return 1 - echo "[BUILD] FBGEMM-GPU build + install completed" + echo "[INSTALL] FBGEMM-GPU documentation build completed" } install_fbgemm_gpu_package () { @@ -1124,7 +1493,7 @@ install_fbgemm_gpu_package () { ################################################################################ -# Test Functions +# FBGEMM_GPU Test Functions ################################################################################ run_fbgemm_gpu_tests () { @@ -1133,7 +1502,7 @@ run_fbgemm_gpu_tests () { if [ "$env_name" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env # Run all tests applicable to GPU (Nvidia)" + echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA" echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU" echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm" return 1 @@ -1165,7 +1534,10 @@ run_fbgemm_gpu_tests () { uvm_test.py ) elif [ "$fbgemm_variant" == "rocm" ]; then - local ignored_tests=() + # https://github.com/pytorch/FBGEMM/issues/1559 + local ignored_tests=( + batched_unary_embeddings_test.py + ) else local ignored_tests=() fi @@ -1197,7 +1569,7 @@ run_fbgemm_gpu_tests () { ################################################################################ -# Publish Functions +# FBGEMM_GPU Publish Functions ################################################################################ publish_to_pypi () { diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index f6bae56123..79561102af 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -13,186 +13,179 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: - build-posix: - runs-on: ${{ matrix.os }} + build-linux: + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + DEBIAN_FRONTEND: noninteractive strategy: + fail-fast: false matrix: - os: [ ubuntu-latest, macos-latest ] + container-image: [ "ubuntu:20.04" ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash + - name: Setup Build Container run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + apt update -y + apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget + git config --global --add safe.directory '*' - - name: Get CPU info on Ubuntu - if: contains(runner.os, 'linux') - run: | - cat /proc/cpuinfo + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - - name: Get CPU info on macOS - if: contains(runner.os, 'macOs') - run: | - sysctl -a | grep machdep.cpu + - name: Display System Info + run: . $PRELUDE; print_system_info - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_static - cd build_static - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake --version + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 .. + make -j VERBOSE=1 - - name: Test static FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + - name: Test FBGEMM Library (${{ matrix.library-type }}) run: | set -e - cd build_static + cd $BUILD_DIR ctest --rerun-failed --output-on-failure - - name: Build shared FBGEMM lib + + build-macos: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + strategy: + fail-fast: false + matrix: + os: [ macos-latest ] + library-type: [ static, shared ] + + steps: + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + # Build but skip tests due to lack of support for AVX2 + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_shared - cd build_shared - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake --version + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} .. + make -j VERBOSE=1 - - name: Test shared FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + + build-bazel: + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + DEBIAN_FRONTEND: noninteractive + strategy: + fail-fast: false + matrix: + container-image: [ "ubuntu:20.04" ] + + steps: + - name: Setup Build Container run: | - set -e - cd build_shared - ctest --rerun-failed --output-on-failure + apt update -y + apt install -y binutils build-essential cmake git libblas-dev python3 sudo unzip wget + git config --global --add safe.directory '*' + + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Download bazel + run: . $PRELUDE; setup_bazel + + - name: Build FBGEMM Library + run: bazel build -s :* + + - name: Test FBGEMM Library + run: bazel test -s :* + build-windows: runs-on: ${{ matrix.os }} + defaults: + run: + shell: cmd + env: + BUILD_DIR: build_${{ matrix.library-type }} strategy: + fail-fast: false matrix: - os: [windows-2019] + os: [ windows-2019 ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - name: Get CPU info on Windows shell: cmd run: | wmic cpu list full - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 echo "INSTALL NINJA:" pip install ninja which ninja - mkdir build_static - cd build_static + mkdir %BUILD_DIR% + cd %BUILD_DIR% echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. - ninja all + cmake --version + cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. + ninja -v all echo "Build Success" - - name: Test static FBGEMM lib + - name: Test FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | echo %cd% - cd build_static - ctest --rerun-failed --output-on-failure - if errorlevel 1 exit /b 1 - - - name: Build shared FBGEMM lib - shell: cmd - run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - echo "INSTALL NINJA:" - pip install ninja - which ninja - mkdir build_shared - cd build_shared - echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. - ninja all - if errorlevel 1 exit /b 1 - - - name: Test shared FBGEMM lib - shell: cmd - run: | - echo %cd% - cd build_shared + cd %BUILD_DIR% set PATH=%PATH%;%cd%;%cd%\asmjit echo %PATH% ctest --rerun-failed --output-on-failure if errorlevel 1 exit /b 1 - - build-bazel: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ ubuntu-latest ] - - steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 - - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Download bazel - run: | - set -e - wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel - # verify content - echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c - chmod +x bazel - - - - name: Build FBGEMM with bazel - run: | - set -e - ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* - - - name: Test FBGEMM bazel build - run: | - set -e - ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml deleted file mode 100644 index 06e2045a03..0000000000 --- a/.github/workflows/fbgemm_docs.yml +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -name: FBGEMM Documentation -on: - push: - branches: - - main -jobs: - build_docs_job: - runs-on: linux.2xlarge - steps: - # Checkout the repository to the GitHub Actions runner - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - # Update references - # TODO: update the git submodule sync after we fixed the auto-sync part - - name: Git Sumbodule Update - run: | - git submodule init - git submodule update --remote --recursive - git log - - name: Update pip - run: | - sudo yum update -y - sudo yum -y install git python3-pip - sudo pip3 install --upgrade pip - - name: Setup conda - run: | - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh - bash ~/miniconda.sh -b -p $HOME/miniconda -u - - name: setup Path - run: | - echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH - - name: create conda env - run: | - conda create --name build_binary python=3.9 - conda info - - name: check python version - run: | - conda run -n build_binary python --version - - name: Install gcc - shell: bash - run: | - sudo yum group install -y "Development Tools" - - name: Setup Path - run: | - echo /usr/local/bin >> $GITHUB_PATH - - name: Install PyTorch - shell: bash - run: | - conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - - name: Test PyTorch Installation - run: | - conda run -n build_binary python -c "import torch.distributed" - echo "torch.distributed succeeded" - - name: Install fbgemm_gpu nightly - run: | - cd ./fbgemm_gpu - conda run -n build_binary python -m pip install -r requirements.txt - conda run -n build_binary python setup.py install --cpu_only - - name: Test fbgemm_gpu installation - shell: bash - run: | - cd ./fbgemm_gpu/docs - conda run -n build_binary \ - python -c "import fbgemm_gpu" - - name: Install Doxygen - run: | - conda install -n build_binary -c conda-forge doxygen - which doxygen - - name: Build the docset - run: | - cd ./fbgemm_gpu/docs - conda run -n build_binary python -m pip install -r requirements.txt - conda run -n build_binary doxygen Doxyfile.in - conda run -n build_binary make html - cd .. - - name: Get output time - run: echo "The time was ${{ steps.build.outputs.time }}" - - name: Deploy - uses: JamesIves/github-pages-deploy-action@releases/v3 - with: - ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages # The branch the action should deploy to. - FOLDER: fbgemm_gpu/docs/build/html # The folder the action should deploy. diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 8e021c4451..50e7c3814b 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -6,16 +6,33 @@ name: FBGEMM_GPU CI on: - push: + # PR Trigger + # + pull_request: branches: - main - pull_request: + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: branches: - main + # Manual Trigger (for testing only) + # + workflow_dispatch: + +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build_and_test_amd: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -25,11 +42,17 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04 ] - python-version: [ "3.10" ] - rocm-version: [ "5.3" ] + container-image: [ "ubuntu:20.04" ] + python-version: [ "3.8", "3.9", "3.10" ] + rocm-version: [ "5.3", "5.4.2" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -45,10 +68,7 @@ jobs: run: . $PRELUDE; free_disk_space - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -62,76 +82,85 @@ jobs: - name: Install PyTorch-ROCm Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU-ROCM Nightly - run: | - . $PRELUDE - cd fbgemm_gpu - - # Build for MI250 only to save time. - print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a - print_exec conda run -n $BUILD_ENV python setup.py build develop + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a - - name: Test FBGEMM_GPU-ROCM Nightly installation + - name: Test FBGEMM_GPU-ROCM Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm test_amd_gpu: - if: ${{ false }} # Disable the job for now runs-on: rocm + container: + image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete" + options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + ENFORCE_AMD_GPU: 1 strategy: fail-fast: false matrix: - os: [ubuntu-latest] + # ROCm machines are limited, so we only test against Python 3.10 + python-version: [ "3.10" ] + rocm-version: [ "5.3", "5.4.2" ] steps: - - name: pre-checkout - shell: bash + - name: Setup Build Container run: | - if [ -d ${{ github.workspace }} ] - then - sudo chown -R $USER:$USER ${{ github.workspace }} - fi - sudo add-apt-repository ppa:git-core/ppa - sudo apt update - sudo apt -y install --only-upgrade git - - - uses: actions/checkout@v3 + apt update -y + apt install -y git wget + git config --global --add safe.directory '*' + + - name: Checkout the Repository + uses: actions/checkout@v3 with: - ref: ${{ github.ref }} - submodules: 'true' + submodules: true - - name: build fbgemm_gpu and test - shell: bash - run: | - set -eux - env - ls -l - DOCKER_IMAGE=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_staging_base - docker pull $DOCKER_IMAGE - JENKINS_REPO_DIR=fbgemm-private-jenkins - JENKINS_REPO_DIR_BAREMETAL=$PWD - JENKINS_REPO_DIR_DOCKER=/workspace/$JENKINS_REPO_DIR - DOCKER_OPTIONS="\ - --user 0 \ - --network=host \ - --ipc=host \ - --shm-size 16G \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device=/dev/kfd \ - --device=/dev/dri \ - -v $JENKINS_REPO_DIR_BAREMETAL:$JENKINS_REPO_DIR_DOCKER - " - docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Free Disk Space + run: . $PRELUDE; free_disk_space + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install PyTorch-ROCm Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build FBGEMM_GPU-ROCM Nightly + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm + + - name: Test FBGEMM_GPU-ROCM Nightly Installation + timeout-minutes: 15 + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm build_and_test_cpu: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -141,10 +170,16 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04, ubuntu-latest ] + container-image: [ "ubuntu:20.04", "ubuntu:22.04" ] python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils build-essential git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -157,10 +192,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -171,12 +203,12 @@ jobs: - name: Install PyTorch run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build and Install FBGEMM_GPU (CPU version) + - name: Build + Install FBGEMM_GPU (CPU version) run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu - - name: Test with PyTest + - name: Test FBGEMM_GPU-CPU Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml similarity index 77% rename from .github/workflows/fbgemm_nightly_build_cpu.yml rename to .github/workflows/fbgemm_gpu_cpu_nightly.yml index 72a0af01e7..8d1d39805f 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml @@ -30,24 +30,36 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -60,10 +72,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -75,9 +84,9 @@ jobs: run: . $PRELUDE; install_build_tools $BUILD_ENV - name: Install PyTorch-CPU Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly (CPU version) @@ -92,7 +101,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -102,16 +114,23 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -119,29 +138,21 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl - - name: Install FBGEMM_GPU Nightly (CPU version) run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml similarity index 78% rename from .github/workflows/fbgemm_release_build_cpu.yml rename to .github/workflows/fbgemm_gpu_cpu_release.yml index a652c89854..577f0b5e88 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_gpu_cpu_release.yml @@ -22,24 +22,35 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -52,10 +63,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -67,9 +75,9 @@ jobs: run: . $PRELUDE; install_build_tools $BUILD_ENV - name: Install PyTorch-CPU Test - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU (CPU version) @@ -84,7 +92,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -94,16 +105,23 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -111,29 +129,21 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} - name: Install PyTorch Test - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl - - name: Install FBGEMM_GPU (CPU version) run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml similarity index 72% rename from .github/workflows/fbgemm_nightly_build.yml rename to .github/workflows/fbgemm_gpu_cuda_nightly.yml index 4cdb10aaa8..c08d088991 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU Nightly Build +name: FBGEMM_GPU-CUDA Nightly Build on: # PR Trigger (enabled only for debugging) @@ -30,25 +30,36 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -61,10 +72,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -78,17 +86,18 @@ jobs: - name: Install CUDA run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly cuda - name: Upload Built Wheel as GHA Artifact uses: actions/upload-artifact@v3 @@ -99,7 +108,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.g5.4xlarge.nvidia.gpu + container: + image: ${{ matrix.container-image }} + options: --user root --gpus all defaults: run: shell: bash @@ -110,19 +122,30 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] needs: build_artifact steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils curl git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -130,10 +153,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -141,21 +161,17 @@ jobs: - name: Install CUDA run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl - - name: Install FBGEMM_GPU Nightly run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml similarity index 77% rename from .github/workflows/fbgemm_release_build.yml rename to .github/workflows/fbgemm_gpu_cuda_release.yml index 5e3d369fe4..3a41125170 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_gpu_cuda_release.yml @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU Release Build +name: FBGEMM_GPU-CUDA Release Build on: # PR Trigger (enabled only for debugging) @@ -22,25 +22,36 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -53,10 +64,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -76,11 +84,11 @@ jobs: - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu cuda - name: Upload Built Wheel as GHA Artifact uses: actions/upload-artifact@v3 @@ -91,7 +99,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.g5.4xlarge.nvidia.gpu + container: + image: ${{ matrix.container-image }} + options: --user root --gpus all defaults: run: shell: bash @@ -102,18 +113,30 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] needs: build_artifact + steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils curl git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -121,10 +144,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -135,18 +155,13 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl - - name: Install FBGEMM_GPU run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml new file mode 100644 index 0000000000..fb63995752 --- /dev/null +++ b/.github/workflows/fbgemm_gpu_docs.yml @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +name: FBGEMM_GPU Documentation + +on: + # PR Trigger + # + pull_request: + branches: + - main + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: + branches: + - main + + # Manual Trigger (for testing only) + # + workflow_dispatch: + +jobs: + build-docs: + runs-on: linux.2xlarge + container: + image: amazonlinux:2023 + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + strategy: + fail-fast: false + matrix: + python-version: [ "3.11" ] + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git rsync sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install Documentation Tools + run: . $PRELUDE; install_docs_tools $BUILD_ENV + + - name: Install PyTorch-CPU Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build + Install FBGEMM_GPU (CPU version) + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu + + - name: Build FBGEMM_GPU Documentation + run: . $PRELUDE; cd fbgemm_gpu/docs; build_fbgemm_gpu_docs $BUILD_ENV + + - name: Deploy FBGEMM_GPU Documentation + if: ${{ github.event_name != 'pull_request' }} + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + branch: gh-pages # The branch the action should deploy to + folder: fbgemm_gpu/docs/build/html # The folder the action should deploy diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml index dc2b6344ce..8a484e9844 100644 --- a/.github/workflows/fbgemm_gpu_lint.yml +++ b/.github/workflows/fbgemm_gpu_lint.yml @@ -6,20 +6,29 @@ name: FBGEMM_GPU Lint on: + # PR Trigger + # push: branches: - main + # Push Trigger (enable to catch errors coming out of multiple merges) + # pull_request: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: - run_pylint: + run-lint: runs-on: ubuntu-latest strategy: matrix: - python-version: [ "3.8" ] + python-version: [ "3.10" ] steps: - uses: actions/checkout@v3 @@ -33,7 +42,7 @@ jobs: python -m pip install --upgrade pip pip install click flake8 ufmt - - name: Analyzing the code with flake8 + - name: Analyzing the Code with flake8 run: | echo "::add-matcher::fbgemm_gpu/test/lint/flake8_problem_matcher.json" flake8 --ignore=E501,W503,E203 . @@ -41,13 +50,13 @@ jobs: # W503 = line break before binary operator (deprecated) # E203 = whitespace before ":" - - name: Analyzing the code with ufmt + - name: Analyzing the Code with ufmt run: | ufmt diff fbgemm_gpu/fbgemm_gpu ufmt diff fbgemm_gpu/test ufmt diff fbgemm_gpu/bench - - name: Check Meta copyright header + - name: Check Meta Copyright Header run: | python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/fbgemm_gpu --fixit=False python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/test --fixit=False diff --git a/BUILD.bazel b/BUILD.bazel index e998487255..12e05c4522 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -159,14 +159,14 @@ cc_library( ) [ - cc_test( - name = paths.split_extension(paths.basename(filename))[0], - size = "medium", - srcs = [ - filename, - ], - deps = [ - ":test_utils", - ], - ) for filename in get_fbgemm_tests() + cc_test( + name = paths.split_extension(paths.basename(filename))[0], + size = "medium", + srcs = [ + filename, + ], + deps = [ + ":test_utils", + ], + ) for filename in get_fbgemm_tests() ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 58dcb9aeb0..32920d1d48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,19 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +# Set the default C++ standard to C++17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) + +# Set the default C standard to C11 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") @@ -114,17 +129,11 @@ add_dependencies(fbgemm_generic defs.bzl) add_dependencies(fbgemm_avx2 defs.bzl) add_dependencies(fbgemm_avx512 defs.bzl) -set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES - CXX_STANDARD 14 - CXX_STANDARD_REQUIRED YES - CXX_EXTENSIONS NO - CXX_VISIBILITY_PRESET hidden) - -#On Windows: -#1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function -#calls with _dllimport attribute -#2) MSVC uses /MD in default cxx compiling flags, -#need to change it to /MT in static case +# On Windows: +# 1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function +# calls with _dllimport attribute +# 2) MSVC uses /MD in default cxx compiling flags, +# Need to change it to /MT in static case if(MSVC) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4305 /wd4309") if(FBGEMM_LIBRARY_TYPE STREQUAL "static") @@ -267,8 +276,6 @@ elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared") set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON) - set_target_properties(fbgemm PROPERTIES - CXX_VISIBILITY_PRESET hidden) elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static") add_library(fbgemm STATIC $ diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 30b1a80424..aff61b2b94 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -16,9 +16,9 @@ http_archive( http_archive( name = "com_google_googletest", - strip_prefix = "googletest-cd6b9ae3243985d4dc725abd513a874ab4161f3e", + strip_prefix = "googletest-1.13.0", urls = [ - "https://github.com/google/googletest/archive/cd6b9ae3243985d4dc725abd513a874ab4161f3e.tar.gz", + "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz", ], ) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index b4fad7510a..49f9e38fa2 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -1,4 +1,12 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) find_package(MKL) if (NOT ${MKL_FOUND}) @@ -21,15 +29,12 @@ if (${BLAS_FOUND}) message(STATUS "BLAS_LIBRARIES= ${BLAS_LIBRARIES}") endif() -#benchmarks +# Benchmarks macro(add_benchmark BENCHNAME) add_executable(${BENCHNAME} ${ARGN} BenchUtils.cc ../test/QuantizationHelpers.cc ../test/EmbeddingSpMDMTestUtils.cc) - set_target_properties(${BENCHNAME} PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) target_compile_options(${BENCHNAME} PRIVATE "-m64" "-mavx2" "-mfma" "-masm=intel") target_link_libraries(${BENCHNAME} fbgemm) diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc index 1fcf4607de..17934b6101 100644 --- a/bench/EmbeddingSpMDM8BitBenchmark.cc +++ b/bench/EmbeddingSpMDM8BitBenchmark.cc @@ -111,7 +111,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc index b987586aac..246549f6a7 100644 --- a/bench/EmbeddingSpMDMBenchmark.cc +++ b/bench/EmbeddingSpMDMBenchmark.cc @@ -104,7 +104,7 @@ void run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc index ed5485ae29..fff665babb 100644 --- a/bench/EmbeddingSpMDMNBitBenchmark.cc +++ b/bench/EmbeddingSpMDMNBitBenchmark.cc @@ -116,7 +116,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc index d1b28f54b5..c50500768d 100644 --- a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc +++ b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc @@ -131,7 +131,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/RowwiseAdagradFusedBenchmark.cc b/bench/RowwiseAdagradFusedBenchmark.cc index 6f1203e6ab..a0524afaa5 100644 --- a/bench/RowwiseAdagradFusedBenchmark.cc +++ b/bench/RowwiseAdagradFusedBenchmark.cc @@ -90,7 +90,7 @@ void run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 5f393b0010..2276ca9ff2 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -1,15 +1,34 @@ -cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR) - -option(FBGEMM_CPU_ONLY "Build fbgemm_gpu without GPU support" OFF) - -set(message_line - "-------------------------------------------------------------") -message("${message_line}") +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +# Set the default C++ standard to C++17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Set the default C standard to C17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) + +function(BLOCK_PRINT) + message("================================================================================") + foreach(ARG IN LISTS ARGN) + message("${ARG}") + endforeach() + message("================================================================================") + message("") +endfunction() if(SKBUILD) - message("The project is built using scikit-build") + BLOCK_PRINT("The project is built using scikit-build") endif() +# Build options +option(FBGEMM_CPU_ONLY "Build FBGEMM_GPU without GPU support" OFF) option(USE_CUDA "Use CUDA" ON) option(USE_ROCM "Use ROCm" OFF) @@ -21,11 +40,10 @@ if(((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH})) endif() if(FBGEMM_CPU_ONLY) - message("Building for CPU-only") + BLOCK_PRINT("Building the CPU-only variant of FBGEMM-GPU") endif() -message("${message_line}") -message(STATUS "USE_ROCM ${USE_ROCM}") +BLOCK_PRINT("USE_ROCM: ${USE_ROCM}") if(FBGEMM_CPU_ONLY OR USE_ROCM) project( @@ -46,12 +64,16 @@ set(THIRDPARTY ${FBGEMM}/third_party) if(DEFINED GLIBCXX_USE_CXX11_ABI) if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1) - set(CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() - message("${CMAKE_CXX_FLAGS}") + BLOCK_PRINT( + "Default C++ compiler flags" + "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):" + "" + "${CMAKE_CXX_FLAGS}" + ) endif() # @@ -72,8 +94,7 @@ if(USE_ROCM) include(Hip) include(Hipify) - message("${message_line}") - message(STATUS "hip found ${HIP_FOUND}") + BLOCK_PRINT("HIP found: ${HIP_FOUND}") endif() # @@ -167,7 +188,8 @@ set(codegen_dependencies ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h - ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh) + ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh + ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_tensor_accessor.h) if(USE_ROCM) message(STATUS "${PYTHON_EXECUTABLE}" "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py" "--opensource --is_rocm") @@ -317,10 +339,15 @@ if(NOT FBGEMM_CPU_ONLY) src/split_embeddings_utils.cpp src/split_table_batched_embeddings.cpp src/metric_ops_host.cpp - src/embedding_inplace_update_gpu.cpp) + src/embedding_inplace_update_gpu.cpp + src/input_combine_gpu.cpp) if(NVML_LIB_PATH) message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}") + endif() + + if(NVML_LIB_PATH OR USE_ROCM) + message(STATUS "Adding merge_pooled_embeddings sources") list( APPEND fbgemm_gpu_sources_cpu @@ -328,8 +355,7 @@ if(NOT FBGEMM_CPU_ONLY) src/merge_pooled_embeddings_gpu.cpp src/topology_utils.cpp) else() - message(STATUS - "Could not find NVML_LIB_PATH; skipping certain sources into the build") + message(STATUS "Skipping merge_pooled_embeddings sources") endif() endif() @@ -351,7 +377,8 @@ if(NOT FBGEMM_CPU_ONLY) src/split_embeddings_cache_cuda.cu src/split_embeddings_utils.cu src/metric_ops.cu - src/embedding_inplace_update.cu) + src/embedding_inplace_update.cu + src/input_combine.cu) set_source_files_properties( ${fbgemm_gpu_sources_gpu} PROPERTIES COMPILE_OPTIONS @@ -411,13 +438,6 @@ if(USE_ROCM) else() add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files} ${cpp_asmjit_files} ${cpp_fbgemm_files}) - set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES - "${cuda_architectures}") - - # FBGEMM_CUB_USE_NAMESPACE will cause compilation errors on CUB for CUDA 12+ - # if(NOT FBGEMM_CPU_ONLY) - # target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE) - # endif() endif() set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "") @@ -427,7 +447,6 @@ if(NVML_LIB_PATH) target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH}) endif() target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS}) -set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17) install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu) diff --git a/fbgemm_gpu/codegen/__init__.template b/fbgemm_gpu/codegen/__init__.template index de8bf21dd0..661622eff9 100644 --- a/fbgemm_gpu/codegen/__init__.template +++ b/fbgemm_gpu/codegen/__init__.template @@ -13,7 +13,9 @@ import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_lars_sgd as loo import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_adam as lookup_partial_rowwise_adam # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_lamb as lookup_partial_rowwise_lamb # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad as lookup_rowwise_adagrad # noqa: F401 +import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad_with_counter as lookup_rowwise_adagrad_with_counter # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_sgd as lookup_sgd # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_sgd as lookup_approx_sgd # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad as lookup_approx_rowwise_adagrad # noqa: F401 +import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad_with_counter as lookup_approx_rowwise_adagrad_with_counter # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_weighted_adagrad as lookup_rowwise_weighted_adagrad # noqa: F401 diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py index 9d67358902..fd69a22f6e 100644 --- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py @@ -646,6 +646,11 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation = """ at::acc_type freq = 1.0; at::acc_type l2_wd = 0.0; + at::acc_type tail_id_threshold_val = tail_id_threshold; + CUDA_KERNEL_ASSERT(max_counter > 0.0); // avoid divide by zero error + if (is_tail_id_thresh_ratio == 1){ + tail_id_threshold_val = floorf(tail_id_threshold * max_counter); + } if (counter_halflife > 0 && threadIdx.x == 0) { // if id occurs multiple times in a batch, iter_delta=1 const auto iter_delta = prev_iter[idx] == 0 ? 1.0 : iter * 1.0 - prev_iter[idx]; @@ -660,6 +665,7 @@ def rowwise_adagrad_with_counter() -> None: } freq = SHFL_SYNC(freq, 0); l2_wd = SHFL_SYNC(l2_wd, 0); + tail_id_threshold_val = SHFL_SYNC(tail_id_threshold_val, 0); at::acc_type g_local_sum_square = 0.0; @@ -682,10 +688,7 @@ def rowwise_adagrad_with_counter() -> None: at::acc_type multiplier; at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; - at::acc_type tail_id_threshold_val = tail_id_threshold; - if (is_tail_id_thresh_ratio == 1){ - tail_id_threshold_val = floorf(tail_id_threshold * max_counter); - } + if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; momentum1[idx] = new_sum_square_grads; diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu index 4d77d2b508..bc18695ece 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check.cu +++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu @@ -23,31 +23,52 @@ __device__ void adjust_offset_kernel( *offset_acc_end = indices_end; } -template +template __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( const at::PackedTensorAccessor32 rows_per_table, at::PackedTensorAccessor32 indices, at::PackedTensorAccessor32 offsets, + const int32_t* const vbe_metadata, const int64_t bounds_check_mode_, at::PackedTensorAccessor32 warning, FixedDivisor fd) { int32_t T = rows_per_table.size(0); - int32_t B = (offsets.size(0) - 1) / T; - int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y; - int32_t b; // = b_t % B; - int32_t t; // = b_t / B; - fd.DivMod(b_t, &t, &b); - if (t >= T) { + int32_t b; + int32_t t; + int32_t B = 0; + int32_t total_B = offsets.size(0) - 1; + + if (!vbe && b_t >= total_B) { return; } - auto bounds_check_mode = static_cast(bounds_check_mode_); - auto num_rows = rows_per_table[t]; - auto indices_start = offsets[t * B + b]; - auto indices_end = offsets[t * B + b + 1]; - index_t num_indices = indices.size(0); + fd.DivMod(b_t, &t, &b); + + if (vbe) { + // Check if t is valid + if (t >= T) { + return; + } + const auto B_start = vbe_metadata[t]; + B = vbe_metadata[t + 1] - B_start; + // Check if b is valid + if (b >= B) { + return; + } + // Update b_t value + b_t = B_start + b; + } else { + B = total_B / T; + } + + const auto bounds_check_mode = + static_cast(bounds_check_mode_); + const auto num_rows = rows_per_table[t]; + auto indices_start = offsets[b_t]; + auto indices_end = offsets[b_t + 1]; + const index_t num_indices = indices.size(0); if (bounds_check_mode == BoundsCheckMode::FATAL) { CUDA_KERNEL_ASSERT(indices_start >= 0); @@ -58,12 +79,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( indices_end > num_indices) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: (at least one) Out of bounds access for " - "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld," + "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for " + "batch: %d, table: %d, indices_start: %lld, indices_end: %lld," " num_indices: %lld. Setting indices_start and indices_end within " "the range.\n", - static_cast(b), - static_cast(t), + vbe ? "true" : "false", + b, + t, static_cast(indices_start), static_cast(indices_end), static_cast(num_indices)); @@ -72,16 +94,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( indices_start, indices_end, num_indices, - &offsets[t * B + b], - &offsets[t * B + b + 1]); + &offsets[b_t], + &offsets[b_t + 1]); } } else if (bounds_check_mode == BoundsCheckMode::IGNORE) { adjust_offset_kernel( indices_start, indices_end, num_indices, - &offsets[t * B + b], - &offsets[t * B + b + 1]); + &offsets[b_t], + &offsets[b_t + 1]); } const auto L = indices_end - indices_start; @@ -100,9 +122,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( if (idx < 0 || idx >= num_rows) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n", - static_cast(b), - static_cast(t), + "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n", + vbe ? "true" : "false", + b, + t, static_cast(i), static_cast(idx), num_rows, @@ -122,25 +145,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( } if (bounds_check_mode == BoundsCheckMode::FATAL) { - CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]); + CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]); } else if (bounds_check_mode == BoundsCheckMode::WARNING) { - if (num_indices != offsets[B * T]) { + if (num_indices != offsets[total_B]) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: the last element in offsets is incorrect for " - "total batch size B: %lld, total table num T: %lld, " + "EmbeddingBoundsCheck (VBE %s): the last element in offsets is incorrect for " + "total batch size %s: %d, total table num T: %d, " " last element in offsets: %lld, indices size: %lld. " " Setting the last element in offsets to be indices size.\n", - static_cast(B), - static_cast(T), - static_cast(offsets[B * T]), + vbe ? "true" : "false", + vbe ? "total_B" : "B", + vbe ? total_B : B, + T, + static_cast(offsets[total_B]), static_cast(num_indices)); } - offsets[B * T] = num_indices; + offsets[total_B] = num_indices; } } else if (bounds_check_mode == BoundsCheckMode::IGNORE) { - if (num_indices != offsets[B * T]) { - offsets[B * T] = num_indices; + if (num_indices != offsets[total_B]) { + offsets[total_B] = num_indices; } } } @@ -151,19 +176,23 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - c10::optional weights) { + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t max_B) { TENSOR_ON_CUDA_GPU(rows_per_table); TENSOR_ON_CUDA_GPU(indices); TENSOR_ON_CUDA_GPU(offsets); TENSOR_ON_CUDA_GPU(warning); TENSOR_EMPTY_OR_ON_CUDA_GPU(weights); + TENSOR_EMPTY_OR_ON_CUDA_GPU(vbe_metadata); at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(rows_per_table.get_device()); const int32_t T = rows_per_table.size(0); - const int32_t B = (offsets.size(0) - 1) / T; - if (B == 0 || T == 0) { + const int32_t total_B = offsets.size(0) - 1; + const int32_t B = (total_B) / T; + if (total_B == 0 || T == 0) { return; } const auto bounds_check_mode = @@ -172,12 +201,17 @@ void bounds_check_indices_cuda( warning.zero_(); } const int64_t num_indices = indices.size(0); + const auto vbe = vbe_metadata.has_value(); - TORCH_CHECK( - offsets.size(0) == B * T + 1, - "offsets size " + std::to_string(offsets.size(0)) + - " is not equal to B (" + std::to_string(B) + ") * T (" + - std::to_string(T) + ") + 1"); + if (vbe) { + TORCH_CHECK(max_B >= 0); + } else { + TORCH_CHECK( + offsets.size(0) == B * T + 1, + "offsets size " + std::to_string(offsets.size(0)) + + " is not equal to B (" + std::to_string(B) + ") * T (" + + std::to_string(T) + ") + 1"); + } if (weights.has_value()) { TORCH_CHECK( weights.value().size(0) == num_indices, @@ -186,20 +220,24 @@ void bounds_check_indices_cuda( } constexpr size_t kNumThreads = 256; + const auto max_B_ = vbe ? max_B : B; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] { - bounds_check_indices_kernel - <<>>( - rows_per_table - .packed_accessor32(), - indices.packed_accessor32(), - offsets.packed_accessor32(), - bounds_check_mode_, - warning.packed_accessor32(), - FixedDivisor(B)); + const auto bounds_check_kernel = + (vbe ? bounds_check_indices_kernel + : bounds_check_indices_kernel); + bounds_check_kernel<<< + div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize), + dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), + 0, + at::cuda::getCurrentCUDAStream()>>>( + rows_per_table.packed_accessor32(), + indices.packed_accessor32(), + offsets.packed_accessor32(), + vbe ? vbe_metadata.value().data_ptr() : nullptr, + bounds_check_mode_, + warning.packed_accessor32(), + FixedDivisor(max_B_)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp index 84575a3361..87e3cd7521 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp +++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp @@ -23,7 +23,9 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode, Tensor& warning, - c10::optional weights); + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t max_B); // Deprecated for fb namespace! Please use fbgemm namespace instead! TORCH_LIBRARY_FRAGMENT(fb, m) { diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp index a2dd19a75e..a33e02e164 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp @@ -42,7 +42,12 @@ void bounds_check_indices_cpu( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - c10::optional weights) { + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t /*max_B*/) { + TORCH_CHECK( + !vbe_metadata.has_value(), + "bounds_check_indices on CPU does not support variable length (batch size)"); auto bounds_check_mode = static_cast(bounds_check_mode_); if (bounds_check_mode == BoundsCheckMode::WARNING) { warning.zero_(); @@ -163,7 +168,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) { // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd // or DCE'd, etc. m.def( - "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()"); + "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()"); DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu); } @@ -171,6 +176,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd // or DCE'd, etc. m.def( - "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()"); + "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()"); DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu); } diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp index 9caaacbfb8..829249b297 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp @@ -534,44 +534,5 @@ Tensor pruned_array_lookup_cpu( return dense_indices; } -Tensor pruned_array_lookup_from_row_idx_cpu( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets) { - TENSOR_ON_CPU(update_row_indices); - TENSOR_ON_CPU(update_table_indices); - TENSOR_ON_CPU(index_remappings); - TENSOR_ON_CPU(index_remappings_offsets); - - int32_t T = index_remappings_offsets.size(0) - 1; - auto dense_indices = empty_like(update_row_indices); - const auto num_indices = update_row_indices.numel(); - - AT_DISPATCH_INDEX_TYPES( - update_row_indices.scalar_type(), "pruned_array_lookup_from_row_idx_cpu_kernel", [&] { - const auto update_row_indices_acc = update_row_indices.accessor(); - auto dense_indices_acc = dense_indices.accessor(); - const auto update_table_indices_acc = update_table_indices.accessor(); - - const auto index_remappings_acc = index_remappings.accessor(); - const auto index_remappings_offsets_acc = index_remappings_offsets.accessor(); - - for (int64_t idx = 0; idx < num_indices; idx++) { - const int table_idx = update_table_indices_acc[idx]; - const auto row_idx = update_row_indices_acc[idx]; - int64_t index_remappings_start = index_remappings_offsets_acc[table_idx]; - int64_t index_remappings_end = index_remappings_offsets_acc[table_idx + 1]; - int64_t capacity = index_remappings_end - index_remappings_start; - if (capacity > 0) { - dense_indices_acc[idx] = index_remappings_acc[index_remappings_start + row_idx]; - } else { - dense_indices_acc[idx] = row_idx; - } - } - }); - return dense_indices; -} - {% endif %} // clang-format on diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp index 6d4426cb27..01c054f818 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp @@ -4,12 +4,12 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ + #include #include #include #include #include -#include #include "c10/core/ScalarType.h" #ifdef FBCODE_CAFFE2 #include "common/stats/Stats.h" @@ -18,6 +18,8 @@ #include "fbgemm_gpu/sparse_ops_utils.h" #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh" +#include + using Tensor = at::Tensor; using namespace fbgemm_gpu; @@ -37,7 +39,7 @@ DEFINE_quantile_stat( facebook::fb303::ExportTypeConsts::kNone, std::array{{.25, .50, .75, .99}}); -// Miss rate due to conflict in cache associativity. +// (Unique) Miss rate due to conflict in cache associativity. // # unique misses due to conflict / # requested indices. DEFINE_quantile_stat( tbe_uvm_cache_conflict_unique_miss_rate, @@ -45,6 +47,21 @@ DEFINE_quantile_stat( facebook::fb303::ExportTypeConsts::kNone, std::array{{.25, .50, .75, .99}}); +// Miss rate due to conflict in cache associativity. +// # misses due to conflict / # requested indices. +DEFINE_quantile_stat( + tbe_uvm_cache_conflict_miss_rate, + "tbe_uvm_cache_conflict_miss_rate_per_mille", + facebook::fb303::ExportTypeConsts::kNone, + std::array{{.25, .50, .75, .99}}); + +// Total miss rate. +DEFINE_quantile_stat( + tbe_uvm_cache_total_miss_rate, + "tbe_uvm_cache_total_miss_rate_per_mille", + facebook::fb303::ExportTypeConsts::kNone, + std::array{{.25, .50, .75, .99}}); + // FLAGs to control UVMCacheStats. DEFINE_int32( tbe_uvm_cache_stat_report, @@ -58,6 +75,12 @@ DEFINE_int32( "If tbe_uvm_cache_stat_report is enabled, more detailed raw stats will be printed with this " "period. This should be an integer multiple of tbe_uvm_cache_stat_report."); +DEFINE_int32( + tbe_uvm_cache_enforced_misses, + 0, + "If set to non-zero, some cache lookups (tbe_uvm_cache_enforced_misses / 256) are enforced to be misses; " + "this is performance evaluation purposes only; and should be zero otherwise."); + // TODO: align this with uvm_cache_stats_index in // split_embeddings_cache_cuda.cu. const int kUvmCacheStatsSize = 6; @@ -84,10 +107,11 @@ void process_uvm_cache_stats( // uvm_cache_stats_counters[0]: num_req_indices // uvm_cache_stats_counters[1]: num_unique_indices // uvm_cache_stats_counters[2]: num_unique_misses - // uvm_cache_stats_counters[3]: num_unique_conflict_misses + // uvm_cache_stats_counters[3]: num_conflict_unique_misses + // uvm_cache_stats_counters[4]: num_conflict_misses // They should be zero-out after the calculated rates are populated into // cache counters. - static std::vector uvm_cache_stats_counters(4); + static std::vector uvm_cache_stats_counters(5); // Export cache stats. auto uvm_cache_stats_cpu = uvm_cache_stats.cpu(); @@ -107,19 +131,32 @@ void process_uvm_cache_stats( // Calculate cache related ratios based on the cumulated numbers and // push them into the counter pools. if (populate_uvm_stats && uvm_cache_stats_counters[0] > 0) { - double unique_rate = + const double unique_rate = static_cast(uvm_cache_stats_counters[1]) / uvm_cache_stats_counters[0] * 1000; - double unique_miss_rate = + const double unique_miss_rate = static_cast(uvm_cache_stats_counters[2]) / uvm_cache_stats_counters[0] * 1000; - double unique_conflict_miss_rate = + const double conflict_unique_miss_rate = static_cast(uvm_cache_stats_counters[3]) / uvm_cache_stats_counters[0] * 1000; + const double conflict_miss_rate = + static_cast(uvm_cache_stats_counters[4]) / + uvm_cache_stats_counters[0] * 1000; + // total # misses = unique misses - conflict_unique_misses + conflict + // misses. + const double total_miss_rate = + static_cast( + uvm_cache_stats_counters[2] - uvm_cache_stats_counters[3] + + uvm_cache_stats_counters[4]) / + uvm_cache_stats_counters[0] * 1000; + STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate); STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate); STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue( - unique_conflict_miss_rate); + conflict_unique_miss_rate); + STATS_tbe_uvm_cache_conflict_miss_rate.addValue(conflict_miss_rate); + STATS_tbe_uvm_cache_total_miss_rate.addValue(total_miss_rate); // Fill all the elements of the vector uvm_cache_stats_counters as 0 // to zero out the cumulated counters. @@ -365,7 +402,7 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( // cache_index_table_map: (linearized) index to table number map. // 1D tensor, dtype=int32. c10::optional cache_index_table_map, - // lxu_cache_state: Cache state (cached idnex, or invalid). + // lxu_cache_state: Cache state (cached index, or invalid). // 2D tensor: # sets x assoc. dtype=int64. c10::optional lxu_cache_state, // lxu_state: meta info for replacement (time stamp for LRU). @@ -461,6 +498,16 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( uvm_cache_stats); #ifdef FBCODE_CAFFE2 + if (FLAGS_tbe_uvm_cache_enforced_misses > 0) { + // Override some lxu_cache_locations (N for every 256 indices) with cache + // miss to enforce access to UVM. + lxu_cache_locations = emulate_cache_miss( + lxu_cache_locations.value(), + FLAGS_tbe_uvm_cache_enforced_misses, + gather_uvm_stats, + uvm_cache_stats); + } + process_uvm_cache_stats( signature, total_cache_hash_size.value(), @@ -511,13 +558,6 @@ Tensor pruned_array_lookup_cuda( Tensor index_remappings, Tensor index_remappings_offsets); -///@ingroup embedding-cuda -Tensor pruned_array_lookup_from_row_idx_cuda( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets); - TORCH_LIBRARY_FRAGMENT(fbgemm, m) { DISPATCH_TO_CUDA( "int_nbit_split_embedding_codegen_lookup_function", @@ -529,7 +569,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { "pruned_hashmap_lookup", pruned_hashmap_lookup_unweighted_cuda); DISPATCH_TO_CUDA("pruned_array_lookup", pruned_array_lookup_cuda); - DISPATCH_TO_CUDA( - "pruned_array_lookup_from_row_idx", - pruned_array_lookup_from_row_idx_cuda); } diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp index 93db44ac76..a43671f880 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp @@ -240,13 +240,6 @@ Tensor pruned_array_lookup_cpu( Tensor index_remappings, Tensor index_remappings_offsets); -///@ingroup embedding-cpu -Tensor pruned_array_lookup_from_row_idx_cpu( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets); - TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor"); @@ -278,12 +271,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "pruned_array_lookup(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); DISPATCH_TO_CPU("pruned_array_lookup", pruned_array_lookup_cpu); - - // GPU version of array lookup. - m.def( - "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); - DISPATCH_TO_CPU( - "pruned_array_lookup_from_row_idx", pruned_array_lookup_from_row_idx_cpu); } class PrunedMapCPU : public torch::jit::CustomClassHolder { diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu index 6ac2b2d3c0..e0a2f04ee8 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu @@ -552,36 +552,6 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_ } {% endif %} -{% if not weighted %} -template -__global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel( - const at::PackedTensorAccessor32 update_row_indices, - const at::PackedTensorAccessor32 update_table_indices, - const at::PackedTensorAccessor32 index_remappings, - const at::PackedTensorAccessor32 index_remappings_offsets, - at::PackedTensorAccessor32 dense_indices) { - - const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= update_row_indices.size(0)) { - return; - } - const int table_idx = update_table_indices[idx]; - const auto row_idx = update_row_indices[idx]; - - const int64_t index_remappings_start = index_remappings_offsets[table_idx]; - const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1]; - const int64_t capacity = index_remappings_end - index_remappings_start; - - if (capacity > 0) { - dense_indices[idx] = index_remappings[index_remappings_start + row_idx]; - } else { - dense_indices[idx] = row_idx; - } -} -{% endif %} - - - } {% for nobag in [True, False] %} @@ -737,13 +707,16 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int2_D > 0) { auto max_int2_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int2_D, SparseType::INT2, row_alignment), 128); - TORCH_CHECK(max_int2_128b_rows <= 2); + TORCH_CHECK(max_int2_128b_rows <= 4); if (max_int2_128b_rows > 0) { Y(2, 16, 0, 1); } if (max_int2_128b_rows > 1) { Y(2, 8, 1, 2); } + if (max_int2_128b_rows > 2) { + Y(2, 8, 2, 4); + } } })); #undef X @@ -783,7 +756,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int4_D > 0) { auto max_int4_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int4_D, SparseType::INT4, row_alignment), 128); - TORCH_CHECK(max_int4_128b_rows <= 4); + TORCH_CHECK(max_int4_128b_rows <= 8); if (max_int4_128b_rows > 0) { Y(4, 8, 0, 1); } @@ -793,6 +766,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_int4_128b_rows > 2) { Y(1, 4, 2, 4); } + if (max_int4_128b_rows > 4) { + Y(1, 4, 4, 8); + } } })); #undef X @@ -831,7 +807,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int8_D > 0) { auto max_int8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int8_D, SparseType::INT8, row_alignment), 128); - TORCH_CHECK(max_int8_128b_rows <= 8); + TORCH_CHECK(max_int8_128b_rows <= 16); if (max_int8_128b_rows > 0) { Y(2, 8, 0, 1); } @@ -844,6 +820,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_int8_128b_rows > 4) { Y(2, 4, 4, 8); } + if (max_int8_128b_rows > 8) { + Y(2, 2, 8, 16); + } } })); #undef X @@ -884,7 +863,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float8_D > 0) { auto max_fp8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float8_D, SparseType::FP8, row_alignment), 128); - TORCH_CHECK(max_fp8_128b_rows <= 8); + TORCH_CHECK(max_fp8_128b_rows <= 16); if (max_fp8_128b_rows > 0) { Y(2, 8, 0, 1); } @@ -897,6 +876,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp8_128b_rows > 4) { Y(2, 4, 4, 8); } + if (max_fp8_128b_rows > 8) { + Y(2, 2, 4, 8); + } } })); #undef X @@ -935,7 +917,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float16_D > 0) { auto max_fp16_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float16_D, SparseType::FP16, row_alignment), 128); - TORCH_CHECK(max_fp16_128b_rows <= 16); + TORCH_CHECK(max_fp16_128b_rows <= 32); if (max_fp16_128b_rows > 0) { Y(2, 8, 0, 2); } @@ -948,6 +930,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp16_128b_rows > 8) { Y(2, 2, 8, 16); } + if (max_fp16_128b_rows > 16) { + Y(2, 1, 16, 32); + } } })); #undef X @@ -986,7 +971,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float32_D > 0) { auto max_fp32_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float32_D, SparseType::FP32, row_alignment), 128); - TORCH_CHECK(max_fp32_128b_rows <= 32); + TORCH_CHECK(max_fp32_128b_rows <= 64); if (max_fp32_128b_rows > 0) { Y(2, 4, 0, 4); } @@ -996,6 +981,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp32_128b_rows > 16) { Y(1, 1, 16, 32); } + if (max_fp32_128b_rows > 32) { + Y(1, 1, 32, 64); + } } })); #undef X @@ -1089,53 +1077,6 @@ Tensor pruned_array_lookup_cuda( C10_CUDA_KERNEL_LAUNCH_CHECK(); return dense_indices; } - -Tensor pruned_array_lookup_from_row_idx_cuda( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets) { - - TENSOR_ON_CUDA_GPU(update_row_indices); - TENSOR_ON_CUDA_GPU(update_table_indices); - TENSOR_ON_CUDA_GPU(index_remappings); - TENSOR_ON_CUDA_GPU(index_remappings_offsets); - - at::cuda::OptionalCUDAGuard device_guard; - device_guard.set_index(update_table_indices.get_device()); - auto dense_indices = at::empty_like(update_row_indices); - const int32_t T = index_remappings_offsets.size(0) - 1; - - const auto num_indices = update_row_indices.numel(); - if (num_indices == 0) { - return dense_indices; - } - - TORCH_CHECK(index_remappings.size(0) < std::numeric_limits::max()); - TORCH_CHECK(update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim()); - TORCH_CHECK(update_table_indices.dim() == 1, "Tensor dim: ", update_table_indices.dim()); - TORCH_CHECK(index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim()); - TORCH_CHECK(index_remappings_offsets.dim() == 1, "Tensor dim: ", index_remappings_offsets.dim()); - TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim()); - constexpr size_t kForwardMaxThreads = 256; - - AT_DISPATCH_INDEX_TYPES( - update_row_indices.scalar_type(), "embedding_inplace_update_kernel", [&] { - nbit::int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel<<< - nbit::div_round_up(num_indices, kForwardMaxThreads), - kForwardMaxThreads, - 0, - at::cuda::getCurrentCUDAStream()>>>( - update_row_indices.packed_accessor32(), - update_table_indices.packed_accessor32(), - index_remappings.packed_accessor32(), - index_remappings_offsets.packed_accessor32(), - dense_indices.packed_accessor32() - ); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); - return dense_indices; -} {% endif %} // clang-format on diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py index c5a3d465e9..8c98a96a1a 100644 --- a/fbgemm_gpu/codegen/lookup_args.py +++ b/fbgemm_gpu/codegen/lookup_args.py @@ -44,6 +44,13 @@ class OptimizerArgs(NamedTuple): weight_decay_mode: int eta: float momentum: float + counter_halflife: int + adjustment_iter: int + adjustment_ub: float + learning_rate_mode: int + grad_sum_decay: int + tail_id_threshold: float + is_tail_id_thresh_ratio: int class Momentum(NamedTuple): diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template index 4cdc5b8766..bd406d39fa 100644 --- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template +++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template @@ -36,9 +36,18 @@ def invoke( {% if "momentum2_dev" in args.split_function_arg_names %} momentum2: Momentum, {% endif %} + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter: Momentum, + {% endif %} + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter: Momentum, + {% endif %} {% if "iter" in args.split_function_arg_names %} iter: int, {% endif %} + {% if "max_counter" in args.split_function_arg_names %} + max_counter: float, + {% endif %} ) -> torch.Tensor: if (common_args.host_weights.numel() > 0): return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu( @@ -84,6 +93,27 @@ def invoke( {% if "momentum" in args.split_function_arg_names %} momentum=optimizer_args.momentum, {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} # momentum1 {% if "momentum1_dev" in args.split_function_arg_names %} momentum1_host=momentum1.host, @@ -96,10 +126,26 @@ def invoke( momentum2_offsets=momentum2.offsets, momentum2_placements=momentum2.placements, {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_host=prev_iter.host, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_host=row_counter.host, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} # iter {% if "iter" in args.split_function_arg_names %} iter=iter, {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} ) else: return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( @@ -151,6 +197,27 @@ def invoke( {% if "momentum" in args.split_function_arg_names %} momentum=optimizer_args.momentum, {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} # momentum1 {% if "momentum1_dev" in args.split_function_arg_names %} momentum1_dev=momentum1.dev, @@ -165,9 +232,27 @@ def invoke( momentum2_offsets=momentum2.offsets, momentum2_placements=momentum2.placements, {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_dev=prev_iter.dev, + prev_iter_uvm=prev_iter.uvm, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_dev=row_counter.dev, + row_counter_uvm=row_counter.uvm, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} # iter {% if "iter" in args.split_function_arg_names %} iter=iter, {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} output_dtype=common_args.output_dtype, ) diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md new file mode 100644 index 0000000000..4f2c9c142b --- /dev/null +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -0,0 +1,437 @@ +# FBGEMM_GPU Build Instructions + +The most up-to-date instructions are embedded in +[`setup_env.bash`](../../.github/scripts/setup_env.bash). The general steps for +building FBGEMM_GPU are as follows: + +1. Set up an isolated environment for building (Miniconda) +1. Install the relevant build tools (C/C++ compiler) +1. Set up for either CUDA, ROCm, or CPU build +1. Install PyTorch +1. Run the build + + +## Set Up an Isolated Build Environment + +### Install Miniconda + +Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html) +environment is recommended for reproducible builds: + +```sh +# Set the Miniconda prefix directory +miniconda_prefix=$HOME/miniconda + +# Download the Miniconda installer +wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + +# Run the installer +bash miniconda.sh -b -p "$miniconda_prefix" -u + +# Load the shortcuts +. ~/.bashrc + +# Run updates +conda update -n base -c defaults -y conda +``` + +From here on out, all installation commands will be run against or inside a +Conda environment. + + +### Set Up the Conda Environment + +Create a Conda environment with the specified Python version: + +```sh +env_name= +python_version=3.10 + +# Create the environment +conda create -y --name "${env_name}" python="${python_version}" + +# Upgrade PIP and pyOpenSSL package +conda run -n "${env_name}" pip install --upgrade pip +conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 +``` + +## Install the Build Tools + +### C/C++ Compiler + +Install a version of the GCC toolchain that supports **C++17**. Note that GCC +(as opposed to Clang for example) is required for GPU (CUDA) builds because +NVIDIA's `nvcc` relies on `gcc` and `g++` in the path. The `sysroot` package +will also need to be installed to avoid issues with missing versioned symbols +when compiling FBGEMM_CPU: + +```sh +conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge +``` + +While newer versions of GCC can be used, binaries compiled under newer versions +of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS +Stream 8, because the compiled library will reference symbols from versions of +`GLIBCXX` that the system's `libstdc++.so.6` will not support. To see what +versions of GLIBC and GLIBCXX the available `libstdc++.so.6` supports: + +```sh +libcxx_path=/path/to/libstdc++.so.6 + +# Print supported for GLIBC versions +objdump -TC "${libcxx_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat + +# Print supported for GLIBCXX versions +objdump -TC "${libcxx_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat +``` + +### Other Build Tools + +Install the other necessary build tools such as `ninja`, `cmake`, etc: + +```sh +conda install -n "${env_name}" -y \ + click \ + cmake \ + hypothesis \ + jinja2 \ + ninja \ + numpy \ + scikit-build \ + wheel +``` + + +## Set Up for CUDA Build + +The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability +3.5+. Setting the machine up for CUDA builds of FBGEMM_GPU can be done either +through pre-built Docker images or through Conda installation on bare metal. +Note that neither a GPU nor the NVIDIA drivers need to be present for builds, +since they are only used at runtime. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired +Linux distribution and CUDA version. + +```sh +# Run for Ubuntu 22.04, CUDA 11.8 +docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install CUDA + +Install the full CUDA package through Conda, which includes +[NVML](https://developer.nvidia.com/nvidia-management-library-nvml): + +```sh +cuda_version=11.7.1 + +# Install the full CUDA package +conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}" +``` + +Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are +found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_runtime.h +find "${conda_prefix}" -name libnvidia-ml.so +``` + +### Install cuDNN + +[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the +CUDA variant of FBGEMM_GPU. Download and extract the cuDNN package for the +given CUDA version: + +```sh +# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh +cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz + +# Download and unpack cuDNN +wget -q "${cudnn_url}" -O cudnn.tar.xz +``` + +### [OPTIONAL] Install CUB + +[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for +the CUDA variant FBGEMM_GPU. This must be installed separately for +**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged. + +To install CUB through Conda: + +```sh +conda install -c bottler nvidiacub +``` + +Alternatively, CUB may be installed manually by downloading from the +[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking +the package: + +```sh +# Download and unpack CUB +wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz +``` + + +## Set Up for ROCm Build + +Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through +pre-built Docker images or through bare metal. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the +desired ROCm CUDA version. + +```sh +# Run for ROCm 5.4.2 +docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install ROCm + +Install the full ROCm package through the operating system package manager. The +full instructions can be found in the +[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html): + +```sh +# [OPTIONAL] Disable apt installation prompts +export DEBIAN_FRONTEND=noninteractive + +# Update the repo DB +apt update + +# Download the installer +wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb + +# Run the installer +apt install ./amdgpu-install_5.4.50403-1_all.deb + +# Install ROCm +amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms +``` + +### Install MIOpen + +[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the +ROCm variant of FBGEMM_GPU that needs to be installed: + +```sh +apt install hipify-clang miopen-hip miopen-hip-dev +``` + + +## Install PyTorch + +The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains +the most authoritative instructions on how to install PyTorch, either through +Conda or through PIP. + +### Installation Through Conda + +```sh +# Install the latest nightly +conda install -n "${env_name}" -y pytorch -c pytorch-nightly +# Install the latest test (RC) +conda install -n "${env_name}" -y pytorch -c pytorch-test +# Install a specific version +conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch +``` + +Note that installing PyTorch through Conda without specifying a version (as in +the case of nightly builds) may not always be reliable. For example, it is known +that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the +CPU-only builds. As such, a Conda installation of `pytorch-nightly` in that time +window will silently fall back to installing the CPU-only version. + +Also note that, because both the GPU and CPU-only versions of PyTorch are placed +into the same artifact bucket, the PyTorch variant that is selected during +installation will depend on whether or not CUDA is installed on the system. Thus +for GPU builds, it is important to install CUDA first prior to PyTorch. + +### Installation Through PIP + +Note that PIP is the only choice of installation of PyTorch for ROCm builds. + +```sh +# Install the latest nightly +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/ +# Install the latest test (RC) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/ +# Install a specific version +conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/ +# Install the latest nightly (ROCm 5.3) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/ +``` + +### Post-Install Checks + +Verify the PyTorch installation with an `import` test: + +```sh +conda run -n "${env_name}" python -c "import torch.distributed" +``` + +For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`** +is found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_cmake_macros.h +``` + + +## Build the FBGEMM_GPU Package + +### Preparing the Build + +Clone the repo along with its submodules, and install the `requirements.txt`: + +```sh +# !! Run inside the Conda environment !! + +# Select a version tag +FBGEMM_VERSION=v0.4.0 + +# Clone the repo along with its submodules +git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION} + +# Install additional required packages for building and testing +cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu +pip install requirements.txt +``` + +### The Build Process + +The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it +keeps state across install runs. As such, builds can become stale and can cause +problems when re-runs are attempted after a build failure due to missing +dependencies, etc. To address this, simply clear the build cache: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python setup.py clean +``` + +### CUDA Build + +Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and +made available to the build through environment variables: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# [OPTIONAL] Specify the CUDA installation paths +# This may be required if CMake is unable to find nvcc +export CUDACXX=/path/to/nvcc +export CUDA_BIN_PATH=/path/to/cuda/installation + +# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1) +export CUB_DIR=/path/to/cub + +# Specify cuDNN header and library paths +export CUDNN_INCLUDE_DIR=/path/to/cudnn/include +export CUDNN_LIBRARY=/path/to/cudnn/lib + +# Specify NVML path +export NVML_LIB_PATH=/path/to/libnvidia-ml.so + +# Update to reflect the version of Python in the Conda environment +python_tag=py310 +package_name=fbgemm_gpu + +# Build for SM70/80 (V100/A100 GPU); update as needed +# If not specified, only the CUDA architecture supported by current system will be targeted +# If no CUDA device is present either, all CUDA architectures will be targeted +cuda_arch_list=7.0;8.0 + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" + +# Build and install the library into the Conda environment +python setup.py install \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" +``` + +### ROCm Build + +For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a') +export ROCM_PATH=/path/to/rocm +export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*') + +python_tag=py310 +package_name=fbgemm_gpu_rocm + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 + +# Build and install the library into the Conda environment +python setup.py install develop +``` + +### CPU-Only Build + +For CPU-only builds, the `--cpu_only` needs to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python_tag=py310 +package_name=fbgemm_gpu_cpu + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --cpu_only + +# Build and install the library into the Conda environment +python setup.py install --cpu_only +``` + +### Post-Build Checks + +After the build completes, it is useful to check the built library and verify +the version numbers of GLIBCXX referenced as well as the availability of certain +function symbols: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Locate the built .SO file +fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so) + +# Note the versions of GLIBCXX referenced by the .SO +# The libstdc++.so.6 available on the install target must support these versions +objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + +# Test for the existence of a given function symbol in the .SO +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings(" +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense(" +``` diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md index 097cde17dc..e2b0c81ae7 100644 --- a/fbgemm_gpu/docs/README.md +++ b/fbgemm_gpu/docs/README.md @@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti ``` pip3 install -r requirements.txt - doxygen Doxygen.ini + doxygen Doxyfile.in make html ``` diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index 0552e9c981..2c7d99610f 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -9,7 +9,7 @@ import enum import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from itertools import accumulate from math import log2 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union @@ -71,6 +71,43 @@ class WeightDecayMode(enum.IntEnum): NONE = 0 L2 = 1 DECOUPLE = 2 + COUNTER = 3 + + +class CounterWeightDecayMode(enum.IntEnum): + NONE = 0 + L2 = 1 + DECOUPLE = 2 + + +class LearningRateMode(enum.IntEnum): + EQUAL = -1 + TAIL_ID_LR_INCREASE = 0 + TAIL_ID_LR_DECREASE = 1 + COUNTER_SGD = 2 + + +class GradSumDecay(enum.IntEnum): + NO_DECAY = -1 + CTR_DECAY = 0 + + +@dataclass +class TailIdThreshold: + val: float = 0 + is_ratio: bool = False + + +@dataclass +class CounterBasedRegularizationDefinition: + counter_weight_decay_mode: CounterWeightDecayMode = CounterWeightDecayMode.NONE + counter_halflife: int = -1 + adjustment_iter: int = -1 + adjustment_ub: float = 1.0 + learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL + grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY + tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold) + max_counter_update_freq: int = 1000 RecordCacheMetrics: NamedTuple = NamedTuple( @@ -78,14 +115,16 @@ class WeightDecayMode(enum.IntEnum): [("record_cache_miss_counter", bool), ("record_tablewise_cache_miss", bool)], ) - -@dataclass -class SplitState: - dev_size: int - host_size: int - uvm_size: int - placements: List[EmbeddingLocation] - offsets: List[int] +SplitState: NamedTuple = NamedTuple( + "SplitState", + [ + ("dev_size", int), + ("host_size", int), + ("uvm_size", int), + ("placements", List[EmbeddingLocation]), + ("offsets", List[int]), + ], +) def construct_split_state( @@ -95,11 +134,11 @@ def construct_split_state( precision: SparseType = SparseType.FP32, int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET, ) -> SplitState: - placements = [] - offsets = [] - dev_size = 0 - host_size = 0 - uvm_size = 0 + placements: List[EmbeddingLocation] = [] + offsets: List[int] = [] + dev_size: int = 0 + host_size: int = 0 + uvm_size: int = 0 for num_embeddings, embedding_dim, location, _ in embedding_specs: assert ( embedding_dim % 4 == 0 @@ -235,6 +274,9 @@ def __init__( # noqa C901 eta: float = 0.001, # used by LARS-SGD, beta1: float = 0.9, # used by LAMB and ADAM beta2: float = 0.999, # used by LAMB and ADAM + counter_based_regularization: Optional[ + CounterBasedRegularizationDefinition + ] = None, # used by Rowwise Adagrad pooling_mode: PoolingMode = PoolingMode.SUM, device: Optional[Union[str, int, torch.device]] = None, bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING, @@ -408,6 +450,34 @@ def __init__( # noqa C901 self.stochastic_rounding = stochastic_rounding self.optimizer = optimizer + self.weight_decay_mode = weight_decay_mode + if ( + weight_decay_mode == WeightDecayMode.COUNTER + and counter_based_regularization is None + ): + raise AssertionError( + "weight_decay_mode is set to WeightDecayMode.COUNTER but counter_based_regularization is None" + ) + + self._used_rowwise_adagrad_with_counter: bool = ( + optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.ROWWISE_ADAGRAD) + and weight_decay_mode == WeightDecayMode.COUNTER + and counter_based_regularization is not None + ) + + if counter_based_regularization is None: + counter_based_regularization = CounterBasedRegularizationDefinition() + self._max_counter_update_freq: int = -1 + if self._used_rowwise_adagrad_with_counter: + self._max_counter_update_freq = ( + counter_based_regularization.max_counter_update_freq + ) + opt_arg_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + else: + opt_arg_weight_decay_mode = weight_decay_mode + self.optimizer_args = invokers.lookup_args.OptimizerArgs( stochastic_rounding=stochastic_rounding, gradient_clipping=gradient_clipping, @@ -417,9 +487,18 @@ def __init__( # noqa C901 beta1=beta1, beta2=beta2, weight_decay=weight_decay, - weight_decay_mode=weight_decay_mode.value, + weight_decay_mode=opt_arg_weight_decay_mode.value, eta=eta, momentum=momentum, + counter_halflife=counter_based_regularization.counter_halflife, + adjustment_iter=counter_based_regularization.adjustment_iter, + adjustment_ub=counter_based_regularization.adjustment_ub, + learning_rate_mode=counter_based_regularization.learning_rate_mode.value, + grad_sum_decay=counter_based_regularization.grad_sum_decay.value, + tail_id_threshold=counter_based_regularization.tail_id_threshold.val, + is_tail_id_thresh_ratio=int( + counter_based_regularization.tail_id_threshold.is_ratio + ), ) if optimizer in ( @@ -427,25 +506,7 @@ def __init__( # noqa C901 OptimType.EXACT_SGD, ): # NOTE: make TorchScript work! - self.register_buffer( - "momentum1_dev", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_host", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_uvm", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_placements", - torch.tensor([0], dtype=torch.int64), - persistent=False, - ) - self.register_buffer( - "momentum1_offsets", - torch.tensor([0], dtype=torch.int64), - persistent=False, - ) + self._register_nonpersistent_buffers("momentum1") else: self._apply_split( construct_split_state( @@ -484,29 +545,40 @@ def __init__( # noqa C901 ) else: # NOTE: make TorchScript work! - self.register_buffer( - "momentum2_dev", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, - ) - self.register_buffer( - "momentum2_host", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, - ) - self.register_buffer( - "momentum2_uvm", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, + self._register_nonpersistent_buffers("momentum2") + if self._used_rowwise_adagrad_with_counter: + self._apply_split( + construct_split_state( + embedding_specs, + rowwise=True, + cacheable=False, + ), + prefix="prev_iter", + # TODO: ideally we should use int64 to track iter but it failed to compile. + # It may be related to low precision training code. Currently using float32 + # as a workaround while investigating the issue. + # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param + # but got `Type[torch.float32]`. + dtype=torch.float32, ) - self.register_buffer( - "momentum2_placements", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, + self._apply_split( + construct_split_state( + embedding_specs, + rowwise=True, + cacheable=False, + ), + prefix="row_counter", + # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param + # but got `Type[torch.float32]`. + dtype=torch.float32, ) + self.register_buffer("max_counter", torch.tensor([1], dtype=torch.float32)) + else: + self._register_nonpersistent_buffers("prev_iter") + self._register_nonpersistent_buffers("row_counter") self.register_buffer( - "momentum2_offsets", - torch.zeros(1, dtype=torch.int64, device=self.current_device), + "max_counter", + torch.ones(1, dtype=torch.float32, device=self.current_device), persistent=False, ) if optimizer in ( @@ -519,6 +591,7 @@ def __init__( # noqa C901 self.register_buffer( "iter", torch.zeros(1, dtype=torch.int64, device=self.current_device) ) + else: self.register_buffer( "iter", @@ -572,6 +645,34 @@ def __init__( # noqa C901 self.step = 0 + def _register_nonpersistent_buffers(self, prefix: str) -> None: + # NOTE: make TorchScript work! + self.register_buffer( + f"{prefix}_dev", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_host", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_uvm", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_placements", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_offsets", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: if not hasattr(self, f"{prefix}_physical_placements"): raise DoesNotHavePrefix() @@ -590,7 +691,7 @@ def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tenso def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]: all_states = [] - for prefix in ["weights", "momentum1", "momentum2"]: + for prefix in ["weights", "momentum1", "momentum2", "prev_iter", "row_counter"]: try: all_states.append(self.get_states(prefix)) except DoesNotHavePrefix: @@ -681,10 +782,20 @@ def forward( return invokers.lookup_approx_sgd.invoke(common_args, self.optimizer_args) momentum1 = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. dev=self.momentum1_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. host=self.momentum1_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. uvm=self.momentum1_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. offsets=self.momentum1_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. placements=self.momentum1_placements, ) @@ -696,21 +807,22 @@ def forward( return invokers.lookup_adagrad.invoke( common_args, self.optimizer_args, momentum1 ) - if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD: - return invokers.lookup_rowwise_adagrad.invoke( - common_args, self.optimizer_args, momentum1 - ) - if self.optimizer == OptimType.ROWWISE_ADAGRAD: - assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode" - return invokers.lookup_approx_rowwise_adagrad.invoke( - common_args, self.optimizer_args, momentum1 - ) momentum2 = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. dev=self.momentum2_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. host=self.momentum2_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. uvm=self.momentum2_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. offsets=self.momentum2_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. placements=self.momentum2_placements, ) # Ensure iter is always on CPU so the increment doesn't synchronize. @@ -768,6 +880,79 @@ def forward( self.iter.item(), ) + prev_iter = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. + dev=self.prev_iter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. + host=self.prev_iter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. + uvm=self.prev_iter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. + offsets=self.prev_iter_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. + placements=self.prev_iter_placements, + ) + row_counter = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. + dev=self.row_counter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. + host=self.row_counter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. + uvm=self.row_counter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. + offsets=self.row_counter_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. + placements=self.row_counter_placements, + ) + if self._used_rowwise_adagrad_with_counter: + if self.iter.item() % self._max_counter_update_freq == 0: + max_counter = torch.max(self.row_counter_dev.detach()) + self.max_counter = max_counter.cpu() + 1 + + if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD: + if self._used_rowwise_adagrad_with_counter: + return invokers.lookup_rowwise_adagrad_with_counter.invoke( + common_args, + self.optimizer_args, + momentum1, + prev_iter, + row_counter, + # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`. + self.iter.item(), + self.max_counter.item(), + ) + else: + return invokers.lookup_rowwise_adagrad.invoke( + common_args, self.optimizer_args, momentum1 + ) + if self.optimizer == OptimType.ROWWISE_ADAGRAD: + assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode" + if self._used_rowwise_adagrad_with_counter: + return invokers.lookup_approx_rowwise_adagrad_with_counter.invoke( + common_args, + self.optimizer_args, + momentum1, + prev_iter, + row_counter, + # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`. + self.iter.item(), + self.max_counter.item(), + ) + else: + return invokers.lookup_approx_rowwise_adagrad.invoke( + common_args, self.optimizer_args, momentum1 + ) + raise ValueError(f"Invalid OptimType: {self.optimizer}") def reset_uvm_cache_stats(self) -> None: @@ -796,10 +981,11 @@ def print_uvm_cache_stats(self) -> None: f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n" f"N_conflict_misses: {uvm_cache_stats[5]}\n" ) - logging.info( - f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" - f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" - ) + if uvm_cache_stats[1]: + logging.info( + f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" + f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" + ) def prefetch(self, indices: Tensor, offsets: Tensor) -> None: self.timestep += 1 @@ -1013,8 +1199,12 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]: or self.optimizer == OptimType.ROWWISE_ADAGRAD or self.optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD ): + split_optimizer_states = self.split_optimizer_states() list_of_state_dict = [ - {"sum": _sum[0]} for _sum in self.split_optimizer_states() + {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]} + if self._used_rowwise_adagrad_with_counter + else {"sum": states[0]} + for states in split_optimizer_states ] else: raise NotImplementedError( @@ -1024,7 +1214,9 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]: return list_of_state_dict @torch.jit.ignore - def split_optimizer_states(self) -> List[Tuple[torch.Tensor]]: + def split_optimizer_states( + self, + ) -> List[List[torch.Tensor]]: """ Returns a list of states, split by table """ @@ -1062,8 +1254,14 @@ def get_optimizer_states( ): states.append( get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. self.momentum1_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. self.momentum1_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. self.momentum1_uvm, # pyre-fixme[6]: Expected `Tensor` for 4th param but got # `Union[Tensor, nn.Module]`. @@ -1087,8 +1285,14 @@ def get_optimizer_states( ): states.append( get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. self.momentum2_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. self.momentum2_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. self.momentum2_uvm, # pyre-fixme[6]: Expected `Tensor` for 4th param but got # `Union[Tensor, nn.Module]`. @@ -1100,7 +1304,49 @@ def get_optimizer_states( in (OptimType.PARTIAL_ROWWISE_ADAM, OptimType.PARTIAL_ROWWISE_LAMB), ) ) - return list(zip(*states)) + if self._used_rowwise_adagrad_with_counter: + states.append( + get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_physical_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_physical_placements, + rowwise=True, + ) + ) + states.append( + get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_physical_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_physical_placements, + rowwise=True, + ) + ) + return_states = [list(s) for s in zip(*states)] + return return_states @torch.jit.export def set_learning_rate(self, lr: float) -> None: @@ -1691,8 +1937,8 @@ def nbit_construct_split_state( scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES, cacheline_alignment: bool = True, ) -> SplitState: - placements = [] - offsets = [] + placements = torch.jit.annotate(List[EmbeddingLocation], []) + offsets = torch.jit.annotate(List[int], []) dev_size = 0 host_size = 0 uvm_size = 0 @@ -1740,6 +1986,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module): cache_miss_counter: torch.Tensor uvm_cache_stats: torch.Tensor local_uvm_cache_stats: torch.Tensor + weights_offsets: torch.Tensor + weights_placements: torch.Tensor def __init__( self, @@ -1921,21 +2169,7 @@ def max_ty_D(ty: SparseType) -> int: ] self.max_D_cache: int = max(cached_dims) if len(cached_dims) > 0 else 0 - weight_split: SplitState = nbit_construct_split_state( - self.embedding_specs, - cacheable=True, - row_alignment=self.row_alignment, - scale_bias_size_in_bytes=self.scale_bias_size_in_bytes, - cacheline_alignment=cacheline_alignment, - ) - - self.weights_physical_placements: List[int] = [ - t.value for t in weight_split.placements - ] - self.weights_physical_offsets: List[int] = weight_split.offsets - self.host_size: int = weight_split.host_size - self.dev_size: int = weight_split.dev_size - self.uvm_size: int = weight_split.uvm_size + self.initialize_physical_weights_placements_and_offsets(cacheline_alignment) self.enforce_hbm: bool = enforce_hbm # Assign weights after weights and weights_offsets are initialized. @@ -1948,7 +2182,8 @@ def max_ty_D(ty: SparseType) -> int: self.weights_physical_offsets, self.enforce_hbm, ) - self.assign_embedding_weights(weight_lists) # type: ignore + # pyre-fixme [6]: In call `IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights`, for 1st positional argument, expected `List[Tuple[Tensor, Optional[Tensor]]]` but got `List[Tuple[Tensor, Tensor]]`. + self.assign_embedding_weights(weight_lists) # Handle index remapping for embedding pruning. self.register_buffer( @@ -2104,10 +2339,11 @@ def print_uvm_cache_stats(self) -> None: f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n" f"N_conflict_misses: {uvm_cache_stats[5]}\n" ) - logging.info( - f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" - f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" - ) + if uvm_cache_stats[1]: + logging.info( + f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" + f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" + ) @torch.jit.export def prefetch(self, indices: Tensor, offsets: Tensor) -> None: @@ -2409,6 +2645,72 @@ def forward( fp8_exponent_bias=self.fp8_exponent_bias, ) + def initialize_logical_weights_placements_and_offsets( + self, + ) -> None: + assert len(self.weights_physical_offsets) == len(self.embedding_specs) + assert len(self.weights_physical_offsets) == len( + self.weights_physical_placements + ) + offsets = [self.weights_physical_offsets[t] for t in self.feature_table_map] + placements = [ + self.weights_physical_placements[t] for t in self.feature_table_map + ] + self.weights_offsets = torch.tensor( + offsets, device=self.current_device, dtype=torch.int64 + ) + self.weights_placements = torch.tensor( + placements, device=self.current_device, dtype=torch.int32 + ) + + def initialize_physical_weights_placements_and_offsets( + self, + cacheline_alignment: bool = True, + ) -> None: + # Initialize physical weights placements and offsets + # and host/dev/uvm sizes + weight_split: SplitState = nbit_construct_split_state( + self.embedding_specs, + cacheable=True, + row_alignment=self.row_alignment, + scale_bias_size_in_bytes=self.scale_bias_size_in_bytes, + cacheline_alignment=cacheline_alignment, + ) + self.weights_physical_placements = [t.value for t in weight_split.placements] + self.weights_physical_offsets = weight_split.offsets + self.host_size = weight_split.host_size + self.dev_size = weight_split.dev_size + self.uvm_size = weight_split.uvm_size + + @torch.jit.export + def reset_weights_placements_and_offsets( + self, device: torch.device, location: int + ) -> None: + # Reset device/location denoted in embedding specs + self.reset_embedding_spec_location(device, location) + # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor + self.initialize_physical_weights_placements_and_offsets() + self.initialize_logical_weights_placements_and_offsets() + + def reset_embedding_spec_location( + self, device: torch.device, location: int + ) -> None: + # Overwrite location in embedding_specs with new location + # Use map since can't script enum call (ie. EmbeddingLocation(value)) + INT_TO_EMBEDDING_LOCATION = { + 0: EmbeddingLocation.DEVICE, + 1: EmbeddingLocation.MANAGED, + 2: EmbeddingLocation.MANAGED_CACHING, + 3: EmbeddingLocation.HOST, + } + target_location = INT_TO_EMBEDDING_LOCATION[location] + self.current_device = device + self.row_alignment = 1 if target_location == EmbeddingLocation.HOST else 16 + self.embedding_specs = [ + (spec[0], spec[1], spec[2], spec[3], target_location) + for spec in self.embedding_specs + ] + def _apply_split( self, dev_size: int, @@ -2427,14 +2729,7 @@ def _apply_split( self.dev_size = dev_size self.uvm_size = uvm_size - offsets = [offsets[t] for t in self.feature_table_map] - placements = [placements[t] for t in self.feature_table_map] - self.weights_offsets = torch.tensor( - offsets, device=self.current_device, dtype=torch.int64 - ) - self.weights_placements = torch.tensor( - placements, device=self.current_device, dtype=torch.int32 - ) + self.initialize_logical_weights_placements_and_offsets() if dev_size > 0: self.weights_dev = torch.zeros( @@ -2816,6 +3111,49 @@ def assign_embedding_weights( else: assert dest_weight[1] is None + @torch.jit.export + def set_index_remappings_array( + self, + index_remapping: List[Tensor], + ) -> None: + rows: List[int] = [e[1] for e in self.embedding_specs] + index_remappings_array_offsets = [0] + original_feature_rows = torch.jit.annotate(List[int], []) + last_offset = 0 + for t, mapping in enumerate(index_remapping): + if mapping is not None: + current_original_row = mapping.numel() + last_offset += current_original_row + original_feature_rows.append(current_original_row) + else: + original_feature_rows.append(rows[t]) + index_remappings_array_offsets.append(last_offset) + + self.index_remappings_array_offsets = torch.tensor( + index_remappings_array_offsets, + device=self.current_device, + dtype=torch.int64, + ) + if len(original_feature_rows) == 0: + original_feature_rows = rows + self.original_rows_per_table = torch.tensor( + [original_feature_rows[t] for t in self.feature_table_map], + device=self.current_device, + dtype=torch.int64, + ) + if self.index_remappings_array_offsets[-1] == 0: + self.index_remappings_array = torch.empty( + 0, dtype=torch.int32, device=self.current_device + ) + else: + index_remappings_filter_nones = [] + for mapping in index_remapping: + if mapping is not None: + index_remappings_filter_nones.append(mapping) + self.index_remappings_array = torch.cat(index_remappings_filter_nones).to( + self.current_device + ) + def set_index_remappings( self, index_remapping: List[Tensor], @@ -2882,37 +3220,7 @@ def set_index_remappings( self.index_remapping_hash_table_cpu = None # Array mapping pruning else: - index_remappings_array_offsets = [0] - original_feature_rows = [] - last_offset = 0 - for t, mapping in enumerate(index_remapping): - if mapping is not None: - current_original_row = mapping.numel() - last_offset += current_original_row - original_feature_rows.append(current_original_row) - else: - original_feature_rows.append(rows[t]) - index_remappings_array_offsets.append(last_offset) - - self.index_remappings_array_offsets = torch.tensor( - index_remappings_array_offsets, - device=self.current_device, - dtype=torch.int64, - ) - if len(original_feature_rows) == 0: - original_feature_rows = rows - self.original_rows_per_table = torch.tensor( - [original_feature_rows[t] for t in self.feature_table_map], - device=self.current_device, - dtype=torch.int64, - ) - self.index_remappings_array = ( - torch.empty(0, dtype=torch.int32, device=self.current_device) - if self.index_remappings_array_offsets[-1] == 0 - else torch.cat( - [mapping for mapping in index_remapping if mapping is not None] - ).to(self.current_device) - ) + self.set_index_remappings_array(index_remapping) def _embedding_inplace_update_per_table( self, diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py index 1eec03fdd9..250f84abb6 100644 --- a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py @@ -18,6 +18,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops import ( align_to_cacheline, CacheAlgorithm, + CounterBasedRegularizationDefinition, DEFAULT_SCALE_BIAS_SIZE_IN_BYTES, EmbeddingLocation, PoolingMode, @@ -88,6 +89,9 @@ def __init__( eta: float = 0.001, # used by LARS-SGD, beta1: float = 0.9, # used by LAMB and ADAM beta2: float = 0.999, # used by LAMB and ADAM + counter_based_regularization: Optional[ + CounterBasedRegularizationDefinition + ] = None, # used by Rowwise Adagrad pooling_mode: PoolingMode = PoolingMode.SUM, ) -> None: super(SSDTableBatchedEmbeddingBags, self).__init__() @@ -217,6 +221,12 @@ def __init__( self.ssd_set_end = torch.cuda.Event() self.timesteps_prefetched: List[int] = [] + if weight_decay_mode == WeightDecayMode.COUNTER or counter_based_regularization: + raise AssertionError( + "weight_decay_mode = WeightDecayMode.COUNTER is not supported for SSD TBE." + ) + counter_based_regularization = CounterBasedRegularizationDefinition() + self.optimizer_args = invokers.lookup_args.OptimizerArgs( stochastic_rounding=stochastic_rounding, gradient_clipping=gradient_clipping, @@ -229,6 +239,15 @@ def __init__( weight_decay_mode=weight_decay_mode.value, eta=eta, momentum=momentum, + counter_halflife=counter_based_regularization.counter_halflife, + adjustment_iter=counter_based_regularization.adjustment_iter, + adjustment_ub=counter_based_regularization.adjustment_ub, + learning_rate_mode=counter_based_regularization.learning_rate_mode.value, + grad_sum_decay=counter_based_regularization.grad_sum_decay.value, + tail_id_threshold=counter_based_regularization.tail_id_threshold.val, + is_tail_id_thresh_ratio=int( + counter_based_regularization.tail_id_threshold.is_ratio + ), ) self.weights_dev = nn.Parameter( torch.empty((0,), device=self.current_device, dtype=torch.float32) diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h index 10670b48d4..cfa457d04b 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h +++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h @@ -75,4 +75,28 @@ void embedding_inplace_update_cpu( c10::nullopt // Not used, to match cache interface for CUDA op ); +/** + * Index remapping function that returns the remapped indices. + * + * Args: + * update_row_indices: row indices for every new row + * update_table_indices: table indices for every new row + * index_remappings: concated index remapping for every embedding table + * index_remappings_offsets: offset for each embedding table + * + * Returns: + * remapped indices for each new row. + */ +Tensor pruned_array_lookup_from_row_idx_cuda( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets); + +Tensor pruned_array_lookup_from_row_idx_cpu( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets); + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh index 5ce7d4f5d1..c21057ac49 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh @@ -62,6 +62,11 @@ static constexpr int32_t kWarpSize = 32; #endif // Max thread num in one thread block static constexpr int32_t kMaxThreads = 1024; +// Max block size in Y dimension of a grid +static constexpr int32_t kMaxBlockYDim = 65535; +// Max block size in Z dimension of a grid +static constexpr int32_t kMaxBlockZDim = 65535; + static constexpr float kQParamEps = 1e-8f; /* For rowwise int8 quantization, two quantization parameters (qparams) diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h new file mode 100644 index 0000000000..750d315d05 --- /dev/null +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace fbgemm_gpu { + +static constexpr size_t PTR_NAME_MAX_LEN = 16; +static constexpr size_t FUNC_NAME_MAX_LEN = 64; + +// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor +// is used to enable the __restrict__ keyword/modifier for the data +// passed to cuda. +template +struct DefaultPtrTraits { + typedef T* PtrType; +}; + +#if defined(__CUDACC__) || defined(__HIPCC__) +template +struct RestrictPtrTraits { + typedef T* __restrict__ PtrType; +}; +#endif + +// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. +// For CUDA tensors it is used in device code (only). This means that we +// restrict ourselves to functions and types available there (e.g. +// at::IntArrayRef isn't). + +// The PtrTraits argument is only relevant to cuda to support `__restrict__` +// pointers. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessorBase( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data), + sizes_(sizes), + strides_(strides), + ptr_name_(ptr_name), + func_name_(func_name) { + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + } + C10_HOST at::IntArrayRef sizes() const { + return at::IntArrayRef(sizes_, N); + } + C10_HOST at::IntArrayRef strides() const { + return at::IntArrayRef(strides_, N); + } + C10_HOST_DEVICE index_t stride(index_t i) const { + return strides_[i]; + } + C10_HOST_DEVICE index_t size(index_t i) const { + return sizes_[i]; + } + C10_HOST_DEVICE PtrType data() { + return data_; + } + C10_HOST_DEVICE const PtrType data() const { + return data_; + } + C10_HOST_DEVICE T& at(index_t idx) const { + if (idx < 0) { + printf( + "ERROR: idx < 0, tensor %s in %s, idx %lld\n", + ptr_name_, + func_name_, + static_cast(idx)); + CUDA_KERNEL_ASSERT(idx >= 0) + } else if (idx >= numel_) { + printf( + "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n", + ptr_name_, + func_name_, + static_cast(idx), + static_cast(numel_)); + CUDA_KERNEL_ASSERT(idx < numel_); + } + return data_[idx]; + } + + protected: + PtrType data_; + const index_t* const sizes_; + const index_t* const strides_; + index_t numel_; + const char* const ptr_name_; + const char* const func_name_; +}; + +// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using +// `Tensor.accessor()`. +// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and +// only indexing on the device uses `TensorAccessor`s. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class TensorAccessor : public TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : TensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_HOST_DEVICE TensorAccessor operator[]( + index_t i) { + return TensorAccessor( + this->data_ + this->strides_[0] * i, + this->sizes_ + 1, + this->strides_ + 1, + this->ptr_name_, + this->func_name); + } + + C10_HOST_DEVICE const TensorAccessor operator[]( + index_t i) const { + return TensorAccessor( + this->data_ + this->strides_[0] * i, + this->sizes_ + 1, + this->strides_ + 1, + this->ptr_name_, + this->func_name); + } +}; + +template class PtrTraits, typename index_t> +class TensorAccessor + : public TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* func_name) + : TensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + C10_HOST_DEVICE T& operator[](index_t i) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + return this->at(this->strides_[0] * i); + } + C10_HOST_DEVICE const T& operator[](index_t i) const { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + return this->at(this->strides_[0] * i); + } +}; + +// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on +// for CUDA `Tensor`s on the host and as In contrast to `TensorAccessor`s, they +// copy the strides and sizes on instantiation (on the host) in order to +// transfer them on the device when calling kernels. On the device, indexing of +// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as +// PtrTraits if you want the tensor's data pointer to be marked as __restrict__. +// Instantiation from data, sizes, strides is only needed on the host and +// std::copy isn't available on the device, so those functions are host only. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + C10_HOST GenericPackedTensorAccessorBase( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data) { + std::copy(sizes, sizes + N, std::begin(sizes_)); + std::copy(strides, strides + N, std::begin(strides_)); + // Compute numel_ + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN); + copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN); + } + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessorBase( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data) { + for (const auto i : c10::irange(N)) { + this->sizes_[i] = sizes[i]; + this->strides_[i] = strides[i]; + } + // Compute numel_ + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN); + copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN); + } + + C10_HOST void copy_str(char* dst, const char* src, const size_t max_len) { + const auto len = std::min(strlen(src), max_len - 1); + std::memcpy(dst, src, sizeof(char) * len); + dst[len] = '\0'; + } + + C10_HOST_DEVICE T& at(index_t idx) const { + if (idx < 0) { + printf( + "ERROR: idx < 0, tensor %s in %s, idx %lld\n", + ptr_name_, + func_name_, + static_cast(idx)); + CUDA_KERNEL_ASSERT(idx >= 0) + } else if (idx >= numel_) { + printf( + "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n", + ptr_name_, + func_name_, + static_cast(idx), + static_cast(numel_)); + CUDA_KERNEL_ASSERT(idx < numel_) + } + return data_[idx]; + } + + C10_HOST_DEVICE index_t stride(index_t i) const { + return strides_[i]; + } + C10_HOST_DEVICE index_t size(index_t i) const { + return sizes_[i]; + } + C10_HOST_DEVICE PtrType data() { + return data_; + } + C10_HOST_DEVICE const PtrType data() const { + return data_; + } + + protected: + PtrType data_; + index_t sizes_[N]; + index_t strides_[N]; + index_t numel_; + char ptr_name_[PTR_NAME_MAX_LEN]; + char func_name_[FUNC_NAME_MAX_LEN]; + C10_HOST void bounds_check_(index_t i) const { + TORCH_CHECK_INDEX( + 0 <= i && i < index_t{N}, + "Index ", + i, + " is not within bounds of a tensor of dimension ", + N); + } +}; + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class GenericPackedTensorAccessor + : public GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_DEVICE TensorAccessor operator[]( + index_t i) { + index_t* new_sizes = this->sizes_ + 1; + index_t* new_strides = this->strides_ + 1; + return TensorAccessor( + this->data_ + this->strides_[0] * i, + new_sizes, + new_strides, + this->ptr_name_, + this->func_name_); + } + + C10_DEVICE const TensorAccessor operator[]( + index_t i) const { + const index_t* const new_sizes = this->sizes_ + 1; + const index_t* const new_strides = this->strides_ + 1; + return TensorAccessor( + this->data_ + this->strides_[0] * i, + new_sizes, + new_strides, + this->ptr_name_, + this->func_name_); + } + + /// Returns a PackedTensorAccessor of the same dimension after transposing the + /// two dimensions given. Does not actually move elements; transposition is + /// made by permuting the size/stride arrays. If the dimensions are not valid, + /// asserts. + C10_HOST GenericPackedTensorAccessor transpose( + index_t dim1, + index_t dim2) const { + this->bounds_check_(dim1); + this->bounds_check_(dim2); + GenericPackedTensorAccessor result( + this->data_, this->sizes_, this->strides_); + std::swap(result.strides_[dim1], result.strides_[dim2]); + std::swap(result.sizes_[dim1], result.sizes_[dim2]); + return result; + } +}; + +template class PtrTraits, typename index_t> +class GenericPackedTensorAccessor + : public GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_DEVICE T& operator[](index_t i) { + return this->at(this->strides_[0] * i); + } + C10_DEVICE const T& operator[](index_t i) const { + return this->at(this->strides_[0] * i); + } + + // Same as in the general N-dimensional case, but note that in the + // 1-dimensional case the returned PackedTensorAccessor will always be an + // identical copy of the original + C10_HOST GenericPackedTensorAccessor transpose( + index_t dim1, + index_t dim2) const { + this->bounds_check_(dim1); + this->bounds_check_(dim2); + return GenericPackedTensorAccessor( + this->data_, this->sizes_, this->strides_); + } +}; + +// Can't put this directly into the macro function args because of commas +#define AT_X GenericPackedTensorAccessor + +// Old name for `GenericPackedTensorAccessor` +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X) + +#undef AT_X + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits> +using PackedTensorAccessor32 = + GenericPackedTensorAccessor; + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits> +using PackedTensorAccessor64 = + GenericPackedTensorAccessor; + +} // namespace fbgemm_gpu + +#ifdef FBGEMM_GPU_MEMCHECK +namespace pta = fbgemm_gpu; +#else +namespace pta = at; +#endif + +#ifdef FBGEMM_GPU_MEMCHECK +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits, + typename index_t = int64_t> +const fbgemm_gpu::GenericPackedTensorAccessor +make_generic_packed_tensor_accessor( + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { + static_assert( + N > 0, + "accessor is used for indexing tensor, for scalars use *data_ptr()"); + TORCH_CHECK( + tensor.dim() == N, + "TensorAccessor expected ", + N, + " dims but tensor has ", + tensor.dim()); + return fbgemm_gpu::GenericPackedTensorAccessor( + static_cast::PtrType>(tensor.data_ptr()), + tensor.sizes().data(), + tensor.strides().data(), + ptr_name, + func_name); +} +#endif + +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits> +const pta::PackedTensorAccessor32 +make_packed_tensor_accessor32( +#ifdef FBGEMM_GPU_MEMCHECK + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { +#else + at::Tensor& tensor) { +#endif + TORCH_CHECK( + tensor.numel() <= + static_cast(std::numeric_limits::max()), + "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64"); +#ifdef FBGEMM_GPU_MEMCHECK + return make_generic_packed_tensor_accessor( + tensor, ptr_name, func_name); +#else + return tensor.packed_accessor32(); +#endif +} + +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits> +const pta::PackedTensorAccessor64 +make_packed_tensor_accessor64( +#ifdef FBGEMM_GPU_MEMCHECK + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { + return make_generic_packed_tensor_accessor( + tensor, ptr_name, func_name); +#else + at::Tensor& tensor) { + return tensor.packed_accessor64(); +#endif +} + +#ifdef FBGEMM_GPU_MEMCHECK +#define MAKE_PACKED_TENSOR_ACCESSOR_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS( \ + TENSOR, #TENSOR, FUNC_NAME) + +#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS< \ + at::acc_type, \ + N, \ + PTR_TRAITS>(TENSOR, #TENSOR, FUNC_NAME) +#else +#define MAKE_PACKED_TENSOR_ACCESSOR_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS(TENSOR) + +#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS< \ + at::acc_type, \ + N, \ + PTR_TRAITS>(TENSOR) +#endif diff --git a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h index 348e0bebfc..c329d6c9d9 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h +++ b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h @@ -30,4 +30,19 @@ padding_fused_tbe_input_combine_cpu( const at::Tensor& include_last_offsets, int64_t batch_size); +std::tuple +tbe_input_combine_with_length_cuda( + const uint64_t* const indices_addrs, + const uint64_t* const lengths_addrs, + const uint64_t* const per_sample_weights_addrs, + const uint32_t* const indices_is_long, + const uint32_t* const lengths_is_long, + const uint64_t* const indices_offsets, + const uint64_t* const lengths_offsets, + const uint64_t num_lists, + const uint64_t total_indices, + const uint64_t total_lengths, + const uint64_t max_list_size, + const c10::DeviceIndex& device); + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh index 52854a4f2e..3532928963 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh @@ -155,6 +155,12 @@ at::Tensor lxu_cache_lookup_cuda( bool gather_cache_stats, c10::optional uvm_cache_stats); +at::Tensor emulate_cache_miss( + at::Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + at::Tensor uvm_cache_stats); + ///@ingroup table-batched-embed-cuda /// Lookup the LRU/LFU cache: find the cache weights location for all indices. /// Look up the slots in the cache corresponding to `linear_cache_indices`, with diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index 6b8ebbb570..2b34cb240a 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -7,6 +7,7 @@ import argparse import os import random +import re import subprocess import sys @@ -38,8 +39,9 @@ def generate_package_version(package_name: str): print( f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}" ) - # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) - version = gitversion.version_from_git().split("+")[0] + # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) + # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0) + version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0]) print(f"[SETUP.PY] Setting the package version: {version}") return version diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu index 7a060681f0..7b49040a83 100644 --- a/fbgemm_gpu/src/cumem_utils.cu +++ b/fbgemm_gpu/src/cumem_utils.cu @@ -41,7 +41,8 @@ struct CUDAHostMappedContext { ~CUDAHostMappedContext() { at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(cuda_device_); - AT_CUDA_CHECK(cudaFreeHost(ptr_)); + AT_CUDA_CHECK(cudaHostUnregister(ptr_)); + free(ptr_); } static void release(void* ptr) { @@ -206,9 +207,28 @@ Tensor new_host_mapped_tensor( auto strides = defaultStrides(sizes); size_t size_bytes = at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize()); - void* ptr; - AT_CUDA_CHECK(cudaHostAlloc( - &ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped)); + + // When using cudaHostAlloc for large allocations, we found that it can + // potentially take a global lock and lock out CUDA APIs from other processes. + // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead + // of using this cuda API, we can do regular malloc, pre-fault the pages, and + // then do cudaHostRegister with GPU mapping flags to lock the pages, so we + // can minimize the cost while holding this global lock. + void* const ptr = malloc(size_bytes); + + // advise the kernel to allocate large 2M pages + madvise(ptr, size_bytes, MADV_HUGEPAGE); + + // pre-fault/map the pages by setting the first byte of the page + size_t pageSize = (1 << 21); + uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1)); + for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes); + p += pageSize) { + memset((void*)p, 0, 1); + } + + AT_CUDA_CHECK(cudaHostRegister( + ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable)); void* dev_ptr; AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0)); diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu index 1d0e394919..f301576a49 100644 --- a/fbgemm_gpu/src/embedding_inplace_update.cu +++ b/fbgemm_gpu/src/embedding_inplace_update.cu @@ -186,4 +186,98 @@ void embedding_inplace_update_cuda( }); } +template +__global__ +__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel( + const at::PackedTensorAccessor32 + update_row_indices, + const at::PackedTensorAccessor32 + update_table_indices, + const at::PackedTensorAccessor32 + index_remappings, + const at::PackedTensorAccessor32 + index_remappings_offsets, + at::PackedTensorAccessor32 + dense_indices) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= update_row_indices.size(0)) { + return; + } + const auto row_idx = update_row_indices[idx]; + if (idx >= update_table_indices.size(0)) { + return; + } + const int table_idx = update_table_indices[idx]; + + const int64_t index_remappings_start = index_remappings_offsets[table_idx]; + const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1]; + const int64_t capacity = index_remappings_end - index_remappings_start; + + if (capacity > 0) { + dense_indices[idx] = index_remappings[index_remappings_start + row_idx]; + } else { + dense_indices[idx] = row_idx; + } +} + +Tensor pruned_array_lookup_from_row_idx_cuda( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets) { + TENSOR_ON_CUDA_GPU(update_row_indices); + TENSOR_ON_CUDA_GPU(update_table_indices); + TENSOR_ON_CUDA_GPU(index_remappings); + TENSOR_ON_CUDA_GPU(index_remappings_offsets); + + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(update_table_indices.get_device()); + auto dense_indices = at::empty_like(update_row_indices); + const int32_t T = index_remappings_offsets.size(0) - 1; + + const auto num_indices = update_row_indices.numel(); + if (num_indices == 0) { + return dense_indices; + } + + TORCH_CHECK(index_remappings.size(0) < std::numeric_limits::max()); + TORCH_CHECK( + update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim()); + TORCH_CHECK( + update_table_indices.dim() == 1, + "Tensor dim: ", + update_table_indices.dim()); + TORCH_CHECK( + index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim()); + TORCH_CHECK( + index_remappings_offsets.dim() == 1, + "Tensor dim: ", + index_remappings_offsets.dim()); + TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim()); + constexpr size_t kForwardMaxThreads = 256; + + AT_DISPATCH_INDEX_TYPES( + update_row_indices.scalar_type(), + "pruned_array_lookup_from_row_idx_kernel", + [&] { + pruned_array_lookup_from_row_idx_kernel<<< + nbit::div_round_up(num_indices, kForwardMaxThreads), + kForwardMaxThreads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + update_row_indices + .packed_accessor32(), + update_table_indices + .packed_accessor32(), + index_remappings + .packed_accessor32(), + index_remappings_offsets + .packed_accessor32(), + dense_indices + .packed_accessor32()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + return dense_indices; +} + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp index bd1315e023..5f3a648872 100644 --- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp @@ -116,6 +116,53 @@ void embedding_inplace_update_cpu( }); } +Tensor pruned_array_lookup_from_row_idx_cpu( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets) { + TENSOR_ON_CPU(update_row_indices); + TENSOR_ON_CPU(update_table_indices); + TENSOR_ON_CPU(index_remappings); + TENSOR_ON_CPU(index_remappings_offsets); + + auto dense_indices = empty_like(update_row_indices); + const auto num_indices = update_row_indices.numel(); + + AT_DISPATCH_INDEX_TYPES( + update_row_indices.scalar_type(), + "pruned_array_lookup_from_row_idx_cpu_kernel", + [&] { + const auto update_row_indices_acc = + update_row_indices.accessor(); + auto dense_indices_acc = dense_indices.accessor(); + const auto update_table_indices_acc = + update_table_indices.accessor(); + + const auto index_remappings_acc = + index_remappings.accessor(); + const auto index_remappings_offsets_acc = + index_remappings_offsets.accessor(); + + for (int64_t idx = 0; idx < num_indices; idx++) { + const int table_idx = update_table_indices_acc[idx]; + const auto row_idx = update_row_indices_acc[idx]; + int64_t index_remappings_start = + index_remappings_offsets_acc[table_idx]; + int64_t index_remappings_end = + index_remappings_offsets_acc[table_idx + 1]; + int64_t capacity = index_remappings_end - index_remappings_start; + if (capacity > 0) { + dense_indices_acc[idx] = + index_remappings_acc[index_remappings_start + row_idx]; + } else { + dense_indices_acc[idx] = row_idx; + } + } + }); + return dense_indices; +} + } // namespace fbgemm_gpu TORCH_LIBRARY_FRAGMENT(fbgemm, m) { @@ -127,3 +174,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { DISPATCH_TO_CPU( "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu); } + +TORCH_LIBRARY_FRAGMENT(fbgemm, m) { + m.def( + "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); +} + +TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { + DISPATCH_TO_CPU( + "pruned_array_lookup_from_row_idx", + fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu); +} diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp index 743a902b68..cfb48c2427 100644 --- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp @@ -14,3 +14,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { DISPATCH_TO_CUDA( "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda); } + +TORCH_LIBRARY_FRAGMENT(fbgemm, m) { + DISPATCH_TO_CUDA( + "pruned_array_lookup_from_row_idx", + fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda); +} diff --git a/fbgemm_gpu/src/input_combine.cu b/fbgemm_gpu/src/input_combine.cu new file mode 100644 index 0000000000..040ca14bbf --- /dev/null +++ b/fbgemm_gpu/src/input_combine.cu @@ -0,0 +1,160 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh" +#include "fbgemm_gpu/input_combine.h" + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +template +DEVICE_INLINE void vec_copy_with_implicit_type_cast( + dst_t* const __restrict__ dst, + const uint64_t src_addr, + const uint64_t src_offset, + const uint64_t dst_offset, + const uint64_t src_bound) { + // TODO: Use vector load/store if address aligns with the vector type + const src_t* const src = reinterpret_cast(src_addr); +#pragma unroll + for (uint64_t i = 0; i < VEC_WIDTH && src_offset + i < src_bound; i++) { + dst[dst_offset + i] = src[src_offset + i]; + } +} + +template +__global__ +__launch_bounds__(kMaxThreads) void tbe_input_combine_with_length_kernel( + int32_t* const __restrict__ combined_indices, + int32_t* const __restrict__ combined_lengths, + float* const __restrict__ combined_weights, + const uint64_t* const __restrict__ indices_addrs, + const uint64_t* const __restrict__ lengths_addrs, + const uint64_t* const __restrict__ per_sample_weights_addrs, + const uint32_t* const __restrict__ indices_is_long, + const uint32_t* const __restrict__ lengths_is_long, + const uint64_t* const __restrict__ indices_offsets, + const uint64_t* const __restrict__ lengths_offsets, + const uint64_t num_lists, + const FixedDivisor fd_num_warps_per_list) { + const auto global_warp_id = blockIdx.x * blockDim.y + threadIdx.y; + uint32_t list_id; + uint32_t warp_id; + fd_num_warps_per_list.DivMod( + global_warp_id, + reinterpret_cast(&list_id), + reinterpret_cast(&warp_id)); + + if (list_id >= num_lists) { + return; + } + + // IS_LONG_NUM_BITS is power of 2 (default = 32); div and mod should be cheap + const uint32_t is_long_idx = list_id / IS_LONG_NUM_BITS; + const uint32_t is_long_mask = 1u << (list_id % IS_LONG_NUM_BITS); + const uint64_t src_idx = (warp_id * kWarpSize + threadIdx.x) * VEC_WIDTH; + const auto indices_start = indices_offsets[list_id]; + const auto indices_end = indices_offsets[list_id + 1]; + const auto lengths_start = lengths_offsets[list_id]; + const auto lengths_end = lengths_offsets[list_id + 1]; + + // Invoke a function based on the indices type + ((indices_is_long[is_long_idx] & is_long_mask) + ? vec_copy_with_implicit_type_cast + : vec_copy_with_implicit_type_cast< + int32_t, + int32_t, + VEC_WIDTH>)(combined_indices, indices_addrs[list_id], src_idx, indices_start + src_idx, indices_end - indices_start); + + // Invoke a function based on the lengths type + ((lengths_is_long[is_long_idx] & is_long_mask) + ? vec_copy_with_implicit_type_cast + : vec_copy_with_implicit_type_cast< + int32_t, + int32_t, + VEC_WIDTH>)(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start + src_idx, lengths_end - lengths_start); + + if (per_sample_weights_addrs) { + vec_copy_with_implicit_type_cast( + combined_weights, + per_sample_weights_addrs[list_id], + src_idx, + indices_start + src_idx, + indices_end - indices_start); + } +} + +std::tuple tbe_input_combine_with_length_cuda( + const uint64_t* const indices_addrs, + const uint64_t* const lengths_addrs, + const uint64_t* const per_sample_weights_addrs, + const uint32_t* const indices_is_long, + const uint32_t* const lengths_is_long, + const uint64_t* const indices_offsets, + const uint64_t* const lengths_offsets, + const uint64_t num_lists, + const uint64_t total_indices, + const uint64_t total_lengths, + const uint64_t max_list_size, + const c10::DeviceIndex& device) { + constexpr uint32_t IS_LONG_NUM_BITS = 32; + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(device); + + // combined_indices and combined_legnths are int tensors + const auto int_options = at::TensorOptions().dtype(at::kInt).device( + at::kCUDA, at::cuda::current_device()); + Tensor combined_indices = + at::empty({static_cast(total_indices)}, int_options); + Tensor combined_lengths = + at::empty({static_cast(total_lengths)}, int_options); + // combined_weights is a float tensor + Tensor combined_weights = at::empty( + {per_sample_weights_addrs ? static_cast(total_indices) + : static_cast(0)}, + at::TensorOptions() + .dtype(at::kFloat) + .device(at::kCUDA, at::cuda::current_device())); + + // Each thread loads 4 elements (rule of thumb; should work well with 32-bit + // inputs) + constexpr uint32_t VEC_WIDTH = 4; + constexpr uint32_t NUM_WARPS_PER_BLOCK = kMaxThreads / kWarpSize; + const auto num_warps_per_list = + div_round_up(max_list_size, kWarpSize * VEC_WIDTH); + const auto num_blocks = + div_round_up(num_warps_per_list * num_lists, NUM_WARPS_PER_BLOCK); + + tbe_input_combine_with_length_kernel + <<>>( + combined_indices.data_ptr(), + combined_lengths.data_ptr(), + per_sample_weights_addrs ? combined_weights.data_ptr() + : nullptr, + indices_addrs, + lengths_addrs, + per_sample_weights_addrs, + indices_is_long, + lengths_is_long, + indices_offsets, + lengths_offsets, + num_lists, + FixedDivisor(num_warps_per_list)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return { + std::move(combined_indices), + std::move(combined_lengths), + std::move(combined_weights)}; +} + +} // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_gpu.cpp new file mode 100644 index 0000000000..482cabd963 --- /dev/null +++ b/fbgemm_gpu/src/input_combine_gpu.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "fbgemm_gpu/input_combine.h" +#include "fbgemm_gpu/sparse_ops_utils.h" + +#include +#include +#include + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +constexpr uint32_t IS_LONG_NUM_BITS = 32; +constexpr uint32_t NUM_ARGS = 7; +enum args_pos { + P_indices_prts = 0, + P_lengths_addrs = 1, + P_indices_offsets = 2, + P_lengths_offsets = 3, + P_per_sample_weight = 4, + P_indices_is_long = 5, + P_lengths_is_long = 6 +}; + +template +uint64_t compute_num_uint64s(const uint64_t num_elements) { + const uint64_t ratio = sizeof(uint64_t) / sizeof(T); + return (num_elements + ratio - 1) / ratio; +} + +void offset_tbe_input_combine_with_length_args( + uint64_t** indices_addrs, + uint64_t** lengths_addrs, + uint64_t** indices_offsets, + uint64_t** lengths_offsets, + uint64_t** per_sample_weights_addrs, + uint32_t** indices_is_long, + uint32_t** lengths_is_long, + uint64_t* base_addr, + const uint64_t* const ptr_offsets, + const bool need_weights) { + *indices_addrs = base_addr + ptr_offsets[P_indices_prts]; + *lengths_addrs = base_addr + ptr_offsets[P_lengths_addrs]; + *indices_offsets = base_addr + ptr_offsets[P_indices_offsets]; + *lengths_offsets = base_addr + ptr_offsets[P_lengths_offsets]; + *per_sample_weights_addrs = + need_weights ? (base_addr + ptr_offsets[P_per_sample_weight]) : nullptr; + *indices_is_long = + reinterpret_cast(base_addr + ptr_offsets[P_indices_is_long]); + *lengths_is_long = + reinterpret_cast(base_addr + ptr_offsets[P_lengths_is_long]); +} + +std::tuple tbe_input_combine_with_length_gpu( + const std::vector& indices_list, + const std::vector& lengths_list, + const std::vector& per_sample_weights) { + const auto num_lists = indices_list.size(); + TORCH_CHECK(num_lists > 0); + TORCH_CHECK(lengths_list.size() == num_lists); + TORCH_CHECK(per_sample_weights.size() == num_lists); + const bool need_weights = std::any_of( + per_sample_weights.begin(), per_sample_weights.end(), [](const auto& x) { + return x.numel() > 0; + }); + + // Store is_longs in 32-bit variables. i-th bit (LSB) indicates if + // list i-th is long. + const uint64_t num_is_longs = + (num_lists + IS_LONG_NUM_BITS - 1) / IS_LONG_NUM_BITS; + const uint64_t num_is_longs_64 = compute_num_uint64s(num_is_longs); + // args_tensor stores kernel arguments: + // - indices_prts (num_lists uint64_t elements) + // - lengths_addrs (num_lists uint64_t elements) + // - indices_offsets (num_lists + 1 uint64_t elements) + // - lengths_offsets (num_lists + 1 uint64_t elements) + // - per_sample_weight (num_lists uint64_t elements; optional) + // - indices_is_long (num_is_longs uint32_t elements) + // - lengths_is_long (num_is_longs uint32_t elements) + uint64_t args_offsets[NUM_ARGS + 1]; + // Initialize offsets with lengths first + args_offsets[P_indices_prts] = num_lists; + args_offsets[P_lengths_addrs] = num_lists; + args_offsets[P_indices_offsets] = num_lists + 1; + args_offsets[P_lengths_offsets] = num_lists + 1; + args_offsets[P_per_sample_weight] = need_weights ? num_lists : 0; + args_offsets[P_indices_is_long] = num_is_longs_64; + args_offsets[P_lengths_is_long] = num_is_longs_64; + + // Compute offsets + uint64_t offset = 0; + auto next = args_offsets[0]; + for (uint32_t i = 0; i < NUM_ARGS; i++) { + args_offsets[i] = offset; + offset += next; + next = args_offsets[i + 1]; + } + args_offsets[NUM_ARGS] = offset; // total number of uint64_t elements required + + Tensor args_tensor = at::empty( + {static_cast(args_offsets[NUM_ARGS] * sizeof(uint64_t))}, + at::TensorOptions().dtype(at::kByte).pinned_memory(true)); + + uint64_t* indices_addrs = nullptr; + uint64_t* lengths_addrs = nullptr; + uint64_t* indices_offsets = nullptr; + uint64_t* lengths_offsets = nullptr; + uint64_t* per_sample_weights_addrs = nullptr; + uint32_t* indices_is_long = nullptr; + uint32_t* lengths_is_long = nullptr; + + // Offset host pointers + offset_tbe_input_combine_with_length_args( + &indices_addrs, + &lengths_addrs, + &indices_offsets, + &lengths_offsets, + &per_sample_weights_addrs, + &indices_is_long, + &lengths_is_long, + reinterpret_cast(args_tensor.data_ptr()), + args_offsets, + need_weights); + + const auto& indices_0 = indices_list[0]; + uint64_t total_indices = 0; + uint64_t total_lengths = 0; + uint64_t max_list_size = 0; + for (uint64_t i = 0; i < num_lists; i++) { + const uint64_t is_long_idx = i / IS_LONG_NUM_BITS; + auto& indices_is_long_ = indices_is_long[is_long_idx]; + auto& lengths_is_long_ = lengths_is_long[is_long_idx]; + if (i % IS_LONG_NUM_BITS == 0) { + indices_is_long_ = 0; + lengths_is_long_ = 0; + } + const auto& indices = indices_list[i]; + const auto& lengths = lengths_list[i]; + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(indices); + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(lengths); + TENSORS_ON_SAME_DEVICE(indices, indices_0); + TENSORS_ON_SAME_DEVICE(lengths, indices_0); + TORCH_CHECK(indices.dtype() == c10::kInt || indices.dtype() == c10::kLong); + TORCH_CHECK(lengths.dtype() == c10::kInt || lengths.dtype() == c10::kLong); + TENSOR_NDIM_EQUALS(indices, 1); + TENSOR_NDIM_EQUALS(lengths, 1); + + const auto indices_numel = indices.numel(); + const auto lengths_numel = lengths.numel(); + indices_offsets[i] = total_indices; + lengths_offsets[i] = total_lengths; + total_indices += indices_numel; + total_lengths += lengths_numel; + max_list_size = + std::max(max_list_size, static_cast(indices_numel)); + max_list_size = + std::max(max_list_size, static_cast(lengths_numel)); + + // Store pointers in args_tensor + indices_addrs[i] = reinterpret_cast(indices.data_ptr()); + lengths_addrs[i] = reinterpret_cast(lengths.data_ptr()); + indices_is_long_ |= static_cast(indices.dtype() == c10::kLong) + << (i % IS_LONG_NUM_BITS); + lengths_is_long_ |= static_cast(lengths.dtype() == c10::kLong) + << (i % IS_LONG_NUM_BITS); + + const auto& weights = per_sample_weights[i]; + if (weights.numel() > 0) { + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(weights); + TENSORS_ON_SAME_DEVICE(weights, indices_0); + TENSOR_TYPE_MUST_BE(weights, c10::kFloat); + TENSOR_NDIM_EQUALS(weights, 1); + TENSORS_HAVE_SAME_NUMEL(weights, indices); + + per_sample_weights_addrs[i] = + reinterpret_cast(weights.data_ptr()); + } + } + indices_offsets[num_lists] = total_indices; + lengths_offsets[num_lists] = total_lengths; + + const auto& device = indices_0.device(); + // Transfer args_tensor from host to device + args_tensor = args_tensor.to(device, /*non_blocking=*/true); + + // Offset device pointers + offset_tbe_input_combine_with_length_args( + &indices_addrs, + &lengths_addrs, + &indices_offsets, + &lengths_offsets, + &per_sample_weights_addrs, + &indices_is_long, + &lengths_is_long, + reinterpret_cast(args_tensor.data_ptr()), + args_offsets, + need_weights); + + return tbe_input_combine_with_length_cuda( + indices_addrs, + lengths_addrs, + per_sample_weights_addrs, + indices_is_long, + lengths_is_long, + indices_offsets, + lengths_offsets, + num_lists, + total_indices, + total_lengths, + max_list_size, + device.index()); +} + +TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) { + DISPATCH_TO_CUDA( + "tbe_input_combine_with_length", + fbgemm_gpu::tbe_input_combine_with_length_gpu); +}; + +} // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 4e93d08a65..62cef01113 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -12,6 +12,7 @@ #include #include #include +#include // clang-format off #include "fbgemm_gpu/cub_namespace_prefix.cuh" @@ -1824,39 +1825,101 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward( return {v_grad, a_values_grad}; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel( const at::PackedTensorAccessor32 values, const at::PackedTensorAccessor32 offsets, at::PackedTensorAccessor32 output, const int max_L) { - const int B = offsets.size(0) - 1; - const int D = output.size(1); + const auto B = offsets.size(0) - 1; + const auto D = output.size(1); - const int b_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_step = gridDim.x * blockDim.y; - for (int b = b_begin; b < B; b += b_step) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length != 0) { - // TODO: use shared memory and better reduction - for (int d = threadIdx.x; d < D; d += blockDim.x) { - scalar_t max_value = values[row_start][d]; - for (int l = 1; l < length; ++l) { - max_value = max(max_value, values[row_start + l][d]); + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce BlockReduceT; + + // Allocate shared memory for BlockReduce + __shared__ typename BlockReduceT::TempStorage temp_storage; + + __shared__ scalar_t max_value; + __shared__ scalar_t exp_sum; + + const auto tid = threadIdx.x; + for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + if (length > 0) { + const auto num_l_blocks = + (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) { + if (tid == 0) { + max_value = values[row_start][d]; + exp_sum = 0; } - at::acc_type acc = - exp(values[row_start][d] - max_value); - for (int l = 1; l < length; ++l) { - acc += exp(values[row_start + l][d] - max_value); + // Loop through all blocks to calculate the max value + // Each block has its own max value block_max_value, and + // max_value is the max value across all blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + scalar_t thread_val = values[row_start][d]; + if (l < length) { + thread_val = values[row_start + l][d]; + } + + // Collectively compute the block-wide max reduction + scalar_t block_max_value = + BlockReduceT(temp_storage).Reduce(thread_val, cub::Max()); + __syncthreads(); + + if (tid == 0) { + max_value = max(max_value, block_max_value); + } + } + + // The max_value was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated max_value + __syncthreads(); + + // Loop through all blocks to calculate the sum of exp + // Each block has its own sum block_exp_acc, and + // exp_sum is the sum across all blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + auto l = bk_l * blockDim.x + tid; + + scalar_t thread_exp = 0; + if (l < length) { + thread_exp = std::exp(values[row_start + l][d] - max_value); + } + + // Collectively compute the block-wide sum reduction + scalar_t block_exp_sum = BlockReduceT(temp_storage).Sum(thread_exp); + __syncthreads(); + + if (tid == 0) { + exp_sum += block_exp_sum; + } } - for (int l = 0; l < length; ++l) { - output[row_start + l][d] = - exp(values[row_start + l][d] - max_value) / acc; + // The exp_sum was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated exp_sum + __syncthreads(); + + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + auto l = bk_l * blockDim.x + tid; + scalar_t thread_exp = 0; + if (l < length) { + thread_exp = std::exp(values[row_start + l][d] - max_value); + output[row_start + l][d] = thread_exp / exp_sum; + } } + + // The max_value and exp_sum will be reinitialized by thread 0 in the + // next d iteration, sync here to make sure the last loop still uses the + // reduced values before reinitialization + __syncthreads(); } } } @@ -1872,14 +1935,13 @@ Tensor jagged_softmax_forward( at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(values.get_device()); - const int B = offsets.numel() - 1; - const int D = values.size(1); + const auto B = offsets.numel() - 1; + const auto D = values.size(1); auto output = at::empty_like(values); if (B > 0 && D > 0) { - const int block_dim_x = - std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int THREADS_PER_BLOCK = 128; + const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_softmax_kernel_1", [&] { @@ -1889,9 +1951,9 @@ Tensor jagged_softmax_forward( values.scalar_type(), "jagged_softmax_kernel_2", [&] { - jagged_softmax_kernel - << + <<>>( values.packed_accessor32(), @@ -1906,35 +1968,76 @@ Tensor jagged_softmax_forward( return output; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel( const at::PackedTensorAccessor32 grad_output, const at::PackedTensorAccessor32 output, const at::PackedTensorAccessor32 offsets, at::PackedTensorAccessor32 grad_input, const int max_L) { - const int B = offsets.size(0) - 1; - const int D = grad_output.size(1); + const auto B = offsets.size(0) - 1; + const auto D = grad_output.size(1); - const int b_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_step = gridDim.x * blockDim.y; - for (int b = b_begin; b < B; b += b_step) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length != 0) { - // TODO: use shared memory and better reduction - for (int d = threadIdx.x; d < D; d += blockDim.x) { - scalar_t sum_value = grad_output[row_start][d] * output[row_start][d]; - for (int l = 1; l < length; ++l) { - sum_value += grad_output[row_start + l][d] * output[row_start + l][d]; + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce BlockReduceT; + + // Allocate shared memory for BlockReduce + __shared__ typename BlockReduceT::TempStorage temp_storage; + + __shared__ scalar_t sum_value; + + const auto tid = threadIdx.x; + for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + if (length > 0) { + const auto num_l_blocks = + (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) { + if (tid == 0) { + sum_value = 0; + } + + // Loop through all blocks to calculate the sum value + // Each block has its own sum, and sum_value is the sum value across all + // blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + scalar_t thread_val = 0; + if (l < length) { + thread_val = + grad_output[row_start + l][d] * output[row_start + l][d]; + } + + // Collectively compute the block-wide sum reduction + scalar_t block_sum_value = BlockReduceT(temp_storage).Sum(thread_val); + __syncthreads(); + + if (tid == 0) { + sum_value += block_sum_value; + } } - for (int l = 0; l < length; ++l) { - grad_input[row_start + l][d] = - (grad_output[row_start + l][d] - sum_value) * - output[row_start + l][d]; + // The sum_value was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated sum_value + __syncthreads(); + + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + if (l < length) { + grad_input[row_start + l][d] = + (grad_output[row_start + l][d] - sum_value) * + output[row_start + l][d]; + } } + + // The sum_value will be reinitialized by thread 0 in the + // next d iteration, sync here to make sure the last loop still uses the + // reduced value before reinitialization + __syncthreads(); } } } @@ -1952,14 +2055,13 @@ Tensor jagged_softmax_backward( at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(grad_output.get_device()); - const int B = offsets.numel() - 1; - const int D = grad_output.size(1); + const auto B = offsets.numel() - 1; + const auto D = grad_output.size(1); auto grad_input = at::empty_like(grad_output); if (B > 0 && D > 0) { - const int block_dim_x = - std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int THREADS_PER_BLOCK = 128; + const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_softmax_backward_kernel_1", [&] { @@ -1969,9 +2071,12 @@ Tensor jagged_softmax_backward( grad_output.scalar_type(), "jagged_softmax_backward_kernel_2", [&] { - jagged_softmax_backward_kernel - << + <<>>( grad_output.packed_accessor32(), @@ -1986,7 +2091,7 @@ Tensor jagged_softmax_backward( return grad_input; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel( const at::PackedTensorAccessor32 x_values, const at::PackedTensorAccessor32 y_values, @@ -1997,30 +2102,53 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel( const int M = x_values.size(1); const int N = y_values.size(1); - const int b_m_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_m_step = gridDim.x * blockDim.y; - for (int b_m = b_m_begin; b_m < B * M; b_m += b_m_step) { - const int b = b_m / M; - const int m = b_m % M; + const auto block_row = blockIdx.y; + const auto block_col = blockIdx.x; + const auto row = threadIdx.y; + const auto col = threadIdx.x; + __shared__ scalar_t Xs[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ scalar_t Ys[BLOCK_SIZE][BLOCK_SIZE]; + + for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + auto num_l_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; + + at::acc_type acc = 0; + + const auto row_offset = block_row * BLOCK_SIZE + row; + const auto col_offset = block_col * BLOCK_SIZE + col; + + // for loop block tile in length dimension + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + Xs[row][col] = 0; + Ys[row][col] = 0; + const auto bk_offset = bk_l * BLOCK_SIZE; + + // load data from global memory to shared memory + const auto l_x = bk_offset + col; + if (row_offset < M && l_x < length) { + Xs[row][col] = x_values[row_start + l_x][row_offset]; + } - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length == 0) { - for (int n = threadIdx.x; n < N; n += blockDim.x) { - output[b][m][n] = 0; + const auto l_y = bk_offset + row; + if (l_y < length && col_offset < N) { + Ys[row][col] = y_values[row_start + l_y][col_offset]; } - } else { - // TODO: use shared memory and better reduction - for (int n = threadIdx.x; n < N; n += blockDim.x) { - at::acc_type acc = - x_values[row_start][m] * y_values[row_start][n]; - for (int l = 1; l < length; ++l) { - acc += x_values[row_start + l][m] * y_values[row_start + l][n]; - } - output[b][m][n] = acc; + + __syncthreads(); + +#pragma unroll + for (auto e = 0; e < BLOCK_SIZE; e++) { + acc += Xs[row][e] * Ys[e][col]; } + __syncthreads(); } + + // write the result to the output + if ((row_offset < M) && (col_offset < N)) + output[b][row_offset][col_offset] = acc; } } @@ -2042,9 +2170,16 @@ Tensor jagged_jagged_bmm_forward( auto output = at::zeros({B, M, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { - const int block_dim_x = - std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int BLOCK_SIZE = 16; + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const auto grid_dim_x = div_round_up(N, BLOCK_SIZE); + const auto grid_dim_y = div_round_up(M, BLOCK_SIZE); + TORCH_CHECK( + grid_dim_y <= kMaxBlockYDim, + "M cannot be larger than", + grid_dim_y * BLOCK_SIZE + 1 - BLOCK_SIZE); + const auto grid_dim_z = std::min(B, kMaxBlockZDim); + const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] { @@ -2054,11 +2189,8 @@ Tensor jagged_jagged_bmm_forward( x_values.scalar_type(), "jagged_jagged_bmm_kernel_2", [&] { - jagged_jagged_bmm_kernel - <<>>( + jagged_jagged_bmm_kernel + <<>>( x_values.packed_accessor32(), y_values.packed_accessor32(), offsets.packed_accessor32(), @@ -2071,7 +2203,17 @@ Tensor jagged_jagged_bmm_forward( return output; } -template +template < + const int BLOCK_TILE_M, // tile height of C that each thread block + // calculates + const int BLOCK_TILE_N, // tile width of C that each thread block + // calculates + const int BLOCK_TILE_K, // tile width of A that each thread block calculates + const int THREAD_TILE_M, // tile height of C that each thread + // calculates + const int THREAD_TILE_N, // tile width of C that each thread calcualtes + typename index_t, + typename scalar_t> __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel( const at::PackedTensorAccessor32 x_values, const at::PackedTensorAccessor32 x_offsets, @@ -2082,25 +2224,116 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel( const int K = x_values.size(1); const int N = y.size(2); - const int b_l_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_l_step = gridDim.x * blockDim.y; - for (int b_l = b_l_begin; b_l < B * max_L; b_l += b_l_step) { - const int b = b_l / max_L; - const int l = b_l % max_L; - - const int row_start = x_offsets[b]; - const int row_end = x_offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length == 0 || l >= length) { - return; - } else { - // TODO: use shared memory and better reduction - for (int n = threadIdx.x; n < N; n += blockDim.x) { - at::acc_type acc = 0; - for (int k = 0; k < K; ++k) { - acc += x_values[row_start + l][k] * y[b][k][n]; + const auto block_row = blockIdx.y; + const auto block_col = blockIdx.x; + + const int THREADS_X_PER_BLOCK = BLOCK_TILE_N / THREAD_TILE_N; + const int THREADS_Y_PER_BLOCK = BLOCK_TILE_M / THREAD_TILE_M; + const int THREADS_PER_BLOCK = THREADS_X_PER_BLOCK * THREADS_Y_PER_BLOCK; + const auto thread_row = threadIdx.x / THREADS_X_PER_BLOCK; + const auto thread_col = threadIdx.x % THREADS_X_PER_BLOCK; + const auto NUM_K_BLOCKS = (K + BLOCK_TILE_K - 1) / BLOCK_TILE_K; + + __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K]; + __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N]; + + // Once we remove ROCm<=5.3 support, we should replace uint32_t with auto. + // See #1655 + for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) { + const index_t row_start = x_offsets[b]; + const index_t row_end = x_offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + // the indices that this current will load into shared mem + const auto inner_row_a = threadIdx.x / BLOCK_TILE_K; + const auto inner_col_a = threadIdx.x % BLOCK_TILE_K; + // the number of rows of As that will be loaded per step by a thread block + const auto A_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_K; + + const auto inner_row_b = threadIdx.x / BLOCK_TILE_N; + const auto inner_col_b = threadIdx.x % BLOCK_TILE_N; + const auto B_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_N; + + // registers for C + scalar_t accum[THREAD_TILE_M][THREAD_TILE_N] = {0}; + + // registers for As and Bs + scalar_t fragment_a[THREAD_TILE_M] = {0}; + scalar_t fragment_b[THREAD_TILE_N] = {0}; + + // loop for block tiles in K dimension + for (auto block = 0; block < NUM_K_BLOCKS; block++) { +// load a block of x_values from global memory to shared memory +// apply tiling for threads in a block +#pragma unroll + for (auto offset = 0; offset < BLOCK_TILE_M; + offset += A_TILE_ROW_STRIDE) { + auto x_row_offset = block_row * BLOCK_TILE_M + inner_row_a + offset; + auto x_col_offset = block * BLOCK_TILE_K + inner_col_a; + if ((x_row_offset < length) && (x_col_offset < K)) { + As[inner_row_a + offset][inner_col_a] = + x_values[row_start + x_row_offset][x_col_offset]; + } else { + As[inner_row_a + offset][inner_col_a] = 0; + } + } + +// load a block of y from global memory to shared memory +// apply tiling for threads in a block +#pragma unroll + for (auto offset = 0; offset < BLOCK_TILE_K; + offset += B_TILE_ROW_STRIDE) { + auto y_row_offset = block * BLOCK_TILE_K + inner_row_b + offset; + auto y_col_offset = block_col * BLOCK_TILE_N + inner_col_b; + if ((y_row_offset < K) && (y_col_offset < N)) { + Bs[inner_row_b + offset][inner_col_b] = + y[b][y_row_offset][y_col_offset]; + } else { + Bs[inner_row_b + offset][inner_col_b] = 0; + } + } + + __syncthreads(); + +// calculate the results per thread +#pragma unroll + for (auto k = 0; k < BLOCK_TILE_K; k++) { + // load values from shared memory to registers for x_values + for (auto row = 0; row < THREAD_TILE_M; row++) { + fragment_a[row] = As[thread_row * THREAD_TILE_M + row][k]; + } + +// load values from shared memory to registers for y +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + fragment_b[col] = Bs[k][thread_col * THREAD_TILE_N + col]; + } + +// each thread calcualtes THREAD_TILE_M * THREAD_TILE_N elements +#pragma unroll + for (auto row = 0; row < THREAD_TILE_M; row++) { +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + accum[row][col] += fragment_a[row] * fragment_b[col]; + } + } + } + + __syncthreads(); + } + +// write the result to the output +#pragma unroll + for (auto row = 0; row < THREAD_TILE_M; row++) { +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + auto out_row_offset = + block_row * BLOCK_TILE_M + thread_row * THREAD_TILE_M + row; + auto out_col_offset = + block_col * BLOCK_TILE_N + thread_col * THREAD_TILE_N + col; + if ((out_row_offset < length) && (out_col_offset < N)) { + output[row_start + out_row_offset][out_col_offset] = accum[row][col]; } - output[row_start + l][n] = acc; } } } @@ -2124,9 +2357,29 @@ Tensor jagged_dense_bmm_forward( const int total_L = x_values.size(0); auto output = at::zeros({total_L, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { - const int block_dim_x = - std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + // The shared memory size is (BLOCK_TILE_M + BLOCK_TILE_N) * BLOCK_TILE_K + // BLOCK_TILE_M needs to be multiple of THREAD_TILE_M, and + // BLOCK_TILE_N needs to be multiple of THREAD_TILE_N + // The setting of these parameters needs to balance the hardware's shared + // memory size limit and occupancy + // TODO: autotune these parameters based on max_L and input and output + // tensor sizes + constexpr int BLOCK_TILE_M = 64; + constexpr int BLOCK_TILE_N = 8; + constexpr int BLOCK_TILE_K = 8; + constexpr int THREAD_TILE_M = 4; + constexpr int THREAD_TILE_N = 4; + + const dim3 block( + (BLOCK_TILE_M * BLOCK_TILE_N) / (THREAD_TILE_M * THREAD_TILE_N)); + const auto grid_dim_x = div_round_up(N, BLOCK_TILE_N); + const auto grid_dim_y = div_round_up(max_L, BLOCK_TILE_M); + TORCH_CHECK( + grid_dim_y <= kMaxBlockYDim, + "max_L cannot be larger than", + grid_dim_y * BLOCK_TILE_M + 1 - BLOCK_TILE_M); + const auto grid_dim_z = std::min(B, kMaxBlockZDim); + const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z); AT_DISPATCH_INDEX_TYPES( x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] { @@ -2136,11 +2389,15 @@ Tensor jagged_dense_bmm_forward( x_values.scalar_type(), "jagged_dense_bmm_kernel_2", [&] { - jagged_dense_bmm_kernel - <<>>( + jagged_dense_bmm_kernel< + BLOCK_TILE_M, + BLOCK_TILE_N, + BLOCK_TILE_K, + THREAD_TILE_M, + THREAD_TILE_N, + index_t, + scalar_t> + <<>>( x_values.packed_accessor32(), x_offsets.packed_accessor32(), y.packed_accessor32(), diff --git a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp index 283422b7ae..347ec089e0 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp @@ -644,7 +644,7 @@ jagged_dense_elementwise_add_jagged_output( const Tensor& y) { // Convert to jagged auto jagged_values = - DenseToJaggedOp::apply(y, x_offsets, c10::optional())[0]; + DenseToJaggedOp::apply(y, x_offsets, x_values.size(0))[0]; // Add jagged_values + x_values -> sum_values auto sum_values = x_values + jagged_values; diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp index ed3c075bd0..d03b961a79 100644 --- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp @@ -23,72 +23,85 @@ using Tensor = at::Tensor; namespace { -// Hilariously unoptimized, but algorithmic correctness matters more here, and -// we only do it once. -AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { - auto world_size = at::cuda::getNumGPUs(); - auto intermediate_node = [&](Node i, Node j) { - if (i == j) { - return std::vector{-1}; - } - if (links(i, j) != 0) { - return std::vector{-1}; - } +struct DirectConnectedPeer { + int64_t num_peer_links; + int64_t peer_id; + // number of transfers from peer + int32_t peer_transfers; +}; - std::vector> paths; - for (const auto k : c10::irange(world_size)) { - if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) { - paths.push_back({k, links(i, k) + links(k, j)}); - } - } - if (paths.empty()) { - LOG(WARNING) - << "Expect very bad performance for p2p copies, we are going via sys path for GPU " - << i << " -> GPU " << j; - return std::vector{-1}; - } - auto mp = std::max_element( - paths.begin(), - paths.end(), - [](std::pair a, std::pair b) { - return a.second < b.second; - }) - ->second; - std::vector candidates; - for (const auto& p : paths) { - if (p.second == mp) { - candidates.push_back(p.first); - } - } - return candidates; - }; +struct TwoHopTransferContainer { + Tensor intermediate_tensor; + uint64_t output_idx; + std::unique_ptr transfer_cuda_event; +}; - std::vector assignments(world_size * world_size); - // Use a two-phase assignment protocol as the greedy approach - // can lead to unbalanced usage. - std::unordered_map uses; +AdjacencyMatrix get_intermediate_node( + const AdjacencyMatrix& links) { + const auto world_size = at::cuda::getNumGPUs(); + std::vector link_vec(static_cast(world_size * world_size)); for (const auto i : c10::irange(world_size)) { for (const auto j : c10::irange(world_size)) { - auto ims = intermediate_node(i, j); - if (ims.size() == 1) { - auto v = ims.front(); - if (v != -1) { - uses[v] += 1; - } - assignments[i * world_size + j] = v; - } + link_vec[i * world_size + j] = links(i, j); } } + auto link_tensor = at::from_blob( + link_vec.data(), + {world_size, world_size}, + at::TensorOptions().dtype(at::kLong)); + LOG(INFO) << "NVLink Topology Matrix: \n" << link_tensor; + std::vector assignments( + static_cast(world_size * world_size), -1); + for (const auto dst_rank_id : c10::irange(world_size)) { + std::vector non_direct_src_ids; + non_direct_src_ids.reserve(world_size); + std::vector direct_connected_peers; + direct_connected_peers.reserve(world_size); + for (const auto src_rank_id : c10::irange(world_size)) { + if (dst_rank_id == src_rank_id) { + continue; + } - for (const auto i : c10::irange(world_size)) { - for (const auto j : c10::irange(world_size)) { - auto ims = intermediate_node(i, j); - if (ims.size() > 1) { - auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) { - return uses[a] < uses[b]; - }); - uses[v] += 1; - assignments[i * world_size + j] = v; + const auto num_peer_links = links(dst_rank_id, src_rank_id); + if (num_peer_links > 0) { + direct_connected_peers.push_back( + {.num_peer_links = num_peer_links, + .peer_id = src_rank_id, + .peer_transfers = 1}); + } else { + non_direct_src_ids.push_back(src_rank_id); + } + } + + // Assign intermediate hop ranks for non-directly connected peers. + // Assigns intermediate hops based on the number of links from the + // potential intermediate rank to target rank, as well as + // the number of two_hop connections already assigned to the + // intermediate rank. + for (const auto i : c10::irange(non_direct_src_ids.size())) { + std::sort( + direct_connected_peers.begin(), + direct_connected_peers.end(), + [](const auto& a, const auto& b) { + if (a.num_peer_links > b.num_peer_links) { + return true; + } else if (a.num_peer_links == b.num_peer_links) { + return a.peer_transfers < b.peer_transfers; + } else { + return false; + } + }); + const auto non_direct_src_id = non_direct_src_ids.at(i); + for (auto& j : direct_connected_peers) { + const auto potential_hop_id = j.peer_id; + const auto potential_hop_peer_links = + links(potential_hop_id, non_direct_src_id); + if (potential_hop_peer_links > 0) { + assignments[dst_rank_id * world_size + non_direct_src_id] = + potential_hop_id; + j.peer_transfers += 1; + break; + } } } } @@ -100,7 +113,8 @@ AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { {world_size, world_size}, at::TensorOptions().dtype(at::kLong)); LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor; - return [=](Node i, Node j) { return assignments[i * world_size + j]; }; + return + [=](Node src, Node dst) { return assignments[dst * world_size + src]; }; } else { return [](Node, Node) { return -1; }; } @@ -111,7 +125,7 @@ AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { // tensor in `input_tensors` is already in the `target_device`, we will skip // copy it if `skip_if_same_device` is true. void all_to_one( - std::vector& input_tensors, + const std::vector& input_tensors, std::vector& output_tensors, at::Device target_device, bool skip_if_same_device) { @@ -119,19 +133,48 @@ void all_to_one( std::vector copy_begin_events(num_gpus); std::vector copy_completion_events(num_gpus); + std::vector two_hop_transfers; + two_hop_transfers.reserve(input_tensors.size()); + std::vector is_two_hop_transfer; + is_two_hop_transfer.reserve(input_tensors.size()); + static auto intermediate_nodes = get_intermediate_node(fbgemm_gpu::get_nvlink_matrix()); - for (auto& ten : input_tensors) { - Node src_device_id = ten.get_device(); + for (const auto i : c10::irange(input_tensors.size())) { + const auto& src = input_tensors.at(i); + Node src_device_id = src.get_device(); auto intermediate_node = intermediate_nodes(src_device_id, target_device.index()); if (intermediate_node != -1) { - ten = ten.to(at::Device(at::kCUDA, intermediate_node)); + two_hop_transfers.push_back( + {.intermediate_tensor = at::empty( + src.sizes(), + src.options().device(at::Device(at::kCUDA, intermediate_node))), + .output_idx = i, + .transfer_cuda_event = + std::make_unique(cudaEventDisableTiming)}); + auto& dst = two_hop_transfers.back().intermediate_tensor; + at::cuda::CUDAStream copy_stream = + at::cuda::getCurrentCUDAStream(src_device_id); + AT_CUDA_CHECK(cudaMemcpy2DAsync( + dst.data_ptr(), + dst.stride(0) * dst.element_size(), + src.data_ptr(), + src.stride(0) * src.element_size(), + src.size(1) * src.element_size(), + src.size(0), + cudaMemcpyDeviceToDevice, + copy_stream)); + two_hop_transfers.back().transfer_cuda_event->record(copy_stream); + is_two_hop_transfer.push_back(true); + } else { + is_two_hop_transfer.push_back(false); } } - // For each source device, we sync its current stream and launch all the - // copies that are from that device. + // For each source device directly connected to the destination device, we + // sync its current stream and launch all the copies that are from that + // device. for (const auto device_id : c10::irange(num_gpus)) { auto src_device = at::Device(at::kCUDA, device_id); if (src_device == target_device) { @@ -160,6 +203,13 @@ void all_to_one( device_guard.set_device(src_device); dst_ready.block(copy_stream); for (const auto i : c10::irange(input_tensors.size())) { + const auto metadata = is_two_hop_transfer.at(i); + // Initiate all transfer for tensors with direct + // NVLink connection to target rank + if (metadata) { + continue; + } + auto& src = input_tensors[i]; if (src.device() != src_device) { continue; @@ -179,6 +229,43 @@ void all_to_one( } } + // Complete 2-hop transfers to target rank + for (auto& two_hop_transfer : two_hop_transfers) { + const auto& src = two_hop_transfer.intermediate_tensor; + const auto src_device_id = src.get_device(); + const auto src_device = at::Device(at::kCUDA, src_device_id); + if (src_device == target_device) { + continue; + } + + // intermediate rank + at::cuda::CUDAGuard device_guard(src_device); + // intermediate rank stream + at::cuda::CUDAStream copy_stream = + at::cuda::getCurrentCUDAStream(src_device_id); + // wait on first hop transfer + two_hop_transfer.transfer_cuda_event->block(copy_stream); + // synchronize with target rank + auto& dst_ready = copy_begin_events[src_device_id]; + device_guard.set_device(target_device); + dst_ready.record(at::cuda::getCurrentCUDAStream(target_device.index())); + device_guard.set_device(src_device); + dst_ready.block(copy_stream); + // originating tensor output position + const auto output_index = two_hop_transfer.output_idx; + auto& dst = output_tensors.at(output_index); + // on source device, launch memcpy. + AT_CUDA_CHECK(cudaMemcpy2DAsync( + dst.data_ptr(), + dst.stride(0) * dst.element_size(), + src.data_ptr(), + src.stride(0) * src.element_size(), + src.size(1) * src.element_size(), + src.size(0), + cudaMemcpyDeviceToDevice, + copy_stream)); + } + // Do the same-GPU cases. if (!skip_if_same_device) { for (const auto i : c10::irange(input_tensors.size())) { diff --git a/fbgemm_gpu/src/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops_gpu.cpp index e3e1225fb9..0126ff414f 100644 --- a/fbgemm_gpu/src/sparse_ops_gpu.cpp +++ b/fbgemm_gpu/src/sparse_ops_gpu.cpp @@ -500,12 +500,41 @@ Tensor index_select_dim0_gpu( std::vector group_index_select_dim0_gpu( const std::vector& input_group, const std::vector& indices_group) { + const auto group_size = input_group.size(); std::vector output_group; - apply_( - [&](auto&&... args) { - output_group = GroupIndexSelectDim0GPUOp::apply(indices_group, args...); - }, - input_group); + // We use the APPLY_AUTOGRAD_FN macros to instantiate + // GroupIndexSelectDim0GPUOp for different group sizes. We only instantiate + // up to group size of 54. + constexpr size_t max_group_size = 54; + // Specialize this path to avoid copy + if (group_size <= max_group_size) { + apply_( + [&](auto&&... args) { + output_group = + GroupIndexSelectDim0GPUOp::apply(indices_group, args...); + }, + input_group); + return output_group; + } + + const auto input_itr = input_group.begin(); + const auto indices_itr = indices_group.begin(); + + for (size_t start = 0; start < group_size; start += max_group_size) { + const auto end = std::min(start + max_group_size, group_size); + std::vector input_subgroup(input_itr + start, input_itr + end); + std::vector indices_subgroup( + indices_itr + start, indices_itr + end); + std::vector output_subgroup; + apply_( + [&](auto&&... args) { + output_subgroup = + GroupIndexSelectDim0GPUOp::apply(indices_subgroup, args...); + }, + input_subgroup); + output_group.insert( + output_group.end(), output_subgroup.begin(), output_subgroup.end()); + } return output_group; } diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu index 9d23ee9fff..513f32cf8e 100644 --- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu +++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu @@ -79,6 +79,18 @@ enum uvm_cache_stats_index { num_conflict_misses = 5, }; +// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is +// not sensitive to grid size as long as the number thread blocks per SM is not +// too small nor too big. +constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16; + +int get_max_thread_blocks_for_cache_kernels_() { + cudaDeviceProp* deviceProp = + at::cuda::getDeviceProperties(c10::cuda::current_device()); + return deviceProp->multiProcessorCount * + MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS; +} + } // namespace int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) { @@ -495,6 +507,67 @@ std::tuple> get_unique_indices_cuda( namespace { +__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel( + at::PackedTensorAccessor32 + lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + at::PackedTensorAccessor32 + uvm_cache_stats) { + const int32_t N = lxu_cache_locations.size(0); + int64_t n_enforced_misses = 0; + CUDA_KERNEL_LOOP(n, N) { + if ((n & 0x00FF) < enforced_misses_per_256) { + if (lxu_cache_locations[n] >= 0) { + n_enforced_misses++; + } + lxu_cache_locations[n] = kCacheLocationMissing; + } + } + if (gather_cache_stats && n_enforced_misses > 0) { + atomicAdd( + &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses], + n_enforced_misses); + } +} +} // namespace + +Tensor emulate_cache_miss( + Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + Tensor uvm_cache_stats) { + TENSOR_ON_CUDA_GPU(lxu_cache_locations); + TENSOR_ON_CUDA_GPU(uvm_cache_stats); + + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(lxu_cache_locations.get_device()); + + const auto N = lxu_cache_locations.numel(); + if (N == 0) { + // nothing to do + return lxu_cache_locations; + } + + const dim3 blocks(std::min( + div_round_up(N, kMaxThreads), + get_max_thread_blocks_for_cache_kernels_())); + + emulate_cache_miss_kernel<<< + blocks, + kMaxThreads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + lxu_cache_locations + .packed_accessor32(), + enforced_misses_per_256, + gather_cache_stats, + uvm_cache_stats.packed_accessor32()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return lxu_cache_locations; +} + +namespace { template __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel( const at::PackedTensorAccessor32 @@ -622,19 +695,6 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel } } } - -// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is -// not sensitive to grid size as long as the number thread blocks per SM is not -// too small nor too big. -constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16; - -int get_max_thread_blocks_for_cache_kernels_() { - cudaDeviceProp* deviceProp = - at::cuda::getDeviceProperties(c10::cuda::current_device()); - return deviceProp->multiProcessorCount * - MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS; -} - } // namespace std::pair lru_cache_find_uncached_cuda( @@ -798,8 +858,8 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel( at::PackedTensorAccessor32 uvm_cache_stats) { const int32_t C = lxu_cache_state.size(0); - int64_t n_conflict_misses = 0; - int64_t n_inserted = 0; + int32_t n_conflict_misses = 0; + int32_t n_inserted = 0; for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique; n += gridDim.x * blockDim.y) { // check if this warp is responsible for this whole segment. diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py index 74f7581576..07102aec90 100644 --- a/fbgemm_gpu/test/input_combine_test.py +++ b/fbgemm_gpu/test/input_combine_test.py @@ -11,12 +11,20 @@ from typing import List, Optional, Tuple import torch +from hypothesis import given, settings try: # pyre-ignore[21] from fbgemm_gpu import open_source # noqa: F401 + + # pyre-ignore[21] + from test_utils import cpu_and_maybe_gpu except Exception: + torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine") torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu") + from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu + +DEFAULT_DEVICE = torch.device("cpu") class TBEInputPrepareReference(torch.nn.Module): @@ -120,23 +128,23 @@ def forward( class InputCombineTest(unittest.TestCase): - def _get_inputs(self, dtypes): + def _get_inputs(self, dtypes, device=DEFAULT_DEVICE): indices_list = [ - torch.tensor([1, 2, 3], dtype=dtypes[0]), - torch.tensor([1, 2, 3, 4], dtype=dtypes[1]), + torch.tensor([1, 2, 3], dtype=dtypes[0], device=device), + torch.tensor([1, 2, 3, 4], dtype=dtypes[1], device=device), ] offsets_list = [ - torch.tensor([0, 2], dtype=dtypes[0]), - torch.tensor([0, 1, 4], dtype=dtypes[1]), + torch.tensor([0, 2], dtype=dtypes[0], device=device), + torch.tensor([0, 1, 4], dtype=dtypes[1], device=device), ] include_last_offsets = [False, True] per_sample_weights = [ - torch.tensor([1, 2, 1], dtype=torch.float), - torch.tensor([1, 2, 1, 3], dtype=torch.float), + torch.tensor([1, 2, 1], dtype=torch.float, device=device), + torch.tensor([1, 2, 1, 3], dtype=torch.float, device=device), ] empty_per_sample_weights = [ - torch.tensor([], dtype=torch.float), - torch.tensor([], dtype=torch.float), + torch.tensor([], dtype=torch.float, device=device), + torch.tensor([], dtype=torch.float, device=device), ] return ( indices_list, @@ -226,27 +234,34 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None: self.assertTrue(outputs[1].dtype == torch.int32) self.assertTrue(outputs[-1].size(0) == 0) - def _offsets_to_lengths(self, offsets, indices, include_last_offsets): + def _offsets_to_lengths( + self, offsets, indices, include_last_offsets, device=DEFAULT_DEVICE + ): if include_last_offsets: offsets_complete = offsets else: offsets_complete = torch.cat( - [offsets, torch.tensor([indices.numel()], dtype=offsets.dtype)] + [ + offsets, + torch.tensor([indices.numel()], dtype=offsets.dtype, device=device), + ] ) return offsets_complete[1:] - offsets_complete[:-1] - def _run_test_with_length(self, dtypes) -> None: + def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None: ( indices_list, offsets_list, per_sample_weights, empty_per_sample_weights, include_last_offsets, - ) = self._get_inputs(dtypes) + ) = self._get_inputs(dtypes, device=device) ref_mod = TBEInputPrepareReference(include_last_offsets) lengths_list = [ - self._offsets_to_lengths(offsets, indices, include_last_offsets) + self._offsets_to_lengths( + offsets, indices, include_last_offsets, device=device + ) for offsets, indices, include_last_offsets in zip( offsets_list, indices_list, include_last_offsets ) @@ -307,14 +322,20 @@ def test_input_combine_int32(self) -> None: def test_input_combined_mix(self) -> None: self._run_test((torch.int64, torch.int32)) - def test_input_combine_int64_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int64)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_int64_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int64, torch.int64), device=device) - def test_input_combine_int32_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int64)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_int32_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int32, torch.int32), device=device) - def test_input_combined_mix_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int32)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_mix_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int64, torch.int32), device=device) def test_padding_fused_input_combine_int64(self) -> None: self._run_padding_fused_test((torch.int64, torch.int64), 64) diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py index fa65a8bb49..98021007f4 100644 --- a/fbgemm_gpu/test/jagged_tensor_ops_test.py +++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py @@ -20,7 +20,12 @@ from fbgemm_gpu import open_source # noqa: F401 # pyre-ignore[21] - from test_utils import gpu_available, gpu_unavailable, running_on_github, TEST_WITH_ROCM + from test_utils import ( + gpu_available, + gpu_unavailable, + running_on_github, + TEST_WITH_ROCM, + ) except Exception: torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops") torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu") diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py index de7c80b79b..98e1ede2ee 100644 --- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py +++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py @@ -100,7 +100,7 @@ def ref(pooled_ad_embeddings, batch_indices): r=st.randoms(use_true_random=False), ) # Can instantiate 8 contexts which takes a long time. - @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None) + @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None) def test_all_to_one_device( self, num_inputs, diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py index c0ba5f6f64..ddab386bf0 100644 --- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py +++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py @@ -9,10 +9,11 @@ import copy +import math import pickle import random import unittest -from typing import List, Optional +from typing import List, Optional, Tuple import fbgemm_gpu import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops @@ -31,11 +32,16 @@ ) from fbgemm_gpu.split_table_batched_embeddings_ops import ( BoundsCheckMode, + CounterBasedRegularizationDefinition, + CounterWeightDecayMode, + GradSumDecay, INT8_EMB_ROW_DIM_OFFSET, + LearningRateMode, OptimType, RecordCacheMetrics, rounded_row_size_in_bytes, SparseType, + TailIdThreshold, WeightDecayMode, ) from hypothesis import assume, given, HealthCheck, settings, Verbosity @@ -1627,6 +1633,7 @@ def execute_backward_adagrad_( # noqa C901 use_cpu: bool, exact: bool, output_dtype: SparseType, + weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE, ) -> None: # NOTE: cache is not applicable to CPU version. assume(not use_cpu or not use_cache) @@ -1826,31 +1833,39 @@ def execute_backward_adagrad_( # noqa C901 goc = torch.cat(gos, dim=0) fc2.backward(goc) cc.flush() - split_optimizer_states = [s for (s,) in cc.split_optimizer_states()] + split_optimizer_states = cc.split_optimizer_states() + assert len(split_optimizer_states) == T tolerance = ( 1.0e-4 if weights_precision == SparseType.FP32 and output_dtype == SparseType.FP32 else 1.0e-2 ) for t in range(T): + if row_wise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, c1, c2) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] # pyre-fixme[16]: `Optional` has no attribute `float`. ref_optimizer_state = bs[t].weight.grad.float().cpu().to_dense().pow(2) torch.testing.assert_close( - split_optimizer_states[t].float().cpu(), + m1.float().cpu(), ref_optimizer_state.mean(dim=1) if row_wise else ref_optimizer_state, atol=tolerance, rtol=tolerance, ) for t in range(T): # optimizer_state = squares (no row-wise) or sum squares (row-wise) + if row_wise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, c1, c2) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] torch.testing.assert_close( cc.split_embedding_weights()[t].float().cpu(), torch.addcdiv( bs[t].weight.float().cpu(), value=-lr, tensor1=bs[t].weight.grad.float().cpu().to_dense(), - tensor2=split_optimizer_states[t] - .float() + tensor2=m1.float() .sqrt_() .add_(eps) .view(Es[t], 1 if row_wise else Ds[t]) @@ -2589,6 +2604,8 @@ def execute_backward_optimizers_( # noqa C901 0.9, 0.01, ) + counter_based_regularization: CounterBasedRegularizationDefinition + if optimizer == OptimType.EXACT_ADAGRAD: optimizer_kwargs["eps"] = eps @@ -2596,6 +2613,21 @@ def execute_backward_optimizers_( # noqa C901 optimizer_kwargs["eps"] = eps optimizer_kwargs["weight_decay"] = weight_decay optimizer_kwargs["weight_decay_mode"] = weight_decay_mode + if weight_decay_mode == WeightDecayMode.COUNTER: + counter_based_regularization = CounterBasedRegularizationDefinition( + counter_weight_decay_mode=CounterWeightDecayMode.DECOUPLE, + counter_halflife=20000, + adjustment_iter=24000, + adjustment_ub=0.1, + learning_rate_mode=LearningRateMode.TAIL_ID_LR_DECREASE, + grad_sum_decay=GradSumDecay.NO_DECAY, + tail_id_threshold=TailIdThreshold(val=1000, is_ratio=False), + ) + + optimizer_kwargs[ + "counter_based_regularization" + # pyre-fixme[6]: Expected `float` for 2nd param but got `CounterBasedRegularizationDefinition`. + ] = counter_based_regularization if optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD: optimizer_kwargs["eps"] = eps @@ -2654,15 +2686,39 @@ def execute_backward_optimizers_( # noqa C901 if optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.EXACT_ADAGRAD): rowwise = optimizer == OptimType.EXACT_ROWWISE_ADAGRAD for t in range(T): - (m1,) = split_optimizer_states[t] + row_counter: Optional[torch.Tensor] = None + freq: Optional[torch.Tensor] = None + iter_: int = -1 + + if rowwise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, prev_iter, row_counter) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] # to_dense in GPU is non-deterministic due to atmomics used in # coalescing and floating point non-associativity. # pyre-fixme[16]: `Optional` has no attribute `cpu`. dense_cpu_grad = bs[t].weight.grad.cpu().to_dense() - if rowwise and not use_cpu and weight_decay_mode == WeightDecayMode.L2: + if rowwise and not use_cpu: # We need to skip when using cpu because use_fbgemm (https://fburl.com/code/12131iub) # is true and the template code (https://fburl.com/code/1kctlup3) is not executed. - dense_cpu_grad += weight_decay * bs[t].weight.cpu() + if weight_decay_mode == WeightDecayMode.L2: + dense_cpu_grad += weight_decay * bs[t].weight.cpu() + elif weight_decay_mode == WeightDecayMode.COUNTER: + iter_ = int(cc.iter.item()) + ( + dense_cpu_grad, + row_counter, + freq, + ) = self.get_grad_from_counter_adagrad( + dense_cpu_grad, + bs[t].weight.cpu(), + counter_based_regularization, + row_counter.cpu(), + prev_iter.cpu(), + iter_, + weight_decay, + ) + m1_ref = ( dense_cpu_grad.pow(2) if not rowwise @@ -2681,14 +2737,31 @@ def execute_backward_optimizers_( # noqa C901 ) + eps ) - if ( - rowwise - and not use_cpu - and weight_decay_mode == WeightDecayMode.DECOUPLE - ): - weights_ref = bs[t].weight.cpu() - lr * ( - dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu() - ) + if rowwise and not use_cpu: + if weight_decay_mode == WeightDecayMode.DECOUPLE: + weights_ref = bs[t].weight.cpu() - lr * ( + dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu() + ) + elif weight_decay_mode == WeightDecayMode.L2: + # pyre-fixme[58]: `/` is not supported for operand types `float` + # and `Tensor`. + weights_ref = bs[t].weight.cpu() - lr * dense_cpu_grad / denom + elif weight_decay_mode == WeightDecayMode.COUNTER: + max_counter = cc.max_counter.item() + weights_ref = self.get_wts_from_counter_adagrad( + dense_cpu_grad, + bs[t].weight.cpu(), + denom, + counter_based_regularization, + row_counter, + # pyre-fixme[6]: Expected `Tensor` for 6th param but got `Optional[Tensor]` + freq, + max_counter, + iter_, + eps, + lr, + weight_decay, + ) else: # pyre-fixme[58]: `/` is not supported for operand types `float` # and `Tensor`. @@ -2833,6 +2906,117 @@ def execute_backward_optimizers_( # noqa C901 rtol=1.0e-4, ) + def get_grad_from_counter_adagrad( + self, + dense_cpu_grad: torch.Tensor, + weights: torch.Tensor, + counter_based_regularization: CounterBasedRegularizationDefinition, + row_counter: torch.Tensor, + prev_iter: torch.Tensor, + iter_: int, + weight_decay: float, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + row_counter = row_counter.view(row_counter.numel(), 1) + prev_iter = prev_iter.view(prev_iter.numel(), 1) + freq = torch.ones_like(row_counter) + counter_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + counter_halflife = counter_based_regularization.counter_halflife + l2_wd = 1.0 if counter_weight_decay_mode == CounterWeightDecayMode.L2 else 0.0 + + if counter_halflife > 0: + counter_log_rho = math.log(2.0) / counter_halflife + # if id occurs multiple times in a batch, iter_delta=1 + iter_delta = torch.where(prev_iter == 0.0, 1.0, iter_ * 1.0 - prev_iter) + prev_iter = iter_ * torch.ones_like(prev_iter) + row_counter = 1.0 + torch.exp(-iter_delta * counter_log_rho) * row_counter + freq = torch.tensor([counter_halflife]) / row_counter + + dense_cpu_grad += l2_wd * freq * weight_decay * weights + return dense_cpu_grad, row_counter, freq + + def get_wts_from_counter_adagrad( + self, + dense_cpu_grad: torch.Tensor, + weights: torch.Tensor, + denom: torch.Tensor, + counter_based_regularization: CounterBasedRegularizationDefinition, + row_counter: torch.Tensor, + freq: torch.Tensor, + max_counter: float, + iter_: int, + eps: float, + learning_rate: float, + weight_decay: float, + ) -> torch.Tensor: + counter_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + counter_halflife = counter_based_regularization.counter_halflife + tail_id_threshold_val = counter_based_regularization.tail_id_threshold.val + if counter_based_regularization.tail_id_threshold.is_ratio: + tail_id_threshold_val = math.floor(tail_id_threshold_val * max_counter) + learning_rate_mode = counter_based_regularization.learning_rate_mode + adjustment_iter = counter_based_regularization.adjustment_iter + adjustment_ub = counter_based_regularization.adjustment_ub + + multiplier = torch.tensor([learning_rate]) / denom + adjusted_multiplier = multiplier + exp_reg_correction = torch.ones_like(row_counter) + + if counter_halflife > 0: + if adjustment_iter <= 0 or ( + adjustment_iter > 0 and iter_ > adjustment_iter + ): + if learning_rate_mode == LearningRateMode.TAIL_ID_LR_INCREASE: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + multiplier + * torch.maximum( + torch.minimum( + torch.pow( + torch.tensor([max_counter]) / (row_counter + 1.0), + adjustment_ub, + ), + torch.Tensor([10.0]), + ), + torch.Tensor([1.0]), + ), + multiplier, + ) + elif learning_rate_mode == LearningRateMode.TAIL_ID_LR_DECREASE: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + multiplier + * torch.minimum( + torch.maximum( + torch.pow( + (row_counter + 1.0) / max_counter, + adjustment_ub, + ), + torch.Tensor([0.1]), + ), + torch.Tensor([1.0]), + ), + multiplier, + ) + elif learning_rate_mode == LearningRateMode.COUNTER_SGD: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + torch.Tensor([learning_rate]) + / (torch.sqrt(adjustment_ub * row_counter) + eps), + multiplier, + ) + + if counter_weight_decay_mode == CounterWeightDecayMode.DECOUPLE: + exp_reg_correction = 1.0 - freq * weight_decay * learning_rate + elif counter_weight_decay_mode == CounterWeightDecayMode.L2: + exp_reg_correction = 1.0 - freq * weight_decay * multiplier + + weights = exp_reg_correction * weights - adjusted_multiplier * dense_cpu_grad + return weights + @given( T=st.integers(min_value=1, max_value=5), D=st.integers(min_value=2, max_value=256), @@ -2901,7 +3085,7 @@ def test_backward_optimizers_adam( # noqa C901 D=st.integers(min_value=2, max_value=256), B=st.integers(min_value=1, max_value=128), log_E=st.integers(min_value=3, max_value=5), - L=st.integers(min_value=0, max_value=20), + L=st.integers(min_value=2, max_value=20), weighted=st.booleans(), mixed=st.booleans(), optimizer=st.sampled_from( @@ -2928,6 +3112,7 @@ def test_backward_optimizers_adam( # noqa C901 [ WeightDecayMode.L2, WeightDecayMode.DECOUPLE, + WeightDecayMode.COUNTER, ] ), ) @@ -3394,7 +3579,7 @@ def test_nbit_forward_cpu( T = random.randint(1, 50) B = random.randint(0, 128) L = random.randint(0, 32) - D = random.randint(2, 1024) + D = random.randint(2, 2048) log_E = random.randint(2, 4) use_cache = False @@ -3475,7 +3660,7 @@ def test_nbit_forward_gpu_no_cache( T = random.randint(1, 50) B = random.randint(0, 128) L = random.randint(0, 32) - D = random.randint(2, 1024) + D = random.randint(2, 2048) log_E = random.randint(2, 4) use_cache = False diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp new file mode 100644 index 0000000000..808ed33624 --- /dev/null +++ b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh" + +using namespace ::testing; + +// Helper function that generates input tensor for emulate_cache_miss testing. +at::Tensor generate_lxu_cache_locations( + const int64_t num_requests, + const int64_t num_sets, + const int64_t associativity = 32) { + const auto lxu_cache_locations = at::randint( + 0, + num_sets * associativity, + {num_requests}, + at::device(at::kCPU).dtype(at::kInt)); + return lxu_cache_locations; +} + +// Wrapper function that takes lxu_cache_locations on CPU, copies it to GPU, +// runs emulate_cache_miss(), and then returns the result, placed on CPU. +std::pair run_emulate_cache_miss( + at::Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_uvm_stats = false) { + at::Tensor lxu_cache_locations_copy = at::_to_copy(lxu_cache_locations); + const auto options = + lxu_cache_locations.options().device(at::kCUDA).dtype(at::kInt); + const auto uvm_cache_stats = + gather_uvm_stats ? at::zeros({6}, options) : at::empty({0}, options); + + const auto lxu_cache_location_with_cache_misses = emulate_cache_miss( + lxu_cache_locations_copy.to(at::kCUDA), + enforced_misses_per_256, + gather_uvm_stats, + uvm_cache_stats); + return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()}; +} + +TEST(uvm_cache_miss_emulate_test, no_cache_miss) { + constexpr int64_t num_requests = 10000; + constexpr int64_t num_sets = 32768; + constexpr int64_t associativity = 32; + + auto lxu_cache_locations_cpu = + generate_lxu_cache_locations(num_requests, num_sets, associativity); + auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats = + run_emulate_cache_miss(lxu_cache_locations_cpu, 0); + auto lxu_cache_location_with_cache_misses = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first; + EXPECT_TRUE( + at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses)); +} + +TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) { + constexpr int64_t num_requests = 10000; + constexpr int64_t num_sets = 32768; + constexpr int64_t associativity = 32; + constexpr std::array enforced_misses_per_256_for_testing = { + 1, 5, 7, 33, 100, 256}; + + for (const bool miss_in_lxu_cache_locations : {false, true}) { + for (const bool gather_cache_stats : {false, true}) { + for (const auto enforced_misses_per_256 : + enforced_misses_per_256_for_testing) { + auto lxu_cache_locations_cpu = + generate_lxu_cache_locations(num_requests, num_sets, associativity); + if (miss_in_lxu_cache_locations) { + // one miss in the original lxu_cache_locations; shouldn't be counted + // as enforced misses from emulate_cache_miss(). + auto z = lxu_cache_locations_cpu.data_ptr(); + z[0] = -1; + } + auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats = + run_emulate_cache_miss( + lxu_cache_locations_cpu, + enforced_misses_per_256, + gather_cache_stats); + auto lxu_cache_location_with_cache_misses = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first; + EXPECT_FALSE(at::equal( + lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses)); + + auto x = lxu_cache_locations_cpu.data_ptr(); + auto y = lxu_cache_location_with_cache_misses.data_ptr(); + int64_t enforced_misses = 0; + for (int32_t i = 0; i < lxu_cache_locations_cpu.numel(); ++i) { + if (x[i] != y[i]) { + EXPECT_EQ(y[i], -1); + enforced_misses++; + } + } + int64_t num_requests_over_256 = + static_cast(num_requests / 256); + int64_t expected_misses = num_requests_over_256 * + enforced_misses_per_256 + + std::min((num_requests - num_requests_over_256 * 256), + enforced_misses_per_256); + if (miss_in_lxu_cache_locations) { + expected_misses--; + } + EXPECT_EQ(expected_misses, enforced_misses); + if (gather_cache_stats) { + auto uvm_cache_stats = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.second; + auto cache_stats_ptr = uvm_cache_stats.data_ptr(); + // enforced misses are recorded as conflict misses. + EXPECT_EQ(expected_misses, cache_stats_ptr[5]); + } + } + } + } +} diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h index be8ac4ec8b..e7d8278464 100644 --- a/include/fbgemm/Types.h +++ b/include/fbgemm/Types.h @@ -15,145 +15,184 @@ namespace fbgemm { using float16 = std::uint16_t; using bfloat16 = std::uint16_t; +// The IEEE754 standard species a binary16 as having the following format: +// SEEEEEMMMMMMMMMM +// 0432109876543210 +// That is: +// * 1 sign bit +// * 5 exponent bits +// * 10 mantissa/significand bits (an 11th bit is implicit) +constexpr uint32_t f16_num_bits = 16; +constexpr uint32_t f16_num_exponent_bits = 5; +constexpr uint32_t f16_num_mantissa_bits = 10; +constexpr uint32_t f16_num_non_sign_bits = + f16_num_exponent_bits + f16_num_mantissa_bits; +constexpr uint32_t f16_exponent_mask = 0b1'1111; // 5 bits +constexpr uint32_t f16_sign_bit = 1u + << (f16_num_exponent_bits + f16_num_mantissa_bits); +constexpr uint32_t f16_exponent_bits = f16_exponent_mask + << f16_num_mantissa_bits; +constexpr uint32_t f16_mantissa_mask = 0b11'1111'1111; // 10 bits +constexpr uint32_t f16_exponent_bias = 15; +constexpr uint32_t f16_nan = 0x7F'FF; + +// The IEEE754 standard specifies a binary32 as having: +// SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM +// That is: +// * 1 sign bit +// * 8 exponent bits +// * 23 mantissa/significand bits (a 24th bit is implicit) +constexpr uint32_t f32_num_exponent_bits = 8; +constexpr uint32_t f32_num_mantissa_bits = 23; +constexpr uint32_t f32_exponent_mask = 0b1111'1111; // 8 bits +constexpr uint32_t f32_mantissa_mask = 0x7F'FF'FF; // 23 bits +constexpr uint32_t f32_exponent_bias = 127; +constexpr uint32_t f32_all_non_sign_mask = 0x7F'FF'FF'FF; // 31 bits +constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit +constexpr uint32_t f32_num_non_sign_bits = + f32_num_exponent_bits + f32_num_mantissa_bits; + // Round to nearest even static inline float16 cpu_float2half_rn(float f) { - float16 ret; - static_assert( - sizeof(unsigned int) == sizeof(float), - "Programming error sizeof(unsigned int) != sizeof(float)"); + sizeof(uint32_t) == sizeof(float), + "Programming error sizeof(uint32_t) != sizeof(float)"); - unsigned* xp = reinterpret_cast(&f); - unsigned x = *xp; - unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; - unsigned sign, exponent, mantissa; + uint32_t* xp = reinterpret_cast(&f); + uint32_t x = *xp; + uint32_t u = (x & f32_all_non_sign_mask); // Get rid of +NaN/-NaN case first. if (u > 0x7f800000) { - ret = 0x7fffU; - return ret; + return static_cast(f16_nan); } - sign = ((x >> 16) & 0x8000); + uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { - ret = static_cast(sign | 0x7c00U); - return ret; + return static_cast(sign | f16_exponent_bits); } if (u < 0x33000001) { - ret = static_cast(sign | 0x0000); - return ret; + return static_cast(sign | 0x0000); } - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); + uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask); + uint32_t mantissa = (u & f32_mantissa_mask); - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; + uint32_t shift; + if (exponent > f32_exponent_bias - f16_exponent_bias) { + shift = f32_num_mantissa_bits - f16_num_mantissa_bits; + exponent -= f32_exponent_bias - f16_exponent_bias; } else { - shift = 0x7e - exponent; + shift = (f32_exponent_bias - 1) - exponent; exponent = 0; - mantissa |= 0x800000; + mantissa |= + (1u + << f32_num_mantissa_bits); // Bump the least significant exponent bit } - lsb = (1 << shift); - lsb_s1 = (lsb >> 1); - lsb_m1 = (lsb - 1); + const uint32_t lsb = (1u << shift); + const uint32_t lsb_s1 = (lsb >> 1); + const uint32_t lsb_m1 = (lsb - 1); // Round to nearest even. - remainder = (mantissa & lsb_m1); + const uint32_t remainder = (mantissa & lsb_m1); mantissa >>= shift; if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { ++mantissa; - if (!(mantissa & 0x3ff)) { + if (!(mantissa & f16_mantissa_mask)) { ++exponent; mantissa = 0; } } - ret = static_cast(sign | (exponent << 10) | mantissa); - - return ret; + return static_cast( + sign | (exponent << f16_num_mantissa_bits) | mantissa); } // Round to zero static inline float16 cpu_float2half_rz(float f) { - float16 ret; - static_assert( - sizeof(unsigned int) == sizeof(float), - "Programming error sizeof(unsigned int) != sizeof(float)"); + sizeof(uint32_t) == sizeof(float), + "Programming error sizeof(uint32_t) != sizeof(float)"); - unsigned* xp = reinterpret_cast(&f); - unsigned x = *xp; - unsigned u = (x & 0x7fffffff); - unsigned shift, sign, exponent, mantissa; + const uint32_t* xp = reinterpret_cast(&f); + const uint32_t x = *xp; + const uint32_t u = (x & f32_all_non_sign_mask); // Get rid of +NaN/-NaN case first. if (u > 0x7f800000) { - ret = static_cast(0x7fffU); - return ret; + return static_cast(f16_nan); } - sign = ((x >> 16) & 0x8000); + uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { - ret = static_cast(sign | 0x7c00U); - return ret; + return static_cast(sign | f16_exponent_bits); } if (u < 0x33000001) { - ret = static_cast(sign | 0x0000); - return ret; + return static_cast(sign | 0x0000); } - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); + uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask); + uint32_t mantissa = (u & f32_mantissa_mask); - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; + uint32_t shift; + if (exponent > f32_exponent_bias - f16_exponent_bias) { + shift = f32_num_mantissa_bits - f16_num_mantissa_bits; + exponent -= f32_exponent_bias - f16_exponent_bias; } else { - shift = 0x7e - exponent; + shift = (f32_exponent_bias - 1) - exponent; exponent = 0; - mantissa |= 0x800000; + mantissa |= + (1u + << f32_num_mantissa_bits); // Bump the least significant exponent bit } // Round to zero. mantissa >>= shift; - ret = static_cast(sign | (exponent << 10) | mantissa); - - return ret; + return static_cast( + sign | (exponent << f16_num_mantissa_bits) | mantissa); } -static inline float cpu_half2float(float16 h) { - unsigned sign = ((h >> 15) & 1); - unsigned exponent = ((h >> 10) & 0x1f); - unsigned mantissa = ((h & 0x3ff) << 13); - - if (exponent == 0x1f) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); - exponent = 0xff; - } else if (!exponent) { /* Denorm or Zero */ +// Converts a 16-bit unsigned integer representation of a IEEE754 half-precision +// float into an IEEE754 32-bit single-precision float +static inline float cpu_half2float(const float16 h) { + // Get sign and exponent alone by themselves + uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1; + uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask; + // Shift mantissa so that it fills the most significant bits of a float32 + uint32_t mantissa = (h & f16_mantissa_mask) + << (f32_num_mantissa_bits - f16_num_mantissa_bits); + + if (exponent == f16_exponent_mask) { // NaN or Inf if (mantissa) { - unsigned int msb; - exponent = 0x71; + mantissa = f32_mantissa_mask; + sign_bit = 0; + } + exponent = f32_exponent_mask; + } else if (!exponent) { // Denorm or Zero + if (mantissa) { + uint32_t msb; + exponent = f32_exponent_bias - f16_exponent_bias + 1; do { - msb = (mantissa & 0x400000); - mantissa <<= 1; /* normalize */ + msb = mantissa & f32_most_significant_bit; + mantissa <<= 1; // normalize --exponent; } while (!msb); - mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + mantissa &= f32_mantissa_mask; // 1.mantissa is implicit } } else { - exponent += 0x70; + exponent += f32_exponent_bias - f16_exponent_bias; } - unsigned i = ((sign << 31) | (exponent << 23) | mantissa); + const uint32_t i = (sign_bit << f32_num_non_sign_bits) | + (exponent << f32_num_mantissa_bits) | mantissa; + float ret; - memcpy(&ret, &i, sizeof(i)); + std::memcpy(&ret, &i, sizeof(float)); return ret; } @@ -161,14 +200,14 @@ static inline float cpu_bf162float(bfloat16 src) { float ret; uint32_t val_fp32 = static_cast(reinterpret_cast(&src)[0]) << 16; - memcpy(&ret, &val_fp32, sizeof(ret)); + memcpy(&ret, &val_fp32, sizeof(float)); return ret; } static inline bfloat16 cpu_float2bfloat16(float src) { uint32_t temp; - memcpy(&temp, &src, sizeof(temp)); - return (temp + (1 << 15)) >> 16; + memcpy(&temp, &src, sizeof(uint32_t)); + return (temp + (1u << 15)) >> 16; } } // namespace fbgemm diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h index a1af6078a8..4fb1220eba 100644 --- a/include/fbgemm/UtilsAvx2.h +++ b/include/fbgemm/UtilsAvx2.h @@ -8,6 +8,7 @@ // This file defines common utilities used in code compiled with avx2/avx512 // flags. +#include #include namespace fbgemm { diff --git a/src/InlineAsmDefines.h b/src/InlineAsmDefines.h index 80612536b7..fa3f706602 100644 --- a/src/InlineAsmDefines.h +++ b/src/InlineAsmDefines.h @@ -10,13 +10,14 @@ // We need to do a hack in inline assembly in some clang versions where we have // to do `.intel_syntax noprefix`. This was fixed in clang in // https://reviews.llvm.org/D113707, which made it into clang-14, but not in -// Apple's clang-14 that ships with Xcode 14. +// Apple's clang-14 that ships with Xcode 14.2. It was first fixed in Xcode 14.3 +// where the clang version is 14.0.3. #if defined(__clang__) #if ( \ defined(__apple_build_version__) || \ (defined(__has_builtin) && __has_builtin(__builtin_pika_xxhash64))) && \ - (__clang_major__ < 15) + (__clang_major__ < 15 && __clang_minor__ == 0 && __clang_patchlevel__ < 3) #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1 #elif (__clang_major__ < 14) #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a30735354a..1e996256bf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,12 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) if(FBGEMM_BUILD_TESTS AND NOT TARGET gtest) #Download Googletest framework from github if @@ -38,12 +46,9 @@ macro(add_gtest TESTNAME) EmbeddingSpMDMTestUtils.cc QuantizationHelpers.cc TestUtils.cc) - set_target_properties(${TESTNAME} PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) - #To compile test files with AVX2 turned on - #For static build, defining FBGEMM_STATIC to avoid generating - #functions with _dllimport attributes. + # To compile test files with AVX2 turned on + # For static build, defining FBGEMM_STATIC to avoid generating + # functions with _dllimport attributes. if(MSVC) target_compile_options(${TESTNAME} PRIVATE "/arch:AVX2" "/wd4244" "/wd4267" "/wd4305" "/wd4309") diff --git a/third_party/asmjit.BUILD b/third_party/asmjit.BUILD index 71dc5c7e6c..c2764a97c4 100644 --- a/third_party/asmjit.BUILD +++ b/third_party/asmjit.BUILD @@ -16,9 +16,7 @@ cc_library( copts = [ "-DASMJIT_STATIC", "-fno-tree-vectorize", - "-std=c++17", "-fmerge-all-constants", - "-std=gnu++11", "-DTH_BLAS_MKL", ], includes = [ diff --git a/third_party/hipify_torch b/third_party/hipify_torch index 1840658c18..23f53b025b 160000 --- a/third_party/hipify_torch +++ b/third_party/hipify_torch @@ -1 +1 @@ -Subproject commit 1840658c184f3eeba787dae0f06c45756c1daaf5 +Subproject commit 23f53b025b466d8ec3c45d52290d3442f7fbe6b1