microsoft · johnnynunez · Apr 25, 2025 · Apr 25, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -42,25 +42,40 @@ jobs:
 
   wheel:
     name: Build Wheel
-    runs-on: ${{ matrix.os }}
     needs: release
-
+    runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
-          pytorch-version: ['2.2.2', '2.3.1', '2.4.0', '2.5.1', '2.6.0']
-          cuda-version: ['12.4.0']
-          exclude:
-            # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
-            # Pytorch < 2.5 does not support Python 3.13
-            - pytorch-version: '2.2.2'
-              python-version: '3.13'
-            - pytorch-version: '2.3.1'
-              python-version: '3.13'
-            - pytorch-version: '2.4.0'
-              python-version: '3.13'
+        # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
+        # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+        os: [ ubuntu-22.04, ubuntu-22.04-arm ]
+        python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ]
+        pytorch-version: [ '2.4.0', '2.5.1', '2.6.0', '2.7.0' ]
+        cuda-version: [ '12.4.1', '12.8.1' ]
+        # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+        # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+        # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+        # when building without C++11 ABI and using it on nvcr images.
+        cxx11_abi: [ 'FALSE', 'TRUE' ]
+        exclude:
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # Pytorch < 2.5 does not support Python 3.13
+          # PyTorch < 2.5 doesn’t support Python 3.13
+          - pytorch-version: '2.4.0'
+            python-version: '3.13'
+
+          # PyTorch 2.7.0 must only use CUDA 12.8.1
+          - pytorch-version: '2.7.0'
+            cuda-version: '12.4.1'
+
+          # All other PyTorch (< 2.7.0) must only use CUDA 12.4.1
+          - pytorch-version: '2.4.0'
+            cuda-version: '12.8.1'
+          - pytorch-version: '2.5.1'
+            cuda-version: '12.8.1'
+          - pytorch-version: '2.6.0'
+            cuda-version: '12.8.1'
 
     steps:
       - name: Checkout
@@ -90,7 +105,7 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.19
+        uses: Jimver/cuda-toolkit@v0.2.23
         id: cuda-toolkit
         with:
           cuda: ${{ matrix.cuda-version }}
@@ -142,10 +157,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.10']
-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['12.2.2']
+          os: ['ubuntu-latest']
+          python-version: ['3.12']
+          pytorch-version: ['2.7.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          cuda-version: [ '12.4.1' ]
 
     steps:
       - name: Checkout
@@ -163,7 +178,7 @@ jobs:
           bash -x .github/workflows/scripts/env.sh
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -175,7 +190,7 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.14
+        uses: Jimver/cuda-toolkit@v0.2.23
         id: cuda-toolkit
         with:
           cuda: ${{ matrix.cuda-version }}

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -3,6 +3,17 @@
 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
 
+# Check if the CUDA version is < 12.8.1
+if [ "$2" = "12.8.1" ]; then
+  echo "CUDA version is 12.8.1, using the latest compatible version of flash-attn."
+  # Make sure release wheels are built for the following architectures
+  export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
+else
+  echo "CUDA version is $2, using the latest compatible version of flash-attn."
+  # Make sure release wheels are built for the following architectures
+  export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+fi
+
 # Update paths
 PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
@@ -13,8 +24,6 @@ $python_executable -m pip install flash_attn triton
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
-# Make sure release wheels are built for the following architectures
-export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
 if [ "$3" = sdist ];
 then
@@ -24,4 +33,4 @@ MINFERENCE_FORCE_BUILD="TRUE" $python_executable setup.py $3 --dist-dir=dist
 tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}
 wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
 ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
-fi
+fi
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
@@ -5,8 +5,37 @@ cuda_version=$(echo $1 | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 OS=$(echo $2 | tr -d ".\-")
 
+ARCH=$(uname -m)
+ARCH_TYPE=$ARCH
+
+# Detectar si es Tegra
+if [[ "$ARCH" == "aarch64" ]]; then
+    if uname -a | grep -qi tegra; then
+        ARCH_TYPE="tegra-aarch64"
+    fi
+fi
+
+echo "Detected architecture: ${ARCH_TYPE}"
+
 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+if [[ "$ARCH_TYPE" == "tegra-aarch64" ]]; then
+    # Jetson (Tegra)
+    wget -nv \
+        https://developer.download.nvidia.com/compute/cuda/repos/${OS}/arm64/cuda-${DISTRO}.pin \
+        -O /etc/apt/preferences.d/cuda-repository-pin-600
+
+elif [[ "$ARCH_TYPE" == "tegra-aarch64" ]]; then
+    # Jetson (Tegra)
+    wget -nv \
+        https://developer.download.nvidia.com/compute/cuda/repos/${OS}/arm64/cuda-${DISTRO}.pin \
+        -O /etc/apt/preferences.d/cuda-repository-pin-600
+else
+    # ARM64 SBSA (Grace)
+    wget -nv \
+        https://developer.download.nvidia.com/compute/cuda/repos/${OS}/sbsa/cuda-${DISTRO}.pin \
+        -O /etc/apt/preferences.d/cuda-repository-pin-600
+fi
+
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update

diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
@@ -20,8 +20,8 @@ pip install typing-extensions==4.12.2
 echo $MATRIX_CUDA_VERSION
 echo $MATRIX_TORCH_VERSION
 export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-minv = {'2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \
-maxv = {'2.2': 121, '2.3': 121, '2.4': 124, '2.5': 124, '2.6': 124}[env['MATRIX_TORCH_VERSION']]; \
+minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118}[env['MATRIX_TORCH_VERSION']]; \
+maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128}[env['MATRIX_TORCH_VERSION']]; \
 print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
 )
 if [[ ${pytorch_version} == *"dev"* ]]; then

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -20,8 +20,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        python-version: ["3.9", "3.10", "3.11"]
+        os: [ubuntu-latest, ubuntu-24.04-arm, macos-latest, windows-latest, windows-11-arm]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
         exclude:
           - os: macos-latest
             python-version: '3.9'

diff --git a/setup.py b/setup.py
@@ -119,12 +119,15 @@ def get_minference_version() -> str:
         return str(version)
 
 
-def get_platform():
+def get_arch():
     """
-    Returns the platform name as used in wheel filenames.
+    Returns the system aarch for the current system.
     """
     if sys.platform.startswith("linux"):
-        return f"linux_{platform.uname().machine}"
+        if platform.machine() == "x86_64":
+            return "x86_64"
+        if platform.machine() == "arm64" or platform.machine() == "aarch64":
+            return "aarch64"
     elif sys.platform == "darwin":
         mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
         return f"macosx_{mac_version}_x86_64"
@@ -134,6 +137,28 @@ def get_platform():
         raise ValueError("Unsupported platform: {}".format(sys.platform))
 
 
+def get_system() -> str:
+    """
+    Returns the system name as used in wheel filenames.
+    """
+    if platform.system() == "Windows":
+        return "win"
+    elif platform.system() == "Darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:1])
+        return f"macos_{mac_version}"
+    elif platform.system() == "Linux":
+        return "linux"
+    else:
+        raise ValueError("Unsupported system: {}".format(platform.system()))
+
+
+def get_platform() -> str:
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    return f"{get_system()}_{get_arch()}"
+
+
 def get_wheel_url():
     # Determine the version numbers that will be used to determine the correct wheel
     # We're using the CUDA version used to build torch, not the one currently installed
@@ -265,4 +290,4 @@ def __init__(self, *args, **kwargs) -> None:
     else {
         "bdist_wheel": CachedWheelsCommand,
     },
-)
+)