llama example working, bmm triton kernel

nod-ai · Jan 26, 2025 · ac54e3e · ac54e3e
1 parent aaef6b9
commit ac54e3e
Show file tree

Hide file tree

Showing 10 changed files with 501 additions and 275 deletions.
diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
@@ -0,0 +1,124 @@
+name: "Build PyTorch"
+
+on:
+  workflow_dispatch:
+    inputs:
+      force_debug_with_tmate:
+        type: boolean
+        description: 'Run the build with tmate session'
+        required: false
+        default: false
+      debug_with_tmate:
+        type: boolean
+        description: 'Run the build with a tmate session ONLY in case of failure'
+        required: false
+        default: false
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "ubuntu-22.04"
+            runs-on: "mi300"
+            # container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"
+            # runs-on: "nod-ai-shared-cpubuilder-manylinux-x86_64"
+
+    runs-on: ${{ matrix.runs-on }}
+
+    name: ${{ matrix.name }}
+
+    env:
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      # either the PR number or `branch-N` where N always increments
+      CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
+
+    defaults:
+      run:
+        shell: bash
+
+    permissions:
+      id-token: write
+      contents: write
+
+    container:
+      image: ${{ matrix.container }}
+
+    steps:
+      - name: "Check out repository"
+        uses: actions/checkout@v4.2.2
+        with:
+          submodules: true
+
+      - name: Enable cache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key:  ${{ env.CACHE_KEY }}
+          restore-keys: linux-build-test-cpp-
+
+      - name: "Build PyTorch"
+        id: build
+        run: |
+
+          export CCACHE_DIR="${{ env.CACHE_DIR }}"
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
+
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          ./build.sh
+
+      - name: "Audit"
+        id: audit
+        run: |
+
+          sudo apt install patchelf
+          source venv/bin/activate
+          pip install auditwheel
+          auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
+
+      - name: Save cache
+        uses: actions/cache/save@v3
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: ${{ env.CACHE_KEY }}
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.name }}_artifact
+          path: dist
+          if-no-files-found: warn
+
+      - name: Release current commit
+        uses: ncipollo/release-action@v1.12.0
+        with:
+          artifacts: "dist/torch*.whl"
+          token: "${{ secrets.GITHUB_TOKEN }}"
+          tag: "latest"
+          name: "latest"
+          removeArtifacts: false
+          allowUpdates: true
+          replacesArtifacts: true
+          makeLatest: true
+
+      - name: "Setup tmate session"
+        if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          limit-access-to-actor: true
+          install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1352,7 +1352,6 @@
   dispatch:
     CPU: bmm_out_cpu
     CUDA: bmm_out_cuda
-    PrivateUse1: bmm_out_zoom
     MPS: bmm_out_mps
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
@@ -1513,7 +1512,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_out
+    CPU, CUDA, PrivateUse1: clamp_out
     MPS: clamp_out_mps
   tags: pointwise
 
@@ -1522,7 +1521,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_Tensor_out
     MPS: clamp_Tensor_out_mps
   tags: pointwise
 
@@ -1553,7 +1552,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_max_out
+    CPU, CUDA, PrivateUse1: clamp_max_out
     MPS: clamp_max_out_mps
   tags: pointwise
 
@@ -1562,7 +1561,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_max_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_max_Tensor_out
     MPS: clamp_max_Tensor_out_mps
   tags: pointwise
 
@@ -1593,7 +1592,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_min_out
+    CPU, CUDA, PrivateUse1: clamp_min_out
     MPS: clamp_min_out_mps
   tags: pointwise
 
@@ -1602,7 +1601,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_min_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_min_Tensor_out
     MPS: clamp_min_Tensor_out_mps
   tags: pointwise
 
@@ -3168,7 +3167,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, MPS: isnan
+    CPU, CUDA, MPS, PrivateUse1: isnan
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
   autogen: isnan.out
@@ -4121,7 +4120,6 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
-    PrivateUse1: mm_out_zoom
     MPS: mm_out_mps
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
@@ -6463,13 +6461,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA, MPS: where
+    CPU, CUDA, MPS, PrivateUse1: where
   tags: [core, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA, MPS: where_self_out
+    CPU, CUDA, MPS, PrivateUse1: where_self_out
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
   variants: function
@@ -7874,7 +7872,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, MPS: set_
+    CPU, CUDA, Meta, MPS, PrivateUse1: set_
   autogen: set.source_Storage, set.source_Storage_out
   tags: inplace_view
 
@@ -7905,7 +7903,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, MPS: set_tensor_
+    CPU, CUDA, Meta, MPS, PrivateUse1: set_tensor_
   autogen: set.source_Tensor, set.source_Tensor_out
   tags: inplace_view
 
@@ -8663,7 +8661,7 @@
   variants: method
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     Meta: random_meta_
     MPS: random_mps_
   autogen: random.from, random.from_out
@@ -8673,7 +8671,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     Meta: random_meta_
     MPS: random_mps_
   autogen: random.to, random.to_out
@@ -8683,7 +8681,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     MPS: random_mps_
     Meta: random_meta_
   autogen: random, random.out
@@ -8693,7 +8691,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: uniform_
+    CPU, CUDA, PrivateUse1: uniform_
     MPS: uniform_mps_
     Meta: uniform_meta_
   autogen: uniform, uniform.out
@@ -13077,7 +13075,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: isposinf_out
+    CPU, CUDA, PrivateUse1: isposinf_out
     SparseCPU, SparseCUDA: isposinf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
   tags: pointwise
@@ -13094,7 +13092,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: isneginf_out
+    CPU, CUDA, PrivateUse1: isneginf_out
     SparseCPU, SparseCUDA: isneginf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
   tags: pointwise