Skip to content

Commit

Permalink
llama example working, bmm triton kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
123epsilon committed Jan 26, 2025
1 parent aaef6b9 commit ac54e3e
Show file tree
Hide file tree
Showing 10 changed files with 501 additions and 275 deletions.
124 changes: 124 additions & 0 deletions .github/workflows/build_zoom_backend.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: "Build PyTorch"

on:
workflow_dispatch:
inputs:
force_debug_with_tmate:
type: boolean
description: 'Run the build with tmate session'
required: false
default: false
debug_with_tmate:
type: boolean
description: 'Run the build with a tmate session ONLY in case of failure'
required: false
default: false
pull_request:
push:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true

jobs:
build:

strategy:
fail-fast: false
matrix:
include:
- name: "ubuntu-22.04"
runs-on: "mi300"
# container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"
# runs-on: "nod-ai-shared-cpubuilder-manylinux-x86_64"

runs-on: ${{ matrix.runs-on }}

name: ${{ matrix.name }}

env:
CACHE_DIR: ${{ github.workspace }}/.container-cache
# either the PR number or `branch-N` where N always increments
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}

defaults:
run:
shell: bash

permissions:
id-token: write
contents: write

container:
image: ${{ matrix.container }}

steps:
- name: "Check out repository"
uses: actions/checkout@v4.2.2
with:
submodules: true

- name: Enable cache
uses: actions/cache/restore@v3
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-

- name: "Build PyTorch"
id: build
run: |
export CCACHE_DIR="${{ env.CACHE_DIR }}"
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
./build.sh
- name: "Audit"
id: audit
run: |
sudo apt install patchelf
source venv/bin/activate
pip install auditwheel
auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
- name: Save cache
uses: actions/cache/save@v3
if: ${{ !cancelled() }}
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}

- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.name }}_artifact
path: dist
if-no-files-found: warn

- name: Release current commit
uses: ncipollo/release-action@v1.12.0
with:
artifacts: "dist/torch*.whl"
token: "${{ secrets.GITHUB_TOKEN }}"
tag: "latest"
name: "latest"
removeArtifacts: false
allowUpdates: true
replacesArtifacts: true
makeLatest: true

- name: "Setup tmate session"
if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
uses: mxschmitt/action-tmate@v3.18
with:
limit-access-to-actor: true
install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
36 changes: 17 additions & 19 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,6 @@
dispatch:
CPU: bmm_out_cpu
CUDA: bmm_out_cuda
PrivateUse1: bmm_out_zoom
MPS: bmm_out_mps
SparseCPU: bmm_out_sparse_cpu
SparseCUDA: bmm_out_sparse_cuda
Expand Down Expand Up @@ -1513,7 +1512,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_out
CPU, CUDA, PrivateUse1: clamp_out
MPS: clamp_out_mps
tags: pointwise

Expand All @@ -1522,7 +1521,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_Tensor_out
CPU, CUDA, PrivateUse1: clamp_Tensor_out
MPS: clamp_Tensor_out_mps
tags: pointwise

Expand Down Expand Up @@ -1553,7 +1552,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_max_out
CPU, CUDA, PrivateUse1: clamp_max_out
MPS: clamp_max_out_mps
tags: pointwise

Expand All @@ -1562,7 +1561,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_max_Tensor_out
CPU, CUDA, PrivateUse1: clamp_max_Tensor_out
MPS: clamp_max_Tensor_out_mps
tags: pointwise

Expand Down Expand Up @@ -1593,7 +1592,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_min_out
CPU, CUDA, PrivateUse1: clamp_min_out
MPS: clamp_min_out_mps
tags: pointwise

Expand All @@ -1602,7 +1601,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: clamp_min_Tensor_out
CPU, CUDA, PrivateUse1: clamp_min_Tensor_out
MPS: clamp_min_Tensor_out_mps
tags: pointwise

Expand Down Expand Up @@ -3168,7 +3167,7 @@
device_check: NoCheck
device_guard: False
dispatch:
CPU, CUDA, MPS: isnan
CPU, CUDA, MPS, PrivateUse1: isnan
SparseCPU, SparseCUDA: isnan_sparse
SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
autogen: isnan.out
Expand Down Expand Up @@ -4121,7 +4120,6 @@
dispatch:
CPU: mm_out_cpu
CUDA: mm_out_cuda
PrivateUse1: mm_out_zoom
MPS: mm_out_mps
SparseCPU, SparseCUDA: _sparse_mm_out
SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
Expand Down Expand Up @@ -6463,13 +6461,13 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA, MPS: where
CPU, CUDA, MPS, PrivateUse1: where
tags: [core, pointwise]

- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: where_self_out
CPU, CUDA, MPS, PrivateUse1: where_self_out

- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
variants: function
Expand Down Expand Up @@ -7874,7 +7872,7 @@
device_check: NoCheck
device_guard: False
dispatch:
CPU, CUDA, Meta, MPS: set_
CPU, CUDA, Meta, MPS, PrivateUse1: set_
autogen: set.source_Storage, set.source_Storage_out
tags: inplace_view

Expand Down Expand Up @@ -7905,7 +7903,7 @@
device_check: NoCheck
device_guard: False
dispatch:
CPU, CUDA, Meta, MPS: set_tensor_
CPU, CUDA, Meta, MPS, PrivateUse1: set_tensor_
autogen: set.source_Tensor, set.source_Tensor_out
tags: inplace_view

Expand Down Expand Up @@ -8663,7 +8661,7 @@
variants: method
tags: nondeterministic_seeded
dispatch:
CPU, CUDA: random_
CPU, CUDA, PrivateUse1: random_
Meta: random_meta_
MPS: random_mps_
autogen: random.from, random.from_out
Expand All @@ -8673,7 +8671,7 @@
tags: nondeterministic_seeded
variants: method
dispatch:
CPU, CUDA: random_
CPU, CUDA, PrivateUse1: random_
Meta: random_meta_
MPS: random_mps_
autogen: random.to, random.to_out
Expand All @@ -8683,7 +8681,7 @@
tags: nondeterministic_seeded
variants: method
dispatch:
CPU, CUDA: random_
CPU, CUDA, PrivateUse1: random_
MPS: random_mps_
Meta: random_meta_
autogen: random, random.out
Expand All @@ -8693,7 +8691,7 @@
tags: nondeterministic_seeded
variants: method
dispatch:
CPU, CUDA: uniform_
CPU, CUDA, PrivateUse1: uniform_
MPS: uniform_mps_
Meta: uniform_meta_
autogen: uniform, uniform.out
Expand Down Expand Up @@ -13077,7 +13075,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: isposinf_out
CPU, CUDA, PrivateUse1: isposinf_out
SparseCPU, SparseCUDA: isposinf_sparse_out
SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
tags: pointwise
Expand All @@ -13094,7 +13092,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: isneginf_out
CPU, CUDA, PrivateUse1: isneginf_out
SparseCPU, SparseCUDA: isneginf_sparse_out
SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
tags: pointwise
Expand Down
Loading

0 comments on commit ac54e3e

Please sign in to comment.