nod-ai · makslevental · Jun 13, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 21, 2024
diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
@@ -0,0 +1,124 @@
+name: "Build PyTorch"
+
+on:
+  workflow_dispatch:
+    inputs:
+      force_debug_with_tmate:
+        type: boolean
+        description: 'Run the build with tmate session'
+        required: false
+        default: false
+      debug_with_tmate:
+        type: boolean
+        description: 'Run the build with a tmate session ONLY in case of failure'
+        required: false
+        default: false
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "ubuntu-22.04"
+            runs-on: "mi300"
+            # container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"
+            # runs-on: "nod-ai-shared-cpubuilder-manylinux-x86_64"
+
+    runs-on: ${{ matrix.runs-on }}
+
+    name: ${{ matrix.name }}
+
+    env:
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      # either the PR number or `branch-N` where N always increments
+      CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
+
+    defaults:
+      run:
+        shell: bash
+
+    permissions:
+      id-token: write
+      contents: write
+
+    container:
+      image: ${{ matrix.container }}
+
+    steps:
+      - name: "Check out repository"
+        uses: actions/checkout@v4.2.2
+        with:
+          submodules: true
+
+      - name: Enable cache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key:  ${{ env.CACHE_KEY }}
+          restore-keys: linux-build-test-cpp-
+
+      - name: "Build PyTorch"
+        id: build
+        run: |
+
+          export CCACHE_DIR="${{ env.CACHE_DIR }}"
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
+
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          ./build.sh
+
+      - name: "Audit"
+        id: audit
+        run: |
+
+          sudo apt install patchelf
+          source venv/bin/activate
+          pip install auditwheel
+          auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
+
+      - name: Save cache
+        uses: actions/cache/save@v3
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: ${{ env.CACHE_KEY }}
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.name }}_artifact
+          path: dist
+          if-no-files-found: warn
+
+      - name: Release current commit
+        uses: ncipollo/release-action@v1.12.0
+        with:
+          artifacts: "dist/torch*.whl"
+          token: "${{ secrets.GITHUB_TOKEN }}"
+          tag: "latest"
+          name: "latest"
+          removeArtifacts: false
+          allowUpdates: true
+          replacesArtifacts: true
+          makeLatest: true
+
+      - name: "Setup tmate session"
+        if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          limit-access-to-actor: true
+          install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -8,8 +8,8 @@ load("@pytorch//tools/rules:cu.bzl", "cu_library")
 load("@pytorch//tools/config:defs.bzl", "if_cuda")
 load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
 load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
-load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
-load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
+load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_zoom_sources", "libtorch_python_distributed_sources")
+load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources", "aten_ufunc_generated_zoom_sources")
 load("//:tools/bazel.bzl", "rules")
 
 define_targets(rules = rules)
@@ -104,15 +104,23 @@ generated_cuda_cpp = [
     "aten/src/ATen/RegisterSparseCsrCUDA.cpp",
 ]
 
+generated_zoom_cpp = [
+    "aten/src/ATen/ZoomFunctions.h",
+    "aten/src/ATen/ZoomFunctions_inl.h",
+    "aten/src/ATen/RegisterPrivateUse1.cpp",
+]
+
 generate_aten(
     name = "generated_aten_cpp",
     srcs = aten_generation_srcs,
     outs = (
         generated_cpu_cpp +
         generated_cuda_cpp +
+        generated_zoom_cpp +
         aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") +
         aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") +
-        aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + [
+        aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + 
+        aten_ufunc_generated_zoom_sources("aten/src/ATen/{}") + [
             "aten/src/ATen/Declarations.yaml",
         ]
     ),
@@ -888,7 +896,7 @@ cc_library(
     name = "torch_python",
     srcs = libtorch_python_core_sources
         + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + if_cuda(libtorch_python_distributed_sources)=
         + GENERATED_AUTOGRAD_PYTHON,
     hdrs = glob([
         "torch/csrc/generic/*.cpp",

diff --git a/BuildingZoom.md b/BuildingZoom.md
@@ -0,0 +1,90 @@
+# Setup Python Env
+
+To start out, we just need to follow the normal procedure to build PyTorch from source. For convenience I've included these steps here:
+
+```bash
+conda create -n nod-pytorch python==3.10
+conda activate nod-pytorch
+conda install cmake ninja
+pip install -r requirements.txt
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+python setup.py develop
+```
+
+# CMake
+
+Using the `USE_ZOOM` flag with CMake will enable building with HIP for ROCm without requiring any of the "HIPify" scripts in order to build. This will include HIP libraries and populate `torch.version.hip` appropriately. This flag is NOT yet entered into the `setup.py` script, so for now it needs to be added manually via `cmake` or `ccmake`.
+
+You'll need to set the `ROCM_PATH` and `HIP_ROOT_DIR` environment variables appropriately, by default on linux these should be `/opt/rocm/` and `/opt/rocm/hip` respectively.
+
+```bash
+cd build/
+export PYTORCH_ROCM_ARCH=gfx90a
+export ROCM_PATH=/opt/rocm
+export HIP_ROOT_DIR=/opt/rocm/hip
+cmake -DUSE_ZOOM=ON --build . --target install
+```
+
+# Running PyTorch with Zoom
+
+Programs using the zoom backend must be prefaced with this stub until we register a proper dispatch key in pytorch
+
+```python
+import torch
+import torch.zoom
+torch.utils.rename_privateuse1_backend('zoom')
+torch.utils.generate_methods_for_privateuse1_backend(unsupported_dtype=None)
+```
+
+# Installing Triton
+
+Since main Triton currently treats ROCm as if its masquerading as `torch.cuda`, we need a custom installation:
+
+```bash
+git clone https://github.com/123epsilon/triton.git
+cd triton/
+git checkout zoom
+pip install pybind11
+pip install python/
+```
+
+# Running LLama3 with Triton using LigerKernels and HuggingFace
+
+```bash
+pip install liger-kernel
+```
+
+```python
+# pytorch/zoom_extension/examples/ligerllama.py
+import torch
+from transformers import AutoTokenizer
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+from time import perf_counter as pf
+torch.utils.rename_privateuse1_backend('zoom')
+
+# Set up the model and tokenizer
+model_id = "meta-llama/Meta-Llama-3-8B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoLigerKernelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="zoom"
+)
+
+# Function to generate text
+def generate_text(prompt, max_length=30):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=max_length)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+# Example usage
+prompt = "Hey, how are you doing today?"
+s = pf()
+response = generate_text(prompt)
+e = pf()
+print(f"Prompt: {prompt}")
+print(f"Response: {response}")
+
+print(f"{e-s} seconds")
+```
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -203,6 +203,7 @@ option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
+option(USE_ZOOM "Use ZOOM HIP Backend" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
     USE_XPU "Use XPU. Only available on Linux." ON

diff --git a/CUDA.md b/CUDA.md
@@ -0,0 +1,19 @@
+# Context
+A [Context](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#context) represents all the relevant state that are required on an accelerator in order to instantiate and perform tasks. A Context includes data, variables, conditions, and more which define the environment in which the provided tasks are executed. Commands such as launching a kernel on a gpu are executed in a Context. Once a context is destroyed CUDA cleans up all the resources associated with it. Therefore, pointers originating from different contexts reference distinct address spaces (memory locations). Contexts are manages in a stack, each host (CPU) thread scheduling tasks has its own stack of contexts. Contexts can be exchanged between host threads. For instance, popping `ctx` from HostA and pushing it onto HostB will force operations executed from HostB to be executed in `ctx` while HostA will operate under the previous context in the stack. 
+
+The context utilized for a device by the runtime API is the device's primary context. From the perspective of the runtime API a device and its primary context are synonymous.
+
+# Module
+Modules are dynamically loadable packages akin to DLLs or shared libraries. These include symbols, functions, and global variables that usres can call on. Modules maintain a module scope to avoid namespace collisions with other concurrently loaded modules.
+
+# Hooks
+Inheriting from `AcceleratorHooksInterface`, Hook implementations in PyTorch provide a generic interface through which host (CPU) code can query and set properties for the provided accelerators.
+
+# CUDAStream
+A stream is a structure that accepts events in a FIFO queue and executes them in a synchronous way, it can be thought of as a queue or pipeline for scheduling tasks on an accelerator. Spinning up multiple concurrent streams can enable task parallelism, for instance when we have multiple devices. In this case, each stream is uniquely associated with a device and queueing tasks to a stream will execute them on that device. Really, streams are specific to a context which are in-turn specific to a device. Streams have an associated integer priority, lower values are considered "high priority" by the accelerator's scheduling algorithm. 
+
+CUDAStream abstracts the concept of a cuda stream (`cudaStream_t`), it maintains several pools of streams to reduce the overhead associated with common stream operations such as creation and destruction. Each device maintains 3 lazily intialized pools of streams, where the first pool contains the default stream. Pool 2 contains low priority streams. Pool 3 contains the high priority streams. Despite the fact that each thread in principle has its own "current stream," this stream pool is global across threads. Hence many host threads can potentially dispatch kernels and synchronize on the same stream. Synchronization can have [different meanings](https://leimao.github.io/blog/CUDA-Default-Stream/) depending on whether we are synchronizing to the legacy stream or via per-thread streams.
+
+# CUDACachingAllocator
+https://cs.stackexchange.com/questions/143650/difference-between-caching-and-slab-allocator
+https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html
diff --git a/ZoomNotes.md b/ZoomNotes.md
@@ -0,0 +1,51 @@
+# Running Device Type Tests
+Set up the environment using `env.sh`. You may have to edit these variables if cloning. `TORCH_TEST_DEVICES` should point to `zoom_extension/test/pytorch_test_base.py`.
+
+Then you can run `test.sh` to run the pytorch device test suite. This script will have a few output artifacts, one will be `test.log` with a verbose log of the `unittest` output from the test suite. Another is `zoom_unimplemented_operators.log` which will contain a list of unimplemented operators in the zoom backend, as well as the frequency with which this operator was called in the test suite. Finally, it will output a list of test failures (i.e. `AssertionError`) that were encountered in the test suite in `zoom_test_errors.log`.
+
+The unimplemented operator log should not be considered exhaustive as additional operator failures may occur once the offending operator is implemented. This is just meant to be a tool to drive development.
+
+# HIP Library Dependencies
+For these running on ROCm, this also means that we take a dependency on the 'roc*' equivalent (e.g. hipBLAS requires rocBLAS)
+
+* HIP - runtime, dtypes
+* hipBLAS
+* hipBLASLt
+* hipRand
+* hipSparse
+* hipFFT
+* rccl - TODO: add this in lieue of NCCL functionality
+* hipThrust
+* hipCub
+* hipSolver
+
+# HIPBlasLt
+
+This is temporarily disabled via the macro `DISABLE_HIPBLASLT` in `ZoomContextLight.h`, we can reenable it by undef'ing that macro. This means that right now `scaledgemm` and `intmm` dont work, but we can implement hipblas versions of them and/or just enable hipblaslt.
+
+# JITerator Notes:
+https://dev-discuss.pytorch.org/t/keeping-pytorchs-ops-maintainable-the-jiterator/468
+
+
+# Zoom JIT
+Kernels are run via hiprtc and use a template specifier `scalar_t` which is filled in by `zoom_generate_code`. JIT functions are in `ATen/zoom/jit/jit_utils.*`. Kernels need to be defined with `extern "C"` to prevent name mangling, otherwise we can't retrieve our kernel properly at launch time with `hipModuleGetFunction`. See `ATen/native/zoom/Blas.cpp:dot_hip` for an example implementation.
+
+ ## Testing Operators on Zoom
+ See `test/test_ops.py`, `test_numpy_ref` and `test_compare_cpu`.
+
+TODO List:
+
+- Add RCCL
+- Determine rocBLAS determinism requirements as far as config and versions (necessary to throw determinism errors when appropriate)
+
+Note on error in test suite: `RuntimeError: t.use_count() <= 1`
+This error is thrown in the `test_parallel_cow_materialize_error` test in the torch device type tests because
+of many parallel references being held on the same tensor. This will only throw in debug mode. I think we can ignore this since 
+this same error is thrown on the CPU backend in debug mode, and passes in release.
+
+Note on error in `test_grad_scaling_state_dict`, this error occurs in the instance check `isinstance(s1._scale, torch.FloatTensor)`
+because, despite their datatypes being equal, the PU1 dispatch key is a mismatch with the CPU dispatch key of the `FloatTensor` class.
+These tensor types are deprecated anyways, and the rest of the test works so we can just ignore - if we want to we can add a
+`torch.zoom.FloatTensor` (though this is a deprecated design pattern and likely frowned upon). The real correct thing to do is to refactor the instance check. See `python_tensor.cpp:Tensor_instancecheck`
+
+For now, I've added a Macro in `Allocator.h` that registers a functor that retrieves the `ZoomCachingAllocator` for us since we're currently implemented as an external backend (e.g. using PU1 dispatch key). Once, we're in the main repo we can replace it with the proper logic when retrieving the allocator for the Zoom backend.