From 663eea1b535f4e1967e559c45926c6415fc0d91e Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 1 Dec 2024 20:09:40 +0100 Subject: [PATCH 1/7] Fix 64-bit dtype for MSVC --- exllamav2/exllamav2_ext/ext_rope.cpp | 42 ++++++++++++++-------------- exllamav2/exllamav2_ext/ext_rope.h | 6 ++-- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/exllamav2/exllamav2_ext/ext_rope.cpp b/exllamav2/exllamav2_ext/ext_rope.cpp index d79a22b8..c7d58025 100644 --- a/exllamav2/exllamav2_ext/ext_rope.cpp +++ b/exllamav2/exllamav2_ext/ext_rope.cpp @@ -58,50 +58,50 @@ void rope_ ); } -long gen_mrope_pos_ids +int64_t gen_mrope_pos_ids ( torch::Tensor mrope_pos_ids, torch::Tensor ids, int merge_size, - const std::vector> &spans, - const std::vector> &grids + const std::vector> &spans, + const std::vector> &grids ) { int max_length = mrope_pos_ids.size(1); int in_length = ids.size(0); - long* in_ids = (long*) ids.data_ptr(); - long* pos_ids = (long*) mrope_pos_ids.data_ptr(); + int64_t* in_ids = (int64_t*) ids.data_ptr(); + int64_t* pos_ids = (int64_t*) mrope_pos_ids.data_ptr(); - long* out_t = pos_ids; - long* out_h = pos_ids + max_length; - long* out_w = pos_ids + 2 * max_length; + int64_t* out_t = pos_ids; + int64_t* out_h = pos_ids + max_length; + int64_t* out_w = pos_ids + 2 * max_length; - long base_t = 0; - long next_base_t = 0; + int64_t base_t = 0; + int64_t next_base_t = 0; for (int i = 0; i < max_length; ++i) { bool is_emb = false; if (i < in_length) { - long id = in_ids[i]; + int64_t id = in_ids[i]; for (int j = 0; j < spans.size(); ++j) { - long span_start = std::get<0>(spans[j]); - long span_end = std::get<1>(spans[j]); - long span = span_end - span_start; + int64_t span_start = std::get<0>(spans[j]); + int64_t span_end = std::get<1>(spans[j]); + int64_t span = span_end - span_start; if (id >= span_start && id < span_end) { is_emb = true; - long k = id - span_start; - long grid_t = std::get<0>(grids[j]); - long grid_h = std::get<1>(grids[j]) / (long)merge_size; - long grid_w = std::get<2>(grids[j]) / (long)merge_size; - long k_t = base_t + (k / grid_w / grid_h) % grid_t; - long k_h = base_t + (k / grid_w) % grid_h; - long k_w = base_t + k % grid_w; + int64_t k = id - span_start; + int64_t grid_t = std::get<0>(grids[j]); + int64_t grid_h = std::get<1>(grids[j]) / (int64_t)merge_size; + int64_t grid_w = std::get<2>(grids[j]) / (int64_t)merge_size; + int64_t k_t = base_t + (k / grid_w / grid_h) % grid_t; + int64_t k_h = base_t + (k / grid_w) % grid_h; + int64_t k_w = base_t + k % grid_w; *out_t++ = k_t; *out_h++ = k_h; *out_w++ = k_w; diff --git a/exllamav2/exllamav2_ext/ext_rope.h b/exllamav2/exllamav2_ext/ext_rope.h index 17adebd4..2a41b22c 100644 --- a/exllamav2/exllamav2_ext/ext_rope.h +++ b/exllamav2/exllamav2_ext/ext_rope.h @@ -11,11 +11,11 @@ void rope_ bool neox_style ); -long gen_mrope_pos_ids +int64_t gen_mrope_pos_ids ( torch::Tensor mrope_pos_ids, torch::Tensor ids, int merge_size, - const std::vector> &spans, - const std::vector> &grids + const std::vector> &spans, + const std::vector> &grids ); \ No newline at end of file From db78601226c3f9ad92b55e1d08087072559356a7 Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Mon, 2 Dec 2024 02:32:32 +0530 Subject: [PATCH 2/7] Prevent NPE in `deallocate_pages` (#688) Prevent NPE in `deallocate_pages` If `deallocate_pages` is called on a job for which `allocate_pages` has not been called (see `iterate_start_jobs` for conditions under which this is true), `allocated_pages` is `None`, raising a NPE when attempting to iterate. In particular, this prevents `clear_queue` from working. In practice, this problem readily occurs when starting a few jobs and then calling `clear_queue`. --- .github/workflows/build-temp.yml | 345 +++++++++++++++++++++++++++++++ exllamav2/generator/dynamic.py | 7 +- 2 files changed, 349 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build-temp.yml diff --git a/.github/workflows/build-temp.yml b/.github/workflows/build-temp.yml new file mode 100644 index 00000000..30a8421a --- /dev/null +++ b/.github/workflows/build-temp.yml @@ -0,0 +1,345 @@ +name: Build Wheels & Release + +on: + workflow_dispatch: + inputs: + release: + description: 'Release? 1 = yes, 0 = no' + default: '0' + required: true + type: string + +permissions: + contents: write + +jobs: + build_wheels: + name: ${{ matrix.os }} P${{ matrix.pyver }} C${{ matrix.cuda }} R${{ matrix.rocm }} T${{ matrix.torch }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + + # Ubuntu 20.04 CUDA + + # Python 3.8 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.9 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.10 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.11 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.12 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Windows 2022 CUDA + + # Python 3.8 +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.9 +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.10 +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.11 +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.12 + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Ubuntu 20.04 ROCm + + # ROCm 5.6 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' } + + # ROCm 6.0 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } + + # ROCm 6.1 +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } + + # ROCm 6.2 + - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } + - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } + - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } + + # sdist +# - { artname: 'sdist', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '', torch: '2.3.1', cudaarch: '' } + + # Extra Torch 2.2 wheels for Windows 2022 until PyTorch resolves the shm.dll issue + + # Python 3.8 +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + +# Python 3.9 +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.10 +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.11 +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Python 3.12 +# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } +# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + # Extra wheel for HF spaces +# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.2.2', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } + + fail-fast: false + + defaults: + run: + shell: pwsh + + steps: + # Free disk space + + - name: Free Disk Space + uses: jlumbroso/free-disk-space@v1.3.1 + if: runner.os == 'Linux' + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: false + swap-storage: true + + # Setup Python + + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + + # Get version string from package + + - name: Get version string + id: package_version + run: | + $versionString = Get-Content $(Join-Path 'exllamav2' 'version.py') -raw + if ($versionString -match '__version__ = "(\d+\.(?:\d+\.?(?:dev\d+)?)*)"') + { + Write-Output $('::notice file=build-wheels-release.yml,line=200,title=Package Version::Detected package version is: {0}' -f $Matches[1]) + Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT" + } + else + { + Write-Output '::error file=build-wheels-release.yml,line=203::Could not parse version from exllamav2/version.py! You must upload wheels manually!' + Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT" + } + + # Pin VS build tools to 17.9 so builds won't fail + + - name: Install VS2022 BuildTools 17.9.7 + run: choco install -y visualstudio2022buildtools --version=117.9.7.0 --params "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 --installChannelUri https://aka.ms/vs/17/release/180911598_-255012421/channel" + if: runner.os == 'Windows' + + # Install ROCm SDK, apparently needs to happen before setting up Python + + - name: Build for ROCm + if: matrix.rocm != '' + shell: bash + run: | + # --- Install ROCm SDK + + export ROCM_VERSION=${{ matrix.rocm }} + export TORCH_VERSION=${{ matrix.torch }} + + [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 + + sudo apt update + sudo apt install rocm-hip-sdk -y + sudo apt clean -y + + echo "/opt/rocm/bin" >> $GITHUB_PATH + echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV + echo "ROCM_VERSION=$ROCM_VERSION" >> $GITHUB_ENV + echo "USE_ROCM=1" >> $GITHUB_ENV + + # --- Install dependencies + + python3 -m ensurepip --upgrade + pip3 install torch==${{ matrix.torch }} --index-url="https://download.pytorch.org/whl/rocm$ROCM_VERSION" + pip3 install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja + pip3 cache purge + + # --- Build wheel + + python3 -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=+rocm${{ matrix.rocm }}-torch${{ matrix.torch }}" + + # Build for CUDA + + - name: Setup Mamba + if: matrix.cuda != '' + uses: conda-incubator/setup-miniconda@v2.3.0 + with: + activate-environment: "exllama" + python-version: ${{ matrix.pyver }} + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + add-pip-as-python-dependency: true + auto-activate-base: false + + - name: Build for CUDA + if: matrix.cuda != '' + run: | + # --- Spawn the VS shell + if ($IsWindows) { + Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' + Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64' + $env:DISTUTILS_USE_SDK=1 + } + + # --- Install CUDA using Conda + $cudaVersion = '${{ matrix.cuda }}' + $cudaVersionPytorch = '${{ matrix.cuda }}'.Remove('${{ matrix.cuda }}'.LastIndexOf('.')).Replace('.','') + + $env:MAMBA_NO_LOW_SPEED_LIMIT = 1 + mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime + + if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaVersion} + if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'} + + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} + + # --- Install dependencies + + python -m ensurepip --upgrade + python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch + python -m pip install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja + + # --- Build wheel + + $BUILDTAG = "+cu$cudaVersionPytorch-torch${{ matrix.torch }}" + $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}' + python -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=$BUILDTAG" + + # Build sdist + + - name: Build sdist + if: matrix.cuda == '' && matrix.rocm == '' + run: | + # --- Spawn the VS shell + if ($IsWindows) { + Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' + Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64' + $env:DISTUTILS_USE_SDK=1 + } + + # --- Install dependencies + + python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cpu + python -m pip install build wheel ninja + + # --- Build wheel + + $env:EXLLAMA_NOCOMPILE=1 + python -m build -n + + # Upload files + + - uses: actions/upload-artifact@v3 + if: matrix.artname == 'wheel' + with: + name: 'wheel' + path: ./dist/* + + - uses: actions/upload-artifact@v3 + if: matrix.artname == 'sdist' + with: + name: 'sdist' + path: ./dist/* + + - name: Upload files to GitHub release + if: steps.package_version.outputs.PACKAGE_VERSION != 'None' && inputs.release == '1' + uses: svenstaro/upload-release-action@2.6.1 + with: + file: ./dist/*.whl + tag: ${{ format('v{0}', steps.package_version.outputs.PACKAGE_VERSION) }} + file_glob: true + overwrite: true + release_name: ${{ steps.package_version.outputs.PACKAGE_VERSION }} diff --git a/exllamav2/generator/dynamic.py b/exllamav2/generator/dynamic.py index 602420da..86dcf35f 100644 --- a/exllamav2/generator/dynamic.py +++ b/exllamav2/generator/dynamic.py @@ -2589,8 +2589,9 @@ def deallocate_pages(self): self.generator.all_pages[0].backup() for seq in self.sequences: - for page in seq.allocated_pages: - page.sub_ref() - seq.allocated_pages = [] + if seq.allocated_pages is not None: + for page in seq.allocated_pages: + page.sub_ref() + seq.allocated_pages = [] self.generator.validate_cache() From c86f62c3b8d829ae3a9d06882a1a47812f393115 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Thu, 5 Dec 2024 18:02:02 +0100 Subject: [PATCH 3/7] Ensure MRoPE ID tensor is contiguous --- exllamav2/mrope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exllamav2/mrope.py b/exllamav2/mrope.py index 16ef31de..3b925314 100644 --- a/exllamav2/mrope.py +++ b/exllamav2/mrope.py @@ -36,7 +36,7 @@ def gen_mrope_embed( # Create 3D position IDs - ids = input_ids.squeeze(0) + ids = input_ids.squeeze(0).contiguous() mrope_pos_ids = torch.zeros((3, max_length), dtype = torch.long).contiguous() merge_size = 1 if not embeddings else embeddings[0].model.config.vision_spatial_merge_size spans = [] From c55656cc0ce38e8d32ce399df4bb5fc320873e17 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Thu, 5 Dec 2024 21:16:36 +0100 Subject: [PATCH 4/7] Fix system RAM consumption while quantizing, fixes #692 --- exllamav2/embedding.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/exllamav2/embedding.py b/exllamav2/embedding.py index 9fd67997..f5571f0a 100644 --- a/exllamav2/embedding.py +++ b/exllamav2/embedding.py @@ -186,16 +186,40 @@ def forward( if self.archparams.normalize_embeddings: hidden_states *= cfg.hidden_size ** 0.5 - # Negative tokens during quantization are noise tokens + # Rows with negative tokens during quantization are noise tokens if kwargs.get("negative_ids_noise"): - mask = (input_ids < 0).unsqueeze(-1) - unmasked_values = hidden_states[~mask.expand_as(hidden_states)].float() - mean, std = unmasked_values.mean(), unmasked_values.std() - noise = torch.randn_like(hidden_states, dtype = torch.float) - noise = noise * std + mean - noise = noise.half() - hidden_states = torch.where(mask, noise, hidden_states) + + n = 0 + mean = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device) + M2 = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device) + + for i in range(input_ids.shape[0]): + if input_ids[i][0] < 0: + continue + + er = hidden_states[i].float() + n += er.numel() + delta = er - mean + mean += delta.sum() / n + delta2 = er - mean + M2 += (delta * delta2).sum() + del er + del delta + del delta2 + + if n > 1: + std = torch.sqrt(M2 / (n - 1)) + + for i in range(input_ids.shape[0]): + if input_ids[i][0] >= 0: + continue + + er = hidden_states[i] + noise = torch.randn(er.size(), dtype = torch.float, device = hidden_states.device) * std + mean + er.copy_(noise.half()) + del er + del noise # Move to pinned temp buffer for TP From ba9774f1c828aae75f022c4e3aabc8879854ac9e Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 7 Dec 2024 15:53:52 +0100 Subject: [PATCH 5/7] Enable noise tokens for Qwen2-VL quantizatino --- exllamav2/architecture.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py index ea7e2941..b2a62803 100644 --- a/exllamav2/architecture.py +++ b/exllamav2/architecture.py @@ -402,6 +402,8 @@ class Params: self.mmp.mlp_bias = True self.mmp.norm = "layernorm" + self.standard_calib_noise = (5, 30) + # Gemma if arch_string == "GemmaForCausalLM": From 83a57c74ed79d8237fbf7cd7b0e28c42eacccef9 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 7 Dec 2024 15:55:11 +0100 Subject: [PATCH 6/7] Bump to v0.2.6 --- exllamav2/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exllamav2/version.py b/exllamav2/version.py index 845be453..d1eb7428 100644 --- a/exllamav2/version.py +++ b/exllamav2/version.py @@ -1 +1 @@ -__version__ = "0.2.5" \ No newline at end of file +__version__ = "0.2.6" \ No newline at end of file From 15b5df784ad51c3e38229934eda831eeaa522b66 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 7 Dec 2024 15:55:53 +0100 Subject: [PATCH 7/7] Cleanup build actions --- .github/workflows/build-temp.yml | 345 ------------------------------- 1 file changed, 345 deletions(-) delete mode 100644 .github/workflows/build-temp.yml diff --git a/.github/workflows/build-temp.yml b/.github/workflows/build-temp.yml deleted file mode 100644 index 30a8421a..00000000 --- a/.github/workflows/build-temp.yml +++ /dev/null @@ -1,345 +0,0 @@ -name: Build Wheels & Release - -on: - workflow_dispatch: - inputs: - release: - description: 'Release? 1 = yes, 0 = no' - default: '0' - required: true - type: string - -permissions: - contents: write - -jobs: - build_wheels: - name: ${{ matrix.os }} P${{ matrix.pyver }} C${{ matrix.cuda }} R${{ matrix.rocm }} T${{ matrix.torch }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - # Ubuntu 20.04 CUDA - - # Python 3.8 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.9 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.10 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.11 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.12 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Windows 2022 CUDA - - # Python 3.8 -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.9 -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.10 -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.11 -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.7.0', rocm: '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.12 - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Ubuntu 20.04 ROCm - - # ROCm 5.6 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' } - - # ROCm 6.0 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' } - - # ROCm 6.1 -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' } - - # ROCm 6.2 - - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } - - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } - - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '', rocm: '6.2', torch: '2.5.0', cudaarch: '' } - - # sdist -# - { artname: 'sdist', os: ubuntu-20.04, pyver: '3.11', cuda: '', rocm: '', torch: '2.3.1', cudaarch: '' } - - # Extra Torch 2.2 wheels for Windows 2022 until PyTorch resolves the shm.dll issue - - # Python 3.8 -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.8', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - -# Python 3.9 -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.9', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.10 -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.11 -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Python 3.12 -# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } -# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - # Extra wheel for HF spaces -# - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.2.2', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' } - - fail-fast: false - - defaults: - run: - shell: pwsh - - steps: - # Free disk space - - - name: Free Disk Space - uses: jlumbroso/free-disk-space@v1.3.1 - if: runner.os == 'Linux' - with: - tool-cache: true - android: true - dotnet: true - haskell: true - large-packages: false - swap-storage: true - - # Setup Python - - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.pyver }} - - # Get version string from package - - - name: Get version string - id: package_version - run: | - $versionString = Get-Content $(Join-Path 'exllamav2' 'version.py') -raw - if ($versionString -match '__version__ = "(\d+\.(?:\d+\.?(?:dev\d+)?)*)"') - { - Write-Output $('::notice file=build-wheels-release.yml,line=200,title=Package Version::Detected package version is: {0}' -f $Matches[1]) - Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT" - } - else - { - Write-Output '::error file=build-wheels-release.yml,line=203::Could not parse version from exllamav2/version.py! You must upload wheels manually!' - Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT" - } - - # Pin VS build tools to 17.9 so builds won't fail - - - name: Install VS2022 BuildTools 17.9.7 - run: choco install -y visualstudio2022buildtools --version=117.9.7.0 --params "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 --installChannelUri https://aka.ms/vs/17/release/180911598_-255012421/channel" - if: runner.os == 'Windows' - - # Install ROCm SDK, apparently needs to happen before setting up Python - - - name: Build for ROCm - if: matrix.rocm != '' - shell: bash - run: | - # --- Install ROCm SDK - - export ROCM_VERSION=${{ matrix.rocm }} - export TORCH_VERSION=${{ matrix.torch }} - - [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings - wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null - echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list - echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 - - sudo apt update - sudo apt install rocm-hip-sdk -y - sudo apt clean -y - - echo "/opt/rocm/bin" >> $GITHUB_PATH - echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV - echo "ROCM_VERSION=$ROCM_VERSION" >> $GITHUB_ENV - echo "USE_ROCM=1" >> $GITHUB_ENV - - # --- Install dependencies - - python3 -m ensurepip --upgrade - pip3 install torch==${{ matrix.torch }} --index-url="https://download.pytorch.org/whl/rocm$ROCM_VERSION" - pip3 install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja - pip3 cache purge - - # --- Build wheel - - python3 -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=+rocm${{ matrix.rocm }}-torch${{ matrix.torch }}" - - # Build for CUDA - - - name: Setup Mamba - if: matrix.cuda != '' - uses: conda-incubator/setup-miniconda@v2.3.0 - with: - activate-environment: "exllama" - python-version: ${{ matrix.pyver }} - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - add-pip-as-python-dependency: true - auto-activate-base: false - - - name: Build for CUDA - if: matrix.cuda != '' - run: | - # --- Spawn the VS shell - if ($IsWindows) { - Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' - Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64' - $env:DISTUTILS_USE_SDK=1 - } - - # --- Install CUDA using Conda - $cudaVersion = '${{ matrix.cuda }}' - $cudaVersionPytorch = '${{ matrix.cuda }}'.Remove('${{ matrix.cuda }}'.LastIndexOf('.')).Replace('.','') - - $env:MAMBA_NO_LOW_SPEED_LIMIT = 1 - mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime - - if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaVersion} - if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'} - - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} - - # --- Install dependencies - - python -m ensurepip --upgrade - python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch - python -m pip install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja - - # --- Build wheel - - $BUILDTAG = "+cu$cudaVersionPytorch-torch${{ matrix.torch }}" - $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}' - python -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=$BUILDTAG" - - # Build sdist - - - name: Build sdist - if: matrix.cuda == '' && matrix.rocm == '' - run: | - # --- Spawn the VS shell - if ($IsWindows) { - Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' - Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64' - $env:DISTUTILS_USE_SDK=1 - } - - # --- Install dependencies - - python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cpu - python -m pip install build wheel ninja - - # --- Build wheel - - $env:EXLLAMA_NOCOMPILE=1 - python -m build -n - - # Upload files - - - uses: actions/upload-artifact@v3 - if: matrix.artname == 'wheel' - with: - name: 'wheel' - path: ./dist/* - - - uses: actions/upload-artifact@v3 - if: matrix.artname == 'sdist' - with: - name: 'sdist' - path: ./dist/* - - - name: Upload files to GitHub release - if: steps.package_version.outputs.PACKAGE_VERSION != 'None' && inputs.release == '1' - uses: svenstaro/upload-release-action@2.6.1 - with: - file: ./dist/*.whl - tag: ${{ format('v{0}', steps.package_version.outputs.PACKAGE_VERSION) }} - file_glob: true - overwrite: true - release_name: ${{ steps.package_version.outputs.PACKAGE_VERSION }}