From 663eea1b535f4e1967e559c45926c6415fc0d91e Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 1 Dec 2024 20:09:40 +0100
Subject: [PATCH 1/7] Fix 64-bit dtype for MSVC

---
 exllamav2/exllamav2_ext/ext_rope.cpp | 42 ++++++++++++++--------------
 exllamav2/exllamav2_ext/ext_rope.h   |  6 ++--
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/exllamav2/exllamav2_ext/ext_rope.cpp b/exllamav2/exllamav2_ext/ext_rope.cpp
index d79a22b8..c7d58025 100644
--- a/exllamav2/exllamav2_ext/ext_rope.cpp
+++ b/exllamav2/exllamav2_ext/ext_rope.cpp
@@ -58,50 +58,50 @@ void rope_
     );
 }
 
-long gen_mrope_pos_ids
+int64_t gen_mrope_pos_ids
 (
     torch::Tensor mrope_pos_ids,
     torch::Tensor ids,
     int merge_size,
-    const std::vector<std::tuple<long, long>> &spans,
-    const std::vector<std::tuple<long, long, long>> &grids
+    const std::vector<std::tuple<int64_t, int64_t>> &spans,
+    const std::vector<std::tuple<int64_t, int64_t, int64_t>> &grids
 )
 {
     int max_length = mrope_pos_ids.size(1);
     int in_length = ids.size(0);
 
-    long* in_ids = (long*) ids.data_ptr();
-    long* pos_ids = (long*) mrope_pos_ids.data_ptr();
+    int64_t* in_ids = (int64_t*) ids.data_ptr();
+    int64_t* pos_ids = (int64_t*) mrope_pos_ids.data_ptr();
 
-    long* out_t = pos_ids;
-    long* out_h = pos_ids + max_length;
-    long* out_w = pos_ids + 2 * max_length;
+    int64_t* out_t = pos_ids;
+    int64_t* out_h = pos_ids + max_length;
+    int64_t* out_w = pos_ids + 2 * max_length;
 
-    long base_t = 0;
-    long next_base_t = 0;
+    int64_t base_t = 0;
+    int64_t next_base_t = 0;
 
     for (int i = 0; i < max_length; ++i)
     {
         bool is_emb = false;
         if (i < in_length)
         {
-            long id = in_ids[i];
+            int64_t id = in_ids[i];
 
             for (int j = 0; j < spans.size(); ++j)
             {
-                long span_start = std::get<0>(spans[j]);
-                long span_end = std::get<1>(spans[j]);
-                long span = span_end - span_start;
+                int64_t span_start = std::get<0>(spans[j]);
+                int64_t span_end = std::get<1>(spans[j]);
+                int64_t span = span_end - span_start;
                 if (id >= span_start && id < span_end)
                 {
                     is_emb = true;
-                    long k = id - span_start;
-                    long grid_t = std::get<0>(grids[j]);
-                    long grid_h = std::get<1>(grids[j]) / (long)merge_size;
-                    long grid_w = std::get<2>(grids[j]) / (long)merge_size;
-                    long k_t = base_t + (k / grid_w / grid_h) % grid_t;
-                    long k_h = base_t + (k / grid_w) % grid_h;
-                    long k_w = base_t + k % grid_w;
+                    int64_t k = id - span_start;
+                    int64_t grid_t = std::get<0>(grids[j]);
+                    int64_t grid_h = std::get<1>(grids[j]) / (int64_t)merge_size;
+                    int64_t grid_w = std::get<2>(grids[j]) / (int64_t)merge_size;
+                    int64_t k_t = base_t + (k / grid_w / grid_h) % grid_t;
+                    int64_t k_h = base_t + (k / grid_w) % grid_h;
+                    int64_t k_w = base_t + k % grid_w;
                     *out_t++ = k_t;
                     *out_h++ = k_h;
                     *out_w++ = k_w;
diff --git a/exllamav2/exllamav2_ext/ext_rope.h b/exllamav2/exllamav2_ext/ext_rope.h
index 17adebd4..2a41b22c 100644
--- a/exllamav2/exllamav2_ext/ext_rope.h
+++ b/exllamav2/exllamav2_ext/ext_rope.h
@@ -11,11 +11,11 @@ void rope_
     bool neox_style
 );
 
-long gen_mrope_pos_ids
+int64_t gen_mrope_pos_ids
 (
     torch::Tensor mrope_pos_ids,
     torch::Tensor ids,
     int merge_size,
-    const std::vector<std::tuple<long, long>> &spans,
-    const std::vector<std::tuple<long, long, long>> &grids
+    const std::vector<std::tuple<int64_t, int64_t>> &spans,
+    const std::vector<std::tuple<int64_t, int64_t, int64_t>> &grids
 );
\ No newline at end of file

From db78601226c3f9ad92b55e1d08087072559356a7 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Mon, 2 Dec 2024 02:32:32 +0530
Subject: [PATCH 2/7] Prevent NPE in `deallocate_pages` (#688)

Prevent NPE in `deallocate_pages`

If `deallocate_pages` is called on a job for which `allocate_pages`
has not been called (see `iterate_start_jobs` for conditions under
which this is true), `allocated_pages` is `None`, raising a NPE
when attempting to iterate.

In particular, this prevents `clear_queue` from working. In
practice, this problem readily occurs when starting a few jobs
and then calling `clear_queue`.
---
 .github/workflows/build-temp.yml | 345 +++++++++++++++++++++++++++++++
 exllamav2/generator/dynamic.py   |   7 +-
 2 files changed, 349 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/build-temp.yml

diff --git a/.github/workflows/build-temp.yml b/.github/workflows/build-temp.yml
new file mode 100644
index 00000000..30a8421a
--- /dev/null
+++ b/.github/workflows/build-temp.yml
@@ -0,0 +1,345 @@
+name: Build Wheels & Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      release:
+        description: 'Release? 1 = yes, 0 = no'
+        default: '0'
+        required: true
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: ${{ matrix.os }} P${{ matrix.pyver }} C${{ matrix.cuda }} R${{ matrix.rocm }} T${{ matrix.torch }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        include:
+
+        # Ubuntu 20.04 CUDA
+
+        # Python 3.8
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.9
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.10
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.11
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.12
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Windows 2022 CUDA
+
+        # Python 3.8
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.9
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.10
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.11
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.12
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Ubuntu 20.04 ROCm
+
+        # ROCm 5.6
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '5.6', torch: '2.2.2', cudaarch: ''                                    }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '5.6', torch: '2.2.2', cudaarch: ''                                    }
+
+        # ROCm 6.0
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
+
+        # ROCm 6.1
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
+
+        # ROCm 6.2
+         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
+         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
+         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
+
+         # sdist
+#         - { artname: 'sdist', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm:    '', torch: '2.3.1', cudaarch: ''                                    }
+
+         # Extra Torch 2.2 wheels for Windows 2022 until PyTorch resolves the shm.dll issue
+
+        # Python 3.8
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+#         Python 3.9
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.10
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.11
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+        # Python 3.12
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+#         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+         # Extra wheel for HF spaces
+#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.2.2', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+
+      fail-fast: false
+
+    defaults:
+      run:
+        shell: pwsh
+
+    steps:
+      # Free disk space
+
+      - name: Free Disk Space
+        uses: jlumbroso/free-disk-space@v1.3.1
+        if: runner.os == 'Linux'
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: false
+          swap-storage: true
+
+      # Setup Python
+
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      # Get version string from package
+
+      - name: Get version string
+        id: package_version
+        run: | 
+          $versionString = Get-Content $(Join-Path 'exllamav2' 'version.py') -raw
+          if ($versionString -match '__version__ = "(\d+\.(?:\d+\.?(?:dev\d+)?)*)"') 
+          {
+            Write-Output $('::notice file=build-wheels-release.yml,line=200,title=Package Version::Detected package version is: {0}' -f $Matches[1])
+            Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT"
+          }
+          else
+          {
+            Write-Output '::error file=build-wheels-release.yml,line=203::Could not parse version from exllamav2/version.py! You must upload wheels manually!'
+            Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT"
+          }
+
+      # Pin VS build tools to 17.9 so builds won't fail
+
+      - name: Install VS2022 BuildTools 17.9.7
+        run: choco install -y visualstudio2022buildtools --version=117.9.7.0 --params "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 --installChannelUri https://aka.ms/vs/17/release/180911598_-255012421/channel"
+        if: runner.os == 'Windows'
+
+      # Install ROCm SDK, apparently needs to happen before setting up Python
+
+      - name: Build for ROCm
+        if: matrix.rocm != ''
+        shell: bash
+        run: |
+          # --- Install ROCm SDK
+
+          export ROCM_VERSION=${{ matrix.rocm }}
+          export TORCH_VERSION=${{ matrix.torch }}
+
+          [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+          echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
+          echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+          
+          sudo apt update
+          sudo apt install rocm-hip-sdk -y
+          sudo apt clean -y
+
+          echo "/opt/rocm/bin" >> $GITHUB_PATH
+          echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
+          echo "ROCM_VERSION=$ROCM_VERSION" >> $GITHUB_ENV
+          echo "USE_ROCM=1" >> $GITHUB_ENV
+
+          # --- Install dependencies
+
+          python3 -m ensurepip --upgrade
+          pip3 install torch==${{ matrix.torch }} --index-url="https://download.pytorch.org/whl/rocm$ROCM_VERSION"
+          pip3 install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja
+          pip3 cache purge
+
+          # --- Build wheel 
+
+          python3 -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=+rocm${{ matrix.rocm }}-torch${{ matrix.torch }}"
+
+      # Build for CUDA
+
+      - name: Setup Mamba
+        if: matrix.cuda != ''
+        uses: conda-incubator/setup-miniconda@v2.3.0
+        with:
+          activate-environment: "exllama"
+          python-version: ${{ matrix.pyver }}
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Build for CUDA
+        if: matrix.cuda != ''
+        run: |
+          # --- Spawn the VS shell
+          if ($IsWindows) {
+            Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+            Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64'
+            $env:DISTUTILS_USE_SDK=1
+          }
+  
+          # --- Install CUDA using Conda
+          $cudaVersion = '${{ matrix.cuda }}'
+          $cudaVersionPytorch = '${{ matrix.cuda }}'.Remove('${{ matrix.cuda }}'.LastIndexOf('.')).Replace('.','')
+
+          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
+          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
+
+          if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaVersion}
+          if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'}
+
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
+          
+          # --- Install dependencies
+          
+          python -m ensurepip --upgrade
+          python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch
+          python -m pip install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja
+
+          # --- Build wheel
+                  
+          $BUILDTAG = "+cu$cudaVersionPytorch-torch${{ matrix.torch }}"
+          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
+          python -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=$BUILDTAG"
+
+      # Build sdist
+
+      - name: Build sdist
+        if: matrix.cuda == '' && matrix.rocm == ''
+        run: |
+          # --- Spawn the VS shell
+          if ($IsWindows) {
+            Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+            Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64'
+            $env:DISTUTILS_USE_SDK=1
+          }
+
+          # --- Install dependencies
+          
+          python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cpu
+          python -m pip install build wheel ninja
+
+          # --- Build wheel
+           
+          $env:EXLLAMA_NOCOMPILE=1
+          python -m build -n
+
+      # Upload files
+
+      - uses: actions/upload-artifact@v3
+        if: matrix.artname == 'wheel'
+        with:
+          name: 'wheel'
+          path: ./dist/*
+
+      - uses: actions/upload-artifact@v3
+        if: matrix.artname == 'sdist'
+        with:
+          name: 'sdist'
+          path: ./dist/*
+
+      - name: Upload files to GitHub release
+        if: steps.package_version.outputs.PACKAGE_VERSION != 'None' && inputs.release == '1'
+        uses: svenstaro/upload-release-action@2.6.1
+        with:
+          file: ./dist/*.whl
+          tag: ${{ format('v{0}', steps.package_version.outputs.PACKAGE_VERSION) }}
+          file_glob: true
+          overwrite: true
+          release_name: ${{ steps.package_version.outputs.PACKAGE_VERSION }}
diff --git a/exllamav2/generator/dynamic.py b/exllamav2/generator/dynamic.py
index 602420da..86dcf35f 100644
--- a/exllamav2/generator/dynamic.py
+++ b/exllamav2/generator/dynamic.py
@@ -2589,8 +2589,9 @@ def deallocate_pages(self):
             self.generator.all_pages[0].backup()
 
         for seq in self.sequences:
-            for page in seq.allocated_pages:
-                page.sub_ref()
-            seq.allocated_pages = []
+            if seq.allocated_pages is not None:
+                for page in seq.allocated_pages:
+                    page.sub_ref()
+                seq.allocated_pages = []
 
         self.generator.validate_cache()

From c86f62c3b8d829ae3a9d06882a1a47812f393115 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Thu, 5 Dec 2024 18:02:02 +0100
Subject: [PATCH 3/7] Ensure MRoPE ID tensor is contiguous

---
 exllamav2/mrope.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exllamav2/mrope.py b/exllamav2/mrope.py
index 16ef31de..3b925314 100644
--- a/exllamav2/mrope.py
+++ b/exllamav2/mrope.py
@@ -36,7 +36,7 @@ def gen_mrope_embed(
 
     # Create 3D position IDs
 
-    ids = input_ids.squeeze(0)
+    ids = input_ids.squeeze(0).contiguous()
     mrope_pos_ids = torch.zeros((3, max_length), dtype = torch.long).contiguous()
     merge_size = 1 if not embeddings else embeddings[0].model.config.vision_spatial_merge_size
     spans = []

From c55656cc0ce38e8d32ce399df4bb5fc320873e17 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:16:36 +0100
Subject: [PATCH 4/7] Fix system RAM consumption while quantizing, fixes #692

---
 exllamav2/embedding.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/exllamav2/embedding.py b/exllamav2/embedding.py
index 9fd67997..f5571f0a 100644
--- a/exllamav2/embedding.py
+++ b/exllamav2/embedding.py
@@ -186,16 +186,40 @@ def forward(
             if self.archparams.normalize_embeddings:
                 hidden_states *= cfg.hidden_size ** 0.5
 
-        # Negative tokens during quantization are noise tokens
+        # Rows with negative tokens during quantization are noise tokens
 
         if kwargs.get("negative_ids_noise"):
-            mask = (input_ids < 0).unsqueeze(-1)
-            unmasked_values = hidden_states[~mask.expand_as(hidden_states)].float()
-            mean, std = unmasked_values.mean(), unmasked_values.std()
-            noise = torch.randn_like(hidden_states, dtype = torch.float)
-            noise = noise * std + mean
-            noise = noise.half()
-            hidden_states = torch.where(mask, noise, hidden_states)
+
+            n = 0
+            mean = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device)
+            M2 = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device)
+
+            for i in range(input_ids.shape[0]):
+                if input_ids[i][0] < 0:
+                    continue
+
+                er = hidden_states[i].float()
+                n += er.numel()
+                delta = er - mean
+                mean += delta.sum() / n
+                delta2 = er - mean
+                M2 += (delta * delta2).sum()
+                del er
+                del delta
+                del delta2
+
+            if n > 1:
+                std = torch.sqrt(M2 / (n - 1))
+
+            for i in range(input_ids.shape[0]):
+                if input_ids[i][0] >= 0:
+                    continue
+
+                er = hidden_states[i]
+                noise = torch.randn(er.size(), dtype = torch.float, device = hidden_states.device) * std + mean
+                er.copy_(noise.half())
+                del er
+                del noise
 
         # Move to pinned temp buffer for TP
 

From ba9774f1c828aae75f022c4e3aabc8879854ac9e Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sat, 7 Dec 2024 15:53:52 +0100
Subject: [PATCH 5/7] Enable noise tokens for Qwen2-VL quantizatino

---
 exllamav2/architecture.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
index ea7e2941..b2a62803 100644
--- a/exllamav2/architecture.py
+++ b/exllamav2/architecture.py
@@ -402,6 +402,8 @@ class Params:
             self.mmp.mlp_bias = True
             self.mmp.norm = "layernorm"
 
+            self.standard_calib_noise = (5, 30)
+
         # Gemma
 
         if arch_string == "GemmaForCausalLM":

From 83a57c74ed79d8237fbf7cd7b0e28c42eacccef9 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sat, 7 Dec 2024 15:55:11 +0100
Subject: [PATCH 6/7] Bump to v0.2.6

---
 exllamav2/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exllamav2/version.py b/exllamav2/version.py
index 845be453..d1eb7428 100644
--- a/exllamav2/version.py
+++ b/exllamav2/version.py
@@ -1 +1 @@
-__version__ = "0.2.5"
\ No newline at end of file
+__version__ = "0.2.6"
\ No newline at end of file

From 15b5df784ad51c3e38229934eda831eeaa522b66 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sat, 7 Dec 2024 15:55:53 +0100
Subject: [PATCH 7/7] Cleanup build actions

---
 .github/workflows/build-temp.yml | 345 -------------------------------
 1 file changed, 345 deletions(-)
 delete mode 100644 .github/workflows/build-temp.yml

diff --git a/.github/workflows/build-temp.yml b/.github/workflows/build-temp.yml
deleted file mode 100644
index 30a8421a..00000000
--- a/.github/workflows/build-temp.yml
+++ /dev/null
@@ -1,345 +0,0 @@
-name: Build Wheels & Release
-
-on:
-  workflow_dispatch:
-    inputs:
-      release:
-        description: 'Release? 1 = yes, 0 = no'
-        default: '0'
-        required: true
-        type: string
-
-permissions:
-  contents: write
-
-jobs:
-  build_wheels:
-    name: ${{ matrix.os }} P${{ matrix.pyver }} C${{ matrix.cuda }} R${{ matrix.rocm }} T${{ matrix.torch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        include:
-
-        # Ubuntu 20.04 CUDA
-
-        # Python 3.8
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.9
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.10
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.11
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.12
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Windows 2022 CUDA
-
-        # Python 3.8
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.9
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.10
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.11
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.7.0', rocm:    '', torch: '2.0.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6+PTX'         }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.12
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Ubuntu 20.04 ROCm
-
-        # ROCm 5.6
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '5.6', torch: '2.2.2', cudaarch: ''                                    }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '5.6', torch: '2.2.2', cudaarch: ''                                    }
-
-        # ROCm 6.0
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.0', torch: '2.3.1', cudaarch: ''                                    }
-
-        # ROCm 6.1
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.1', torch: '2.4.0', cudaarch: ''                                    }
-
-        # ROCm 6.2
-         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
-         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
-         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.12', cuda: '',       rocm: '6.2', torch: '2.5.0', cudaarch: ''                                    }
-
-         # sdist
-#         - { artname: 'sdist', os: ubuntu-20.04, pyver: '3.11', cuda: '',       rocm:    '', torch: '2.3.1', cudaarch: ''                                    }
-
-         # Extra Torch 2.2 wheels for Windows 2022 until PyTorch resolves the shm.dll issue
-
-        # Python 3.8
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.8', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-#         Python 3.9
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver:  '3.9', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.10
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.11
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-        # Python 3.12
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-#         - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm:    '', torch: '2.2.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-         # Extra wheel for HF spaces
-#         - { artname: 'wheel', os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.0', rocm:    '', torch: '2.2.2', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-
-      fail-fast: false
-
-    defaults:
-      run:
-        shell: pwsh
-
-    steps:
-      # Free disk space
-
-      - name: Free Disk Space
-        uses: jlumbroso/free-disk-space@v1.3.1
-        if: runner.os == 'Linux'
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          swap-storage: true
-
-      # Setup Python
-
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v3
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      # Get version string from package
-
-      - name: Get version string
-        id: package_version
-        run: | 
-          $versionString = Get-Content $(Join-Path 'exllamav2' 'version.py') -raw
-          if ($versionString -match '__version__ = "(\d+\.(?:\d+\.?(?:dev\d+)?)*)"') 
-          {
-            Write-Output $('::notice file=build-wheels-release.yml,line=200,title=Package Version::Detected package version is: {0}' -f $Matches[1])
-            Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT"
-          }
-          else
-          {
-            Write-Output '::error file=build-wheels-release.yml,line=203::Could not parse version from exllamav2/version.py! You must upload wheels manually!'
-            Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT"
-          }
-
-      # Pin VS build tools to 17.9 so builds won't fail
-
-      - name: Install VS2022 BuildTools 17.9.7
-        run: choco install -y visualstudio2022buildtools --version=117.9.7.0 --params "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 --installChannelUri https://aka.ms/vs/17/release/180911598_-255012421/channel"
-        if: runner.os == 'Windows'
-
-      # Install ROCm SDK, apparently needs to happen before setting up Python
-
-      - name: Build for ROCm
-        if: matrix.rocm != ''
-        shell: bash
-        run: |
-          # --- Install ROCm SDK
-
-          export ROCM_VERSION=${{ matrix.rocm }}
-          export TORCH_VERSION=${{ matrix.torch }}
-
-          [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-          echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
-          echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
-          
-          sudo apt update
-          sudo apt install rocm-hip-sdk -y
-          sudo apt clean -y
-
-          echo "/opt/rocm/bin" >> $GITHUB_PATH
-          echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
-          echo "ROCM_VERSION=$ROCM_VERSION" >> $GITHUB_ENV
-          echo "USE_ROCM=1" >> $GITHUB_ENV
-
-          # --- Install dependencies
-
-          python3 -m ensurepip --upgrade
-          pip3 install torch==${{ matrix.torch }} --index-url="https://download.pytorch.org/whl/rocm$ROCM_VERSION"
-          pip3 install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja
-          pip3 cache purge
-
-          # --- Build wheel 
-
-          python3 -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=+rocm${{ matrix.rocm }}-torch${{ matrix.torch }}"
-
-      # Build for CUDA
-
-      - name: Setup Mamba
-        if: matrix.cuda != ''
-        uses: conda-incubator/setup-miniconda@v2.3.0
-        with:
-          activate-environment: "exllama"
-          python-version: ${{ matrix.pyver }}
-          miniforge-variant: Mambaforge
-          miniforge-version: latest
-          use-mamba: true
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-      - name: Build for CUDA
-        if: matrix.cuda != ''
-        run: |
-          # --- Spawn the VS shell
-          if ($IsWindows) {
-            Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-          }
-  
-          # --- Install CUDA using Conda
-          $cudaVersion = '${{ matrix.cuda }}'
-          $cudaVersionPytorch = '${{ matrix.cuda }}'.Remove('${{ matrix.cuda }}'.LastIndexOf('.')).Replace('.','')
-
-          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
-          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
-
-          if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaVersion}
-          if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'}
-
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
-          
-          # --- Install dependencies
-          
-          python -m ensurepip --upgrade
-          python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch
-          python -m pip install --upgrade setuptools==69.5.1 build wheel safetensors sentencepiece ninja
-
-          # --- Build wheel
-                  
-          $BUILDTAG = "+cu$cudaVersionPytorch-torch${{ matrix.torch }}"
-          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
-          python -m build -n --wheel -C--build-option=egg_info "-C--build-option=--tag-build=$BUILDTAG"
-
-      # Build sdist
-
-      - name: Build sdist
-        if: matrix.cuda == '' && matrix.rocm == ''
-        run: |
-          # --- Spawn the VS shell
-          if ($IsWindows) {
-            Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-          }
-
-          # --- Install dependencies
-          
-          python -m pip install torch==${{ matrix.torch }} --index-url https://download.pytorch.org/whl/cpu
-          python -m pip install build wheel ninja
-
-          # --- Build wheel
-           
-          $env:EXLLAMA_NOCOMPILE=1
-          python -m build -n
-
-      # Upload files
-
-      - uses: actions/upload-artifact@v3
-        if: matrix.artname == 'wheel'
-        with:
-          name: 'wheel'
-          path: ./dist/*
-
-      - uses: actions/upload-artifact@v3
-        if: matrix.artname == 'sdist'
-        with:
-          name: 'sdist'
-          path: ./dist/*
-
-      - name: Upload files to GitHub release
-        if: steps.package_version.outputs.PACKAGE_VERSION != 'None' && inputs.release == '1'
-        uses: svenstaro/upload-release-action@2.6.1
-        with:
-          file: ./dist/*.whl
-          tag: ${{ format('v{0}', steps.package_version.outputs.PACKAGE_VERSION) }}
-          file_glob: true
-          overwrite: true
-          release_name: ${{ steps.package_version.outputs.PACKAGE_VERSION }}