From d6cc466bd48dd27474ecb00c3baba2e8a887f6c4 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 3 Jun 2025 02:55:32 -0700
Subject: [PATCH 01/11] Get_Rows & Dequantize implementation adapted to work
 for repacked weights of type q4_0

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 145 ++++++++++++++++++++++++-
 src/whisper.cpp                        |  43 +++++---
 2 files changed, 172 insertions(+), 16 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 0a3ff867cfe..5f143e25477 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -6046,6 +6046,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                 size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
                 size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
                 return true;
+            case GGML_OP_GET_ROWS:
+                size = 0;  // GET_ROWS (standard and repacked) doesn't need a work buffer
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -6061,6 +6064,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_OP_MUL_MAT_ID:
                 forward_mul_mat_id(params, op);
                 return true;
+            case GGML_OP_GET_ROWS:
+                forward_get_rows(params, op);
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -6068,6 +6074,132 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         return false;
     }
 
+    void forward_get_rows(const ggml_compute_params *params,
+                          ggml_tensor *dst) {
+        const ggml_tensor *src0 = dst->src[0];
+
+        switch (src0->type) {
+            case GGML_TYPE_Q4_0: {
+                ggml_compute_forward_get_rows_q4_0x8(params, dst);
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+    }
+
+    static void ggml_compute_forward_get_rows_q4_0x8(
+        const ggml_compute_params *params,
+        ggml_tensor *dst) {
+        const ggml_tensor *src0 = dst->src[0];
+        const ggml_tensor *src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK4_0;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char *base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q4_0x8 of the identified row_group_idx
+            const block_q4_0x8 *p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q4_0x8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from data repacked with quant interleaving.
+     *
+     * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group.
+     * @param y                              Output buffer for the dequantized float values.
+     * @param k                              Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group               Index (0-7) of the logical row to dequantize.
+     */
+    static void dequantize_row_q4_0x8(
+        const block_q4_0x8 *GGML_RESTRICT p_repacked_group_column_blocks,
+        float *GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8;
+        assert(k % QK4_0 == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE);
+
+        const int nb = k / QK4_0;
+        const int bytes_for_half_elements = (QK4_0 / 2) / 2;
+
+        const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE;
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        const int qk4_0_half_elements = QK4_0 / 2;
+
+        for (int i = 0; i < nb; ++i) {
+            const block_q4_0x8 *current_column_repacked_block = &p_repacked_group_column_blocks[i];
+            const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]);
+            float *y_curr = y + i * QK4_0;
+
+            const int8_t *qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
+
+            uint64_t first_half_chunk_u64;
+            memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t));
+            first_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t *original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
+
+            const int8_t *qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
+
+            uint64_t second_half_chunk_u64;
+            memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t));
+            second_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t *original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
+
+            // dequantizing all QK4_0's for this block.
+            for (int j = 0; j < bytes_for_half_elements; ++j) {
+                const uint8_t quant_byte_first = original_qs_first_half_bytes[j];
+                y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val;
+                y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val;
+
+                const uint8_t quant_byte_second = original_qs_second_half_bytes[j];
+                const int out_idx_base_second_half = j + bytes_for_half_elements;  // Offset for the second set of low nibbles
+                y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val;
+                y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val;
+            }
+        }
+    }
+
     void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
         const ggml_tensor * src0 = op->src[0];
         const ggml_tensor * src1 = op->src[1];
@@ -6398,12 +6530,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             //if (op->src[1]->type == GGML_TYPE_Q8_0) {
             //    return true;
             //}
+        } else if (op->op == GGML_OP_GET_ROWS 
+            && op->src[0]->buffer 
+            && (ggml_n_dims(op->src[0]) == 2) 
+            && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() 
+            && ggml_aarch64_get_optimal_repack_type(op->src[0])) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+                return true;
+            }
         }
         return false;
     }
 
     ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) {
             if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
                 return (ggml::cpu::tensor_traits *) op->src[0]->extra;
             }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index a2f28d7db54..3ac77dc7003 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         // GPU and default CPU backend support all operators
         op_supported = true;
     } else {
-        switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
-            case GGML_OP_MUL_MAT: {
-                ggml_init_params params = {
-                    /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
-                    /*.mem_buffer =*/ nullptr,
-                    /*.no_alloc   =*/ true,
-                };
+        ggml_init_params params = {
+            /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
 
-                ggml_context_ptr ctx_ptr { ggml_init(params) };
-                if (!ctx_ptr) {
-                    throw std::runtime_error("failed to create ggml context");
-                }
-                ggml_context * ctx = ctx_ptr.get();
+        ggml_context_ptr ctx_ptr { ggml_init(params) };
+        if (!ctx_ptr) {
+            throw std::runtime_error("failed to create ggml context");
+        }
+        ggml_context * ctx = ctx_ptr.get();
 
-                ggml_tensor * op_tensor = nullptr;
+        ggml_tensor * op_tensor = nullptr;
+        
+        int64_t n_ctx = hparams.n_audio_ctx;
 
-                int64_t n_ctx = hparams.n_audio_ctx;
+        switch (op) {
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0)
+            case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);
 
@@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
                 w->buffer = nullptr;
                 break;
             }
+            case GGML_OP_GET_ROWS: {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+                op_tensor = ggml_get_rows(ctx, w, b);
+
+                // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+                GGML_ASSERT(w->buffer == nullptr);
+                w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+                op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+                ggml_backend_buffer_free(w->buffer);
+                w->buffer = nullptr;
+                break;
+            }
             default: {
                 op_supported = false;
                 break;

From 994e02a5eb1526fb9dcdb7448c930fe88fb19095 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Fri, 6 Jun 2025 03:46:05 -0700
Subject: [PATCH 02/11] Resolve PR comments

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 42 +++++++++++++-------------
 src/whisper.cpp                        |  2 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 5f143e25477..a2b5492543b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -6074,9 +6074,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         return false;
     }
 
-    void forward_get_rows(const ggml_compute_params *params,
-                          ggml_tensor *dst) {
-        const ggml_tensor *src0 = dst->src[0];
+    void forward_get_rows(const ggml_compute_params * params,
+                          ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
 
         switch (src0->type) {
             case GGML_TYPE_Q4_0: {
@@ -6089,10 +6089,10 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
     }
 
     static void ggml_compute_forward_get_rows_q4_0x8(
-        const ggml_compute_params *params,
-        ggml_tensor *dst) {
-        const ggml_tensor *src0 = dst->src[0];
-        const ggml_tensor *src1 = dst->src[1];
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
 
         GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -6132,10 +6132,10 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             int row_group_idx = i01 / nrows_interleaved;
             const int row_idx_in_group = i01 % nrows_interleaved;
 
-            const char *base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
 
             // Pointer to the first block_q4_0x8 of the identified row_group_idx
-            const block_q4_0x8 *p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+            const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
 
             dequantize_row_q4_0x8(
                 p_first_repacked_block_of_group_x8,
@@ -6152,8 +6152,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
      * @param row_idx_in_group               Index (0-7) of the logical row to dequantize.
      */
     static void dequantize_row_q4_0x8(
-        const block_q4_0x8 *GGML_RESTRICT p_repacked_group_column_blocks,
-        float *GGML_RESTRICT y,
+        const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks,
+        float * GGML_RESTRICT y,
         int64_t k,
         int row_idx_in_group) {
         const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8;
@@ -6168,23 +6168,23 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const int qk4_0_half_elements = QK4_0 / 2;
 
         for (int i = 0; i < nb; ++i) {
-            const block_q4_0x8 *current_column_repacked_block = &p_repacked_group_column_blocks[i];
+            const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i];
             const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]);
-            float *y_curr = y + i * QK4_0;
+            float * y_curr = y + i * QK4_0;
 
-            const int8_t *qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
+            const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
 
             uint64_t first_half_chunk_u64;
             memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t));
             first_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
-            const uint8_t *original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
+            const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
 
-            const int8_t *qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
+            const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
 
             uint64_t second_half_chunk_u64;
             memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t));
             second_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
-            const uint8_t *original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
+            const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
 
             // dequantizing all QK4_0's for this block.
             for (int j = 0; j < bytes_for_half_elements; ++j) {
@@ -6530,10 +6530,10 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             //if (op->src[1]->type == GGML_TYPE_Q8_0) {
             //    return true;
             //}
-        } else if (op->op == GGML_OP_GET_ROWS 
-            && op->src[0]->buffer 
-            && (ggml_n_dims(op->src[0]) == 2) 
-            && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() 
+        } else if (op->op == GGML_OP_GET_ROWS
+            && op->src[0]->buffer
+            && (ggml_n_dims(op->src[0]) == 2)
+            && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
             && ggml_aarch64_get_optimal_repack_type(op->src[0])) {
             if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                 return false;
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 3ac77dc7003..30bfa107838 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         ggml_context * ctx = ctx_ptr.get();
 
         ggml_tensor * op_tensor = nullptr;
-        
+
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {

From ed1d3a2beb1065538994e07a9cf9584b18f8f452 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 3 Jun 2025 02:55:32 -0700
Subject: [PATCH 03/11] Get_Rows & Dequantize implementation adapted to work
 for repacked weights of type q4_0

---
 ggml/src/ggml-cpu/repack.cpp | 145 ++++++++++++++++++++++++++++++++++-
 src/whisper.cpp              |  43 +++++++----
 2 files changed, 172 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 628142d5f63..5a80365d09b 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1181,6 +1181,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                 size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
                 size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
                 return true;
+            case GGML_OP_GET_ROWS:
+                size = 0;  // GET_ROWS (standard and repacked) doesn't need a work buffer
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -1196,6 +1199,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_OP_MUL_MAT_ID:
                 forward_mul_mat_id(params, op);
                 return true;
+            case GGML_OP_GET_ROWS:
+                forward_get_rows(params, op);
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -1401,6 +1407,132 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 #undef MMID_MATRIX_ROW
     }
 
+    void forward_get_rows(const ggml_compute_params * params,
+                          ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+
+        switch (src0->type) {
+            case GGML_TYPE_Q4_0: {
+                ggml_compute_forward_get_rows_q4_0x8(params, dst);
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+    }
+
+    static void ggml_compute_forward_get_rows_q4_0x8(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK4_0;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q4_0x8 of the identified row_group_idx
+            const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q4_0x8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from data repacked with quant interleaving.
+     *
+     * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group.
+     * @param y                              Output buffer for the dequantized float values.
+     * @param k                              Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group               Index (0-7) of the logical row to dequantize.
+     */
+    static void dequantize_row_q4_0x8(
+        const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks,
+        float * GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8;
+        assert(k % QK4_0 == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE);
+
+        const int nb = k / QK4_0;
+        const int bytes_for_half_elements = (QK4_0 / 2) / 2;
+
+        const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE;
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        const int qk4_0_half_elements = QK4_0 / 2;
+
+        for (int i = 0; i < nb; ++i) {
+            const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i];
+            const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]);
+            float * y_curr = y + i * QK4_0;
+
+            const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
+
+            uint64_t first_half_chunk_u64;
+            memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t));
+            first_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
+
+            const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
+
+            uint64_t second_half_chunk_u64;
+            memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t));
+            second_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
+
+            // dequantizing all QK4_0's for this block.
+            for (int j = 0; j < bytes_for_half_elements; ++j) {
+                const uint8_t quant_byte_first = original_qs_first_half_bytes[j];
+                y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val;
+                y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val;
+
+                const uint8_t quant_byte_second = original_qs_second_half_bytes[j];
+                const int out_idx_base_second_half = j + bytes_for_half_elements;  // Offset for the second set of low nibbles
+                y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val;
+                y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val;
+            }
+        }
+    }
+
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
@@ -1533,12 +1665,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             //if (op->src[1]->type == GGML_TYPE_Q8_0) {
             //    return true;
             //}
+        } else if (op->op == GGML_OP_GET_ROWS
+            && op->src[0]->buffer
+            && (ggml_n_dims(op->src[0]) == 2)
+            && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
+            && ggml_repack_get_optimal_repack_type(op->src[0])) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+                return true;
+            }
         }
         return false;
     }
 
     ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) {
             if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
                 return (ggml::cpu::tensor_traits *) op->src[0]->extra;
             }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 6483ae8ab9d..41174b37e41 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         // GPU and default CPU backend support all operators
         op_supported = true;
     } else {
-        switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
-            case GGML_OP_MUL_MAT: {
-                ggml_init_params params = {
-                    /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
-                    /*.mem_buffer =*/ nullptr,
-                    /*.no_alloc   =*/ true,
-                };
+        ggml_init_params params = {
+            /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
 
-                ggml_context_ptr ctx_ptr { ggml_init(params) };
-                if (!ctx_ptr) {
-                    throw std::runtime_error("failed to create ggml context");
-                }
-                ggml_context * ctx = ctx_ptr.get();
+        ggml_context_ptr ctx_ptr { ggml_init(params) };
+        if (!ctx_ptr) {
+            throw std::runtime_error("failed to create ggml context");
+        }
+        ggml_context * ctx = ctx_ptr.get();
 
-                ggml_tensor * op_tensor = nullptr;
+        ggml_tensor * op_tensor = nullptr;
+        
+        int64_t n_ctx = hparams.n_audio_ctx;
 
-                int64_t n_ctx = hparams.n_audio_ctx;
+        switch (op) {
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0)
+            case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);
 
@@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
                 w->buffer = nullptr;
                 break;
             }
+            case GGML_OP_GET_ROWS: {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+                op_tensor = ggml_get_rows(ctx, w, b);
+
+                // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+                GGML_ASSERT(w->buffer == nullptr);
+                w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+                op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+                ggml_backend_buffer_free(w->buffer);
+                w->buffer = nullptr;
+                break;
+            }
             default: {
                 op_supported = false;
                 break;

From 6959d4190e2647589a5a3e46ba73810288db54ba Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Fri, 6 Jun 2025 03:46:05 -0700
Subject: [PATCH 04/11] Resolve PR comments

---
 src/whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 41174b37e41..31c5ee3ba61 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         ggml_context * ctx = ctx_ptr.get();
 
         ggml_tensor * op_tensor = nullptr;
-        
+
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {

From 066b47ad60305d296d08b6b9e730a5d1078af731 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Wed, 18 Jun 2025 05:33:50 -0700
Subject: [PATCH 05/11] Add the Get_Rows & Dequantize implementation adapted to
 work for repacked weights of type q4_K

---
 ggml/src/ggml-cpu/repack.cpp | 130 ++++++++++++++++++++++++++++++++++-
 src/whisper.cpp              |   2 +-
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 5a80365d09b..df7e7a29021 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1415,6 +1415,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_TYPE_Q4_0: {
                 ggml_compute_forward_get_rows_q4_0x8(params, dst);
             } break;
+            case GGML_TYPE_Q4_K: {
+                ggml_compute_forward_get_rows_q4_Kx8(params, dst);
+            } break;
             default:
                 GGML_ABORT("fatal error");
                 break;
@@ -1533,6 +1536,131 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         }
     }
 
+    static void ggml_compute_forward_get_rows_q4_Kx8(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q4_Kx8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK_K;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q4_Kx8 of the identified row_group_idx
+            const block_q4_Kx8 * p_first_repacked_block_of_group_x8 = (const block_q4_Kx8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q4_Kx8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from the repacked q4_Kx8 data format.
+     *
+     * @param p_repacked_blocks Pointer to the start of the 'block_q4_Kx8' structures for the entire row.
+     * @param y                 Output buffer for the dequantized float values.
+     * @param k                 Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group  The index (0-7) of the logical row to extract from the interleaved data.
+     */
+
+    static void dequantize_row_q4_Kx8(
+        const void * GGML_RESTRICT p_repacked_blocks,
+        float * GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        constexpr int nrows_interleaved = 8;
+        assert(k % QK_K == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < nrows_interleaved);
+
+        const int nb = k / QK_K;
+        const block_q4_Kx8 * blocks = (const block_q4_Kx8 *)p_repacked_blocks;
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_Kx8 * current_block = &blocks[i];
+
+            const float d_super_block = GGML_FP16_TO_FP32(current_block->d[row_idx_in_group]);
+            const float dmin_super_block = GGML_FP16_TO_FP32(current_block->dmin[row_idx_in_group]);
+
+            const uint8_t * ptr_qs_base = current_block->qs;
+            const uint8_t * ptr_repacked_scales = (const uint8_t *)current_block->scales;
+            int is = 0, chunk_group_start_idx = 0;
+            for (int j = 0; j < QK_K; j += 64) {
+
+                uint8_t sc1, m1_val, sc2, m2_val;
+                const uint8_t *scales_repacked_data;
+                
+                scales_repacked_data = &ptr_repacked_scales[(is + 0) * 12];
+                get_scale_min_k4(row_idx_in_group, scales_repacked_data, &sc1, &m1_val);
+            
+                scales_repacked_data = &ptr_repacked_scales[(is + 1) * 12];
+                get_scale_min_k4(row_idx_in_group, scales_repacked_data, &sc2, &m2_val);
+                
+                const float d1 = d_super_block * sc1;
+                const float m1 = dmin_super_block * m1_val;
+                const float d2 = d_super_block * sc2;
+                const float m2 = dmin_super_block * m2_val;
+
+                for (int idx = 0; idx < 4; idx++) {
+                    const uint8_t * ptr_qs_chunk = ptr_qs_base + ((chunk_group_start_idx + idx) * 64) + row_idx_in_group * 8;
+                    for (int l = 0; l < 8; ++l) *y++ = d1 * (ptr_qs_chunk[l] & 0xF) - m1; // 16 elements of quants
+                }
+ 
+                for (int idx = 0; idx < 4; idx++) {
+                    const uint8_t * ptr_qs_chunk = ptr_qs_base + ((chunk_group_start_idx + idx) * 64) + row_idx_in_group * 8;
+                    for (int l = 0; l < 8; ++l) *y++ = d2 * (ptr_qs_chunk[l] >> 4) - m2; // 16 elements of quants
+                }
+                is += 2;
+                chunk_group_start_idx += 4;
+            }
+        }
+    }
+
+    static inline void get_scale_min_k4(int j, const uint8_t *GGML_RESTRICT s, uint8_t *GGML_RESTRICT d, uint8_t *GGML_RESTRICT m) {
+        if (j < 4) {
+            *d = s[j] & 63;
+            *m = s[j + 4] & 63;
+        } else {
+            *d = (s[j + 4] & 0xF) | ((s[j - 4] >> 6) << 4);
+            *m = (s[j + 4] >> 4) | ((s[j - 0] >> 6) << 4);
+        }
+    }
+
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
@@ -1673,7 +1801,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                 return false;
             }
-            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+            if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_K) {
                 return true;
             }
         }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 31c5ee3ba61..b4e178e2a17 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1454,7 +1454,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0)
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (repacked - q4_0, q4_K)
             case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);

From b9e152def9b3c83651f9fce334b3de8610f85bfd Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 3 Jun 2025 02:55:32 -0700
Subject: [PATCH 06/11] Get_Rows & Dequantize implementation adapted to work
 for repacked weights of type q4_0

---
 ggml/src/ggml-cpu/repack.cpp | 145 ++++++++++++++++++++++++++++++++++-
 src/whisper.cpp              |  43 +++++++----
 2 files changed, 172 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 5c6715d5c01..ae1fe2336f4 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1170,6 +1170,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                 size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
                 size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
                 return true;
+            case GGML_OP_GET_ROWS:
+                size = 0;  // GET_ROWS (standard and repacked) doesn't need a work buffer
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -1185,6 +1188,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_OP_MUL_MAT_ID:
                 forward_mul_mat_id(params, op);
                 return true;
+            case GGML_OP_GET_ROWS:
+                forward_get_rows(params, op);
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -1390,6 +1396,132 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 #undef MMID_MATRIX_ROW
     }
 
+    void forward_get_rows(const ggml_compute_params * params,
+                          ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+
+        switch (src0->type) {
+            case GGML_TYPE_Q4_0: {
+                ggml_compute_forward_get_rows_q4_0x8(params, dst);
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+    }
+
+    static void ggml_compute_forward_get_rows_q4_0x8(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK4_0;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q4_0x8 of the identified row_group_idx
+            const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q4_0x8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from data repacked with quant interleaving.
+     *
+     * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group.
+     * @param y                              Output buffer for the dequantized float values.
+     * @param k                              Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group               Index (0-7) of the logical row to dequantize.
+     */
+    static void dequantize_row_q4_0x8(
+        const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks,
+        float * GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8;
+        assert(k % QK4_0 == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE);
+
+        const int nb = k / QK4_0;
+        const int bytes_for_half_elements = (QK4_0 / 2) / 2;
+
+        const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE;
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        const int qk4_0_half_elements = QK4_0 / 2;
+
+        for (int i = 0; i < nb; ++i) {
+            const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i];
+            const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]);
+            float * y_curr = y + i * QK4_0;
+
+            const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
+
+            uint64_t first_half_chunk_u64;
+            memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t));
+            first_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
+
+            const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
+
+            uint64_t second_half_chunk_u64;
+            memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t));
+            second_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
+            const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
+
+            // dequantizing all QK4_0's for this block.
+            for (int j = 0; j < bytes_for_half_elements; ++j) {
+                const uint8_t quant_byte_first = original_qs_first_half_bytes[j];
+                y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val;
+                y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val;
+
+                const uint8_t quant_byte_second = original_qs_second_half_bytes[j];
+                const int out_idx_base_second_half = j + bytes_for_half_elements;  // Offset for the second set of low nibbles
+                y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val;
+                y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val;
+            }
+        }
+    }
+
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
@@ -1522,12 +1654,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             //if (op->src[1]->type == GGML_TYPE_Q8_0) {
             //    return true;
             //}
+        } else if (op->op == GGML_OP_GET_ROWS
+            && op->src[0]->buffer
+            && (ggml_n_dims(op->src[0]) == 2)
+            && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
+            && ggml_repack_get_optimal_repack_type(op->src[0])) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+                return true;
+            }
         }
         return false;
     }
 
     ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) {
             if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
                 return (ggml::cpu::tensor_traits *) op->src[0]->extra;
             }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index fe3e135bee6..00d3f14f05c 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         // GPU and default CPU backend support all operators
         op_supported = true;
     } else {
-        switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
-            case GGML_OP_MUL_MAT: {
-                ggml_init_params params = {
-                    /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
-                    /*.mem_buffer =*/ nullptr,
-                    /*.no_alloc   =*/ true,
-                };
+        ggml_init_params params = {
+            /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
 
-                ggml_context_ptr ctx_ptr { ggml_init(params) };
-                if (!ctx_ptr) {
-                    throw std::runtime_error("failed to create ggml context");
-                }
-                ggml_context * ctx = ctx_ptr.get();
+        ggml_context_ptr ctx_ptr { ggml_init(params) };
+        if (!ctx_ptr) {
+            throw std::runtime_error("failed to create ggml context");
+        }
+        ggml_context * ctx = ctx_ptr.get();
 
-                ggml_tensor * op_tensor = nullptr;
+        ggml_tensor * op_tensor = nullptr;
+        
+        int64_t n_ctx = hparams.n_audio_ctx;
 
-                int64_t n_ctx = hparams.n_audio_ctx;
+        switch (op) {
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0)
+            case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);
 
@@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
                 w->buffer = nullptr;
                 break;
             }
+            case GGML_OP_GET_ROWS: {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+                op_tensor = ggml_get_rows(ctx, w, b);
+
+                // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+                GGML_ASSERT(w->buffer == nullptr);
+                w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+                op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+                ggml_backend_buffer_free(w->buffer);
+                w->buffer = nullptr;
+                break;
+            }
             default: {
                 op_supported = false;
                 break;

From 2705c086b4e5b1681e86061b1f2e97591ca9a5c3 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Fri, 6 Jun 2025 03:46:05 -0700
Subject: [PATCH 07/11] Resolve PR comments

---
 src/whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 00d3f14f05c..46bc934ed9a 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         ggml_context * ctx = ctx_ptr.get();
 
         ggml_tensor * op_tensor = nullptr;
-        
+
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {

From 70cf05ae71fbabb1f943e13e4d27f41f890197f7 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Wed, 18 Jun 2025 05:33:50 -0700
Subject: [PATCH 08/11] Add the Get_Rows & Dequantize implementation adapted to
 work for repacked weights of type q4_K

---
 ggml/src/ggml-cpu/repack.cpp | 130 ++++++++++++++++++++++++++++++++++-
 src/whisper.cpp              |   2 +-
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index ae1fe2336f4..e06071bee8f 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1404,6 +1404,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_TYPE_Q4_0: {
                 ggml_compute_forward_get_rows_q4_0x8(params, dst);
             } break;
+            case GGML_TYPE_Q4_K: {
+                ggml_compute_forward_get_rows_q4_Kx8(params, dst);
+            } break;
             default:
                 GGML_ABORT("fatal error");
                 break;
@@ -1522,6 +1525,131 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         }
     }
 
+    static void ggml_compute_forward_get_rows_q4_Kx8(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q4_Kx8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK_K;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q4_Kx8 of the identified row_group_idx
+            const block_q4_Kx8 * p_first_repacked_block_of_group_x8 = (const block_q4_Kx8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q4_Kx8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from the repacked q4_Kx8 data format.
+     *
+     * @param p_repacked_blocks Pointer to the start of the 'block_q4_Kx8' structures for the entire row.
+     * @param y                 Output buffer for the dequantized float values.
+     * @param k                 Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group  The index (0-7) of the logical row to extract from the interleaved data.
+     */
+
+    static void dequantize_row_q4_Kx8(
+        const void * GGML_RESTRICT p_repacked_blocks,
+        float * GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        constexpr int nrows_interleaved = 8;
+        assert(k % QK_K == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < nrows_interleaved);
+
+        const int nb = k / QK_K;
+        const block_q4_Kx8 * blocks = (const block_q4_Kx8 *)p_repacked_blocks;
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_Kx8 * current_block = &blocks[i];
+
+            const float d_super_block = GGML_FP16_TO_FP32(current_block->d[row_idx_in_group]);
+            const float dmin_super_block = GGML_FP16_TO_FP32(current_block->dmin[row_idx_in_group]);
+
+            const uint8_t * ptr_qs_base = current_block->qs;
+            const uint8_t * ptr_repacked_scales = (const uint8_t *)current_block->scales;
+            int is = 0, chunk_group_start_idx = 0;
+            for (int j = 0; j < QK_K; j += 64) {
+
+                uint8_t sc1, m1_val, sc2, m2_val;
+                const uint8_t *scales_repacked_data;
+                
+                scales_repacked_data = &ptr_repacked_scales[(is + 0) * 12];
+                get_scale_min_k4(row_idx_in_group, scales_repacked_data, &sc1, &m1_val);
+            
+                scales_repacked_data = &ptr_repacked_scales[(is + 1) * 12];
+                get_scale_min_k4(row_idx_in_group, scales_repacked_data, &sc2, &m2_val);
+                
+                const float d1 = d_super_block * sc1;
+                const float m1 = dmin_super_block * m1_val;
+                const float d2 = d_super_block * sc2;
+                const float m2 = dmin_super_block * m2_val;
+
+                for (int idx = 0; idx < 4; idx++) {
+                    const uint8_t * ptr_qs_chunk = ptr_qs_base + ((chunk_group_start_idx + idx) * 64) + row_idx_in_group * 8;
+                    for (int l = 0; l < 8; ++l) *y++ = d1 * (ptr_qs_chunk[l] & 0xF) - m1; // 16 elements of quants
+                }
+ 
+                for (int idx = 0; idx < 4; idx++) {
+                    const uint8_t * ptr_qs_chunk = ptr_qs_base + ((chunk_group_start_idx + idx) * 64) + row_idx_in_group * 8;
+                    for (int l = 0; l < 8; ++l) *y++ = d2 * (ptr_qs_chunk[l] >> 4) - m2; // 16 elements of quants
+                }
+                is += 2;
+                chunk_group_start_idx += 4;
+            }
+        }
+    }
+
+    static inline void get_scale_min_k4(int j, const uint8_t *GGML_RESTRICT s, uint8_t *GGML_RESTRICT d, uint8_t *GGML_RESTRICT m) {
+        if (j < 4) {
+            *d = s[j] & 63;
+            *m = s[j + 4] & 63;
+        } else {
+            *d = (s[j + 4] & 0xF) | ((s[j - 4] >> 6) << 4);
+            *m = (s[j + 4] >> 4) | ((s[j - 0] >> 6) << 4);
+        }
+    }
+
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
@@ -1662,7 +1790,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                 return false;
             }
-            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+            if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_K) {
                 return true;
             }
         }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 46bc934ed9a..c8db6fb850a 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1454,7 +1454,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0)
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (repacked - q4_0, q4_K)
             case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);

From 1446e6468f2eb1cadea231725b3ec43c941b4bc5 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Thu, 19 Jun 2025 23:25:51 -0700
Subject: [PATCH 09/11] Remove q4_0 code implementation for get_rows &
 dequantize

---
 ggml/src/ggml-cpu/repack.cpp | 117 +----------------------------------
 src/whisper.cpp              |   2 +-
 2 files changed, 2 insertions(+), 117 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index e06071bee8f..6bb57a9fb12 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1401,9 +1401,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const ggml_tensor * src0 = dst->src[0];
 
         switch (src0->type) {
-            case GGML_TYPE_Q4_0: {
-                ggml_compute_forward_get_rows_q4_0x8(params, dst);
-            } break;
             case GGML_TYPE_Q4_K: {
                 ggml_compute_forward_get_rows_q4_Kx8(params, dst);
             } break;
@@ -1413,118 +1410,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         }
     }
 
-    static void ggml_compute_forward_get_rows_q4_0x8(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-        const ggml_tensor * src0 = dst->src[0];
-        const ggml_tensor * src1 = dst->src[1];
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        const int64_t nc = ne00;
-        const int64_t nr = ggml_nelements(src1);
-
-        assert(ne0 == nc);
-        assert(ne02 == ne11);
-        assert(nb00 == ggml_type_size(src0->type));
-        assert(ggml_nrows(dst) == nr);
-
-        const int ith = params->ith;
-        const int nth = params->nth;
-
-        // rows per thread
-        const int dr = (nr + nth - 1) / nth;
-
-        // row range for this thread
-        const int ir0 = dr * ith;
-        const int ir1 = MIN(ir0 + dr, nr);
-
-        constexpr int nrows_interleaved = 8;
-        const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8);
-
-        const int num_repacked_blocks_per_row_width = nc / QK4_0;
-
-        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
-
-        for (int64_t i = ir0; i < ir1; ++i) {
-            const int64_t i12 = i / (ne11 * ne10);
-            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
-            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
-            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
-
-            GGML_ASSERT(i01 >= 0 && i01 < ne01);
-
-            int row_group_idx = i01 / nrows_interleaved;
-            const int row_idx_in_group = i01 % nrows_interleaved;
-
-            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
-
-            // Pointer to the first block_q4_0x8 of the identified row_group_idx
-            const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
-
-            dequantize_row_q4_0x8(
-                p_first_repacked_block_of_group_x8,
-                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
-        }
-    }
-
-    /**
-     * Dequantizes a single logical row from data repacked with quant interleaving.
-     *
-     * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group.
-     * @param y                              Output buffer for the dequantized float values.
-     * @param k                              Total number of elements (columns) in the logical row.
-     * @param row_idx_in_group               Index (0-7) of the logical row to dequantize.
-     */
-    static void dequantize_row_q4_0x8(
-        const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks,
-        float * GGML_RESTRICT y,
-        int64_t k,
-        int row_idx_in_group) {
-        const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8;
-        assert(k % QK4_0 == 0);
-        assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE);
-
-        const int nb = k / QK4_0;
-        const int bytes_for_half_elements = (QK4_0 / 2) / 2;
-
-        const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE;
-        const uint64_t xor_mask = 0x8888888888888888ULL;
-        const int qk4_0_half_elements = QK4_0 / 2;
-
-        for (int i = 0; i < nb; ++i) {
-            const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i];
-            const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]);
-            float * y_curr = y + i * QK4_0;
-
-            const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]);
-
-            uint64_t first_half_chunk_u64;
-            memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t));
-            first_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
-            const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64;
-
-            const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]);
-
-            uint64_t second_half_chunk_u64;
-            memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t));
-            second_half_chunk_u64 ^= xor_mask;  // Reverse the XOR
-            const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64;
-
-            // dequantizing all QK4_0's for this block.
-            for (int j = 0; j < bytes_for_half_elements; ++j) {
-                const uint8_t quant_byte_first = original_qs_first_half_bytes[j];
-                y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val;
-                y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val;
-
-                const uint8_t quant_byte_second = original_qs_second_half_bytes[j];
-                const int out_idx_base_second_half = j + bytes_for_half_elements;  // Offset for the second set of low nibbles
-                y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val;
-                y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val;
-            }
-        }
-    }
-
     static void ggml_compute_forward_get_rows_q4_Kx8(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -1790,7 +1675,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                 return false;
             }
-            if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_K) {
+            if (op->src[0]->type == GGML_TYPE_Q4_K) {
                 return true;
             }
         }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index c8db6fb850a..6a8d86059e4 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1454,7 +1454,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
         int64_t n_ctx = hparams.n_audio_ctx;
 
         switch (op) {
-            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (repacked - q4_0, q4_K)
+            // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (repacked - q4_K)
             case GGML_OP_MUL_MAT: {
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                 op_tensor = ggml_mul_mat(ctx, w, b);

From 099aa24134364233dc12796ed4b7a63e602a7afa Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Thu, 19 Jun 2025 23:27:38 -0700
Subject: [PATCH 10/11] Fix warning

---
 ggml/src/ggml-cpu/repack.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 6bb57a9fb12..fc30e6a9944 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1478,9 +1478,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         float * GGML_RESTRICT y,
         int64_t k,
         int row_idx_in_group) {
-        constexpr int nrows_interleaved = 8;
+
         assert(k % QK_K == 0);
-        assert(row_idx_in_group >= 0 && row_idx_in_group < nrows_interleaved);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < 8);
 
         const int nb = k / QK_K;
         const block_q4_Kx8 * blocks = (const block_q4_Kx8 *)p_repacked_blocks;

From 37303ab6e2c2663b8ede24e7491f599db4c985d5 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Thu, 26 Jun 2025 06:52:29 -0700
Subject: [PATCH 11/11] Resolve minor PR comments

---
 ggml/src/ggml-cpu/repack.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index fc30e6a9944..5737300640a 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1401,9 +1401,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const ggml_tensor * src0 = dst->src[0];
 
         switch (src0->type) {
-            case GGML_TYPE_Q4_K: {
+            case GGML_TYPE_Q4_K:
                 ggml_compute_forward_get_rows_q4_Kx8(params, dst);
-            } break;
+                break;
             default:
                 GGML_ABORT("fatal error");
                 break;
@@ -1450,7 +1450,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 
             GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
-            int row_group_idx = i01 / nrows_interleaved;
+            const int row_group_idx = i01 / nrows_interleaved;
             const int row_idx_in_group = i01 % nrows_interleaved;
 
             const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;