From 3c50e46ee471e5f2abeee639d277d62810680c99 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 9 Oct 2023 12:17:54 -0400
Subject: [PATCH 01/26] added conv2d stage 0 - 1 cuda kernels

---
 src/ggml-cuda.cu | 122 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 5bd83bb5c..1d878ab49 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4585,6 +4585,28 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
+static __global__ void gemm_f16_f32(const half  *x,const half  *y, float *dst, int N, int M, int K) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < N && col < K) {
+        float sum = 0.0f;
+        for (int i = 0; i < M; ++i) {
+            sum += __half2float(x[row * M + i]) * __half2float(y[col * M + i]);
+        }
+        dst[row * K + col] = sum;
+    }
+}
+
+static  __global__ void im2col_f32_f16(const float* x, half* dst, int nb12, int nb13, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
+    int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
+	int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
+    __syncthreads();
+    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+        int offset_dst = (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW;
+        int offset_src = threadIdx.x * nb13 +  blockIdx.x * nb12;
+        dst[offset_dst + (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z)] = __float2half(x[offset_src + iih * IW + iiw]);
+    }
+}
 
 template<int qk, int qr, dequantize_kernel_t dq>
 static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
@@ -5534,6 +5556,35 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
     soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
 }
 
+static void im2col_f32_f16_cuda(const float* x, half* dst,
+    int OC, int OH,
+    int IW, int IH,
+    int OW, int IC,
+    int KH, int KW, int N,
+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+
+    int nb11 = IW;
+    int nb12 = nb11 * IH;
+    int nb13 = nb12 * IC;
+
+    int CHW = IC * KH * KW;
+    dim3 block_nums(IC, OH, OW);
+    dim3 block_dims(N, KH, KW);
+    im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, nb12, nb13, IW, IH, CHW, s0, s1, p0, p1, d0, d1);
+}
+
+// GEMM
+static void gemm_f16_f32_cuda(const half* x,const half* y, float* dst, int OC, int OH, int OW,int IC, int KH, int KW, int N, cudaStream_t stream) {
+        int m = OC;
+        int n = OH * OW;
+        int k = IC * KH * KW;
+        for(int i = 0; i < N; i++) {
+            dim3 block_dims(16, 16);
+            dim3 block_nums((n + block_dims.x - 1) / block_dims.x, (m + block_dims.y - 1) / block_dims.y);
+            gemm_f16_f32<<<block_nums, block_dims, 0, stream>>>(x, y + i * m * k, dst + i * m * n, m, k, n);
+        }
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 256
 
@@ -6438,6 +6489,63 @@ inline void ggml_cuda_op_alibi(
     (void) src1_dd;
 }
 
+inline void ggml_cuda_op_conv2d_stage_0(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const int64_t N = src1->ne[3];
+    const int64_t IC = src1->ne[2];
+    const int64_t IH = src1->ne[1];
+    const int64_t IW = src1->ne[0];
+
+    const int64_t OC = src0->ne[3];
+    // const int64_t IC = ne02;
+    const int64_t KH = src0->ne[1];
+    const int64_t KW = src0->ne[0];
+
+    const int64_t OH = dst->ne[2];
+    const int64_t OW = dst->ne[1];
+
+    im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
+        OC, OH, IW, IH, OW, IC, KH, KW, N, s0, s1, p0, p1, d0, d1, main_stream);
+
+    (void) src0;
+    (void) src0_dd;
+}
+
+inline void ggml_cuda_op_conv2d_stage_1(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int N = src1->ne[3];
+    const int OH = src1->ne[2];
+    const int OW = src1->ne[1];
+
+    const int OC = src0->ne[3];
+    const int IC = src0->ne[2];
+    const int KH = src0->ne[1];
+    const int KW = src0->ne[0];
+
+    gemm_f16_f32_cuda(
+        (const half*)src0_dd, (const half*)src1_dd,
+        dst_dd, OC, OH, OW, IC, KH, KW, N, main_stream);
+}
+
 inline void ggml_cuda_op_diag_mask_inf(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7133,6 +7241,14 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
 }
 
+void ggml_cuda_conv2d_stage_0(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_conv2d_stage_0);
+}
+
+void ggml_cuda_conv2d_stage_1(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_conv2d_stage_1);
+}
+
 static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     (void) src0;
     (void) src1;
@@ -7494,6 +7610,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
         case GGML_OP_ALIBI:
             func = ggml_cuda_alibi;
             break;
+        case GGML_OP_CONV_2D_STAGE_0:
+            func = ggml_cuda_conv2d_stage_0;
+            break;
+        case GGML_OP_CONV_2D_STAGE_1:
+            func = ggml_cuda_conv2d_stage_1;
+            break;
         default:
             return false;
     }

From 27b3ab3d70336fda302a00bbd0da346e6b8a3f77 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 9 Oct 2023 14:14:06 -0400
Subject: [PATCH 02/26] add im2col + refactor conv1d and conv2d

---
 include/ggml/ggml.h |  17 +-
 src/ggml-cuda.cu    |  21 +-
 src/ggml.c          | 692 +++++++++++++++-----------------------------
 3 files changed, 252 insertions(+), 478 deletions(-)

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 4b16032f0..d187620e1 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -400,12 +400,9 @@ extern "C" {
         GGML_OP_ALIBI,
         GGML_OP_CLAMP,
         GGML_OP_CONV_1D,
-        GGML_OP_CONV_1D_STAGE_0,  // internal
-        GGML_OP_CONV_1D_STAGE_1,  // internal
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_CONV_2D,
-        GGML_OP_CONV_2D_STAGE_0, // internal
-        GGML_OP_CONV_2D_STAGE_1, // internal
+        GGML_OP_IM2COL,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
@@ -1376,6 +1373,18 @@ extern "C" {
             float                 min,
             float                 max);
 
+    GGML_API struct ggml_tensor * ggml_im2col(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1,
+            bool             is_2D);
+
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 1d878ab49..f8e9fe220 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -6489,13 +6489,13 @@ inline void ggml_cuda_op_alibi(
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_conv2d_stage_0(
+inline void ggml_cuda_op_im2col(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
     const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -7241,12 +7241,8 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
 }
 
-void ggml_cuda_conv2d_stage_0(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_conv2d_stage_0);
-}
-
-void ggml_cuda_conv2d_stage_1(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_conv2d_stage_1);
+void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
 }
 
 static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7610,11 +7606,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
         case GGML_OP_ALIBI:
             func = ggml_cuda_alibi;
             break;
-        case GGML_OP_CONV_2D_STAGE_0:
-            func = ggml_cuda_conv2d_stage_0;
-            break;
-        case GGML_OP_CONV_2D_STAGE_1:
-            func = ggml_cuda_conv2d_stage_1;
+        case GGML_OP_IM2COL:
+            func = ggml_cuda_im2col;
             break;
         default:
             return false;
diff --git a/src/ggml.c b/src/ggml.c
index 2d3c7b801..fd9165efe 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4055,12 +4055,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
     "CONV_TRANSPOSE_1D",
     "CONV_2D",
-    "CONV_2D_STAGE_0",
-    "CONV_2D_STAGE_1",
+    "IM2COL",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
@@ -4091,7 +4088,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4142,12 +4139,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
     "conv_transpose_1d(x)",
     "conv_2d(x)",
-    "conv_2d_stage_0(x)",
-    "conv_2d_stage_1(x)",
+    "im2col(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
@@ -4178,7 +4172,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 72");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4207,12 +4201,9 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[GGML_OP_CONV_1D                ] = true;
-        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
-        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
         p[GGML_OP_CONV_2D                ] = true;
-        p[GGML_OP_CONV_2D_STAGE_0        ] = true;
-        p[GGML_OP_CONV_2D_STAGE_1        ] = true;
+        p[GGML_OP_IM2COL                 ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -7487,80 +7478,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-// im2col: [N, IC, IL] => [N, OL, IC*K]
-// a: [OC，IC, K]
-// b: [N, IC, IL]
-// result: [N, OL, IC*K]
-static struct ggml_tensor * ggml_conv_1d_stage_0(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                   s0,
-    int                   p0,
-    int                   d0) {
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
-    const int64_t ne[4] = {
-        a->ne[1] * a->ne[0],
-        OL,
-        b->ne[2],
-        1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_CONV_1D_STAGE_0;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_conv_1d_stage_1
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// a: [OC, IC, K]
-// b: [N, OL, IC * K]
-// result: [N, OC, OL]
-static struct ggml_tensor * ggml_conv_1d_stage_1(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        a->ne[2],
-        b->ne[2],
-        1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op = GGML_OP_CONV_1D_STAGE_1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
 // ggml_conv_1d
 
 GGML_API struct ggml_tensor * ggml_conv_1d(
@@ -7570,8 +7487,8 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
-    result = ggml_conv_1d_stage_1(ctx, a, result);
+    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false);
+    result = ggml_mul_mat(ctx, a, result);
     return result;
 }
 
@@ -7669,7 +7586,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
-static struct ggml_tensor * ggml_conv_2d_stage_0(
+static struct ggml_tensor * ggml_im2col(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
     struct ggml_tensor  * b,
@@ -7678,9 +7595,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
     int                  p0,
     int                  p1,
     int                  d0,
-    int                  d1) {
+    int                  d1,
+    bool             is_2D) {
 
-    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    if(is_2D) {
+        GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        GGML_ASSERT(a->ne[1] == b->ne[1]);
+    }
     bool is_node = false;
 
     if (a->grad || b->grad) {
@@ -7692,50 +7614,18 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
     const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
 
     const int64_t ne[4] = {
-        a->ne[2] * a->ne[1] * a->ne[0],
-        OW,
-        OH,
-        b->ne[3],
+        is_2D ? a->ne[2] * a->ne[1] * a->ne[0] : a->ne[1] * a->ne[0],
+        is_2D ? OW : OH,
+        is_2D ? OH : b->ne[2],
+        is_2D ? b->ne[3] : 1,
     };
+
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
 
-    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_CONV_2D_STAGE_0;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-
-}
-
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// a: [OC, IC, KH, KW]
-// b: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static struct ggml_tensor * ggml_conv_2d_stage_1(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        b->ne[2],
-        a->ne[3],
-        b->ne[3],
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op = GGML_OP_CONV_2D_STAGE_1;
+    result->op = GGML_OP_IM2COL;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7758,8 +7648,8 @@ struct ggml_tensor * ggml_conv_2d(
     int                  d0,
     int                  d1) {
 
-    struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
-    result = ggml_conv_2d_stage_1(ctx, a, result);
+    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
+    result = ggml_mul_mat(ctx, a, result);
 
     return result;
 
@@ -11801,7 +11691,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 
-static void ggml_compute_forward_mul_mat(
+// legacy multiplication matrix just float 32 data type
+
+static void ggml_compute_forward_mul_mat_f32_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -12012,6 +11904,123 @@ static void ggml_compute_forward_mul_mat(
     }
 }
 
+
+// GEMM
+// TODO: compare gemm op with the current implementation of mul_mat
+
+static void ggml_compute_forward_mul_mat_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne13;
+    const int OH = ne12;
+    const int OW = ne11;
+
+    const int OC = ne03;
+    const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
+
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+            // does not seem to make a difference
+        int64_t m0, m1, n0, n1;
+        // patches per thread
+        if (m > n) {
+            n0 = 0;
+            n1 = n;
+
+            // total patches in dst
+            const int np = m;
+
+            // patches per thread
+            const int dp = (np + nth - 1)/nth;
+
+            // patch range for this thread
+            m0 = dp*ith;
+            m1 = MIN(m0 + dp, np);
+        } else {
+            m0 = 0;
+            m1 = m;
+
+            // total patches in dst
+            const int np = n;
+
+            // patches per thread
+            const int dp = (np + nth - 1)/nth;
+
+            // patch range for this thread
+            n0 = dp*ith;
+            n1 = MIN(n0 + dp, np);
+        }
+
+        // block-tiling attempt
+        int64_t blck_n = 16;
+        int64_t blck_m = 16;
+
+        for (int j = n0; j < n1; j+=blck_n) {
+            for (int i = m0; i < m1; i+=blck_m) {
+                // printf("i j k => %d %d %d\n", i, j, K);
+                for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+                    for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+                        ggml_vec_dot_f16(k,
+                                        C + ii*n + jj,
+                                        A + ii * k,
+                                        B + jj * k);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+static void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(
+        src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 ||
+        src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    if(src0->type == GGML_TYPE_F32) {
+        // full precision
+        ggml_compute_forward_mul_mat_f32_f32(params, src0, src1, dst);
+    } else {
+        // reduce memory usage
+        ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
+    }
+}
+
 // ggml_compute_forward_out_prod
 
 static void ggml_compute_forward_out_prod_f32(
@@ -14013,143 +14022,7 @@ static void ggml_compute_forward_conv_1d_f32(
     }
 }
 
-// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
-static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
-                             ggml_fp16_t * A,
-                             ggml_fp16_t * B,
-                             float * C,
-                             const int ith, const int nth) {
-    // does not seem to make a difference
-    int64_t m0, m1, n0, n1;
-    // patches per thread
-    if (m > n) {
-        n0 = 0;
-        n1 = n;
-
-        // total patches in dst
-        const int np = m;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        m0 = dp*ith;
-        m1 = MIN(m0 + dp, np);
-    } else {
-        m0 = 0;
-        m1 = m;
-
-        // total patches in dst
-        const int np = n;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        n0 = dp*ith;
-        n1 = MIN(n0 + dp, np);
-    }
-
-    // block-tiling attempt
-    int64_t blck_n = 16;
-    int64_t blck_m = 16;
-
-    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
-    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
-    // if (blck_size > 0) {
-    //     blck_0 = 4;
-    //     blck_1 = blck_size / blck_0;
-    //     if (blck_1 < 0) {
-    //         blck_1 = 1;
-    //     }
-    //     // blck_0 = (int64_t)sqrt(blck_size);
-    //     // blck_1 = blck_0;
-    // }
-    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
-
-    for (int j = n0; j < n1; j+=blck_n) {
-        for (int i = m0; i < m1; i+=blck_m) {
-            // printf("i j k => %d %d %d\n", i, j, K);
-            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                    ggml_vec_dot_f16(k,
-                                    C + ii*n + jj,
-                                    A + ii * k,
-                                    B + jj * k);
-                }
-            }
-        }
-    }
-}
-
-// src0: kernel [OC, IC, K]
-// src1: signal [N, IC, IL]
-// dst:  result [N, OL, IC*K]
-static void ggml_compute_forward_conv_1d_stage_0_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int64_t N  = ne12;
-    const int64_t IC = ne11;
-    const int64_t IL = ne10;
-
-    const int64_t K = ne00;
-
-    const int64_t OL = ne1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IL] => [N, OL, IC*K]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t iol = 0; iol < OL; iol++) {
-                for (int64_t iic = ith; iic < IC; iic+=nth) {
-
-                    // micro kernel
-                    ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
-                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
-
-                    for (int64_t ik = 0; ik < K; ik++) {
-                        const int64_t iil = iol*s0 + ik*d0 - p0;
-
-                        if (!(iil < 0 || iil >= IL)) {
-                            dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
+// TODO: GEMM conv1d differ GEMM conv2d
 
 // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
 // src0: [OC, IC, K]
@@ -14201,7 +14074,7 @@ static void ggml_compute_forward_conv_1d_stage_1_f16(
         ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
         float * C = (float *)dst->data + i * m * n; // [m, n]
 
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+        //gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
     }
 }
 
@@ -14226,40 +14099,6 @@ static void ggml_compute_forward_conv_1d(
     }
 }
 
-static void ggml_compute_forward_conv_1d_stage_0(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_conv_1d_stage_1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_conv_transpose_1d
 
 static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -14470,7 +14309,7 @@ static void ggml_compute_forward_conv_transpose_1d(
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_conv_2d_stage_0_f32(
+static void ggml_compute_forward_im2col_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14484,61 +14323,64 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
-    const int64_t N = ne13;
-    const int64_t IC = ne12;
-    const int64_t IH = ne11;
-    const int64_t IW = ne10;
-
-    // const int64_t OC = ne03;
-    // const int64_t IC = ne02;
-    const int64_t KH = ne01;
-    const int64_t KW = ne00;
-
-    const int64_t OH = ne2;
-    const int64_t OW = ne1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
     const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
     const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
     const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
     const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t*)(dst->op_params))[5] == 1;
 
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
+    const int ith = params->ith;
+    const int nth = params->nth;
 
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-        return;
-    }
+    if(is_2D) {
+        const int64_t N = ne13;
+        const int64_t IC = ne12;
+        const int64_t IH = ne11;
+        const int64_t IW = ne10;
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+        // const int64_t OC = ne03;
+        // const int64_t IC = ne02;
+        const int64_t KH = ne01;
+        const int64_t KW = ne00;
 
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+        const int64_t OH = ne2;
+        const int64_t OW = ne1;
 
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) {
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+        GGML_ASSERT(nb10 == sizeof(float));
 
-                        // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+        if (params->type == GGML_TASK_INIT) {
+            memset(dst->data, 0, ggml_nbytes(dst));
+            return;
+        }
+
+        if (params->type == GGML_TASK_FINALIZE) {
+            return;
+        }
+
+        // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+            for (int64_t in = 0; in < N; in++) {
+                for (int64_t ioh = 0; ioh < OH; ioh++) {
+                    for (int64_t iow = 0; iow < OW; iow++) {
+                        for (int64_t iic = ith; iic < IC; iic+=nth) {
 
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+                            // micro kernel
+                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
 
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                            for (int64_t ikh = 0; ikh < KH; ikh++) {
+                                for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                    const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                    const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                    }
                                 }
                             }
                         }
@@ -14546,62 +14388,49 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
                 }
             }
         }
-    }
-}
-
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// src0: [OC, IC, KH, KW]
-// src1: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static void ggml_compute_forward_conv_2d_stage_1_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
+    } else {
+        const int64_t N  = ne12;
+        const int64_t IC = ne11;
+        const int64_t IL = ne10;
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+        const int64_t K = ne00;
 
-    GGML_TENSOR_BINARY_OP_LOCALS;
+        const int64_t OL = ne1;
+        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+        GGML_ASSERT(nb10 == sizeof(float));
 
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb0  == sizeof(float));
+        if (params->type == GGML_TASK_INIT) {
+            memset(dst->data, 0, ggml_nbytes(dst));
+            return;
+        }
 
-    const int N = ne13;
-    const int OH = ne12;
-    const int OW = ne11;
+        if (params->type == GGML_TASK_FINALIZE) {
+            return;
+        }
 
-    const int OC = ne03;
-    const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
+        // im2col: [N, IC, IL] => [N, OL, IC*K]
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
 
-    const int ith = params->ith;
-    const int nth = params->nth;
+            for (int64_t in = 0; in < N; in++) {
+                for (int64_t iol = 0; iol < OL; iol++) {
+                    for (int64_t iic = ith; iic < IC; iic+=nth) {
 
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
+                        // micro kernel
+                        ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
+                        const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
 
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
+                        for (int64_t ik = 0; ik < K; ik++) {
+                            const int64_t iil = iol*s0 + ik*d0 - p0;
 
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+                            if (!(iil < 0 || iil >= IL)) {
+                                dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -14718,7 +14547,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
         ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
         float * C = (float *)dst->data + i * m * n; // [m * k]
 
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+        //gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
     }
 }
 
@@ -14744,28 +14573,7 @@ static void ggml_compute_forward_conv_2d(
     }
 }
 
-static void ggml_compute_forward_conv_2d_stage_0(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_conv_2d_stage_1(
+static void ggml_compute_forward_im2col(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14773,7 +14581,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+                ggml_compute_forward_im2col_f16(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
@@ -16998,14 +16806,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_1D_STAGE_0:
-            {
-                ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_1D_STAGE_1:
-            {
-                ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17014,13 +16814,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_2D_STAGE_0:
-            {
-                ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_2D_STAGE_1:
+        case GGML_OP_IM2COL:
             {
-                ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+                ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
@@ -17943,14 +17739,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_1D_STAGE_0:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_STAGE_1:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -17959,11 +17747,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_2D_STAGE_0:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_2D_STAGE_1:
+        case GGML_OP_IM2COL:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -18838,14 +18622,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     work_size = MAX(work_size, cur);
                 } break;
-            case GGML_OP_CONV_1D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case GGML_OP_CONV_1D_STAGE_1:
-                {
-                    n_tasks = n_threads;
-                } break;
             case GGML_OP_CONV_TRANSPOSE_1D:
                 {
                     n_tasks = n_threads;
@@ -18914,11 +18690,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     work_size = MAX(work_size, cur);
                 } break;
-            case GGML_OP_CONV_2D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case GGML_OP_CONV_2D_STAGE_1:
+            case GGML_OP_IM2COL:
                 {
                     n_tasks = n_threads;
                 } break;

From d5c329bb891396082c87f47ef8450b86c3b5ca75 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 9 Oct 2023 14:21:11 -0400
Subject: [PATCH 03/26] fix params invalid index

---
 src/ggml-cuda.cu | 2 ++
 src/ggml.c       | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index f8e9fe220..bbf33ab64 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -6503,6 +6503,7 @@ inline void ggml_cuda_op_im2col(
     const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
     const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
 
     const int64_t N = src1->ne[3];
     const int64_t IC = src1->ne[2];
@@ -6510,6 +6511,7 @@ inline void ggml_cuda_op_im2col(
     const int64_t IW = src1->ne[0];
 
     const int64_t OC = src0->ne[3];
+
     // const int64_t IC = ne02;
     const int64_t KH = src0->ne[1];
     const int64_t KW = src0->ne[0];
diff --git a/src/ggml.c b/src/ggml.c
index fd9165efe..534382d6f 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -14329,7 +14329,7 @@ static void ggml_compute_forward_im2col_f16(
     const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
     const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t*)(dst->op_params))[5] == 1;
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
 
     const int ith = params->ith;
     const int nth = params->nth;

From 574735c3d2aba284c34899aebc4fb336fcb23e35 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 9 Oct 2023 15:51:49 -0400
Subject: [PATCH 04/26] add conv1d and conv2d unit tests

---
 tests/CMakeLists.txt  |  19 ++++
 tests/test-conv1d.cpp | 242 +++++++++++++++++++++++++++++++++++++++
 tests/test-conv2d.cpp | 255 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 516 insertions(+)
 create mode 100644 tests/test-conv1d.cpp
 create mode 100644 tests/test-conv2d.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a1cedf0f8..2646d49a5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -355,3 +355,22 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-conv1d
+
+set(TEST_TARGET test-conv1d)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+
+#
+# test-conv2d
+
+set(TEST_TARGET test-conv2d)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
new file mode 100644
index 000000000..eba41bbe8
--- /dev/null
+++ b/tests/test-conv1d.cpp
@@ -0,0 +1,242 @@
+#include "ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+//#define GGML_USE_CUBLAS
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+struct test_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;
+};
+
+void load_model(test_model & model, bool use_gpu = false) {
+    // create data
+    int K = 3, IC = 3, OC = 3;
+    int IL = 5, N = 1;
+
+    // Initialize adata
+    float* adata = new float[K * IC * OC];
+    for (size_t i = 0; i < K * IC * OC; i++) {
+        adata[i] = 2.0f;
+    }
+
+    // Convert adata to fp16 format
+    uint16_t* hadata = new uint16_t[K * IC * OC];
+    ggml_fp32_to_fp16_row(adata, hadata, K * IC * OC);
+
+    // Initialize bdata
+    float* bdata =  new float[IL * IC * N];
+    for (size_t i = 0; i < IL * IC * N; i++) {
+        bdata[i] = 3.0f;
+    }
+
+    size_t buffer_size = 0;
+    {
+        buffer_size += K * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a
+        buffer_size += IL * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+        buffer_size += 1024; // overhead
+    }
+
+    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+    printf("%s: backend buffer size = %0.2f MB\n", __func__, (int) (buffer_size/ 1024.f/ 1024.f));
+
+    int num_tensors = 2;
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (use_gpu) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (use_gpu) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if(!model.backend) {
+        // fallback to CPU backend
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F16,  K, IC, OC);
+    model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N);
+
+    // create a allocator
+    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.a);
+
+    // load data to buffer
+    if(ggml_backend_is_cpu(model.backend)) {
+        memcpy(model.a->data, hadata, ggml_nbytes(model.a));
+    } else {
+        ggml_backend_tensor_set(model.a, hadata, 0, ggml_nbytes(model.a));
+    }
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.b);
+
+    if(ggml_backend_is_cpu(model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+    ) {
+        memcpy(model.b->data, bdata, ggml_nbytes(model.b));
+    } else {
+        ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
+    }
+
+    ggml_allocr_free(alloc);
+}
+
+struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    int s0 = 1;
+    int p0 = 1;
+    int d0 = 1;
+
+    struct ggml_tensor* result = ggml_conv_1d(ctx0, model.a, model.b, s0, p0, d0);
+
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = build_graph(model, allocr);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+    int n_threads = 1;
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    //ggml_graph_print(gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+int main(void)
+{
+    ggml_time_init();
+
+    test_model model;
+    load_model(model, true);
+
+    ggml_backend_buffer_t buf_compute; // for compute
+    struct ggml_allocr * allocr = NULL;
+
+    {
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
+
+        //create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = build_graph(model, allocr);
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
+        ggml_allocr_free(allocr);
+
+        // compute the required memory
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
+    }
+
+    struct ggml_tensor * result = compute(model, allocr);
+
+    float* out_data = new float[ggml_nelements(result)];
+
+    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
+    const int n_test = 15;
+    float expected[n_test] = {
+        36.00f, 54.00f, 54.00f, 54.00f, 36.00f, 36.00f,
+        54.00f, 54.00f, 54.00f, 36.00f, 36.00f, 54.00f,
+        54.00f, 54.00f, 36.00f
+    };
+    bool passed = true;
+    for(int i = 0; i < n_test; i++) {
+        if(out_data[i] != expected[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    printf("ggml_conv1d (%i): %s", ggml_nelements(result), passed && (ggml_nelements(result) == n_test) ? "PASS" : "FAILED");
+    ggml_free(model.ctx);
+
+    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+    return 0;
+}
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
new file mode 100644
index 000000000..e129f36ef
--- /dev/null
+++ b/tests/test-conv2d.cpp
@@ -0,0 +1,255 @@
+#include "ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+//#define GGML_USE_CUBLAS
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+struct test_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;
+};
+
+void load_model(test_model & model, bool use_gpu = false) {
+    // create data
+    int KW = 3, KH = 3, IC = 4, OC = 4;
+    int IW = 4, IH = 4, /* IC = 640 */ N = 1;
+
+    // Initialize adata
+    float* adata = new float[KW * KH * IC * OC];
+    for (size_t i = 0; i < KW * KH * IC * OC; i++) {
+        adata[i] = 2.0f;
+    }
+
+    // Convert adata to fp16 format
+    uint16_t* hadata = new uint16_t[KW * KH * IC * OC];
+    ggml_fp32_to_fp16_row(adata, hadata, KW * KH * IC * OC);
+
+    // Initialize bdata
+    float* bdata =  new float[IW * IH * IC * N];
+    for (size_t i = 0; i < IW * IH * IC * N; i++) {
+        bdata[i] = 1.0f;
+    }
+
+    size_t buffer_size = 0;
+    {
+        buffer_size += KW * KH * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a
+        buffer_size += IW * IH * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+        buffer_size += 1024; // overhead
+    }
+
+    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+    printf("%s: backend buffer size = %0.2f MB\n", __func__, (int) (buffer_size/ 1024.f/ 1024.f));
+
+    int num_tensors = 2;
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (use_gpu) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (use_gpu) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if(!model.backend) {
+        // fallback to CPU backend
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, IC, OC);
+    model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
+
+    // create a allocator
+    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.a);
+
+    // load data to buffer
+    if(ggml_backend_is_cpu(model.backend)) {
+        memcpy(model.a->data, hadata, ggml_nbytes(model.a));
+    } else {
+        ggml_backend_tensor_set(model.a, hadata, 0, ggml_nbytes(model.a));
+    }
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.b);
+
+    if(ggml_backend_is_cpu(model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+    ) {
+        memcpy(model.b->data, bdata, ggml_nbytes(model.b));
+    } else {
+        ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
+    }
+
+    ggml_allocr_free(alloc);
+}
+
+struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    int s0 = 1;
+    int s1 = 1;
+    int p0 = 1;
+    int p1 = 1;
+    int d0 = 1;
+    int d1 = 1;
+
+    struct ggml_tensor* result = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
+
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = build_graph(model, allocr);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+    int n_threads = 1;
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    //ggml_graph_print(gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+int main(void)
+{
+    ggml_time_init();
+
+    test_model model;
+    load_model(model, true);
+
+    ggml_backend_buffer_t buf_compute; // for compute
+    struct ggml_allocr * allocr = NULL;
+
+    {
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
+
+        //create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = build_graph(model, allocr);
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
+        ggml_allocr_free(allocr);
+
+        // compute the required memory
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
+    }
+
+    struct ggml_tensor * result = compute(model, allocr);
+
+    float* out_data = new float[ggml_nelements(result)];
+
+    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
+
+    const int n_test = 64;
+
+    float expected [n_test] = {
+        32.00f, 48.00f, 48.00f, 32.00f, 48.00f, 72.00f,
+        72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
+        32.00f, 48.00f, 48.00f, 32.00f, 32.00f, 48.00f,
+        48.00f, 32.00f, 48.00f, 72.00f, 72.00f, 48.00f,
+        48.00f, 72.00f, 72.00f, 48.00f, 32.00f, 48.00f,
+        48.00f, 32.00f, 32.00f, 48.00f, 48.00f, 32.00f,
+        48.00f, 72.00f, 72.00f, 48.00f, 48.00f, 72.00f,
+        72.00f, 48.00f, 32.00f, 48.00f, 48.00f, 32.00f,
+        32.00f, 48.00f, 48.00f, 32.00f, 48.00f, 72.00f,
+        72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
+        32.00f, 48.00f, 48.00f, 32.00f };
+
+    bool passed = true;
+    for(int i = 0; i < n_test; i++) {
+        if(out_data[i] != expected[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    printf("ggml_conv2d (%i): %s", ggml_nelements(result), passed && (ggml_nelements(result) == n_test) ? "PASS" : "FAILED");
+    ggml_free(model.ctx);
+
+    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+    return 0;
+}

From 2358d1571a20c75a0fa72a650f513703f993c131 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 9 Oct 2023 17:24:18 -0400
Subject: [PATCH 05/26] resolving wrong values and fix mul_mat validation

---
 src/ggml.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index 534382d6f..5eaa3be38 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6422,7 +6422,10 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_mul_mat(a, b));
+    // hack to admit GEMM custom multiplication
+    bool special_case0 = (a->ne[0] * a->ne[1] * a->ne[2]) == b->ne[0];
+    bool special_case1 = (a->ne[0] * a->ne[1]) == b->ne[0];
+    GGML_ASSERT(ggml_can_mul_mat(a, b) || special_case0 || special_case1);
     GGML_ASSERT(!ggml_is_transposed(a));
 
     bool is_node = false;
@@ -6431,7 +6434,11 @@ struct ggml_tensor * ggml_mul_mat(
         is_node = true;
     }
 
-    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    const int64_t ne[4] = {
+        special_case0 || special_case1 ? b->ne[1] : a->ne[1],
+        special_case1 ? a->ne[2] : b->ne[special_case0 ? 2 : 1],
+        special_case0 ? a->ne[3] : b->ne[2],
+        special_case1 ? 1 : b->ne[3]  };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_MUL_MAT;
@@ -7487,8 +7494,8 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false);
-    result = ggml_mul_mat(ctx, a, result);
+    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OH, OW, IC * KH * KW]
+    result = ggml_mul_mat(ctx, a, result); // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
     return result;
 }
 
@@ -7610,18 +7617,17 @@ static struct ggml_tensor * ggml_im2col(
         is_node = true;
     }
 
-    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
     const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
 
     const int64_t ne[4] = {
-        is_2D ? a->ne[2] * a->ne[1] * a->ne[0] : a->ne[1] * a->ne[0],
-        is_2D ? OW : OH,
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
         is_2D ? OH : b->ne[2],
         is_2D ? b->ne[3] : 1,
     };
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
-
     int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
@@ -7649,10 +7655,8 @@ struct ggml_tensor * ggml_conv_2d(
     int                  d1) {
 
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-    result = ggml_mul_mat(ctx, a, result);
-
+    result = ggml_mul_mat(ctx, a, result); // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
     return result;
-
 }
 
 // ggml_conv_2d_sk_p0
@@ -11930,28 +11934,23 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb0  == sizeof(float));
 
-    const int N = ne13;
-    const int OH = ne12;
-    const int OW = ne11;
-
-    const int OC = ne03;
-    const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
+    bool case_conv_2d = (ne00 * ne01 * ne02) == ne10;
 
     const int ith = params->ith;
     const int nth = params->nth;
 
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
+    int64_t m = case_conv_2d ? ne03 : ne02;
+    int64_t n = (case_conv_2d ? ne12 : 1) * ne11;
+    int64_t k = (case_conv_2d ? ne02 : 1) * ne01 * ne00;
 
     // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    int64_t N = case_conv_2d ? ne13 : ne12;
     for (int i = 0; i < N; i++) {
         ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
         ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
         float * C = (float *)dst->data + i * m * n; // [m, n]
-            // does not seem to make a difference
+
+        // does not seem to make a difference
         int64_t m0, m1, n0, n1;
         // patches per thread
         if (m > n) {

From ca56f51070f22424e799d5d1da58da52a73c04ce Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Tue, 10 Oct 2023 13:24:08 -0400
Subject: [PATCH 06/26] improve tests + reduce code duplication

---
 src/ggml.c            | 193 ++++++++++--------------------------------
 tests/test-conv1d.cpp |  66 ++++++++++++---
 tests/test-conv2d.cpp |  70 ++++++++++++---
 3 files changed, 154 insertions(+), 175 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index 5eaa3be38..51057abed 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6422,10 +6422,11 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-    // hack to admit GEMM custom multiplication
-    bool special_case0 = (a->ne[0] * a->ne[1] * a->ne[2]) == b->ne[0];
-    bool special_case1 = (a->ne[0] * a->ne[1]) == b->ne[0];
-    GGML_ASSERT(ggml_can_mul_mat(a, b) || special_case0 || special_case1);
+    // hack to admit GEMM custom operator
+    bool mult_mat_conv1d = (a->ne[0] * a->ne[1]) == b->ne[0];
+    bool mult_mat_conv2d = (a->ne[0] * a->ne[1] * a->ne[2]) == b->ne[0];
+
+    GGML_ASSERT(ggml_can_mul_mat(a, b) || mult_mat_conv1d || mult_mat_conv2d);
     GGML_ASSERT(!ggml_is_transposed(a));
 
     bool is_node = false;
@@ -6435,10 +6436,10 @@ struct ggml_tensor * ggml_mul_mat(
     }
 
     const int64_t ne[4] = {
-        special_case0 || special_case1 ? b->ne[1] : a->ne[1],
-        special_case1 ? a->ne[2] : b->ne[special_case0 ? 2 : 1],
-        special_case0 ? a->ne[3] : b->ne[2],
-        special_case1 ? 1 : b->ne[3]  };
+        mult_mat_conv2d || mult_mat_conv1d ? b->ne[1] : a->ne[1],
+        mult_mat_conv1d ? a->ne[2] : b->ne[mult_mat_conv2d ? 2 : 1],
+        mult_mat_conv2d ? a->ne[3] : b->ne[2],
+        mult_mat_conv1d ? 1 : b->ne[3]  };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_MUL_MAT;
@@ -11908,7 +11909,6 @@ static void ggml_compute_forward_mul_mat_f32_f32(
     }
 }
 
-
 // GEMM
 // TODO: compare gemm op with the current implementation of mul_mat
 
@@ -11942,9 +11942,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     int64_t m = case_conv_2d ? ne03 : ne02;
     int64_t n = (case_conv_2d ? ne12 : 1) * ne11;
     int64_t k = (case_conv_2d ? ne02 : 1) * ne01 * ne00;
+    int64_t N = case_conv_2d ? ne13 : ne12;
 
     // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    int64_t N = case_conv_2d ? ne13 : ne12;
     for (int i = 0; i < N; i++) {
         ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
         ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
@@ -14021,62 +14021,6 @@ static void ggml_compute_forward_conv_1d_f32(
     }
 }
 
-// TODO: GEMM conv1d differ GEMM conv2d
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// src0: [OC, IC, K]
-// src1: [N, OL, IC * K]
-// result: [N, OC, OL]
-static void ggml_compute_forward_conv_1d_stage_1_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    const int N = ne12;
-    const int OL = ne11;
-
-    const int OC = ne02;
-    const int IC = ne01;
-    const int K  = ne00;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = OC;
-    int64_t n = OL;
-    int64_t k = IC * K;
-
-    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        //gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
 static void ggml_compute_forward_conv_1d(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -14303,8 +14247,6 @@ static void ggml_compute_forward_conv_transpose_1d(
     }
 }
 
-// ggml_compute_forward_conv_2d
-
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
@@ -14332,98 +14274,53 @@ static void ggml_compute_forward_im2col_f16(
 
     const int ith = params->ith;
     const int nth = params->nth;
+    const int64_t N = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
 
-    if(is_2D) {
-        const int64_t N = ne13;
-        const int64_t IC = ne12;
-        const int64_t IH = ne11;
-        const int64_t IW = ne10;
-
-        // const int64_t OC = ne03;
-        // const int64_t IC = ne02;
-        const int64_t KH = ne01;
-        const int64_t KW = ne00;
-
-        const int64_t OH = ne2;
-        const int64_t OW = ne1;
-
-        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-        GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->type == GGML_TASK_INIT) {
-            memset(dst->data, 0, ggml_nbytes(dst));
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-            for (int64_t in = 0; in < N; in++) {
-                for (int64_t ioh = 0; ioh < OH; ioh++) {
-                    for (int64_t iow = 0; iow < OW; iow++) {
-                        for (int64_t iic = ith; iic < IC; iic+=nth) {
-
-                            // micro kernel
-                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
 
-                            for (int64_t ikh = 0; ikh < KH; ikh++) {
-                                for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                    const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                    const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        const int64_t N  = ne12;
-        const int64_t IC = ne11;
-        const int64_t IL = ne10;
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
 
-        const int64_t K = ne00;
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
 
-        const int64_t OL = ne1;
-        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-        GGML_ASSERT(nb10 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
 
-        if (params->type == GGML_TASK_INIT) {
-            memset(dst->data, 0, ggml_nbytes(dst));
-            return;
-        }
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+        return;
+    }
 
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
 
-        // im2col: [N, IC, IL] => [N, OL, IC*K]
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
 
-            for (int64_t in = 0; in < N; in++) {
-                for (int64_t iol = 0; iol < OL; iol++) {
-                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
 
                         // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
-                        const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
+                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
 
-                        for (int64_t ik = 0; ik < K; ik++) {
-                            const int64_t iil = iol*s0 + ik*d0 - p0;
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
 
-                            if (!(iil < 0 || iil >= IL)) {
-                                dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
+                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
                             }
                         }
                     }
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index eba41bbe8..d692c36eb 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -151,16 +151,19 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     int p0 = 1;
     int d0 = 1;
 
-    struct ggml_tensor* result = ggml_conv_1d(ctx0, model.a, model.b, s0, p0, d0);
-
-    ggml_build_forward_expand(gf, result);
+    // split conv1d in fundamental methods for test unit
+    struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
+    ggml_set_name(im2col_res, "im2col_res");
+    struct ggml_tensor* conv1d_res = ggml_mul_mat(ctx0, model.a, im2col_res);
+    ggml_set_name(conv1d_res, "conv1d_res");
+    ggml_build_forward_expand(gf, conv1d_res);
 
     // delete the temporally context used to build the graph
     ggml_free(ctx0);
     return gf;
 }
 
-struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
+struct ggml_cgraph* compute_graph(const test_model & model, struct ggml_allocr * allocr) {
     // reset the allocator to free all the memory allocated during the previous inference
     ggml_allocr_reset(allocr);
 
@@ -185,7 +188,7 @@ struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * alloc
     //ggml_graph_print(gf);
 
     // in this case, the output tensor is the last one in the graph
-    return gf->nodes[gf->n_nodes - 1];
+    return gf;
 }
 
 int main(void)
@@ -213,26 +216,63 @@ int main(void)
         fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
     }
 
-    struct ggml_tensor * result = compute(model, allocr);
+    struct ggml_cgraph * gf_res = compute_graph(model, allocr);
+
+     struct ggml_tensor * im2col_res = NULL;
+    struct ggml_tensor * conv1d_res = NULL;
 
-    float* out_data = new float[ggml_nelements(result)];
+    for(int i = 0; i < gf_res->n_nodes; i++) {
+        if(strcmp(ggml_get_name(gf_res->nodes[i]), "im2col_res") == 0) {
+            im2col_res = gf_res->nodes[i];
+        } else if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv1d_res") == 0) {
+            conv1d_res = gf_res->nodes[i];
+        }
+    }
 
-    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
-    const int n_test = 15;
-    float expected[n_test] = {
+    ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
+    float* conv2d_data = new float[ggml_nelements(conv1d_res)];
+
+    ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
+    ggml_backend_tensor_get(conv1d_res, conv2d_data, 0, ggml_nbytes(conv1d_res));
+
+    const int n_conv1d_test = 15;
+    const int n_im2col_test = 45;
+
+    float expected_conv1d[n_conv1d_test] = {
         36.00f, 54.00f, 54.00f, 54.00f, 36.00f, 36.00f,
         54.00f, 54.00f, 54.00f, 36.00f, 36.00f, 54.00f,
         54.00f, 54.00f, 36.00f
     };
+    // first im2col test
+
+    ggml_fp16_t expected_im2col[n_conv1d_test] = {
+        0, 16896, 16896, 0, 16896, 16896, 0,
+        16896, 16896, 16896, 16896, 16896,
+        16896, 16896, 16896
+    };
+
+    printf("\nPerforming test:\n");
+
     bool passed = true;
-    for(int i = 0; i < n_test; i++) {
-        if(out_data[i] != expected[i]) {
+    for(int i = 0; i < n_conv1d_test; i++) {
+        if(
+            im2col_data[i] != expected_im2col[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "PASS" : "FAILED");
+
+    passed = true;
+    for(int i = 0; i < n_conv1d_test; i++) {
+        if(conv2d_data[i] != expected_conv1d[i]) {
             passed = false;
             break;
         }
     }
 
-    printf("ggml_conv1d (%i): %s", ggml_nelements(result), passed && (ggml_nelements(result) == n_test) ? "PASS" : "FAILED");
+    printf("ggml_conv1d (%i): %s\n", ggml_nelements(conv1d_res), passed && (ggml_nelements(conv1d_res) == n_conv1d_test) ? "PASS" : "FAILED");
     ggml_free(model.ctx);
 
     ggml_backend_buffer_free(model.buffer);
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index e129f36ef..6382a0397 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -154,16 +154,19 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     int d0 = 1;
     int d1 = 1;
 
-    struct ggml_tensor* result = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
-
-    ggml_build_forward_expand(gf, result);
+    // split conv2d in fundamental methods for test unit
+    struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
+    ggml_set_name(im2col_res, "im2col_res");
+    struct ggml_tensor* conv2d_res = ggml_mul_mat(ctx0, model.a, im2col_res);
+    ggml_set_name(conv2d_res, "conv2d_res");
+    ggml_build_forward_expand(gf, conv2d_res);
 
     // delete the temporally context used to build the graph
     ggml_free(ctx0);
     return gf;
 }
 
-struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
+struct ggml_cgraph * compute_graph(const test_model & model, struct ggml_allocr * allocr) {
     // reset the allocator to free all the memory allocated during the previous inference
     ggml_allocr_reset(allocr);
 
@@ -188,7 +191,7 @@ struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * alloc
     //ggml_graph_print(gf);
 
     // in this case, the output tensor is the last one in the graph
-    return gf->nodes[gf->n_nodes - 1];
+    return gf;
 }
 
 int main(void)
@@ -196,7 +199,7 @@ int main(void)
     ggml_time_init();
 
     test_model model;
-    load_model(model, true);
+    load_model(model, false);
 
     ggml_backend_buffer_t buf_compute; // for compute
     struct ggml_allocr * allocr = NULL;
@@ -216,15 +219,29 @@ int main(void)
         fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
     }
 
-    struct ggml_tensor * result = compute(model, allocr);
+    struct ggml_cgraph * gf_res = compute_graph(model, allocr);
+
+    struct ggml_tensor * im2col_res = NULL;
+    struct ggml_tensor * conv2d_res = NULL;
+
+    for(int i = 0; i < gf_res->n_nodes; i++) {
+        if(strcmp(ggml_get_name(gf_res->nodes[i]), "im2col_res") == 0) {
+            im2col_res = gf_res->nodes[i];
+        } else if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv2d_res") == 0) {
+            conv2d_res = gf_res->nodes[i];
+        }
+    }
 
-    float* out_data = new float[ggml_nelements(result)];
+    ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
+    float* conv2d_data = new float[ggml_nelements(conv2d_res)];
 
-    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
+    ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
+    ggml_backend_tensor_get(conv2d_res, conv2d_data, 0, ggml_nbytes(conv2d_res));
 
-    const int n_test = 64;
+    const int n_conv2d_test = 64;
+    const int n_im2col_test = 576;
 
-    float expected [n_test] = {
+    float expected_conv2d [n_conv2d_test] = {
         32.00f, 48.00f, 48.00f, 32.00f, 48.00f, 72.00f,
         72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
         32.00f, 48.00f, 48.00f, 32.00f, 32.00f, 48.00f,
@@ -237,15 +254,40 @@ int main(void)
         72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
         32.00f, 48.00f, 48.00f, 32.00f };
 
+    ggml_fp16_t expected_im2col[n_conv2d_test] = {
+        0, 0, 0, 0, 15360, 15360, 0, 15360,
+        15360, 0, 0, 0, 0, 15360, 15360, 0,
+        15360, 15360, 0, 0, 0, 0, 15360,
+        15360, 0, 15360, 15360, 0, 0, 0, 0,
+        15360, 15360, 0, 15360, 15360, 0, 0, 0,
+        15360, 15360, 15360, 15360, 15360, 15360,
+        0, 0, 0, 15360, 15360, 15360, 15360, 15360,
+        15360, 0, 0, 0, 15360, 15360, 15360, 15360,
+        15360, 15360, 0
+    };
+
+    printf("\nPerforming test:\n");
+
     bool passed = true;
-    for(int i = 0; i < n_test; i++) {
-        if(out_data[i] != expected[i]) {
+    for(int i = 0; i < n_conv2d_test; i++) {
+        if(
+            im2col_data[i] != expected_im2col[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "PASS" : "FAILED");
+
+    passed = true;
+    for(int i = 0; i < n_conv2d_test; i++) {
+        if(conv2d_data[i] != expected_conv2d[i]) {
             passed = false;
             break;
         }
     }
 
-    printf("ggml_conv2d (%i): %s", ggml_nelements(result), passed && (ggml_nelements(result) == n_test) ? "PASS" : "FAILED");
+    printf("ggml_conv2d (%i): %s\n", ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "PASS" : "FAILED");
     ggml_free(model.ctx);
 
     ggml_backend_buffer_free(model.buffer);

From 15ceadb4d2e5d878d406c24502fd4c23f0e25442 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Tue, 10 Oct 2023 15:04:55 -0400
Subject: [PATCH 07/26] add cuda kernels

---
 src/ggml-cuda.cu      | 76 +++++++++++++++++++------------------------
 src/ggml.c            |  3 +-
 tests/test-conv1d.cpp |  2 +-
 tests/test-conv2d.cpp |  4 +--
 4 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index bbf33ab64..92caab2b0 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4597,13 +4597,13 @@ static __global__ void gemm_f16_f32(const half  *x,const half  *y, float *dst, i
     }
 }
 
-static  __global__ void im2col_f32_f16(const float* x, half* dst, int nb12, int nb13, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
+static  __global__ void im2col_f32_f16(const float* x, half* dst, int ofs0, int ofs1, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
     int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
 	int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
     __syncthreads();
     if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
         int offset_dst = (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW;
-        int offset_src = threadIdx.x * nb13 +  blockIdx.x * nb12;
+        int offset_src = threadIdx.x * ofs0 +  blockIdx.x * ofs1;
         dst[offset_dst + (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z)] = __float2half(x[offset_src + iih * IW + iiw]);
     }
 }
@@ -5557,32 +5557,22 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
 }
 
 static void im2col_f32_f16_cuda(const float* x, half* dst,
-    int OC, int OH,
-    int IW, int IH,
+    int OH, int IW, int IH,
     int OW, int IC,
-    int KH, int KW, int N,
+    int KH, int KW, int N, int ofs0, int ofs1,
     int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-
-    int nb11 = IW;
-    int nb12 = nb11 * IH;
-    int nb13 = nb12 * IC;
-
-    int CHW = IC * KH * KW;
     dim3 block_nums(IC, OH, OW);
     dim3 block_dims(N, KH, KW);
-    im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, nb12, nb13, IW, IH, CHW, s0, s1, p0, p1, d0, d1);
+    im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
 }
 
 // GEMM
-static void gemm_f16_f32_cuda(const half* x,const half* y, float* dst, int OC, int OH, int OW,int IC, int KH, int KW, int N, cudaStream_t stream) {
-        int m = OC;
-        int n = OH * OW;
-        int k = IC * KH * KW;
-        for(int i = 0; i < N; i++) {
-            dim3 block_dims(16, 16);
-            dim3 block_nums((n + block_dims.x - 1) / block_dims.x, (m + block_dims.y - 1) / block_dims.y);
-            gemm_f16_f32<<<block_nums, block_dims, 0, stream>>>(x, y + i * m * k, dst + i * m * n, m, k, n);
-        }
+static void gemm_f16_f32_cuda(const half* x,const half* y, float* dst, int m, int n, int k, int N, cudaStream_t stream) {
+    for(int i = 0; i < N; i++) {
+        dim3 block_dims(16, 16);
+        dim3 block_nums((n + block_dims.x - 1) / block_dims.x, (m + block_dims.y - 1) / block_dims.y);
+        gemm_f16_f32<<<block_nums, block_dims, 0, stream>>>(x, y + i * m * k, dst + i * m * n, m, k, n);
+    }
 }
 
 // buffer pool for cuda
@@ -6493,9 +6483,9 @@ inline void ggml_cuda_op_im2col(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
     const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -6503,30 +6493,32 @@ inline void ggml_cuda_op_im2col(
     const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
     const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
     const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
     const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
 
-    const int64_t N = src1->ne[3];
-    const int64_t IC = src1->ne[2];
-    const int64_t IH = src1->ne[1];
+    const int64_t N = src1->ne[is_2D ? 3 : 2];
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
     const int64_t IW = src1->ne[0];
 
-    const int64_t OC = src0->ne[3];
-
-    // const int64_t IC = ne02;
-    const int64_t KH = src0->ne[1];
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
     const int64_t KW = src0->ne[0];
 
-    const int64_t OH = dst->ne[2];
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
     const int64_t OW = dst->ne[1];
 
+
     im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
-        OC, OH, IW, IH, OW, IC, KH, KW, N, s0, s1, p0, p1, d0, d1, main_stream);
+        OH, IW, IH, OW, IC, KH, KW, N,
+        src1->nb[is_2D ? 3 : 2] / 4, // nb is byte offset, src is type float32
+        src1->nb[is_2D ? 2 : 1] / 4, // nb is byte offset, src is type float32
+        s0, s1, p0, p1, d0, d1, main_stream);
 
     (void) src0;
     (void) src0_dd;
 }
 
-inline void ggml_cuda_op_conv2d_stage_1(
+inline void ggml_cuda_op_mul_mat_gemm_f16(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
@@ -6534,18 +6526,16 @@ inline void ggml_cuda_op_conv2d_stage_1(
     GGML_ASSERT(src1->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    const int N = src1->ne[3];
-    const int OH = src1->ne[2];
-    const int OW = src1->ne[1];
+    bool case_conv_2d = (src0->ne[0] * src0->ne[1] * src0->ne[2]) == src1->ne[0];
 
-    const int OC = src0->ne[3];
-    const int IC = src0->ne[2];
-    const int KH = src0->ne[1];
-    const int KW = src0->ne[0];
+    int m = src0->ne[case_conv_2d ? 3 : 2];
+    int n = (case_conv_2d ? src1->ne[2] : 1) * src1->ne[1];
+    int k = (case_conv_2d ? src0->ne[2] : 1) * src0->ne[1] *  src0->ne[0];
+    int N = src1->ne[case_conv_2d ? 3 : 2];
 
     gemm_f16_f32_cuda(
         (const half*)src0_dd, (const half*)src1_dd,
-        dst_dd, OC, OH, OW, IC, KH, KW, N, main_stream);
+        dst_dd, m, n, k, N, main_stream);
 }
 
 inline void ggml_cuda_op_diag_mask_inf(
@@ -7135,7 +7125,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if(src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul_mat_gemm_f16);
+    } else if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
diff --git a/src/ggml.c b/src/ggml.c
index 51057abed..1be9f67e8 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -11944,7 +11944,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     int64_t k = (case_conv_2d ? ne02 : 1) * ne01 * ne00;
     int64_t N = case_conv_2d ? ne13 : ne12;
 
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    // GEMM
     for (int i = 0; i < N; i++) {
         ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
         ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
@@ -11987,7 +11987,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
         for (int j = n0; j < n1; j+=blck_n) {
             for (int i = m0; i < m1; i+=blck_m) {
-                // printf("i j k => %d %d %d\n", i, j, K);
                 for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
                     for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
                         ggml_vec_dot_f16(k,
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index d692c36eb..63428c5a7 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-//#define GGML_USE_CUBLAS
+#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 6382a0397..568962f4c 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-//#define GGML_USE_CUBLAS
+#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -199,7 +199,7 @@ int main(void)
     ggml_time_init();
 
     test_model model;
-    load_model(model, false);
+    load_model(model, true);
 
     ggml_backend_buffer_t buf_compute; // for compute
     struct ggml_allocr * allocr = NULL;

From 872cc04b94fdc990798337898a2df317c75b118a Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Tue, 10 Oct 2023 16:09:52 -0400
Subject: [PATCH 08/26] more data test

---
 tests/test-conv1d.cpp |  44 ++++++++----
 tests/test-conv2d.cpp | 158 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 158 insertions(+), 44 deletions(-)

diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 63428c5a7..85fddbd3b 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-#define GGML_USE_CUBLAS
+//#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -31,13 +31,13 @@ struct test_model {
 
 void load_model(test_model & model, bool use_gpu = false) {
     // create data
-    int K = 3, IC = 3, OC = 3;
-    int IL = 5, N = 1;
+    int K = 3, IC = 10, OC = 10;
+    int IL = 8, N = 1;
 
     // Initialize adata
     float* adata = new float[K * IC * OC];
     for (size_t i = 0; i < K * IC * OC; i++) {
-        adata[i] = 2.0f;
+        adata[i] = 4.5f;
     }
 
     // Convert adata to fp16 format
@@ -47,7 +47,7 @@ void load_model(test_model & model, bool use_gpu = false) {
     // Initialize bdata
     float* bdata =  new float[IL * IC * N];
     for (size_t i = 0; i < IL * IC * N; i++) {
-        bdata[i] = 3.0f;
+        bdata[i] = 2.5f;
     }
 
     size_t buffer_size = 0;
@@ -235,20 +235,34 @@ int main(void)
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
     ggml_backend_tensor_get(conv1d_res, conv2d_data, 0, ggml_nbytes(conv1d_res));
 
-    const int n_conv1d_test = 15;
-    const int n_im2col_test = 45;
+    const int n_conv1d_test = 80;
+    const int n_im2col_test = 240;
 
     float expected_conv1d[n_conv1d_test] = {
-        36.00f, 54.00f, 54.00f, 54.00f, 36.00f, 36.00f,
-        54.00f, 54.00f, 54.00f, 36.00f, 36.00f, 54.00f,
-        54.00f, 54.00f, 36.00f
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f
     };
     // first im2col test
 
     ggml_fp16_t expected_im2col[n_conv1d_test] = {
-        0, 16896, 16896, 0, 16896, 16896, 0,
-        16896, 16896, 16896, 16896, 16896,
-        16896, 16896, 16896
+        0, 16640, 16640, 0, 16640, 16640, 0, 16640,
+        16640, 0, 16640, 16640, 0, 16640, 16640, 0,
+        16640, 16640, 0, 16640, 16640, 0, 16640, 16640,
+        0, 16640, 16640, 0, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
+        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640
     };
 
     printf("\nPerforming test:\n");
@@ -262,7 +276,7 @@ int main(void)
         }
     }
 
-    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "PASS" : "FAILED");
+    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
 
     passed = true;
     for(int i = 0; i < n_conv1d_test; i++) {
@@ -272,7 +286,7 @@ int main(void)
         }
     }
 
-    printf("ggml_conv1d (%i): %s\n", ggml_nelements(conv1d_res), passed && (ggml_nelements(conv1d_res) == n_conv1d_test) ? "PASS" : "FAILED");
+    printf("ggml_conv1d (%i): %s\n", ggml_nelements(conv1d_res), passed && (ggml_nelements(conv1d_res) == n_conv1d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
     ggml_free(model.ctx);
 
     ggml_backend_buffer_free(model.buffer);
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 568962f4c..f8ae975d5 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-#define GGML_USE_CUBLAS
+//#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -31,13 +31,13 @@ struct test_model {
 
 void load_model(test_model & model, bool use_gpu = false) {
     // create data
-    int KW = 3, KH = 3, IC = 4, OC = 4;
-    int IW = 4, IH = 4, /* IC = 640 */ N = 1;
+    int KW = 3, KH = 3, IC = 10, OC = 10;
+    int IW = 8, IH = 6, N = 1;
 
     // Initialize adata
     float* adata = new float[KW * KH * IC * OC];
     for (size_t i = 0; i < KW * KH * IC * OC; i++) {
-        adata[i] = 2.0f;
+        adata[i] = 2.5f;
     }
 
     // Convert adata to fp16 format
@@ -47,7 +47,7 @@ void load_model(test_model & model, bool use_gpu = false) {
     // Initialize bdata
     float* bdata =  new float[IW * IH * IC * N];
     for (size_t i = 0; i < IW * IH * IC * N; i++) {
-        bdata[i] = 1.0f;
+        bdata[i] = 1.5f;
     }
 
     size_t buffer_size = 0;
@@ -238,32 +238,132 @@ int main(void)
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
     ggml_backend_tensor_get(conv2d_res, conv2d_data, 0, ggml_nbytes(conv2d_res));
 
-    const int n_conv2d_test = 64;
-    const int n_im2col_test = 576;
+    const int n_conv2d_test = 480;
+    const int n_im2col_test = 4320;
 
     float expected_conv2d [n_conv2d_test] = {
-        32.00f, 48.00f, 48.00f, 32.00f, 48.00f, 72.00f,
-        72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
-        32.00f, 48.00f, 48.00f, 32.00f, 32.00f, 48.00f,
-        48.00f, 32.00f, 48.00f, 72.00f, 72.00f, 48.00f,
-        48.00f, 72.00f, 72.00f, 48.00f, 32.00f, 48.00f,
-        48.00f, 32.00f, 32.00f, 48.00f, 48.00f, 32.00f,
-        48.00f, 72.00f, 72.00f, 48.00f, 48.00f, 72.00f,
-        72.00f, 48.00f, 32.00f, 48.00f, 48.00f, 32.00f,
-        32.00f, 48.00f, 48.00f, 32.00f, 48.00f, 72.00f,
-        72.00f, 48.00f, 48.00f, 72.00f, 72.00f, 48.00f,
-        32.00f, 48.00f, 48.00f, 32.00f };
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
+        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f };
 
     ggml_fp16_t expected_im2col[n_conv2d_test] = {
-        0, 0, 0, 0, 15360, 15360, 0, 15360,
-        15360, 0, 0, 0, 0, 15360, 15360, 0,
-        15360, 15360, 0, 0, 0, 0, 15360,
-        15360, 0, 15360, 15360, 0, 0, 0, 0,
-        15360, 15360, 0, 15360, 15360, 0, 0, 0,
-        15360, 15360, 15360, 15360, 15360, 15360,
-        0, 0, 0, 15360, 15360, 15360, 15360, 15360,
-        15360, 0, 0, 0, 15360, 15360, 15360, 15360,
-        15360, 15360, 0
+            0, 0, 0, 0, 15872, 15872, 0, 15872,
+            15872, 0, 0, 0, 0, 15872, 15872, 0,
+            15872, 15872, 0, 0, 0, 0, 15872, 15872,
+            0, 15872, 15872, 0, 0, 0, 0, 15872,
+            15872, 0, 15872, 15872, 0, 0, 0, 0,
+            15872, 15872, 0, 15872, 15872, 0, 0, 0,
+            0, 15872, 15872, 0, 15872, 15872, 0, 0,
+            0, 0, 15872, 15872, 0, 15872, 15872, 0,
+            0, 0, 0, 15872, 15872, 0, 15872, 15872,
+            0, 0, 0, 0, 15872, 15872, 0, 15872,
+            15872, 0, 0, 0, 0, 15872, 15872, 0,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
+            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
+            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
+            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
+            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
+            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
+            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
+            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
+            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
+            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
+            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
+            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
+            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
+            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
+            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
+            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
+            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
+            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
+            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
+            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
+            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
+            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
+            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
+            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
+            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
+            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
+            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
+            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
+            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
+            15872, 15872, 15872, 15872, 15872, 0, 0, 0
     };
 
     printf("\nPerforming test:\n");
@@ -277,7 +377,7 @@ int main(void)
         }
     }
 
-    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "PASS" : "FAILED");
+    printf("ggml_im2col (%i): %s\n", ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
 
     passed = true;
     for(int i = 0; i < n_conv2d_test; i++) {
@@ -287,7 +387,7 @@ int main(void)
         }
     }
 
-    printf("ggml_conv2d (%i): %s\n", ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "PASS" : "FAILED");
+    printf("ggml_conv2d (%i): %s\n", ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
     ggml_free(model.ctx);
 
     ggml_backend_buffer_free(model.buffer);

From bb340dc38a51ce3e60c14616fc4c753f152af851 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Tue, 10 Oct 2023 16:19:53 -0400
Subject: [PATCH 09/26] fix ggml_op_count to 70

---
 src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ggml.c b/src/ggml.c
index 1be9f67e8..0659077f5 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4172,7 +4172,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 

From 419b4b8c3b2ba8f739c08bfc3ce9501852f2f145 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Tue, 10 Oct 2023 19:44:41 -0400
Subject: [PATCH 10/26] add temp test - gemm != mul_mat

---
 src/ggml.c             |  16 +-
 tests/CMakeLists.txt   |  10 ++
 tests/test-mul-mat.cpp | 397 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 411 insertions(+), 12 deletions(-)
 create mode 100644 tests/test-mul-mat.cpp

diff --git a/src/ggml.c b/src/ggml.c
index 0659077f5..019467dcd 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -11696,9 +11696,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 
-// legacy multiplication matrix just float 32 data type
-
-static void ggml_compute_forward_mul_mat_f32_f32(
+static void ggml_compute_forward_mul_mat_x(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -12006,16 +12004,10 @@ static void ggml_compute_forward_mul_mat(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    GGML_ASSERT(
-        src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 ||
-        src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    if(src0->type == GGML_TYPE_F32) {
-        // full precision
-        ggml_compute_forward_mul_mat_f32_f32(params, src0, src1, dst);
-    } else {
-        // reduce memory usage
+    if(src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
         ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
+    } else {
+        ggml_compute_forward_mul_mat_x(params, src0, src1, dst);
     }
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2646d49a5..6ed84893f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -374,3 +374,13 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+
+#
+# test-mul-mat
+
+set(TEST_TARGET test-mul-mat)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
new file mode 100644
index 000000000..73cd74f53
--- /dev/null
+++ b/tests/test-mul-mat.cpp
@@ -0,0 +1,397 @@
+#include "ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+//#define GGML_USE_CUBLAS uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
+//#define LONG_MATRIX test a conv2d expected matrix
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+struct test_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;
+};
+
+void load_model(test_model & model, float* a, float* b, int M, int N, int K, bool use_gpu = false) {
+    size_t buffer_size = 0;
+    {
+        buffer_size += (M * N) * ggml_type_sizef(GGML_TYPE_F32); // tensor a
+        buffer_size += (N * K) * ggml_type_sizef(GGML_TYPE_F32); // tensor b
+        buffer_size += 1024; // overhead
+    }
+
+    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+    printf("%s: backend buffer size = %d bytes\n", __func__, (int) buffer_size);
+
+    int num_tensors = 2;
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (use_gpu) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (use_gpu) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if(!model.backend) {
+        // fallback to CPU backend
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, M, N);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, N, K);
+
+    // create a allocator
+    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.a);
+
+    // load data to buffer
+    if(ggml_backend_is_cpu(model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+    ) {
+        memcpy(model.a->data, a, ggml_nbytes(model.a));
+    } else {
+        ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); // cuda requires copy the data directly to device
+    }
+
+    // alloc memory
+    ggml_allocr_alloc(alloc, model.b);
+
+    if(ggml_backend_is_cpu(model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+    ) {
+        memcpy(model.b->data, b, ggml_nbytes(model.b));
+    } else {
+        ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));  // cuda requires copy the data directly to device
+    }
+
+    ggml_allocr_free(alloc);
+}
+
+struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor* result = ggml_mul_mat(ctx0, model.a, ggml_reshape_2d(ctx0, model.b, 3, 2));
+
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = build_graph(model, allocr);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+    int n_threads = 1;
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    //ggml_graph_print(gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+
+static void ggml_vec_dot_f16(const int n, float * s, float * x, float * y) {
+    float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += x[i] * y[i];
+    }
+    *s = sumf;
+}
+
+static void gemm_f16_out_f32(int m, int n, int k,
+                             float * A,
+                             float * B,
+                             float * C,
+                             const int ith, const int nth) {
+    // does not seem to make a difference
+    int m0, m1, n0, n1;
+    // patches per thread
+    if (m > n) {
+        n0 = 0;
+        n1 = n;
+
+        // total patches in dst
+        const int np = m;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        m0 = dp*ith;
+        m1 = std::min(m0 + dp, np);
+    } else {
+        m0 = 0;
+        m1 = m;
+
+        // total patches in dst
+        const int np = n;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        n0 = dp*ith;
+        n1 = std::min(n0 + dp, np);
+    }
+
+    // block-tiling attempt
+    int64_t blck_n = 16;
+    int64_t blck_m = 16;
+
+    for (int j = n0; j < n1; j+=blck_n) {
+        for (int i = m0; i < m1; i+=blck_m) {
+            // printf("i j k => %d %d %d\n", i, j, K);
+            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+                    ggml_vec_dot_f16(k,
+                                    C + ii*n + jj,
+                                    A + ii * k,
+                                    B + jj * k);
+                }
+            }
+        }
+    }
+}
+
+
+void perform_gemm_test(float* a, float* b, float* expected, int M, int N, int K) {
+    printf("\nPerforming gemm_f16_out_f32 test:\n");
+    // transpose b
+    float* transposed = new float[K * N];
+    for(int i = 0; i < N; i++) {
+        for(int j = 0; j < K; j++) {
+            transposed[j * N + i] = b[i * K + j];
+        }
+    }
+
+    for (int i = 0; i < K; i++) {
+        for (int j = 0; j < N; j++) {
+            printf("%.1f ", transposed[i * N + j]);
+        }
+        printf("\n");
+    }
+
+    float* gemm_out = new float[M * K];
+                    //  N, K => transpose => K, N
+    gemm_f16_out_f32(M, K, N, a, transposed, gemm_out, 0, 1);
+
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%.1f ", gemm_out[i * K + j]);
+        }
+        printf("\n");
+    }
+
+    bool passed = true;
+
+    for(int i = 0; i < M * K; i++) {
+        if(gemm_out[i] != expected[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    printf("gemm_mult: %s\n", passed ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+}
+
+int main(void)
+{
+    ggml_time_init();
+#ifdef LONG_MATRIX
+    const int M = 4, N = 16, K = 36;  // a conv2d expected matrix multiplication
+#else
+    const int M = 3, N = 2, K = 3; // a normal matrix multiplication
+#endif
+
+    // matrix A (4 X 16)
+    float matrixA[M * N] = {
+#ifdef LONG_MATRIX
+        2.0f, 8.0f, 5.0f, 1.0f, 10.0f, 5.0f, 9.0f, 9.0f, 3.0f, 5.0f, 6.0f, 6.0f, 2.0f, 8.0f, 2.0f, 2.0f,
+        6.0f, 3.0f, 8.0f, 7.0f, 2.0f, 5.0f, 3.0f, 4.0f, 3.0f, 3.0f, 2.0f, 7.0f, 9.0f, 6.0f, 8.0f, 7.0f,
+        2.0f, 9.0f, 10.0f, 3.0f, 8.0f, 10.0f, 6.0f, 5.0f, 4.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 2.0f, 2.0f,
+        4.0f, 9.0f, 8.0f, 5.0f, 3.0f, 8.0f, 8.0f, 10.0f, 4.0f, 2.0f, 10.0f, 9.0f, 7.0f, 6.0f, 1.0f, 3.0f
+#else
+        2, 8,
+        5, 1,
+        4, 2
+#endif
+    };
+
+    // matrix B (16 X 36)
+    float matrixB[N * K] = {
+#ifdef LONG_MATRIX
+        9.0f, 7.0f, 1.0f, 3.0f, 5.0f, 9.0f, 7.0f, 6.0f, 1.0f, 10.0f, 1.0f, 1.0f, 7.0f, 2.0f, 4.0f, 9.0f, 10.0f, 4.0f, 5.0f, 5.0f, 7.0f, 1.0f, 7.0f, 7.0f, 2.0f, 9.0f, 5.0f, 10.0f, 7.0f, 4.0f, 8.0f, 9.0f, 9.0f, 3.0f, 10.0f, 2.0f,
+        4.0f, 6.0f, 10.0f, 9.0f, 5.0f, 1.0f, 8.0f, 7.0f, 4.0f, 7.0f, 2.0f, 6.0f, 5.0f, 3.0f, 1.0f, 10.0f, 8.0f, 4.0f, 8.0f, 3.0f, 7.0f, 1.0f, 2.0f, 7.0f, 6.0f, 8.0f, 6.0f, 5.0f, 2.0f, 3.0f, 1.0f, 1.0f, 2.0f, 5.0f, 7.0f, 1.0f,
+        8.0f, 2.0f, 8.0f, 8.0f, 8.0f, 8.0f, 4.0f, 4.0f, 6.0f, 10.0f, 10.0f, 9.0f, 2.0f, 9.0f, 3.0f, 7.0f, 7.0f, 1.0f, 4.0f, 9.0f, 1.0f, 2.0f, 3.0f, 6.0f, 1.0f, 10.0f, 5.0f, 8.0f, 9.0f, 4.0f, 6.0f, 2.0f, 3.0f, 1.0f, 2.0f, 7.0f,
+        5.0f, 1.0f, 7.0f, 2.0f, 9.0f, 10.0f, 9.0f, 5.0f, 2.0f, 5.0f, 4.0f, 10.0f, 9.0f, 9.0f, 1.0f, 9.0f, 8.0f, 8.0f, 9.0f, 4.0f, 9.0f, 4.0f, 8.0f, 2.0f, 1.0f, 8.0f, 4.0f, 5.0f, 10.0f, 7.0f, 6.0f, 2.0f, 1.0f, 10.0f, 10.0f, 7.0f,
+        9.0f, 4.0f, 5.0f, 9.0f, 5.0f, 10.0f, 10.0f, 3.0f, 6.0f, 6.0f, 4.0f, 4.0f, 4.0f, 8.0f, 5.0f, 4.0f, 9.0f, 1.0f, 9.0f, 9.0f, 1.0f, 7.0f, 9.0f, 2.0f, 10.0f, 9.0f, 10.0f, 8.0f, 3.0f, 3.0f, 9.0f, 3.0f, 9.0f, 10.0f, 1.0f, 8.0f,
+        9.0f, 2.0f, 6.0f, 9.0f, 7.0f, 2.0f, 3.0f, 5.0f, 3.0f, 6.0f, 9.0f, 7.0f, 3.0f, 7.0f, 6.0f, 4.0f, 10.0f, 3.0f, 5.0f, 7.0f, 2.0f, 9.0f, 3.0f, 2.0f, 2.0f, 10.0f, 8.0f, 7.0f, 3.0f, 10.0f, 6.0f, 3.0f, 1.0f, 1.0f, 4.0f, 10.0f,
+        2.0f, 9.0f, 2.0f, 10.0f, 6.0f, 4.0f, 3.0f, 6.0f, 3.0f, 6.0f, 9.0f, 7.0f, 8.0f, 8.0f, 3.0f, 3.0f, 10.0f, 5.0f, 2.0f, 10.0f, 7.0f, 10.0f, 9.0f, 3.0f, 6.0f, 6.0f, 5.0f, 10.0f, 2.0f, 3.0f, 6.0f, 1.0f, 9.0f, 4.0f, 10.0f, 4.0f,
+        10.0f, 7.0f, 8.0f, 10.0f, 10.0f, 8.0f, 7.0f, 10.0f, 4.0f, 6.0f, 8.0f, 7.0f, 7.0f, 6.0f, 9.0f, 3.0f, 6.0f, 5.0f, 5.0f, 2.0f, 7.0f, 2.0f, 7.0f, 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 3.0f, 9.0f, 3.0f, 6.0f, 4.0f, 7.0f, 2.0f, 9.0f,
+        7.0f, 3.0f, 2.0f, 5.0f, 7.0f, 3.0f, 10.0f, 2.0f, 6.0f, 1.0f, 4.0f, 7.0f, 5.0f, 10.0f, 3.0f, 10.0f, 4.0f, 5.0f, 5.0f, 1.0f, 6.0f, 10.0f, 7.0f, 4.0f, 5.0f, 3.0f, 9.0f, 9.0f, 8.0f, 6.0f, 9.0f, 2.0f, 3.0f, 6.0f, 8.0f, 5.0f,
+        5.0f, 5.0f, 5.0f, 5.0f, 3.0f, 10.0f, 4.0f, 1.0f, 8.0f, 8.0f, 9.0f, 8.0f, 4.0f, 1.0f, 4.0f, 9.0f, 3.0f, 6.0f, 3.0f, 1.0f, 4.0f, 8.0f, 3.0f, 10.0f, 8.0f, 6.0f, 4.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f,
+        6.0f, 2.0f, 3.0f, 3.0f, 3.0f, 7.0f, 5.0f, 1.0f, 8.0f, 1.0f, 4.0f, 5.0f, 1.0f, 1.0f, 6.0f, 4.0f, 2.0f, 1.0f, 7.0f, 8.0f, 6.0f, 1.0f, 1.0f, 5.0f, 6.0f, 5.0f, 10.0f, 6.0f, 7.0f, 5.0f, 9.0f, 3.0f, 2.0f, 7.0f, 9.0f, 4.0f,
+        2.0f, 5.0f, 9.0f, 5.0f, 10.0f, 3.0f, 1.0f, 8.0f, 1.0f, 7.0f, 1.0f, 8.0f, 1.0f, 6.0f, 7.0f, 8.0f, 4.0f, 9.0f, 5.0f, 10.0f, 3.0f, 7.0f, 6.0f, 8.0f, 8.0f, 5.0f, 6.0f, 8.0f, 10.0f, 9.0f, 4.0f, 1.0f, 3.0f, 3.0f, 4.0f, 7.0f,
+        8.0f, 2.0f, 6.0f, 6.0f, 5.0f, 1.0f, 3.0f, 7.0f, 1.0f, 7.0f, 2.0f, 2.0f, 2.0f, 8.0f, 4.0f, 1.0f, 1.0f, 5.0f, 9.0f, 4.0f, 1.0f, 2.0f, 3.0f, 10.0f, 1.0f, 4.0f, 9.0f, 9.0f, 6.0f, 8.0f, 8.0f, 1.0f, 9.0f, 10.0f, 4.0f, 1.0f,
+        8.0f, 5.0f, 8.0f, 9.0f, 4.0f, 8.0f, 2.0f, 1.0f, 1.0f, 9.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 5.0f, 6.0f, 7.0f, 3.0f, 1.0f, 4.0f, 6.0f, 7.0f, 7.0f, 7.0f, 8.0f, 7.0f, 8.0f, 8.0f, 2.0f, 10.0f, 2.0f, 7.0f, 3.0f, 8.0f, 3.0f,
+        8.0f, 7.0f, 6.0f, 2.0f, 4.0f, 10.0f, 10.0f, 6.0f, 10.0f, 3.0f, 7.0f, 6.0f, 4.0f, 3.0f, 5.0f, 5.0f, 5.0f, 3.0f, 8.0f, 10.0f, 3.0f, 4.0f, 8.0f, 4.0f, 2.0f, 6.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 3.0f, 5.0f, 2.0f, 2.0f, 6.0f,
+        10.0f, 6.0f, 2.0f, 1.0f, 7.0f, 5.0f, 6.0f, 4.0f, 1.0f, 9.0f, 10.0f, 2.0f, 4.0f, 5.0f, 8.0f, 5.0f, 7.0f, 4.0f, 7.0f, 6.0f, 3.0f, 9.0f, 2.0f, 1.0f, 4.0f, 2.0f, 6.0f, 6.0f, 3.0f, 3.0f, 2.0f, 8.0f, 5.0f, 9.0f, 3.0f, 4.0f,
+#else
+
+        10, 9, 5,
+        5, 9, 4
+#endif
+    };
+
+    // matrix C (4 x 16)
+    float expected_result[M * K] = {
+#ifdef LONG_MATRIX
+        548.0f, 413.0f, 494.0f, 613.0f, 506.0f, 509.0f, 452.0f, 399.0f, 348.0f, 530.0f, 467.0f, 505.0f, 378.0f, 440.0f, 377.0f, 465.0f, 548.0f, 353.0f, 455.0f, 480.0f, 368.0f, 443.0f, 462.0f, 421.0f, 467.0f, 575.0f, 571.0f, 594.0f, 415.0f, 420.0f, 484.0f, 226.0f, 415.0f, 424.0f, 448.0f, 448.0f,
+        590.0f, 365.0f, 481.0f, 458.0f, 531.0f, 511.0f, 453.0f, 425.0f, 299.0f, 562.0f, 454.0f, 479.0f, 361.0f, 469.0f, 369.0f, 487.0f, 505.0f, 389.0f, 503.0f, 486.0f, 336.0f, 404.0f, 437.0f, 443.0f, 311.0f, 542.0f, 545.0f, 624.0f, 520.0f, 492.0f, 476.0f, 261.0f, 401.0f, 425.0f, 436.0f, 421.0f,
+        545.0f, 339.0f, 488.0f, 579.0f, 503.0f, 440.0f, 437.0f, 388.0f, 319.0f, 520.0f, 460.0f, 495.0f, 340.0f, 482.0f, 331.0f, 461.0f, 550.0f, 311.0f, 456.0f, 474.0f, 317.0f, 410.0f, 405.0f, 385.0f, 364.0f, 583.0f, 543.0f, 576.0f, 416.0f, 430.0f, 463.0f, 204.0f, 354.0f, 380.0f, 404.0f, 444.0f,
+        634.0f, 429.0f, 583.0f, 652.0f, 631.0f, 526.0f, 493.0f, 506.0f, 359.0f, 608.0f, 516.0f, 596.0f, 417.0f, 542.0f, 443.0f, 552.0f, 602.0f, 432.0f, 553.0f, 560.0f, 451.0f, 452.0f, 478.0f, 512.0f, 440.0f, 650.0f, 657.0f, 690.0f, 552.0f, 560.0f, 564.0f, 269.0f, 422.0f, 499.0f, 561.0f, 508.0f
+#else
+        60.0f, 90.0f, 42.0f,
+        55.0f, 54.0f, 29.0f,
+        50.0f, 54.0f, 28.0f
+#endif
+    };
+
+    bool passed = true;
+
+    perform_gemm_test(matrixA, matrixB, expected_result, M, N, K);
+
+    test_model model;
+    load_model(model, matrixA, matrixB, M, N, K, true);
+
+    ggml_backend_buffer_t buf_compute; // for compute
+    struct ggml_allocr * allocr = NULL;
+
+    {
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
+
+        //create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = build_graph(model, allocr);
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
+        ggml_allocr_free(allocr);
+
+        // compute the required memory
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
+        fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+    }
+
+    struct ggml_tensor * result = compute(model, allocr);
+
+    float* out_data = new float[ggml_nelements(result)];
+
+    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
+
+    printf("\nPerforming ggml_mul_mat test:\n");
+
+    passed = true;
+    for(int i = 0; i < M * K; i++) {
+        if(out_data[i] != expected_result[i]) {
+            passed = false;
+            break;
+        }
+    }
+
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%.1f ", out_data[i * K + j]);
+        }
+        printf("\n");
+    }
+
+    printf("ggml_mul_mat (%i): %s\n", ggml_nelements(result), passed && (ggml_nelements(result) == M * K) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+
+   // free memory
+    ggml_free(model.ctx);
+
+    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+    return 0;
+}

From af312e43cc255d5ea3f8eb848d0d29d3511dcbd1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Oct 2023 21:50:54 +0300
Subject: [PATCH 11/26] tests : fix test-mul-mat matrix multiplication

---
 include/ggml/ggml.h    |  2 +-
 src/ggml.c             |  5 ++---
 tests/test-mul-mat.cpp | 12 +++++++-----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index d187620e1..5f0864335 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -1383,7 +1383,7 @@ extern "C" {
             int                  p1,
             int                  d0,
             int                  d1,
-            bool             is_2D);
+            bool                 is_2D);
 
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
diff --git a/src/ggml.c b/src/ggml.c
index 019467dcd..4cd3b468d 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -7594,7 +7594,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
-static struct ggml_tensor * ggml_im2col(
+struct ggml_tensor * ggml_im2col(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
     struct ggml_tensor  * b,
@@ -7604,7 +7604,7 @@ static struct ggml_tensor * ggml_im2col(
     int                  p1,
     int                  d0,
     int                  d1,
-    bool             is_2D) {
+    bool                 is_2D) {
 
     if(is_2D) {
         GGML_ASSERT(a->ne[2] == b->ne[2]);
@@ -7638,7 +7638,6 @@ static struct ggml_tensor * ggml_im2col(
     result->src[1] = b;
 
     return result;
-
 }
 
 // a: [OC，IC, KH, KW]
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 73cd74f53..4d93d4b65 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -81,8 +81,8 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
     model.ctx = ggml_init(params);
 
     // create tensors
-    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, M, N);
-    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, N, K);
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, N, M);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, K, N);
 
     // create a allocator
     ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
@@ -130,11 +130,13 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     // create a temporally context to build the graph
     struct ggml_context * ctx0 = ggml_init(params0);
 
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    struct ggml_tensor* result = ggml_mul_mat(ctx0, model.a, ggml_reshape_2d(ctx0, model.b, 3, 2));
+    // zT = x @ yT
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, ggml_cont(ctx0, ggml_transpose(ctx0, model.b)));
 
-    ggml_build_forward_expand(gf, result);
+    // z = (zT)T
+    ggml_build_forward_expand(gf, ggml_cont(ctx0, ggml_transpose(ctx0, result)));
 
     // delete the temporally context used to build the graph
     ggml_free(ctx0);

From c692f6152e972df1b429c5db93b495cc165bee38 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sat, 14 Oct 2023 11:20:43 -0400
Subject: [PATCH 12/26] test-mul-mat match gemm == ggml_mul_mat with conv2d op

---
 tests/test-conv2d.cpp  |  3 ++
 tests/test-mul-mat.cpp | 68 +++++++++++++++++-------------------------
 2 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index f8ae975d5..9cc4d3831 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -157,7 +157,10 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     // split conv2d in fundamental methods for test unit
     struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
     ggml_set_name(im2col_res, "im2col_res");
+    printf("MatrixA [%i, %i, %i, %i]\n", model.a->ne[0], model.a->ne[1], model.a->ne[2], model.a->ne[3]);
+    printf("MatrixB [%i, %i, %i, %i]\n", im2col_res->ne[0], im2col_res->ne[1], im2col_res->ne[2], im2col_res->ne[3]);
     struct ggml_tensor* conv2d_res = ggml_mul_mat(ctx0, model.a, im2col_res);
+    printf("MatrixR [%i, %i, %i, %i]\n", conv2d_res->ne[0], conv2d_res->ne[1], conv2d_res->ne[2], conv2d_res->ne[3]);
     ggml_set_name(conv2d_res, "conv2d_res");
     ggml_build_forward_expand(gf, conv2d_res);
 
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 4d93d4b65..671f4fb21 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -3,7 +3,8 @@
 #include "ggml/ggml-backend.h"
 
 //#define GGML_USE_CUBLAS uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
-//#define LONG_MATRIX test a conv2d expected matrix
+// test a conv2d expected matrix
+#define LONG_MATRIX
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -81,8 +82,10 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
     model.ctx = ggml_init(params);
 
     // create tensors
-    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, N, M);
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, K, M);
+    printf("Matrix A: [%i, %i]\n", K, M);
     model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, K, N);
+    printf("Matrix B: [%i, %i]\n", K, N);
 
     // create a allocator
     ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
@@ -133,7 +136,7 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     // zT = x @ yT
-    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, ggml_cont(ctx0, ggml_transpose(ctx0, model.b)));
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, ggml_cont(ctx0, model.b));
 
     // z = (zT)T
     ggml_build_forward_expand(gf, ggml_cont(ctx0, ggml_transpose(ctx0, result)));
@@ -185,6 +188,7 @@ static void gemm_f16_out_f32(int m, int n, int k,
                              float * B,
                              float * C,
                              const int ith, const int nth) {
+    printf("M: %i, N: %i, K: %i\n", m, n, k);
     // does not seem to make a difference
     int m0, m1, n0, n1;
     // patches per thread
@@ -238,42 +242,27 @@ static void gemm_f16_out_f32(int m, int n, int k,
 
 void perform_gemm_test(float* a, float* b, float* expected, int M, int N, int K) {
     printf("\nPerforming gemm_f16_out_f32 test:\n");
-    // transpose b
-    float* transposed = new float[K * N];
-    for(int i = 0; i < N; i++) {
-        for(int j = 0; j < K; j++) {
-            transposed[j * N + i] = b[i * K + j];
-        }
-    }
-
-    for (int i = 0; i < K; i++) {
-        for (int j = 0; j < N; j++) {
-            printf("%.1f ", transposed[i * N + j]);
-        }
-        printf("\n");
-    }
 
-    float* gemm_out = new float[M * K];
-                    //  N, K => transpose => K, N
-    gemm_f16_out_f32(M, K, N, a, transposed, gemm_out, 0, 1);
+    float* gemm_out = new float[M * N];
+    gemm_f16_out_f32(M, N, K, a, b, gemm_out, 0, 1);
 
     for (int i = 0; i < M; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%.1f ", gemm_out[i * K + j]);
+        for (int j = 0; j < N; j++) {
+            printf("%.1ff,", gemm_out[i * N + j]);
         }
         printf("\n");
     }
 
     bool passed = true;
 
-    for(int i = 0; i < M * K; i++) {
+    for(int i = 0; i < M * N; i++) {
         if(gemm_out[i] != expected[i]) {
             passed = false;
             break;
         }
     }
 
-    printf("gemm_mult: %s\n", passed ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+    printf("gemm_mult (%i): %s\n", (M * N), passed ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
 }
 
 int main(void)
@@ -285,13 +274,13 @@ int main(void)
     const int M = 3, N = 2, K = 3; // a normal matrix multiplication
 #endif
 
-    // matrix A (4 X 16)
-    float matrixA[M * N] = {
+    // matrix A (4 X 36)
+    float matrixA[M * K] = {
 #ifdef LONG_MATRIX
-        2.0f, 8.0f, 5.0f, 1.0f, 10.0f, 5.0f, 9.0f, 9.0f, 3.0f, 5.0f, 6.0f, 6.0f, 2.0f, 8.0f, 2.0f, 2.0f,
-        6.0f, 3.0f, 8.0f, 7.0f, 2.0f, 5.0f, 3.0f, 4.0f, 3.0f, 3.0f, 2.0f, 7.0f, 9.0f, 6.0f, 8.0f, 7.0f,
-        2.0f, 9.0f, 10.0f, 3.0f, 8.0f, 10.0f, 6.0f, 5.0f, 4.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 2.0f, 2.0f,
-        4.0f, 9.0f, 8.0f, 5.0f, 3.0f, 8.0f, 8.0f, 10.0f, 4.0f, 2.0f, 10.0f, 9.0f, 7.0f, 6.0f, 1.0f, 3.0f
+       2.0f, 9.0f, 2.0f, 10.0f, 6.0f, 4.0f, 3.0f, 6.0f, 3.0f, 6.0f, 9.0f, 7.0f, 8.0f, 8.0f, 3.0f, 3.0f, 10.0f, 5.0f, 2.0f, 10.0f, 7.0f, 10.0f, 9.0f, 3.0f, 6.0f, 6.0f, 5.0f, 10.0f, 2.0f, 3.0f, 6.0f, 1.0f, 9.0f, 4.0f, 10.0f, 4.0f,
+        10.0f, 7.0f, 8.0f, 10.0f, 10.0f, 8.0f, 7.0f, 10.0f, 4.0f, 6.0f, 8.0f, 7.0f, 7.0f, 6.0f, 9.0f, 3.0f, 6.0f, 5.0f, 5.0f, 2.0f, 7.0f, 2.0f, 7.0f, 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 3.0f, 9.0f, 3.0f, 6.0f, 4.0f, 7.0f, 2.0f, 9.0f,
+        7.0f, 3.0f, 2.0f, 5.0f, 7.0f, 3.0f, 10.0f, 2.0f, 6.0f, 1.0f, 4.0f, 7.0f, 5.0f, 10.0f, 3.0f, 10.0f, 4.0f, 5.0f, 5.0f, 1.0f, 6.0f, 10.0f, 7.0f, 4.0f, 5.0f, 3.0f, 9.0f, 9.0f, 8.0f, 6.0f, 9.0f, 2.0f, 3.0f, 6.0f, 8.0f, 5.0f,
+        5.0f, 5.0f, 5.0f, 5.0f, 3.0f, 10.0f, 4.0f, 1.0f, 8.0f, 8.0f, 9.0f, 8.0f, 4.0f, 1.0f, 4.0f, 9.0f, 3.0f, 6.0f, 3.0f, 1.0f, 4.0f, 8.0f, 3.0f, 10.0f, 8.0f, 6.0f, 4.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f,
 #else
         2, 8,
         5, 1,
@@ -319,19 +308,18 @@ int main(void)
         8.0f, 7.0f, 6.0f, 2.0f, 4.0f, 10.0f, 10.0f, 6.0f, 10.0f, 3.0f, 7.0f, 6.0f, 4.0f, 3.0f, 5.0f, 5.0f, 5.0f, 3.0f, 8.0f, 10.0f, 3.0f, 4.0f, 8.0f, 4.0f, 2.0f, 6.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 3.0f, 5.0f, 2.0f, 2.0f, 6.0f,
         10.0f, 6.0f, 2.0f, 1.0f, 7.0f, 5.0f, 6.0f, 4.0f, 1.0f, 9.0f, 10.0f, 2.0f, 4.0f, 5.0f, 8.0f, 5.0f, 7.0f, 4.0f, 7.0f, 6.0f, 3.0f, 9.0f, 2.0f, 1.0f, 4.0f, 2.0f, 6.0f, 6.0f, 3.0f, 3.0f, 2.0f, 8.0f, 5.0f, 9.0f, 3.0f, 4.0f,
 #else
-
         10, 9, 5,
         5, 9, 4
 #endif
     };
 
     // matrix C (4 x 16)
-    float expected_result[M * K] = {
+    float expected_result[M * N] = {
 #ifdef LONG_MATRIX
-        548.0f, 413.0f, 494.0f, 613.0f, 506.0f, 509.0f, 452.0f, 399.0f, 348.0f, 530.0f, 467.0f, 505.0f, 378.0f, 440.0f, 377.0f, 465.0f, 548.0f, 353.0f, 455.0f, 480.0f, 368.0f, 443.0f, 462.0f, 421.0f, 467.0f, 575.0f, 571.0f, 594.0f, 415.0f, 420.0f, 484.0f, 226.0f, 415.0f, 424.0f, 448.0f, 448.0f,
-        590.0f, 365.0f, 481.0f, 458.0f, 531.0f, 511.0f, 453.0f, 425.0f, 299.0f, 562.0f, 454.0f, 479.0f, 361.0f, 469.0f, 369.0f, 487.0f, 505.0f, 389.0f, 503.0f, 486.0f, 336.0f, 404.0f, 437.0f, 443.0f, 311.0f, 542.0f, 545.0f, 624.0f, 520.0f, 492.0f, 476.0f, 261.0f, 401.0f, 425.0f, 436.0f, 421.0f,
-        545.0f, 339.0f, 488.0f, 579.0f, 503.0f, 440.0f, 437.0f, 388.0f, 319.0f, 520.0f, 460.0f, 495.0f, 340.0f, 482.0f, 331.0f, 461.0f, 550.0f, 311.0f, 456.0f, 474.0f, 317.0f, 410.0f, 405.0f, 385.0f, 364.0f, 583.0f, 543.0f, 576.0f, 416.0f, 430.0f, 463.0f, 204.0f, 354.0f, 380.0f, 404.0f, 444.0f,
-        634.0f, 429.0f, 583.0f, 652.0f, 631.0f, 526.0f, 493.0f, 506.0f, 359.0f, 608.0f, 516.0f, 596.0f, 417.0f, 542.0f, 443.0f, 552.0f, 602.0f, 432.0f, 553.0f, 560.0f, 451.0f, 452.0f, 478.0f, 512.0f, 440.0f, 650.0f, 657.0f, 690.0f, 552.0f, 560.0f, 564.0f, 269.0f, 422.0f, 499.0f, 561.0f, 508.0f
+        1224.0f, 1023.0f, 1158.0f,1259.0f,1359.0f,1194.0f,1535.0f,1247.0f,1185.0f,1029.0f,889.0f,1182.0f,955.0f,1179.0f,1147.0f,1048.0f,
+        1216.0f, 1087.0f, 1239.0f,1361.0f,1392.0f,1260.0f,1247.0f,1563.0f,1167.0f,1052.0f,942.0f,1214.0f,1045.0f,1134.0f,1264.0f,1126.0f,
+        1125.0f, 966.0f, 1079.0f,1333.0f,1287.0f,1101.0f,1185.0f,1167.0f,1368.0f,990.0f,967.0f,1121.0f,971.0f,1086.0f,1130.0f,980.0f,
+        999.0f, 902.0f, 1020.0f,1056.0f,1076.0f,929.0f,1029.0f,1052.0f,990.0f,1108.0f,823.0f,989.0f,759.0f,1041.0f,1003.0f,870.0f
 #else
         60.0f, 90.0f, 42.0f,
         55.0f, 54.0f, 29.0f,
@@ -373,7 +361,7 @@ int main(void)
     printf("\nPerforming ggml_mul_mat test:\n");
 
     passed = true;
-    for(int i = 0; i < M * K; i++) {
+    for(int i = 0; i < M * N; i++) {
         if(out_data[i] != expected_result[i]) {
             passed = false;
             break;
@@ -381,13 +369,13 @@ int main(void)
     }
 
     for (int i = 0; i < M; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%.1f ", out_data[i * K + j]);
+        for (int j = 0; j < N; j++) {
+            printf("%.1f ", out_data[i * N + j]);
         }
         printf("\n");
     }
 
-    printf("ggml_mul_mat (%i): %s\n", ggml_nelements(result), passed && (ggml_nelements(result) == M * K) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+    printf("ggml_mul_mat (%i): %s\n", ggml_nelements(result), passed && (ggml_nelements(result) == M * N) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
 
    // free memory
     ggml_free(model.ctx);

From 3dad5e698053bfc7a781c1a9b9e7df4a6704a4e7 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sat, 14 Oct 2023 13:21:33 -0400
Subject: [PATCH 13/26] replaced gemm by ggml_mul_mat

---
 src/ggml-cuda.cu       |  44 +-------------
 src/ggml.c             | 134 ++++++-----------------------------------
 tests/test-conv1d.cpp  |   7 ++-
 tests/test-conv2d.cpp  |  10 +--
 tests/test-mul-mat.cpp |  24 --------
 5 files changed, 30 insertions(+), 189 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 92caab2b0..1a72047ae 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4585,17 +4585,6 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
-static __global__ void gemm_f16_f32(const half  *x,const half  *y, float *dst, int N, int M, int K) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row < N && col < K) {
-        float sum = 0.0f;
-        for (int i = 0; i < M; ++i) {
-            sum += __half2float(x[row * M + i]) * __half2float(y[col * M + i]);
-        }
-        dst[row * K + col] = sum;
-    }
-}
 
 static  __global__ void im2col_f32_f16(const float* x, half* dst, int ofs0, int ofs1, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
     int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
@@ -5566,15 +5555,6 @@ static void im2col_f32_f16_cuda(const float* x, half* dst,
     im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
 }
 
-// GEMM
-static void gemm_f16_f32_cuda(const half* x,const half* y, float* dst, int m, int n, int k, int N, cudaStream_t stream) {
-    for(int i = 0; i < N; i++) {
-        dim3 block_dims(16, 16);
-        dim3 block_nums((n + block_dims.x - 1) / block_dims.x, (m + block_dims.y - 1) / block_dims.y);
-        gemm_f16_f32<<<block_nums, block_dims, 0, stream>>>(x, y + i * m * k, dst + i * m * n, m, k, n);
-    }
-}
-
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 256
 
@@ -6518,26 +6498,6 @@ inline void ggml_cuda_op_im2col(
     (void) src0_dd;
 }
 
-inline void ggml_cuda_op_mul_mat_gemm_f16(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    bool case_conv_2d = (src0->ne[0] * src0->ne[1] * src0->ne[2]) == src1->ne[0];
-
-    int m = src0->ne[case_conv_2d ? 3 : 2];
-    int n = (case_conv_2d ? src1->ne[2] : 1) * src1->ne[1];
-    int k = (case_conv_2d ? src0->ne[2] : 1) * src0->ne[1] *  src0->ne[0];
-    int N = src1->ne[case_conv_2d ? 3 : 2];
-
-    gemm_f16_f32_cuda(
-        (const half*)src0_dd, (const half*)src1_dd,
-        dst_dd, m, n, k, N, main_stream);
-}
-
 inline void ggml_cuda_op_diag_mask_inf(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7125,9 +7085,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    if(src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul_mat_gemm_f16);
-    } else if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
diff --git a/src/ggml.c b/src/ggml.c
index 4cd3b468d..d0d00694e 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6422,11 +6422,8 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-    // hack to admit GEMM custom operator
-    bool mult_mat_conv1d = (a->ne[0] * a->ne[1]) == b->ne[0];
-    bool mult_mat_conv2d = (a->ne[0] * a->ne[1] * a->ne[2]) == b->ne[0];
 
-    GGML_ASSERT(ggml_can_mul_mat(a, b) || mult_mat_conv1d || mult_mat_conv2d);
+    GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
     bool is_node = false;
@@ -6435,11 +6432,7 @@ struct ggml_tensor * ggml_mul_mat(
         is_node = true;
     }
 
-    const int64_t ne[4] = {
-        mult_mat_conv2d || mult_mat_conv1d ? b->ne[1] : a->ne[1],
-        mult_mat_conv1d ? a->ne[2] : b->ne[mult_mat_conv2d ? 2 : 1],
-        mult_mat_conv2d ? a->ne[3] : b->ne[2],
-        mult_mat_conv1d ? 1 : b->ne[3]  };
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_MUL_MAT;
@@ -7496,7 +7489,12 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   p0,
         int                   d0) {
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OH, OW, IC * KH * KW]
-    result = ggml_mul_mat(ctx, a, result); // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+    result = ggml_reshape_3d(ctx,
+            ggml_cont(ctx, ggml_transpose(ctx,
+            ggml_mul_mat(ctx,
+            ggml_cont(ctx, ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2])),
+            ggml_cont(ctx, ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1])))))),
+            result->ne[1],  a->ne[2], result->ne[2]);
     return result;
 }
 
@@ -7655,7 +7653,12 @@ struct ggml_tensor * ggml_conv_2d(
     int                  d1) {
 
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-    result = ggml_mul_mat(ctx, a, result); // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    result = ggml_reshape_4d(ctx,
+            ggml_cont(ctx, ggml_transpose(ctx,
+            ggml_mul_mat(ctx,
+            ggml_cont(ctx, ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])),
+            ggml_cont(ctx, ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[3] * result->ne[2] * result->ne[1])))))),
+            result->ne[1], result->ne[2], a->ne[3], result->ne[3]); // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
     return result;
 }
 
@@ -11695,8 +11698,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 
-static void ggml_compute_forward_mul_mat_x(
-        const struct ggml_compute_params * params,
+
+static void ggml_compute_forward_mul_mat(
+         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
@@ -11906,110 +11910,6 @@ static void ggml_compute_forward_mul_mat_x(
     }
 }
 
-// GEMM
-// TODO: compare gemm op with the current implementation of mul_mat
-
-static void ggml_compute_forward_mul_mat_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    bool case_conv_2d = (ne00 * ne01 * ne02) == ne10;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = case_conv_2d ? ne03 : ne02;
-    int64_t n = (case_conv_2d ? ne12 : 1) * ne11;
-    int64_t k = (case_conv_2d ? ne02 : 1) * ne01 * ne00;
-    int64_t N = case_conv_2d ? ne13 : ne12;
-
-    // GEMM
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        // does not seem to make a difference
-        int64_t m0, m1, n0, n1;
-        // patches per thread
-        if (m > n) {
-            n0 = 0;
-            n1 = n;
-
-            // total patches in dst
-            const int np = m;
-
-            // patches per thread
-            const int dp = (np + nth - 1)/nth;
-
-            // patch range for this thread
-            m0 = dp*ith;
-            m1 = MIN(m0 + dp, np);
-        } else {
-            m0 = 0;
-            m1 = m;
-
-            // total patches in dst
-            const int np = n;
-
-            // patches per thread
-            const int dp = (np + nth - 1)/nth;
-
-            // patch range for this thread
-            n0 = dp*ith;
-            n1 = MIN(n0 + dp, np);
-        }
-
-        // block-tiling attempt
-        int64_t blck_n = 16;
-        int64_t blck_m = 16;
-
-        for (int j = n0; j < n1; j+=blck_n) {
-            for (int i = m0; i < m1; i+=blck_m) {
-                for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                    for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                        ggml_vec_dot_f16(k,
-                                        C + ii*n + jj,
-                                        A + ii * k,
-                                        B + jj * k);
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    if(src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
-    } else {
-        ggml_compute_forward_mul_mat_x(params, src0, src1, dst);
-    }
-}
-
 // ggml_compute_forward_out_prod
 
 static void ggml_compute_forward_out_prod_f32(
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 85fddbd3b..e9c2ae66b 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -154,7 +154,12 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     // split conv1d in fundamental methods for test unit
     struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
     ggml_set_name(im2col_res, "im2col_res");
-    struct ggml_tensor* conv1d_res = ggml_mul_mat(ctx0, model.a, im2col_res);
+    struct ggml_tensor* conv1d_res = ggml_reshape_3d(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0,
+            ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1]),  model.a->ne[2])),
+            ggml_cont(ctx0, ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[2] * im2col_res->ne[1])))))),
+            im2col_res->ne[1],  model.a->ne[2], im2col_res->ne[2]);
     ggml_set_name(conv1d_res, "conv1d_res");
     ggml_build_forward_expand(gf, conv1d_res);
 
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 9cc4d3831..61f04c47b 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -157,10 +157,12 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     // split conv2d in fundamental methods for test unit
     struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
     ggml_set_name(im2col_res, "im2col_res");
-    printf("MatrixA [%i, %i, %i, %i]\n", model.a->ne[0], model.a->ne[1], model.a->ne[2], model.a->ne[3]);
-    printf("MatrixB [%i, %i, %i, %i]\n", im2col_res->ne[0], im2col_res->ne[1], im2col_res->ne[2], im2col_res->ne[3]);
-    struct ggml_tensor* conv2d_res = ggml_mul_mat(ctx0, model.a, im2col_res);
-    printf("MatrixR [%i, %i, %i, %i]\n", conv2d_res->ne[0], conv2d_res->ne[1], conv2d_res->ne[2], conv2d_res->ne[3]);
+    struct ggml_tensor* conv2d_res = ggml_reshape_4d(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0,
+            ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1] * model.a->ne[2]),  model.a->ne[3])),
+            ggml_cont(ctx0, ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[3] * im2col_res->ne[2] * im2col_res->ne[1])))))),
+            im2col_res->ne[1], im2col_res->ne[2], model.a->ne[3], im2col_res->ne[3]);
     ggml_set_name(conv2d_res, "conv2d_res");
     ggml_build_forward_expand(gf, conv2d_res);
 
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 671f4fb21..99bb0a117 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -3,8 +3,6 @@
 #include "ggml/ggml-backend.h"
 
 //#define GGML_USE_CUBLAS uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
-// test a conv2d expected matrix
-#define LONG_MATRIX
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -188,7 +186,6 @@ static void gemm_f16_out_f32(int m, int n, int k,
                              float * B,
                              float * C,
                              const int ith, const int nth) {
-    printf("M: %i, N: %i, K: %i\n", m, n, k);
     // does not seem to make a difference
     int m0, m1, n0, n1;
     // patches per thread
@@ -268,29 +265,18 @@ void perform_gemm_test(float* a, float* b, float* expected, int M, int N, int K)
 int main(void)
 {
     ggml_time_init();
-#ifdef LONG_MATRIX
     const int M = 4, N = 16, K = 36;  // a conv2d expected matrix multiplication
-#else
-    const int M = 3, N = 2, K = 3; // a normal matrix multiplication
-#endif
 
     // matrix A (4 X 36)
     float matrixA[M * K] = {
-#ifdef LONG_MATRIX
        2.0f, 9.0f, 2.0f, 10.0f, 6.0f, 4.0f, 3.0f, 6.0f, 3.0f, 6.0f, 9.0f, 7.0f, 8.0f, 8.0f, 3.0f, 3.0f, 10.0f, 5.0f, 2.0f, 10.0f, 7.0f, 10.0f, 9.0f, 3.0f, 6.0f, 6.0f, 5.0f, 10.0f, 2.0f, 3.0f, 6.0f, 1.0f, 9.0f, 4.0f, 10.0f, 4.0f,
         10.0f, 7.0f, 8.0f, 10.0f, 10.0f, 8.0f, 7.0f, 10.0f, 4.0f, 6.0f, 8.0f, 7.0f, 7.0f, 6.0f, 9.0f, 3.0f, 6.0f, 5.0f, 5.0f, 2.0f, 7.0f, 2.0f, 7.0f, 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 3.0f, 9.0f, 3.0f, 6.0f, 4.0f, 7.0f, 2.0f, 9.0f,
         7.0f, 3.0f, 2.0f, 5.0f, 7.0f, 3.0f, 10.0f, 2.0f, 6.0f, 1.0f, 4.0f, 7.0f, 5.0f, 10.0f, 3.0f, 10.0f, 4.0f, 5.0f, 5.0f, 1.0f, 6.0f, 10.0f, 7.0f, 4.0f, 5.0f, 3.0f, 9.0f, 9.0f, 8.0f, 6.0f, 9.0f, 2.0f, 3.0f, 6.0f, 8.0f, 5.0f,
         5.0f, 5.0f, 5.0f, 5.0f, 3.0f, 10.0f, 4.0f, 1.0f, 8.0f, 8.0f, 9.0f, 8.0f, 4.0f, 1.0f, 4.0f, 9.0f, 3.0f, 6.0f, 3.0f, 1.0f, 4.0f, 8.0f, 3.0f, 10.0f, 8.0f, 6.0f, 4.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f,
-#else
-        2, 8,
-        5, 1,
-        4, 2
-#endif
     };
 
     // matrix B (16 X 36)
     float matrixB[N * K] = {
-#ifdef LONG_MATRIX
         9.0f, 7.0f, 1.0f, 3.0f, 5.0f, 9.0f, 7.0f, 6.0f, 1.0f, 10.0f, 1.0f, 1.0f, 7.0f, 2.0f, 4.0f, 9.0f, 10.0f, 4.0f, 5.0f, 5.0f, 7.0f, 1.0f, 7.0f, 7.0f, 2.0f, 9.0f, 5.0f, 10.0f, 7.0f, 4.0f, 8.0f, 9.0f, 9.0f, 3.0f, 10.0f, 2.0f,
         4.0f, 6.0f, 10.0f, 9.0f, 5.0f, 1.0f, 8.0f, 7.0f, 4.0f, 7.0f, 2.0f, 6.0f, 5.0f, 3.0f, 1.0f, 10.0f, 8.0f, 4.0f, 8.0f, 3.0f, 7.0f, 1.0f, 2.0f, 7.0f, 6.0f, 8.0f, 6.0f, 5.0f, 2.0f, 3.0f, 1.0f, 1.0f, 2.0f, 5.0f, 7.0f, 1.0f,
         8.0f, 2.0f, 8.0f, 8.0f, 8.0f, 8.0f, 4.0f, 4.0f, 6.0f, 10.0f, 10.0f, 9.0f, 2.0f, 9.0f, 3.0f, 7.0f, 7.0f, 1.0f, 4.0f, 9.0f, 1.0f, 2.0f, 3.0f, 6.0f, 1.0f, 10.0f, 5.0f, 8.0f, 9.0f, 4.0f, 6.0f, 2.0f, 3.0f, 1.0f, 2.0f, 7.0f,
@@ -307,24 +293,14 @@ int main(void)
         8.0f, 5.0f, 8.0f, 9.0f, 4.0f, 8.0f, 2.0f, 1.0f, 1.0f, 9.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 5.0f, 6.0f, 7.0f, 3.0f, 1.0f, 4.0f, 6.0f, 7.0f, 7.0f, 7.0f, 8.0f, 7.0f, 8.0f, 8.0f, 2.0f, 10.0f, 2.0f, 7.0f, 3.0f, 8.0f, 3.0f,
         8.0f, 7.0f, 6.0f, 2.0f, 4.0f, 10.0f, 10.0f, 6.0f, 10.0f, 3.0f, 7.0f, 6.0f, 4.0f, 3.0f, 5.0f, 5.0f, 5.0f, 3.0f, 8.0f, 10.0f, 3.0f, 4.0f, 8.0f, 4.0f, 2.0f, 6.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 3.0f, 5.0f, 2.0f, 2.0f, 6.0f,
         10.0f, 6.0f, 2.0f, 1.0f, 7.0f, 5.0f, 6.0f, 4.0f, 1.0f, 9.0f, 10.0f, 2.0f, 4.0f, 5.0f, 8.0f, 5.0f, 7.0f, 4.0f, 7.0f, 6.0f, 3.0f, 9.0f, 2.0f, 1.0f, 4.0f, 2.0f, 6.0f, 6.0f, 3.0f, 3.0f, 2.0f, 8.0f, 5.0f, 9.0f, 3.0f, 4.0f,
-#else
-        10, 9, 5,
-        5, 9, 4
-#endif
     };
 
     // matrix C (4 x 16)
     float expected_result[M * N] = {
-#ifdef LONG_MATRIX
         1224.0f, 1023.0f, 1158.0f,1259.0f,1359.0f,1194.0f,1535.0f,1247.0f,1185.0f,1029.0f,889.0f,1182.0f,955.0f,1179.0f,1147.0f,1048.0f,
         1216.0f, 1087.0f, 1239.0f,1361.0f,1392.0f,1260.0f,1247.0f,1563.0f,1167.0f,1052.0f,942.0f,1214.0f,1045.0f,1134.0f,1264.0f,1126.0f,
         1125.0f, 966.0f, 1079.0f,1333.0f,1287.0f,1101.0f,1185.0f,1167.0f,1368.0f,990.0f,967.0f,1121.0f,971.0f,1086.0f,1130.0f,980.0f,
         999.0f, 902.0f, 1020.0f,1056.0f,1076.0f,929.0f,1029.0f,1052.0f,990.0f,1108.0f,823.0f,989.0f,759.0f,1041.0f,1003.0f,870.0f
-#else
-        60.0f, 90.0f, 42.0f,
-        55.0f, 54.0f, 29.0f,
-        50.0f, 54.0f, 28.0f
-#endif
     };
 
     bool passed = true;

From fde8828b202f1e7fc2e67fdfe9f1322c85bffba4 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sat, 14 Oct 2023 18:02:44 -0400
Subject: [PATCH 14/26] ggml_mul_mat cpu backend support fp16 src1

---
 src/ggml.c            |  2 +-
 tests/test-conv1d.cpp |  5 ++++-
 tests/test-conv2d.cpp | 12 +++++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index d0d00694e..0e7ac745c 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -11727,7 +11727,7 @@ static void ggml_compute_forward_mul_mat(
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == sizeof(float));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index e9c2ae66b..84a2c8d5b 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -152,8 +152,11 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     int d0 = 1;
 
     // split conv1d in fundamental methods for test unit
+    struct ggml_tensor* im2col_0 = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
+    ggml_set_name(im2col_0, "im2col_res");
+    ggml_build_forward_expand(gf, im2col_0);
+
     struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
-    ggml_set_name(im2col_res, "im2col_res");
     struct ggml_tensor* conv1d_res = ggml_reshape_3d(ctx0,
             ggml_cont(ctx0, ggml_transpose(ctx0,
             ggml_mul_mat(ctx0,
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 61f04c47b..10f4f2d92 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -155,13 +155,17 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     int d1 = 1;
 
     // split conv2d in fundamental methods for test unit
+    struct ggml_tensor* im2col_0 = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
+    ggml_set_name(im2col_0, "im2col_res");
+    ggml_build_forward_expand(gf, im2col_0);
+
+    // recalculate for avoid fragmentation
     struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
-    ggml_set_name(im2col_res, "im2col_res");
     struct ggml_tensor* conv2d_res = ggml_reshape_4d(ctx0,
             ggml_cont(ctx0, ggml_transpose(ctx0,
             ggml_mul_mat(ctx0,
-            ggml_cont(ctx0, ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1] * model.a->ne[2]),  model.a->ne[3])),
-            ggml_cont(ctx0, ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[3] * im2col_res->ne[2] * im2col_res->ne[1])))))),
+            ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1] * model.a->ne[2]),  model.a->ne[3]),
+            ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[3] * im2col_res->ne[2] * im2col_res->ne[1]))))),
             im2col_res->ne[1], im2col_res->ne[2], model.a->ne[3], im2col_res->ne[3]);
     ggml_set_name(conv2d_res, "conv2d_res");
     ggml_build_forward_expand(gf, conv2d_res);
@@ -238,6 +242,7 @@ int main(void)
     }
 
     ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
+    printf("Res [%i, %i, %i, %i]\n", im2col_res->ne[0], im2col_res->ne[1], im2col_res->ne[2], im2col_res->ne[3]);
     float* conv2d_data = new float[ggml_nelements(conv2d_res)];
 
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
@@ -393,6 +398,7 @@ int main(void)
     }
 
     printf("ggml_conv2d (%i): %s\n", ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+
     ggml_free(model.ctx);
 
     ggml_backend_buffer_free(model.buffer);

From 5377678ae875a886566d320fd81d214e8c6657a6 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sat, 14 Oct 2023 19:58:40 -0400
Subject: [PATCH 15/26] ggml_mul_mat cuda backend fp16 fixed

---
 src/ggml-cuda.cu       | 22 +++++++++++++++++++++-
 tests/test-conv2d.cpp  |  1 -
 tests/test-mul-mat.cpp |  2 +-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 1a72047ae..12c56c6f9 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4379,6 +4379,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
     *dsti = __float2half(*xi);
 }
 
+static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const half * xi = (const half *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -5481,6 +5488,16 @@ static void ggml_cpy_f32_f16_cuda(
         (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 
+static void ggml_cpy_f16_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
 static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -6301,7 +6318,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
             src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
             to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
         }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
 
         size_t dst_as = 0;
         half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
@@ -7162,6 +7179,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
                               ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+                              ne10, ne11, nb10, nb11, nb12, main_stream);
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 10f4f2d92..46b3eb77f 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -242,7 +242,6 @@ int main(void)
     }
 
     ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
-    printf("Res [%i, %i, %i, %i]\n", im2col_res->ne[0], im2col_res->ne[1], im2col_res->ne[2], im2col_res->ne[3]);
     float* conv2d_data = new float[ggml_nelements(conv2d_res)];
 
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 99bb0a117..fda7bef37 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-//#define GGML_USE_CUBLAS uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
+//#define GGML_USE_CUBLAS // uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"

From 6b42245178c6b735590b7738d69bee54d5fce80d Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sun, 15 Oct 2023 15:51:17 -0400
Subject: [PATCH 16/26] remove unnecessary ggml_cont and removed conv1d-2d
 functions deprecated

---
 include/ggml/ggml.h   |   2 -
 src/ggml.c            | 475 +-----------------------------------------
 tests/test-conv1d.cpp |  10 +-
 tests/test-conv2d.cpp |  10 +-
 4 files changed, 13 insertions(+), 484 deletions(-)

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 5f0864335..93a036edf 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -399,9 +399,7 @@ extern "C" {
         GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
         GGML_OP_CLAMP,
-        GGML_OP_CONV_1D,
         GGML_OP_CONV_TRANSPOSE_1D,
-        GGML_OP_CONV_2D,
         GGML_OP_IM2COL,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
diff --git a/src/ggml.c b/src/ggml.c
index 0e7ac745c..e3d64cffb 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4054,9 +4054,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ROPE_BACK",
     "ALIBI",
     "CLAMP",
-    "CONV_1D",
     "CONV_TRANSPOSE_1D",
-    "CONV_2D",
     "IM2COL",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
@@ -4088,7 +4086,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4138,9 +4136,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rope_back(x)",
     "alibi(x)",
     "clamp(x)",
-    "conv_1d(x)",
     "conv_transpose_1d(x)",
-    "conv_2d(x)",
     "im2col(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
@@ -4172,7 +4168,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4200,9 +4196,7 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_GET_ROWS_BACK          ] = true;
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[GGML_OP_CONV_1D                ] = true;
         p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_CONV_2D                ] = true;
         p[GGML_OP_IM2COL                 ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
@@ -7489,48 +7483,14 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   p0,
         int                   d0) {
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OH, OW, IC * KH * KW]
-    result = ggml_reshape_3d(ctx,
-            ggml_cont(ctx, ggml_transpose(ctx,
+    result = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx,
             ggml_mul_mat(ctx,
-            ggml_cont(ctx, ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2])),
-            ggml_cont(ctx, ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1])))))),
+            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2]),
+            ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1]))))),
             result->ne[1],  a->ne[2], result->ne[2]);
     return result;
 }
 
-// GGML_API struct ggml_tensor * ggml_conv_1d(
-//         struct ggml_context * ctx,
-//         struct ggml_tensor  * a,
-//         struct ggml_tensor  * b,
-//         int                   s0,
-//         int                   p0,
-//         int                   d0) {
-//     GGML_ASSERT(ggml_is_matrix(b));
-//     GGML_ASSERT(a->ne[1] == b->ne[1]);
-//     bool is_node = false;
-
-//     if (a->grad || b->grad) {
-//         GGML_ASSERT(false); // TODO: implement backward
-//         is_node = true;
-//     }
-
-//     const int64_t ne[4] = {
-//         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-//         a->ne[2], 1, 1,
-//     };
-//     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-//     int32_t params[] = { s0, p0, d0 };
-//     ggml_set_op_params(result, params, sizeof(params));
-
-//     result->op = GGML_OP_CONV_1D;
-//     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-//     result->src[0] = a;
-//     result->src[1] = b;
-
-//     return result;
-// }
-
 // ggml_conv_1d_ph
 
 struct ggml_tensor* ggml_conv_1d_ph(
@@ -7651,14 +7611,12 @@ struct ggml_tensor * ggml_conv_2d(
     int                  p1,
     int                  d0,
     int                  d1) {
-
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-    result = ggml_reshape_4d(ctx,
-            ggml_cont(ctx, ggml_transpose(ctx,
+    result = ggml_reshape_4d(ctx, ggml_cont(ctx, ggml_transpose(ctx,
             ggml_mul_mat(ctx,
-            ggml_cont(ctx, ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])),
-            ggml_cont(ctx, ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[3] * result->ne[2] * result->ne[1])))))),
-            result->ne[1], result->ne[2], a->ne[3], result->ne[3]); // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]), // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+            ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1])))), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+            result->ne[1], result->ne[2], a->ne[3], result->ne[3]); // [N, OC, OH, OW]
     return result;
 }
 
@@ -13748,190 +13706,6 @@ static void ggml_compute_forward_rope_back(
     }
 }
 
-// ggml_compute_forward_conv_1d
-
-static void ggml_compute_forward_conv_1d_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-
-    // size of the convolution row - the kernel size unrolled across all input channels
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            ggml_fp16_t * dst_data = wdata;
-
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
-
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne2;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-
-            for (int i0 = 0; i0 < ne0; i0++) {
-                ggml_vec_dot_f16(ew0, dst_data + i0,
-                        (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
-                        (ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        float * const wdata = (float *) params->wdata + 0;
-
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            float * dst_data = wdata;
-
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
-
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * const wdata = (float *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-
-            for (int i0 = 0; i0 < ne0; i0++) {
-                ggml_vec_dot_f32(ew0, dst_data + i0,
-                        (float *) ((char *) src0->data + i1*nb02),
-                        (float *)                wdata + i2*nb2 + i0*ew0);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_conv_transpose_1d
 
 static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -14220,145 +13994,6 @@ static void ggml_compute_forward_im2col_f16(
     }
 }
 
-static void ggml_compute_forward_conv_2d_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // src1: image [N, IC, IH, IW]
-    // src0: kernel [OC, IC, KH, KW]
-    // dst:  result [N, OC, OH, OW]
-    // ne12: IC
-    // ne0: OW
-    // ne1: OH
-    // nk0: KW
-    // nk1: KH
-    // ne13: N
-
-    const int N = ne13;
-    const int IC = ne12;
-    const int IH = ne11;
-    const int IW = ne10;
-
-    const int OC = ne03;
-    // const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-
-    const int OH = ne1;
-    const int OW = ne0;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // const int nk0 = ne00;
-    // const int nk1 = ne01;
-
-    // size of the convolution row - the kernel size unrolled across all channels
-    // const int ew0 = nk0*nk1*ne02;
-    // ew0: IC*KH*KW
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare source data (src1)
-        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
-
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int in = 0; in < N; in++) {
-                for (int iic = 0; iic < IC; iic++) {
-                    for (int ioh = 0; ioh < OH; ioh++) {
-                        for (int iow = 0; iow < OW; iow++) {
-
-                            // micro kernel
-                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-
-                            for (int ikh = 0; ikh < KH; ikh++) {
-                                for (int ikw = 0; ikw < KW; ikw++) {
-                                    const int iiw = iow*s0 + ikw*d0 - p0;
-                                    const int iih = ioh*s1 + ikh*d1 - p1;
-
-                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    // wdata: [N*OH*OW, IC*KH*KW]
-    // dst: result [N, OC, OH, OW]
-    // src0: kernel [OC, IC, KH, KW]
-
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m * k]
-
-        //gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
-static void ggml_compute_forward_conv_2d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 static void ggml_compute_forward_im2col(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -16588,18 +16223,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_clamp(params, tensor->src[0], tensor);
             } break;
-        case GGML_OP_CONV_1D:
-            {
-                ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_2D:
-            {
-                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case GGML_OP_IM2COL:
             {
                 ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
@@ -17521,18 +17148,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_2D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_IM2COL:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -18370,44 +17989,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = 1; //TODO
                 } break;
-            case GGML_OP_CONV_1D:
-                {
-                    n_tasks = n_threads;
-
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];
-                    const int64_t ne01 = node->src[0]->ne[1];
-                    const int64_t ne02 = node->src[0]->ne[2];
-
-                    const int64_t ne10 = node->src[1]->ne[0];
-                    const int64_t ne11 = node->src[1]->ne[1];
-
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t nk  = ne00;
-                    const int64_t ew0 = nk * ne01;
-
-                    UNUSED(ne02);
-                    UNUSED(ne10);
-                    UNUSED(ne11);
-
-                    size_t cur = 0;
-
-                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(float)*(ne0*ne1*ew0);
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-
-                    work_size = MAX(work_size, cur);
-                } break;
             case GGML_OP_CONV_TRANSPOSE_1D:
                 {
                     n_tasks = n_threads;
@@ -18436,44 +18017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                         GGML_ASSERT(false);
                     }
 
-                    work_size = MAX(work_size, cur);
-                } break;
-            case GGML_OP_CONV_2D:
-                {
-                    n_tasks = n_threads;
-
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // C
-                    const int64_t ne03 = node->src[0]->ne[3]; // N
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // C
-
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t ne2 = node->ne[2];
-                    const int64_t ne3 = node->ne[3];
-                    const int64_t nk = ne00*ne01;
-                    const int64_t ew0 = nk * ne02;
-
-                    UNUSED(ne03);
-                    UNUSED(ne2);
-
-                    size_t cur = 0;
-
-                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        // im2col: [N*OH*OW, IC*KH*KW]
-                        cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(float)*      (ne10*ne11*ne12);
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-
                     work_size = MAX(work_size, cur);
                 } break;
             case GGML_OP_IM2COL:
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 84a2c8d5b..07a04509a 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-//#define GGML_USE_CUBLAS
+#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -156,13 +156,7 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     ggml_set_name(im2col_0, "im2col_res");
     ggml_build_forward_expand(gf, im2col_0);
 
-    struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
-    struct ggml_tensor* conv1d_res = ggml_reshape_3d(ctx0,
-            ggml_cont(ctx0, ggml_transpose(ctx0,
-            ggml_mul_mat(ctx0,
-            ggml_cont(ctx0, ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1]),  model.a->ne[2])),
-            ggml_cont(ctx0, ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[2] * im2col_res->ne[1])))))),
-            im2col_res->ne[1],  model.a->ne[2], im2col_res->ne[2]);
+    struct ggml_tensor* conv1d_res = ggml_conv_1d(ctx0, model.a, model.b, s0, p0, d0);
     ggml_set_name(conv1d_res, "conv1d_res");
     ggml_build_forward_expand(gf, conv1d_res);
 
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 46b3eb77f..3ff4da4fc 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-//#define GGML_USE_CUBLAS
+#define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -160,13 +160,7 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a
     ggml_build_forward_expand(gf, im2col_0);
 
     // recalculate for avoid fragmentation
-    struct ggml_tensor* im2col_res = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
-    struct ggml_tensor* conv2d_res = ggml_reshape_4d(ctx0,
-            ggml_cont(ctx0, ggml_transpose(ctx0,
-            ggml_mul_mat(ctx0,
-            ggml_reshape_2d(ctx0, model.a, (model.a->ne[0] * model.a->ne[1] * model.a->ne[2]),  model.a->ne[3]),
-            ggml_reshape_2d(ctx0, im2col_res, im2col_res->ne[0],  (im2col_res->ne[3] * im2col_res->ne[2] * im2col_res->ne[1]))))),
-            im2col_res->ne[1], im2col_res->ne[2], model.a->ne[3], im2col_res->ne[3]);
+    struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
     ggml_set_name(conv2d_res, "conv2d_res");
     ggml_build_forward_expand(gf, conv2d_res);
 

From d7340400e9eb8a606f681112715b6aa8fb549b31 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Sun, 15 Oct 2023 15:59:34 -0400
Subject: [PATCH 17/26] some fixes

---
 src/ggml.c            | 3 +--
 tests/test-conv1d.cpp | 2 +-
 tests/test-conv2d.cpp | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index e3d64cffb..46c077291 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6416,7 +6416,6 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
@@ -11658,7 +11657,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
 
 static void ggml_compute_forward_mul_mat(
-         const struct ggml_compute_params * params,
+        const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 07a04509a..560ac0eb9 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-#define GGML_USE_CUBLAS
+// #define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index 3ff4da4fc..a3edc5cce 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -2,7 +2,7 @@
 #include "ggml/ggml-alloc.h"
 #include "ggml/ggml-backend.h"
 
-#define GGML_USE_CUBLAS
+// #define GGML_USE_CUBLAS
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"

From d8539f367b60100751f07654ba7ca37ccdf957a4 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Mon, 16 Oct 2023 06:54:45 -0400
Subject: [PATCH 18/26] explain conv1d reshapes

---
 src/ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index 46c077291..16e64ecef 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -7481,12 +7481,12 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OH, OW, IC * KH * KW]
+    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
     result = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx,
             ggml_mul_mat(ctx,
-            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2]),
-            ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1]))))),
-            result->ne[1],  a->ne[2], result->ne[2]);
+            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2]), // [OC，IC, K] => [OC, IC * K]
+            ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1]))))), // [N, OL, IC * K] => [N*OL, IC * K]
+            result->ne[1],  a->ne[2], result->ne[2]); // [N, OC, OL]
     return result;
 }
 

From 53f805eaa1d95ffd3fa8efbffa9246ce3bbfb4fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Oct 2023 23:24:52 +0300
Subject: [PATCH 19/26] ggml : fix tests on Arm + do not use BLAS for F16 data

---
 src/ggml.c            | 2 ++
 tests/test-conv1d.cpp | 8 ++++----
 tests/test-conv2d.cpp | 8 ++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index 16e64ecef..927f03a69 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -11645,6 +11645,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     // TODO: find the optimal values for these
     if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) &&
+        src0->type == GGML_TYPE_F32 &&
+        src1->type == GGML_TYPE_F32 &&
         (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 560ac0eb9..1e9338c1e 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -41,8 +41,8 @@ void load_model(test_model & model, bool use_gpu = false) {
     }
 
     // Convert adata to fp16 format
-    uint16_t* hadata = new uint16_t[K * IC * OC];
-    ggml_fp32_to_fp16_row(adata, hadata, K * IC * OC);
+    std::vector<ggml_fp16_t> hadata(K * IC * OC);
+    ggml_fp32_to_fp16_row(adata, hadata.data(), K * IC * OC);
 
     // Initialize bdata
     float* bdata =  new float[IL * IC * N];
@@ -111,9 +111,9 @@ void load_model(test_model & model, bool use_gpu = false) {
 
     // load data to buffer
     if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata, ggml_nbytes(model.a));
+        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
     } else {
-        ggml_backend_tensor_set(model.a, hadata, 0, ggml_nbytes(model.a));
+        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
     }
 
     // alloc memory
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index a3edc5cce..d6987bd30 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -41,8 +41,8 @@ void load_model(test_model & model, bool use_gpu = false) {
     }
 
     // Convert adata to fp16 format
-    uint16_t* hadata = new uint16_t[KW * KH * IC * OC];
-    ggml_fp32_to_fp16_row(adata, hadata, KW * KH * IC * OC);
+    std::vector<ggml_fp16_t> hadata(KW * KH * IC * OC);
+    ggml_fp32_to_fp16_row(adata, hadata.data(), KW * KH * IC * OC);
 
     // Initialize bdata
     float* bdata =  new float[IW * IH * IC * N];
@@ -111,9 +111,9 @@ void load_model(test_model & model, bool use_gpu = false) {
 
     // load data to buffer
     if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata, ggml_nbytes(model.a));
+        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
     } else {
-        ggml_backend_tensor_set(model.a, hadata, 0, ggml_nbytes(model.a));
+        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
     }
 
     // alloc memory

From 3b9022a5a1a1ebaec38ee113a3e5d645d0181405 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Oct 2023 23:34:48 +0300
Subject: [PATCH 20/26] tests : fix FP16 handling on Arm

---
 tests/test-conv1d.cpp | 4 ++--
 tests/test-conv2d.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp
index 1e9338c1e..da3c7aaea 100644
--- a/tests/test-conv1d.cpp
+++ b/tests/test-conv1d.cpp
@@ -231,7 +231,7 @@ int main(void)
         }
     }
 
-    ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
+    uint16_t* im2col_data = new uint16_t[ggml_nelements(im2col_res)];
     float* conv2d_data = new float[ggml_nelements(conv1d_res)];
 
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
@@ -254,7 +254,7 @@ int main(void)
     };
     // first im2col test
 
-    ggml_fp16_t expected_im2col[n_conv1d_test] = {
+    uint16_t expected_im2col[n_conv1d_test] = {
         0, 16640, 16640, 0, 16640, 16640, 0, 16640,
         16640, 0, 16640, 16640, 0, 16640, 16640, 0,
         16640, 16640, 0, 16640, 16640, 0, 16640, 16640,
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index d6987bd30..7695c4c70 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -235,7 +235,7 @@ int main(void)
         }
     }
 
-    ggml_fp16_t* im2col_data = new ggml_fp16_t[ggml_nelements(im2col_res)];
+    uint16_t* im2col_data = new uint16_t[ggml_nelements(im2col_res)];
     float* conv2d_data = new float[ggml_nelements(conv2d_res)];
 
     ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
@@ -306,7 +306,7 @@ int main(void)
         225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
         150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f };
 
-    ggml_fp16_t expected_im2col[n_conv2d_test] = {
+    uint16_t expected_im2col[n_conv2d_test] = {
             0, 0, 0, 0, 15872, 15872, 0, 15872,
             15872, 0, 0, 0, 0, 15872, 15872, 0,
             15872, 15872, 0, 0, 0, 0, 15872, 15872,

From 7193df2b9152ca93c2f27f09558ad2906188f9af Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Oct 2023 23:37:33 +0300
Subject: [PATCH 21/26] ggml : avoid ggml_cont and ggml_transpose in
 ggml_conv_xd

---
 src/ggml.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ggml.c b/src/ggml.c
index 927f03a69..6e8235eb1 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -7482,10 +7482,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   p0,
         int                   d0) {
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
-    result = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx,
+    result = ggml_reshape_3d(ctx,
             ggml_mul_mat(ctx,
-            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2]), // [OC，IC, K] => [OC, IC * K]
-            ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1]))))), // [N, OL, IC * K] => [N*OL, IC * K]
+            ggml_reshape_2d(ctx, result, result->ne[0],  (result->ne[2] * result->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]),  a->ne[2])), // [OC，IC, K] => [OC, IC * K]
             result->ne[1],  a->ne[2], result->ne[2]); // [N, OC, OL]
     return result;
 }
@@ -7611,10 +7611,10 @@ struct ggml_tensor * ggml_conv_2d(
     int                  d0,
     int                  d1) {
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-    result = ggml_reshape_4d(ctx, ggml_cont(ctx, ggml_transpose(ctx,
+    result = ggml_reshape_4d(ctx,
             ggml_mul_mat(ctx,
-            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]), // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-            ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1])))), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+            ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+            ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])), // [OC，IC, KH, KW] => [OC, IC * KH * KW]
             result->ne[1], result->ne[2], a->ne[3], result->ne[3]); // [N, OC, OH, OW]
     return result;
 }

From f1879c06562e37906d749ea9525a85b2fbc29020 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 10 Nov 2023 14:46:09 +0200
Subject: [PATCH 22/26] ci : switch back to release

---
 ci/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 36a2eb543..7afe8304f 100644
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -109,7 +109,7 @@ function gg_run_ctest_release {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z $GG_BUILD_LOW_PERF ]; then

From 439a79f9e91e4d9fa767869d65edbb08fea044fd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 10 Nov 2023 15:11:55 +0200
Subject: [PATCH 23/26] cuda : fix wrong pointer usage

---
 src/ggml-cuda.cu | 2 +-
 src/ggml.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 1a2b54d79..854bf0c56 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -6515,7 +6515,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
             src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
             to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
         }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
         size_t dst_f16_as = 0;
         half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
 
diff --git a/src/ggml.c b/src/ggml.c
index 75f31e4af..52adc589e 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -15814,7 +15814,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         default:
             {
-                printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
+                fprintf(stderr, "%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
                 GGML_ASSERT(false);
             } break;
     }

From a729f6b0697e5fe346b617de0ee6f5a525102699 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 11 Nov 2023 10:30:13 +0200
Subject: [PATCH 24/26] ggml : add metal support for im2col and f16xf16 mul mat

---
 src/ggml-cuda.cu     |  53 +++++++++++++--------
 src/ggml-metal.m     |  82 ++++++++++++++++++++++++++++----
 src/ggml-metal.metal | 108 ++++++++++++++++++++++++++++++++++++++++++-
 src/ggml.c           |  71 +++++++++++++++-------------
 4 files changed, 251 insertions(+), 63 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 854bf0c56..309866b36 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -39,6 +39,7 @@
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceGetMemPool hipDeviceGetMemPool
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@@ -48,6 +49,7 @@
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
+#define cudaFreeAsync hipFreeAsync
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
@@ -55,6 +57,7 @@
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
+#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
@@ -63,6 +66,9 @@
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
+#define cudaMemPool_t hipMemPool_t
+#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
+#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
@@ -4730,14 +4736,22 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
     dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
 }
 
-static  __global__ void im2col_f32_f16(const float* x, half* dst, int ofs0, int ofs1, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
-    int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
-	int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
-    __syncthreads();
+static  __global__ void im2col_f32_f16(
+        const float * x, half * dst,
+        int ofs0, int ofs1, int IW, int IH, int CHW,
+        int s0, int s1, int p0, int p1, int d0, int d1) {
+    const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
+	const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
+
+    const int offset_dst =
+        (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
+        (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
+
     if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-        int offset_dst = (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW;
-        int offset_src = threadIdx.x * ofs0 +  blockIdx.x * ofs1;
-        dst[offset_dst + (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z)] = __float2half(x[offset_src + iih * IW + iiw]);
+        const int offset_src =  threadIdx.x * ofs0 + blockIdx.x * ofs1;
+        dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
+    } else {
+        dst[offset_dst] = __float2half(0.0f);
     }
 }
 
@@ -5723,13 +5737,12 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
     soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
 }
 
-static void im2col_f32_f16_cuda(const float* x, half* dst,
-    int OH, int IW, int IH,
-    int OW, int IC,
-    int KH, int KW, int N, int ofs0, int ofs1,
-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+static void im2col_f32_f16_cuda(const float * x, half * dst,
+    int OH, int IW, int IH, int OW, int IC,
+    int KH, int KW, int N,  int ofs0, int ofs1,
+    int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
     dim3 block_nums(IC, OH, OW);
-    dim3 block_dims(N, KH, KW);
+    dim3 block_dims(N,  KH, KW);
     im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
 }
 
@@ -6708,23 +6721,23 @@ inline void ggml_cuda_op_im2col(
 
     const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
 
-    const int64_t N = src1->ne[is_2D ? 3 : 2];
+    const int64_t N  = src1->ne[is_2D ? 3 : 2];
     const int64_t IC = src1->ne[is_2D ? 2 : 1];
     const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW = src1->ne[0];
+    const int64_t IW =         src1->ne[0];
 
     const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW = src0->ne[0];
+    const int64_t KW =         src0->ne[0];
 
     const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW = dst->ne[1];
+    const int64_t OW =         dst->ne[1];
 
+    const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
+    const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
 
     im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
         OH, IW, IH, OW, IC, KH, KW, N,
-        src1->nb[is_2D ? 3 : 2] / 4, // nb is byte offset, src is type float32
-        src1->nb[is_2D ? 2 : 1] / 4, // nb is byte offset, src is type float32
-        s0, s1, p0, p1, d0, d1, main_stream);
+        ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
 
     (void) src0;
     (void) src0_dd;
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
index 43d0dff09..148c12b14 100644
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -86,6 +86,7 @@
     GGML_METAL_DECL_KERNEL(rms_norm);
     GGML_METAL_DECL_KERNEL(norm);
     GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
     GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
     GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
@@ -114,6 +115,7 @@
     GGML_METAL_DECL_KERNEL(rope_f32);
     GGML_METAL_DECL_KERNEL(rope_f16);
     GGML_METAL_DECL_KERNEL(alibi_f32);
+    GGML_METAL_DECL_KERNEL(im2col_f16);
     GGML_METAL_DECL_KERNEL(cpy_f32_f16);
     GGML_METAL_DECL_KERNEL(cpy_f32_f32);
     GGML_METAL_DECL_KERNEL(cpy_f16_f16);
@@ -287,6 +289,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
         GGML_METAL_ADD_KERNEL(rms_norm);
         GGML_METAL_ADD_KERNEL(norm);
         GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
         GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
         GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
@@ -317,6 +320,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
         GGML_METAL_ADD_KERNEL(rope_f32);
         GGML_METAL_ADD_KERNEL(rope_f16);
         GGML_METAL_ADD_KERNEL(alibi_f32);
+        GGML_METAL_ADD_KERNEL(im2col_f16);
         GGML_METAL_ADD_KERNEL(cpy_f32_f16);
         GGML_METAL_ADD_KERNEL(cpy_f32_f32);
         GGML_METAL_ADD_KERNEL(cpy_f16_f16);
@@ -386,6 +390,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(rms_norm);
     GGML_METAL_DEL_KERNEL(norm);
     GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
     GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
     GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
@@ -416,6 +421,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(rope_f32);
     GGML_METAL_DEL_KERNEL(rope_f16);
     GGML_METAL_DEL_KERNEL(alibi_f32);
+    GGML_METAL_DEL_KERNEL(im2col_f16);
     GGML_METAL_DEL_KERNEL(cpy_f32_f16);
     GGML_METAL_DEL_KERNEL(cpy_f32_f32);
     GGML_METAL_DEL_KERNEL(cpy_f16_f16);
@@ -1030,7 +1036,7 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                             [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                             [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
@@ -1139,6 +1145,7 @@ void ggml_metal_graph_compute(
                                 switch (src0t) {
                                     case GGML_TYPE_F32:
                                         {
+                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
                                             nrows = 4;
                                         } break;
@@ -1146,13 +1153,18 @@ void ggml_metal_graph_compute(
                                         {
                                             nth0 = 32;
                                             nth1 = 1;
-                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
-                                                nrows = ne11;
+                                            if (src1t == GGML_TYPE_F32) {
+                                                if (ne11 * ne12 < 4) {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                    nrows = ne11;
+                                                } else {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                    nrows = 4;
+                                                }
                                             } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
                                                 nrows = 4;
                                             }
                                         } break;
@@ -1342,7 +1354,7 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                             [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                             [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
 
                             const int64_t nrows = ggml_nrows(src0);
 
@@ -1361,7 +1373,7 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                             [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
                             [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
 
                             const int64_t nrows = ggml_nrows(src0);
 
@@ -1464,6 +1476,58 @@ void ggml_metal_graph_compute(
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
+                    case GGML_OP_IM2COL:
+                        {
+                            GGML_ASSERT(src0->type == GGML_TYPE_F16);
+                            GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                            GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                            const int32_t IH = is_2D ? src1->ne[1] : 1;
+                            const int32_t IW =         src1->ne[0];
+
+                            const int32_t KH = is_2D ? src0->ne[1] : 1;
+                            const int32_t KW =         src0->ne[0];
+
+                            const int32_t OH = is_2D ? dst->ne[2] : 1;
+                            const int32_t OW =         dst->ne[1];
+
+                            const int32_t CHW = IC * KH * KW;
+
+                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
+                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
+                                default: GGML_ASSERT(false);
+                            };
+
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
+                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
+                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
+                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
+                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
+                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
+                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
+                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
+                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
+                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
+                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                        } break;
                     case GGML_OP_DUP:
                     case GGML_OP_CPY:
                     case GGML_OP_CONT:
diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal
index 7c35f23a7..4fdcaac9e 100644
--- a/src/ggml-metal.metal
+++ b/src/ggml-metal.metal
@@ -792,7 +792,7 @@ kernel void kernel_mul_mv_f32_f32(
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]]) {
 
     const int64_t r0 = tgpig.x;
     const int64_t rb = tgpig.y*N_F32_F32;
@@ -844,6 +844,79 @@ kernel void kernel_mul_mv_f32_f32(
     }
 }
 
+#define N_F16_F16 4
+
+kernel void kernel_mul_mv_f16_f16(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]]) {
+
+    const int64_t r0 = tgpig.x;
+    const int64_t rb = tgpig.y*N_F16_F16;
+    const int64_t im = tgpig.z;
+
+    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00; i += 32) {
+                sumf += (half) x[i] * (half) y[i];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const half4 * x4 = (device const half4 *)x;
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
+            device const half4 * y4 = (device const half4 *) y;
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00/4; i += 32) {
+                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
 kernel void kernel_mul_mv_f16_f32_1row(
         device const  char * src0,
         device const  char * src1,
@@ -1229,6 +1302,39 @@ kernel void kernel_rope(
 template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
 template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
 
+kernel void kernel_im2col_f16(
+        device const float * x,
+        device       half * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
+    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
+
+    const int32_t offset_dst =
+        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
+        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
+
+    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
+        dst[offset_dst] = x[offset_src + iih * IW + iiw];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
 kernel void kernel_cpy_f16_f16(
         device const half * src0,
         device       half * dst,
diff --git a/src/ggml.c b/src/ggml.c
index 52adc589e..2723c5be2 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -143,12 +143,6 @@ void ggml_print_backtrace(void) {
 }
 #endif
 
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -277,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
 // floating point type used to accumulate sums
 typedef double ggml_float;
 
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 //
 // global data
 //
@@ -5131,13 +5131,15 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
-    result =
-        ggml_reshape_3d(ctx,
-                ggml_mul_mat(ctx,
-                    ggml_reshape_2d(ctx, result, result->ne[0], (result->ne[2] * result->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
-                    ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])),                    // [OC，IC, K] => [OC, IC * K]
-                result->ne[1], a->ne[2], result->ne[2]);                                          // [N, OC, OL]
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
+
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
+
+    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
+
     return result;
 }
 
@@ -5227,13 +5229,13 @@ struct ggml_tensor * ggml_im2col(
     }
 
     const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
 
     const int64_t ne[4] = {
         is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
         OW,
         is_2D ? OH : b->ne[2],
-        is_2D ? b->ne[3] : 1,
+        is_2D ?      b->ne[3] : 1,
     };
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
@@ -5252,22 +5254,24 @@ struct ggml_tensor * ggml_im2col(
 // b: [N, IC, IH, IW]
 // result: [N, OC, OH, OW]
 struct ggml_tensor * ggml_conv_2d(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                  s0,
+        int                  s1,
+        int                  p0,
+        int                  p1,
+        int                  d0,
+        int                  d1) {
     struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
+
     result =
         ggml_reshape_4d(ctx,
                 ggml_mul_mat(ctx,
                     ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
                     ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])),                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
                 result->ne[1], result->ne[2], a->ne[3], result->ne[3]);                                          // [N, OC, OH, OW]
+
     return result;
 }
 
@@ -11724,17 +11728,18 @@ static void ggml_compute_forward_im2col_f16(
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
 
     const int ith = params->ith;
     const int nth = params->nth;
-    const int64_t N = is_2D ? ne13 : ne12;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
     const int64_t IC = is_2D ? ne12 : ne11;
     const int64_t IH = is_2D ? ne11 : 1;
     const int64_t IW = ne10;
@@ -15814,7 +15819,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         default:
             {
-                fprintf(stderr, "%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
+                printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
                 GGML_ASSERT(false);
             } break;
     }

From 406cbc1e5c14545dcc36d985fc0e5049b18c4a06 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 11 Nov 2023 10:39:42 +0200
Subject: [PATCH 25/26] ggml : im2col opts

---
 src/ggml-cuda.cu     |  6 +++---
 src/ggml-metal.metal |  6 +++---
 src/ggml.c           | 22 ++++++++++------------
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 309866b36..34c45f388 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4747,11 +4747,11 @@ static  __global__ void im2col_f32_f16(
         (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
         (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
 
-    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = __float2half(0.0f);
+    } else {
         const int offset_src =  threadIdx.x * ofs0 + blockIdx.x * ofs1;
         dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
-    } else {
-        dst[offset_dst] = __float2half(0.0f);
     }
 }
 
diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal
index 4fdcaac9e..5d1357cd7 100644
--- a/src/ggml-metal.metal
+++ b/src/ggml-metal.metal
@@ -1327,11 +1327,11 @@ kernel void kernel_im2col_f16(
         (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
         (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
 
-    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = 0.0f;
+    } else {
         const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
         dst[offset_dst] = x[offset_src + iih * IW + iiw];
-    } else {
-        dst[offset_dst] = 0.0f;
     }
 }
 
diff --git a/src/ggml.c b/src/ggml.c
index 2723c5be2..584ee4680 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -1777,7 +1777,6 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_IM2COL                 ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -5122,8 +5121,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-// ggml_conv_1d
-
 GGML_API struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -5263,14 +5260,14 @@ struct ggml_tensor * ggml_conv_2d(
         int                  p1,
         int                  d0,
         int                  d1) {
-    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
 
-    result =
-        ggml_reshape_4d(ctx,
-                ggml_mul_mat(ctx,
-                    ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
-                    ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])),                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-                result->ne[1], result->ne[2], a->ne[3], result->ne[3]);                                          // [N, OC, OH, OW]
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
 
     return result;
 }
@@ -11757,7 +11754,6 @@ static void ggml_compute_forward_im2col_f16(
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
         return;
     }
 
@@ -11783,7 +11779,9 @@ static void ggml_compute_forward_im2col_f16(
                                 const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                 const int64_t iih = ioh*s1 + ikh*d1 - p1;
 
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
                                     dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                 }
                             }

From da25cf0ad9c8bd2a78d5cfb57a9d36a234d47d29 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 11 Nov 2023 16:47:37 +0200
Subject: [PATCH 26/26] Update src/ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>
---
 src/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 34c45f388..ce4feeece 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -4741,7 +4741,7 @@ static  __global__ void im2col_f32_f16(
         int ofs0, int ofs1, int IW, int IH, int CHW,
         int s0, int s1, int p0, int p1, int d0, int d1) {
     const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
-	const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
+    const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
 
     const int offset_dst =
         (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +