From 28de59253fe3f070d42c8aacd726f608d76679fc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 24 Apr 2023 04:30:42 +0200
Subject: [PATCH] implement 5 of 6 missing backward pass operations used by
 llama

- GGML_OP_DIAG_MASK_INF
- GGML_OP_GET_ROWS
- GGML_OP_RMS_NORM
- GGML_OP_SILU
- GGML_OP_SOFT_MAX

add necessary ggml operations GGML_OP_ADD1, GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK, GGML_OP_DIAG_MASK_ZERO, and GGML_OP_ROPE_BACK

GGML_OP_ADD1 is necessary to add a scalar value in the backward pass of GGML_OP_SOFT_MAX
GGML_OP_ADD1 could also be replaced by using GGML_OP_ADD and GGML_OP_REPEAT, but the performance would be worse. additionally GGML_OP_REPEAT will return unexpected value when the the input to GGML_OP_SOFT_MAX contains only a single scalar. in this case GGML_OP_REPEAT will not return the value that should be repeated (src1) but the value which shape the result should take (src0). So in this case it can not replace GGML_OP_ADD1.

GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK and GGML_OP_ROPE_BACK are necessary for backward pass of GGML_OP_SILU, GGML_OP_RMS_NORM and GGML_OP_ROPE. The backward pass for these functions cannot be easily composed of existing operations. Since the backward pass builds a computation graph we need operations forward pass implementations of the the required backward passes. Sounds a bit confusing at first, I know...

GGML_OP_DIAG_MASK_ZERO is necessary for backward pass of GGML_OP_DIAG_MASK_INF.

Some operations where previously inplace-only. for backward pass there needs to be non-inplace variants.
staying consistent with other operations that have non-inplace and inplace variants, the operations are changed to non-inplace and
functions with "_inplace" are added which are inplace.
in llama we need to call the inplace variants so that it is implemented as before.
for llama backward pass we need to use the non-inplace variants.

still not completely implemented backward passes for llama:

- GGML_OP_ROPE: needs forward pass for GGML_OP_ROPE_BACK
- GGML_OP_GET_ROWS: only necessary for tokenizer
---
 ggml.c    | 1447 +++++++++++++++++++++++++++++++++++++++++++++--------
 ggml.h    |   70 ++-
 llama.cpp |   14 +-
 3 files changed, 1315 insertions(+), 216 deletions(-)

diff --git a/ggml.c b/ggml.c
index 87572b0ed62d7..960a1291a34d1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2275,6 +2275,7 @@ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) {
 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
 inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
 inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
@@ -3230,6 +3231,30 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
 }
 #endif
 
+inline static float ggml_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+#ifdef GGML_SILU_FP16
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        // we did not use x[i] to compute forward silu but its f16 equivalent
+        // take derivative at f16 of x[i]:
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        float usedx = GGML_FP16_TO_FP32(fp16);
+        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
+    }
+}
+#else
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+    }
+}
+#endif
+
 inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
     ggml_float sum = 0.0;
@@ -3349,6 +3374,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
 
     "DUP",
     "ADD",
+    "ADD1",
+    "ADD_AT",
     "SUB",
     "MUL",
     "DIV",
@@ -3364,8 +3391,10 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "RELU",
     "GELU",
     "SILU",
+    "SILU_BACK",
     "NORM",
     "RMS_NORM",
+    "RMS_NORM_BACK",
 
     "MUL_MAT",
 
@@ -3378,8 +3407,10 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "TRANSPOSE",
     "GET_ROWS",
     "DIAG_MASK_INF",
+    "DIAG_MASK_ZERO",
     "SOFT_MAX",
     "ROPE",
+    "ROPE_BACK",
     "CONV_1D_1S",
     "CONV_1D_2S",
 
@@ -3390,13 +3421,15 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 44, "GGML_OP_COUNT != 44");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
 
     "x",
     "x+y",
+    "x+y",
+    "x[offset:]+y",
     "x-y",
     "x*y",
     "x/y",
@@ -3412,8 +3445,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "relu(x)",
     "gelu(x)",
     "silu(x)",
+    "silu_back(x)",
     "norm(x)",
     "rms_norm(x)",
+    "rms_norm_back(x)",
 
     "X*Y",
 
@@ -3426,8 +3461,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "transpose(x)",
     "get_rows(x)",
     "diag_mask_inf(x)",
+    "diag_mask_zero(x)",
     "soft_max(x)",
     "rope(x)",
+    "rope_back(x)",
     "conv_1d_1s(x)",
     "conv_1d_2s(x)",
 
@@ -3438,7 +3475,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 44, "GGML_OP_COUNT != 44");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4357,6 +4394,48 @@ struct ggml_tensor * ggml_add_inplace(
     return ggml_add_impl(ctx, a, b, true);
 }
 
+// ggml_add1
+
+struct ggml_tensor * ggml_add1_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_is_scalar(b));
+    GGML_ASSERT(ggml_is_padded_1d(a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_ADD1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add1(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add1_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add1_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add1_impl(ctx, a, b, true);
+}
+
+// ggml_add_at
+
 struct ggml_tensor * ggml_add_at_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
@@ -4902,6 +4981,29 @@ struct ggml_tensor * ggml_silu_inplace(
     return ggml_silu_impl(ctx, a, true);
 }
 
+// ggml_silu_back
+
+struct ggml_tensor * ggml_silu_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SILU_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 // ggml_norm
 
 struct ggml_tensor * ggml_norm_impl(
@@ -4944,7 +5046,6 @@ struct ggml_tensor * ggml_rms_norm_impl(
     bool is_node = false;
 
     if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
         is_node = true;
     }
 
@@ -4970,6 +5071,28 @@ struct ggml_tensor * ggml_rms_norm_inplace(
     return ggml_rms_norm_impl(ctx, a, true);
 }
 
+struct ggml_tensor * ggml_rms_norm_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_RMS_NORM_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
+
 // ggml_mul_mat
 
 struct ggml_tensor * ggml_mul_mat(
@@ -5012,9 +5135,7 @@ struct ggml_tensor * ggml_scale_impl(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     result->op   = GGML_OP_SCALE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5420,23 +5541,60 @@ struct ggml_tensor * ggml_get_rows(
 
 // ggml_diag_mask_inf
 
+struct ggml_tensor * ggml_diag_mask_inf_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
+
+    result->op   = GGML_OP_DIAG_MASK_INF;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 struct ggml_tensor * ggml_diag_mask_inf(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
+// ggml_diag_mask_zero
+
+struct ggml_tensor * ggml_diag_mask_zero_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
     struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
 
-    result->op   = GGML_OP_DIAG_MASK_INF;
+    result->op   = GGML_OP_DIAG_MASK_ZERO;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = b;
@@ -5444,21 +5602,33 @@ struct ggml_tensor * ggml_diag_mask_inf(
     return result;
 }
 
+struct ggml_tensor * ggml_diag_mask_zero(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_zero_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
 // ggml_soft_max
 
-struct ggml_tensor * ggml_soft_max(
+struct ggml_tensor * ggml_soft_max_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
+        struct ggml_tensor  * a,
+        bool                  inplace) {
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     result->op   = GGML_OP_SOFT_MAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5468,14 +5638,27 @@ struct ggml_tensor * ggml_soft_max(
     return result;
 }
 
+struct ggml_tensor * ggml_soft_max(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, true);
+}
+
 // ggml_rope
 
-struct ggml_tensor * ggml_rope(
+struct ggml_tensor * ggml_rope_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
         int                   n_dims,
-        int                   mode) {
+        int                   mode,
+        bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
 
@@ -5484,9 +5667,7 @@ struct ggml_tensor * ggml_rope(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
     ((int32_t *) b->data)[0] = n_past;
@@ -5501,6 +5682,57 @@ struct ggml_tensor * ggml_rope(
     return result;
 }
 
+struct ggml_tensor * ggml_rope(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
+};
+
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
+};
+
+// ggml_rope_back
+
+struct ggml_tensor * ggml_rope_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode) {
+    GGML_ASSERT(n_past >= 0);
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+    ((int32_t *) c->data)[0] = n_past;
+    ((int32_t *) c->data)[1] = n_dims;
+    ((int32_t *) c->data)[2] = mode;
+
+    result->op   = GGML_OP_ROPE_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+    result->opt[0] = c;
+
+    return result;
+}
+
 // ggml_conv_1d_1s
 
 struct ggml_tensor * ggml_conv_1d_1s(
@@ -6631,22 +6863,23 @@ static void ggml_compute_forward_add(
     }
 }
 
+// ggml_compute_forward_add1
 
-// ggml_compute_forward_add_at
-
-static void ggml_compute_forward_add_at_f32(
+static void ggml_compute_forward_add1_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        size_t offset) {
-    // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1)
+        struct ggml_tensor * dst) {
     GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    // scalar to add
+    const float v = *(float *) src1->data;
+
     const int ith = params->ith;
     const int nth = params->nth;
 
@@ -6656,56 +6889,42 @@ static void ggml_compute_forward_add_at_f32(
     const size_t nb00 = src0->nb[0];
     const size_t nb01 = src0->nb[1];
 
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
     const size_t nb0 = dst->nb[0];
     const size_t nb1 = dst->nb[1];
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
 
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
+    for (int j = ith; j < n; j += nth) {
 #ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
-                    (float *) ((char *) src0->data + j*nb01 + offset), 1,
-                    (float *) ((char *) src1->data + j*nb11), 1,
-                    (float *) ((char *) dst->data  + j*nb1 + offset),  1, nc);
+        vDSP_vadd(
+                (float *) ((char *) src0->data + j*nb01), 1,
+                (float *) ((char *) src1->data), 0,
+                (float *) ((char *) dst->data  + j*nb1),  1, nc);
 #else
-            ggml_vec_add_f32(nc,
-                    (float *) ((char *) dst->data  + j*nb1 + offset),
-                    (float *) ((char *) src0->data + j*nb01 + offset),
-                    (float *) ((char *) src1->data + j*nb11));
+        ggml_vec_add1_f32(nc,
+                (float *) ((char *) dst->data  + j*nb1),
+                (float *) ((char *) src0->data + j*nb01),
+                v);
 #endif
-        }
-    } else {
-        // src1 is not contiguous
-        for (int j = ith; j < n; j += nth) {
-            float * dst_ptr  = (float *) ((char *) dst->data + j*nb1 + offset);
-            float * src0_ptr = (float *) ((char *) src0->data + j*nb01 + offset);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-
-                dst_ptr[i] = src0_ptr[i] + *src1_ptr;
-            }
-        }
     }
 }
 
-static void ggml_compute_forward_add_at_f16_f32(
+static void ggml_compute_forward_add1_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        size_t offset) {
-    // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1)
+        struct ggml_tensor * dst) {
     GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    // scalar to add
+    const float v = *(float *) src1->data;
+
     const int ith = params->ith;
     const int nth = params->nth;
 
@@ -6715,9 +6934,6 @@ static void ggml_compute_forward_add_at_f16_f32(
     const size_t nb00 = src0->nb[0];
     const size_t nb01 = src0->nb[1];
 
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
     const size_t nb0 = dst->nb[0];
     const size_t nb1 = dst->nb[1];
 
@@ -6728,28 +6944,314 @@ static void ggml_compute_forward_add_at_f16_f32(
     GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1 + offset);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01 + offset);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
-            }
+    for (int j = ith; j < n; j += nth) {
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
         }
     }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
 }
 
-static void ggml_compute_forward_add_at_f16_f16(
+static void ggml_compute_forward_add1_f16_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        size_t offset) {
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    for (int j = ith; j < n; j += nth) {
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_q_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    //const int64_t ne10 = src1->ne[0];
+    //const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    //const int64_t ne0  = dst->ne[0];
+    //const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+
+    const int nb00 = src0->nb[0];
+    const int nb01 = src0->nb[1];
+    const int nb02 = src0->nb[2];
+    const int nb03 = src0->nb[3];
+
+    const int nb0  = dst->nb[0];
+    const int nb1  = dst->nb[1];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    const enum ggml_type type = src0->type;
+    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+
+    // we don't support permuted src0
+    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(dst->type == src0->type);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // total rows in src0
+    const int nr = ne01*ne02*ne03;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        // dst is same shape as src0 => same indices
+        const int i3 = i03;
+        const int i2 = i02;
+        const int i1 = i01;
+
+        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb0));
+
+        assert(ne00 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne00);
+        // add src1
+        ggml_vec_acc1_f32(ne00, wdata, v);
+        // quantize row to dst
+        quantize_row_q(wdata, dst_row, ne00);
+    }
+}
+
+static void ggml_compute_forward_add1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add1_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
+                }
+                else {
+                    GGML_ASSERT(false);
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q4_2:
+        case GGML_TYPE_Q4_3:
+            {
+                ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+
+// ggml_compute_forward_add_at
+
+static void ggml_compute_forward_add_at_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        size_t offset) {
+    // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1)
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb10 = src1->nb[0];
+    const size_t nb11 = src1->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int j = ith; j < n; j += nth) {
+#ifdef GGML_USE_ACCELERATE
+            vDSP_vadd(
+                    (float *) ((char *) src0->data + j*nb01 + offset), 1,
+                    (float *) ((char *) src1->data + j*nb11), 1,
+                    (float *) ((char *) dst->data  + j*nb1 + offset),  1, nc);
+#else
+            ggml_vec_add_f32(nc,
+                    (float *) ((char *) dst->data  + j*nb1 + offset),
+                    (float *) ((char *) src0->data + j*nb01 + offset),
+                    (float *) ((char *) src1->data + j*nb11));
+#endif
+        }
+    } else {
+        // src1 is not contiguous
+        for (int j = ith; j < n; j += nth) {
+            float * dst_ptr  = (float *) ((char *) dst->data + j*nb1 + offset);
+            float * src0_ptr = (float *) ((char *) src0->data + j*nb01 + offset);
+            for (int i = 0; i < nc; i++) {
+                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
+
+                dst_ptr[i] = src0_ptr[i] + *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_at_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        size_t offset) {
+    // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1)
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb10 = src1->nb[0];
+    const size_t nb11 = src1->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    if (nb10 == sizeof(float)) {
+        for (int j = ith; j < n; j += nth) {
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1 + offset);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01 + offset);
+            for (int i = 0; i < nc; i++) {
+                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_compute_forward_add_at_f16_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        size_t offset) {
     // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1)
     GGML_ASSERT(ggml_are_same_shape(src0, dst));
 
@@ -7677,9 +8179,150 @@ static void ggml_compute_forward_silu(
 }
 
 
+// ggml_compute_forward_silu_back
+
+static void ggml_compute_forward_silu_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * grad,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(grad));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, grad));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_backward_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])),
+                (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * grad,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_back_f32(params, grad, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_norm
 
-static void ggml_compute_forward_norm_f32(
+static void ggml_compute_forward_norm_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
+
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    const float eps = 1e-5f; // TODO: make this a parameter
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) { // i think this must not be threaded, because we need mean over all x
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)x[i00];
+                }
+
+                float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_float sum2 = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    float v = x[i00] - mean;
+                    y[i00] = v;
+                    sum2 += (ggml_float)(v*v);
+                }
+
+                float variance = sum2/ne00;
+                const float scale = 1.0f/sqrtf(variance + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_norm(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_rms_norm_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -7707,32 +8350,29 @@ static void ggml_compute_forward_norm_f32(
     const size_t nb2 = dst->nb[2];
     const size_t nb3 = dst->nb[3];
 
-    const float eps = 1e-5f; // TODO: make this a parameter
+    const float eps = 1e-6f; // TODO: make this a parameter
 
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) { // i think this must not be threaded, because we need mean over all x*x
                 const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
 
                 ggml_float sum = 0.0;
                 for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
+                    sum += (ggml_float)(x[i00] * x[i00]);
                 }
 
                 float mean = sum/ne00;
 
                 float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
 
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
+                memcpy(y, x, ne00 * sizeof(float));
+                // for (int i00 = 0; i00 < ne00; i00++) {
+                //     y[i00] = x[i00];
+                // }
 
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
+                const float scale = 1.0f/sqrtf(mean + eps);
 
                 ggml_vec_scale_f32(ne00, y, scale);
             }
@@ -7740,14 +8380,14 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-static void ggml_compute_forward_norm(
+static void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_norm_f32(params, src0, dst);
+                ggml_compute_forward_rms_norm_f32(params, src0, dst);
             } break;
         default:
             {
@@ -7756,11 +8396,13 @@ static void ggml_compute_forward_norm(
     }
 }
 
-static void ggml_compute_forward_rms_norm_f32(
+
+static void ggml_compute_forward_rms_norm_back_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -7780,6 +8422,10 @@ static void ggml_compute_forward_rms_norm_f32(
     const size_t nb02 = src0->nb[2];
     const size_t nb03 = src0->nb[3];
 
+    const size_t nb11 = src1->nb[1];
+    const size_t nb12 = src1->nb[2];
+    const size_t nb13 = src1->nb[3];
+
     const size_t nb1 = dst->nb[1];
     const size_t nb2 = dst->nb[2];
     const size_t nb3 = dst->nb[3];
@@ -7789,39 +8435,54 @@ static void ggml_compute_forward_rms_norm_f32(
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) { // i think this must not be threaded, because we need mean over all x*x
+                // src1 is same shape as src0 => same indices
+                const auto i11 = i01;
+                const auto i12 = i02;
+                const auto i13 = i03;
                 const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * dy = (float *) ((char *) src1->data + /*TODO*/ i11*nb11 + i12*nb12 + i13*nb13);
 
                 ggml_float sum = 0.0;
                 for (int64_t i00 = 0; i00 < ne00; i00++) {
                     sum += (ggml_float)(x[i00] * x[i00]);
                 }
 
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
+                const float mean = sum/ne00;
+                const float mean_eps = sum/ne00 + eps;
+                // we could cache rms from forward pass to improve performance.
+                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
+                const float rms = sqrtf(mean_eps);
+                const float rrms = 1.0f / sqrtf(mean_eps);
+                const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+                // rms(x) = sqrt(eps + mean(square(x))) ; scalar
+                // y = rms_norm(x) = x/rms(x) = x/sqrt(eps+mean(square(x))) ; vector
+                // dx = dy*(1/rms(x) - square(x)/(n*rms(x)**3))
+
+                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+                // square(x)
+                ggml_vec_mul_f32(ne00, dx, x, x);
+                // -square(x)/(n*rms**3)
+                ggml_vec_scale_f32(ne00, dx, scale);
+                // 1/rms(x) - square(x)/(n*rms(x)**3)
+                ggml_vec_acc1_f32(ne00, dx, rrms);
+                // dy*(1/rms(x) - square(x)/(n*rms(x)**3))
+                ggml_vec_mul_f32(ne00, dx, dx, dy);
             }
         }
     }
 }
 
-static void ggml_compute_forward_rms_norm(
+static void ggml_compute_forward_rms_norm_back(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_rms_norm_f32(params, src0, dst);
+                ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -8624,8 +9285,17 @@ static void ggml_compute_forward_scale_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+
     for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+        }
+        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
     }
 }
 
@@ -8841,79 +9511,292 @@ static void ggml_compute_forward_get_rows(
 
 // ggml_compute_forward_diag_mask_inf
 
-static void ggml_compute_forward_diag_mask_inf_f32(
+static void ggml_compute_forward_diag_mask_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const float value) {
     assert(params->ith == 0);
     assert(src1->type == GGML_TYPE_I32);
     assert(ggml_nelements(src1) == 1);
 
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+
+    // TODO: handle transposed/permuted matrices
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    const int nr = src0->ne[1];
+    const int nz = n/nr;
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int k = 0; k < nz; k++) {
+        for (int j = 0; j < nr; j++) {
+            for (int i = n_past; i < nc; i++) {
+                if (i > n_past + j) {
+                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_inf(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_diag_mask_zero(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_soft_max
+
+static void ggml_compute_forward_soft_max_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(sp[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, sp);
+
+        ggml_float sum = 0.0;
+
+        uint16_t scvt;
+        for (int i = 0; i < nc; i++) {
+            if (sp[i] == -INFINITY) {
+                dp[i] = 0.0f;
+            } else {
+                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
+                ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
+                memcpy(&scvt, &s, sizeof(scvt));
+                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                sum += (ggml_float)val;
+                dp[i] = val;
+            }
+        }
+
+        assert(sum > 0.0);
+
+        sum = 1.0/sum;
+        ggml_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dp[i]));
+            assert(!isinf(dp[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_rope
+
+static void ggml_compute_forward_rope_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 3);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_dims = ((int32_t *) src1->data)[1];
+    const int mode   = ((int32_t *) src1->data)[2];
+
+    //const int64_t ne0 = src0->ne[0];
+    const int64_t ne1 = src0->ne[1];
+    const int64_t ne2 = src0->ne[2];
+    const int64_t ne3 = src0->ne[3];
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    const int nb3 = src0->nb[3];
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta = (float)p;
 
-    const int n_past = ((int32_t *) src1->data)[0];
+                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                    const float cos_theta = cosf(theta);
+                    const float sin_theta = sinf(theta);
 
-    // TODO: handle transposed/permuted matrices
+                    theta *= theta_scale;
 
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
+                    if (!is_neox) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
+                        const float x0 = src[0];
+                        const float x1 = src[1];
 
-    for (int k = 0; k < nz; k++) {
-        for (int j = 0; j < nr; j++) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;
+                        dst_data[0] = x0*cos_theta - x1*sin_theta;
+                        dst_data[1] = x0*sin_theta + x1*cos_theta;
+                    } else {
+                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims/2];
+
+                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                    }
                 }
             }
         }
     }
 }
 
-static void ggml_compute_forward_diag_mask_inf(
+static void ggml_compute_forward_rope_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 3);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
-    // TODO: handle transposed/permuted matrices
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_dims = ((int32_t *) src1->data)[1];
+    const int mode   = ((int32_t *) src1->data)[2];
+
+    //const int64_t ne0 = src0->ne[0];
+    const int64_t ne1 = src0->ne[1];
+    const int64_t ne2 = src0->ne[2];
+    const int64_t ne3 = src0->ne[3];
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    const int nb3 = src0->nb[3];
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(ggml_fp16_t));
 
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nc = src0->ne[0];
     const int nr = ggml_nrows(src0);
 
     // rows per thread
@@ -8923,58 +9806,66 @@ static void ggml_compute_forward_soft_max_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
+    // row index used to determine which thread to use
+    int ir = 0;
 
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(sp[i]));
-        }
-#endif
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
 
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, sp);
+    const bool is_neox = mode & 2;
 
-        ggml_float sum = 0.0;
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
 
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            if (sp[i] == -INFINITY) {
-                dp[i] = 0.0f;
-            } else {
-                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                dp[i] = val;
-            }
-        }
+                float theta = (float)p;
 
-        assert(sum > 0.0);
+                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                    const float cos_theta = cosf(theta);
+                    const float sin_theta = sinf(theta);
 
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, dp, sum);
+                    theta *= theta_scale;
 
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dp[i]));
-            assert(!isinf(dp[i]));
+                    if (!is_neox) {
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[1]);
+
+                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    } else {
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                }
+            }
         }
-#endif
     }
 }
 
-static void ggml_compute_forward_soft_max(
+static void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+            } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_soft_max_f32(params, src0, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -8983,23 +9874,28 @@ static void ggml_compute_forward_soft_max(
     }
 }
 
-// ggml_compute_forward_rope
+// ggml_compute_forward_rope_back
 
-static void ggml_compute_forward_rope_f32(
+static void ggml_compute_forward_rope_back_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt,
         struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(opt->type == GGML_TYPE_I32);
+    assert(ggml_nelements(opt) == 3);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    // y = rope(x, opt)
+    // dx = rope_back(x, dy)
+    // src0 is x, src1 is dy
+
+    const int n_past = ((int32_t *) opt->data)[0];
+    const int n_dims = ((int32_t *) opt->data)[1];
+    const int mode   = ((int32_t *) opt->data)[2];
 
     //const int64_t ne0 = src0->ne[0];
     const int64_t ne1 = src0->ne[1];
@@ -9035,6 +9931,9 @@ static void ggml_compute_forward_rope_f32(
 
     const bool is_neox = mode & 2;
 
+    // TODO
+    GGML_ASSERT(false);
+    //*
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
@@ -9073,23 +9972,25 @@ static void ggml_compute_forward_rope_f32(
             }
         }
     }
+    //*/
 }
 
-static void ggml_compute_forward_rope_f16(
+static void ggml_compute_forward_rope_back_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt,
         struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(opt->type == GGML_TYPE_I32);
+    assert(ggml_nelements(opt) == 3);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    const int n_past = ((int32_t *) opt->data)[0];
+    const int n_dims = ((int32_t *) opt->data)[1];
+    const int mode   = ((int32_t *) opt->data)[2];
 
     //const int64_t ne0 = src0->ne[0];
     const int64_t ne1 = src0->ne[1];
@@ -9125,6 +10026,9 @@ static void ggml_compute_forward_rope_f16(
 
     const bool is_neox = mode & 2;
 
+    // TODO
+    GGML_ASSERT(false);
+    /*
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
@@ -9163,21 +10067,23 @@ static void ggml_compute_forward_rope_f16(
             }
         }
     }
+    */
 }
 
-static void ggml_compute_forward_rope(
+static void ggml_compute_forward_rope_back(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_back_f16(params, src0, src1, opt, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_back_f32(params, src0, src1, opt, dst);
             } break;
         default:
             {
@@ -10505,6 +11411,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_ADD1:
+            {
+                ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_ADD_AT:
             {
                 ggml_compute_forward_add_at(params, tensor->src0, tensor->src1, tensor);
@@ -10569,6 +11479,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_silu(params, tensor->src0, tensor);
             } break;
+        case GGML_OP_SILU_BACK:
+            {
+                ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_NORM:
             {
                 ggml_compute_forward_norm(params, tensor->src0, tensor);
@@ -10577,6 +11491,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
             } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
+                ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_MUL_MAT:
             {
                 ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
@@ -10617,6 +11535,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_SOFT_MAX:
             {
                 ggml_compute_forward_soft_max(params, tensor->src0, tensor);
@@ -10625,6 +11547,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_ROPE_BACK:
+            {
+                ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+            } break;
         case GGML_OP_CONV_1D_1S:
             {
                 ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -10689,6 +11615,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);
                 }
             } break;
+        case GGML_OP_ADD1:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_add_impl(ctx,
+                        src1->grad,
+                        ggml_mean(ctx, tensor->grad),
+                        inplace);
+                }
+            } break;
         case GGML_OP_ADD_AT:
             {
                 if (src0->grad) {
@@ -10774,9 +11712,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad =
                         ggml_add_impl(ctx,
                                 src0->grad,
-                                ggml_div(ctx,
-                                    ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
-                                    tensor),
+                                ggml_mul(ctx,
+                                    tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1
+                                    ggml_div(ctx,
+                                        ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
+                                        tensor)),
                                 inplace);
                 }
             } break;
@@ -10852,6 +11792,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_SILU:
             {
                 // necessary for llama
+                if (src0->grad) {
+                    src0->grad = ggml_add_impl(ctx,
+                            src0->grad,
+                            ggml_silu_back(ctx, src0, tensor->grad),
+                            inplace);
+                }
+            } break;
+        case GGML_OP_SILU_BACK:
+            {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_NORM:
@@ -10861,6 +11810,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_RMS_NORM:
             {
                 // necessary for llama
+                if (src0->grad) {
+                    src0->grad = ggml_add_impl(ctx,
+                            src0->grad,
+                            ggml_rms_norm_back(ctx, src0, tensor->grad),
+                            inplace);
+                }
+            } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_MUL_MAT:
@@ -11000,22 +11958,88 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_GET_ROWS:
             {
-                // necessary for llama
+                // necessary for llama (only for tokenizer)
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_DIAG_MASK_INF:
             {
                 // necessary for llama
-                GGML_ASSERT(false); // TODO: not implemented
+                if (src0->grad) {
+                    assert(src1->type == GGML_TYPE_I32);
+                    assert(ggml_nelements(src1) == 1);
+                    const int n_past = ((int32_t *) src1->data)[0];
+                    src0->grad =
+                        ggml_add_impl(ctx, src0->grad,
+                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        inplace);
+                }
+                if (src1->grad) {
+                    // noop
+                }
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    assert(src1->type == GGML_TYPE_I32);
+                    assert(ggml_nelements(src1) == 1);
+                    const int n_past = ((int32_t *) src1->data)[0];
+                    src0->grad =
+                        ggml_add_impl(ctx, src0->grad,
+                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        inplace);
+                }
+                if (src1->grad) {
+                    // noop
+                }
             } break;
         case GGML_OP_SOFT_MAX:
             {
                 // necessary for llama
-                GGML_ASSERT(false); // TODO: not implemented
+                if (src0->grad) {
+                    // y = softmax(x)
+                    // dx = dy * y - sum(dy * y) * y
+                    // dx = y * (dy - sum(dy * y))
+                    src0->grad =
+                        ggml_add_impl(ctx,
+                            src0->grad,
+                            ggml_mul(ctx,
+                                tensor,
+                                ggml_add1(ctx,
+                                    tensor->grad,
+                                    ggml_neg(ctx,
+                                        ggml_sum(ctx,
+                                            ggml_mul(ctx,
+                                                tensor->grad,
+                                                tensor))))),
+                            inplace);
+                }
             } break;
         case GGML_OP_ROPE:
             {
                 // necessary for llama
+                if (src0->grad) {
+                    assert(src1->type == GGML_TYPE_I32);
+                    assert(ggml_nelements(src1) == 3);
+                    const int n_past = ((int32_t *) src1->data)[0];
+                    const int n_dims = ((int32_t *) src1->data)[1];
+                    const int mode   = ((int32_t *) src1->data)[2];
+                    src0->grad = ggml_sub_impl(ctx,
+                            src0->grad,
+                            ggml_rope_back(ctx,
+                                src0,
+                                tensor->grad,
+                                n_past,
+                                n_dims,
+                                mode),
+                            inplace);
+                }
+                if (src1->grad) {
+                    // noop
+                }
+            } break;
+        case GGML_OP_ROPE_BACK:
+            {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_CONV_1D_1S:
@@ -11370,6 +12394,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_ADD:
+                case GGML_OP_ADD1:
                     {
                         node->n_tasks = n_threads;
 
@@ -11417,9 +12442,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = n_threads;
                     } break;
+                case GGML_OP_SILU_BACK:
+                    {
+                        node->n_tasks = n_threads;
+                    } break;
                 case GGML_OP_NORM:
                 case GGML_OP_RMS_NORM:
+                case GGML_OP_RMS_NORM_BACK:
                     {
+                        // i think this must not be threaded, because we need mean over all items, not just the slices each thread sees
                         node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_MUL_MAT:
@@ -11482,6 +12513,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = 1;
                     } break;
+                case GGML_OP_DIAG_MASK_ZERO:
+                    {
+                        node->n_tasks = 1;
+                    } break;
                 case GGML_OP_SOFT_MAX:
                     {
                         node->n_tasks = n_threads;
diff --git a/ggml.h b/ggml.h
index fa8f90f505e96..1fd3bff11237b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -219,6 +219,7 @@ enum ggml_op {
 
     GGML_OP_DUP,
     GGML_OP_ADD,
+    GGML_OP_ADD1,
     GGML_OP_ADD_AT,
     GGML_OP_SUB,
     GGML_OP_MUL,
@@ -235,8 +236,10 @@ enum ggml_op {
     GGML_OP_RELU,
     GGML_OP_GELU,
     GGML_OP_SILU,
+    GGML_OP_SILU_BACK,
     GGML_OP_NORM, // normalize
     GGML_OP_RMS_NORM,
+    GGML_OP_RMS_NORM_BACK,
 
     GGML_OP_MUL_MAT,
 
@@ -249,8 +252,10 @@ enum ggml_op {
     GGML_OP_TRANSPOSE,
     GGML_OP_GET_ROWS,
     GGML_OP_DIAG_MASK_INF,
+    GGML_OP_DIAG_MASK_ZERO,
     GGML_OP_SOFT_MAX,
     GGML_OP_ROPE,
+    GGML_OP_ROPE_BACK,
     GGML_OP_CONV_1D_1S,
     GGML_OP_CONV_1D_2S,
 
@@ -441,6 +446,11 @@ struct ggml_tensor * ggml_add_inplace(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b);
 
+struct ggml_tensor * ggml_add1(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+
 struct ggml_tensor * ggml_add_at(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -523,6 +533,11 @@ struct ggml_tensor * ggml_silu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
 
+struct ggml_tensor * ggml_silu_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * dy);
+
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
@@ -533,6 +548,12 @@ struct ggml_tensor * ggml_rms_norm(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
 
+
+struct ggml_tensor * ggml_rms_norm_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * dy);
+
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@@ -545,12 +566,17 @@ struct ggml_tensor * ggml_mul_mat(
 // operations on tensors without backpropagation
 //
 
-// in-place, returns view(a)
 struct ggml_tensor * ggml_scale(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b);
 
+// in-place, returns view(a)
+struct ggml_tensor * ggml_scale_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+
 // a -> b, return view(b)
 struct ggml_tensor * ggml_cpy(
         struct ggml_context * ctx,
@@ -630,19 +656,39 @@ struct ggml_tensor * ggml_get_rows(
         struct ggml_tensor  * b);
 
 // set elements above the diagonal to -INF
-// in-place, returns view(a)
 struct ggml_tensor * ggml_diag_mask_inf(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past);
 
 // in-place, returns view(a)
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past);
+
+// set elements above the diagonal to 0
+struct ggml_tensor * ggml_diag_mask_zero(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past);
+
+// in-place, returns view(a)
+struct ggml_tensor * gml_diag_mask_zero_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past);
+
 struct ggml_tensor * ggml_soft_max(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
 
-// rotary position embedding
 // in-place, returns view(a)
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
+// rotary position embedding
 // if mode & 1 == 1, skip n_past elements
 // if mode & 2 == 1, GPT-NeoX style
 // TODO: avoid creating a new tensor every time
@@ -651,6 +697,24 @@ struct ggml_tensor * ggml_rope(
         struct ggml_tensor  * a,
         int                   n_past,
         int                   n_dims,
+        int                   mode,
+        bool                  inplace);
+
+// in-place, returns view(a)
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode);
+
+// rotary position embedding backward, i.e compute dx
+struct ggml_tensor * ggml_rope_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * dy,
+        int                   n_past,
+        int                   n_dims,
         int                   mode);
 
 // padding = 1
diff --git a/llama.cpp b/llama.cpp
index e83fc5507b0f3..72eea074da349 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1103,8 +1103,8 @@ static bool llama_eval_internal(
         // self-attention
         {
             // compute Q and K and RoPE them
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
             // store key and value to memory
             {
@@ -1138,15 +1138,15 @@ static bool llama_eval_internal(
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale_inplace(ctx0,
                         KQ,
                         ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
 
             // split cached V into n_head heads
             struct ggml_tensor * V =
@@ -1244,7 +1244,7 @@ static bool llama_eval_internal(
     lctx.use_buf(ctx0, -1);
 
     // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
@@ -2022,7 +2022,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
 
             if (scaling != 1.0f) {
                 ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-                BA = ggml_scale(lora_ctx, BA, scale_tensor);
+                BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
             }
 
             ggml_tensor * r;