From 580fe2064cc439a588c56b791a2ecbe07d35bcba Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 29 Nov 2023 17:30:19 +0200
Subject: [PATCH] metal : simplify soft_max encoding

ggml-ci
---
 ggml-metal.m | 7 +------
 llama.cpp    | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 0b468bea027a4..58149a487559f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1040,12 +1040,7 @@ void ggml_metal_graph_compute(
                             const float scale = ((float *) dst->op_params)[0];
 
                             [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            if (id_src1) {
-                                [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                            } else {
-                                [encoder setBuffer:nil     offset:0           atIndex:1];
-                            }
-
+                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
                             [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
                             [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
diff --git a/llama.cpp b/llama.cpp
index 2c13aeb5091c5..7b261b73e2210 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3705,8 +3705,8 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
     cb(kq, "kq", il);
 
-    // TODO: !!!!!!!!!
     if (max_alibi_bias > 0.0f) {
+        // temporary branch until we figure out how to handle ggml_alibi through ggml_add
         kq = ggml_scale(ctx, kq, kq_scale);
         cb(kq, "kq_scaled", il);