Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from ggerganov:master #165

Closed
wants to merge 72 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
c0d6f79
SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 (#1…
qnixsynapse Jan 7, 2025
a4dd490
rpc : code cleanup (#11107)
rgerganov Jan 7, 2025
a3d50bc
ggml-backend : only offload from host buffers (#11120)
slaren Jan 7, 2025
017cc5f
ggml-backend : only offload from host buffers (fix) (#11124)
slaren Jan 7, 2025
53ff6b9
GGUF: C++ refactor, backend support, misc fixes (#11030)
JohannesGaessler Jan 7, 2025
bec2183
fix: Vulkan shader gen binary path when Cross-compiling (#11096)
ag2s20150909 Jan 8, 2025
02f0430
Disable GL_KHR_cooperative_matrix Vulkan extension if not available. …
mbaudier Jan 8, 2025
0d52a69
ci : fix cmake option (#11125)
ggerganov Jan 8, 2025
8cef75c
llamafile : ppc64le MMA INT8 implementation (#10912)
amritahs-ibm Jan 8, 2025
a3c1232
arg : option to exclude arguments from specific examples (#11136)
ggerganov Jan 8, 2025
80ccf5d
ci : pin dependency to specific version (#11137)
ngxson Jan 8, 2025
c792dcf
ggml : allow loading backend with env variable (ggml/1059)
rgerganov Jan 5, 2025
99a3755
sync : ggml
ggerganov Jan 8, 2025
c07d437
llama : avoid hardcoded QK_K (#11061)
ggerganov Jan 8, 2025
4d2b3d8
lora : improve compat with `mergekit-extract-lora` (#11131)
ngxson Jan 8, 2025
f7cd133
ci : use actions from ggml-org (#11140)
ngxson Jan 8, 2025
1bf839b
Enhance user input handling for llama-run (#11138)
ericcurtin Jan 8, 2025
8a1d9c2
gguf-py : move scripts directory (#11116)
VJHack Jan 8, 2025
8d59d91
fix: add missing msg in static_assert (#11143)
hydai Jan 8, 2025
d9feae1
llama-chat : add phi 4 template (#11148)
ngxson Jan 9, 2025
be0e950
media : remove old img [no ci]
ggerganov Jan 9, 2025
f8feb4b
model: Add support for PhiMoE arch (#11003)
phymbert Jan 9, 2025
8eceb88
server : add tooltips to settings and themes btn (#11154)
danbev Jan 9, 2025
1204f97
doc: add cuda guide for fedora (#11135)
teihome Jan 9, 2025
c6860cc
SYCL: Refactor ggml_sycl_compute_forward (#11121)
qnixsynapse Jan 10, 2025
ee7136c
llama: add support for QRWKV6 model architecture (#11001)
MollySophia Jan 10, 2025
c3f9d25
Vulkan: Fix float16 use on devices without float16 support + fix subg…
0cc4m Jan 10, 2025
ff3fcab
convert : add --print-supported-models option (#11172)
danbev Jan 10, 2025
ba8a1f9
examples : add README.md to tts example [no ci] (#11155)
danbev Jan 10, 2025
2739a71
convert : sort print supported models [no ci] (#11179)
danbev Jan 11, 2025
c05e8c9
gguf-py: fixed local detection of gguf package (#11180)
VJHack Jan 11, 2025
afa8a9e
llama : add `llama_vocab`, functions -> methods, naming (#11110)
ggerganov Jan 12, 2025
08f10f6
llama : remove notion of CLS token (#11064)
ggerganov Jan 12, 2025
9a48399
llama : fix chat template gguf key (#11201)
ngxson Jan 12, 2025
924518e
Reset color before we exit (#11205)
ericcurtin Jan 12, 2025
1244cdc
ggml : do not define GGML_USE_CUDA when building with GGML_BACKEND_DL…
rgerganov Jan 13, 2025
8f70fc3
llama : remove 'd' from bad special token log (#11212)
danbev Jan 13, 2025
7426a26
contrib : add naming guidelines (#11177)
ggerganov Jan 13, 2025
00b4c3d
common : support tag-based --hf-repo like on ollama (#11195)
ngxson Jan 13, 2025
ca001f6
contrib : add naming guidelines (cont) (#11177)
ggerganov Jan 13, 2025
437e05f
server : (UI) Support for RTL text as models input or output (#11208)
ebraminio Jan 13, 2025
a29f087
contrib : add naming guidelines (cont) (#11177)
ggerganov Jan 13, 2025
39509fb
cuda : CUDA Graph Compute Function Refactor (precursor for performanc…
aendk Jan 13, 2025
84a4481
cli : auto activate conversation mode if chat template is available (…
ngxson Jan 13, 2025
504af20
server : (UI) Improve messages bubble shape in RTL (#11220)
ebraminio Jan 13, 2025
d00a80e
scripts : sync opencl
ggerganov Jan 14, 2025
48e1ae0
scripts : sync gguf
ggerganov Jan 14, 2025
a4f3f5d
scripts : sync gguf (cont)
ggerganov Jan 14, 2025
44d1e79
sync : ggml
ggerganov Jan 14, 2025
091592d
Refactor test-chat-template.cpp (#11224)
ochafik Jan 14, 2025
c5bf0d1
server : Improve code snippets direction between RTL text (#11221)
ebraminio Jan 14, 2025
bbf3e55
vocab : add dummy tokens for "no_vocab" type (#11231)
ggerganov Jan 14, 2025
b4d92a5
ci : add -no-cnv for tests (#11238)
ngxson Jan 14, 2025
f446c2c
SYCL: Add gated linear attention kernel (#11175)
qnixsynapse Jan 15, 2025
0ccd7f3
examples : add embd_to_audio to tts-outetts.py [no ci] (#11235)
danbev Jan 15, 2025
432df2d
RoPE: fix back, CUDA support for back + noncont. (#11240)
JohannesGaessler Jan 15, 2025
1d85043
fix: ggml: fix vulkan-shaders-gen build (#10448)
sparkleholic Jan 15, 2025
f11cfdf
ci : use -no-cnv in gguf-split tests (#11254)
ggerganov Jan 15, 2025
adc5dd9
vulkan: scale caching for k quants + misc fixes (#11081)
netrunnereve Jan 15, 2025
c67cc98
ggml: aarch64: implement SVE kernels for q4_K_q8_K vector dot (#11227)
fj-y-saito Jan 16, 2025
681149c
llama : add `llama_model_load_from_splits` (#11255)
ngxson Jan 16, 2025
9c8dcef
CUDA: backwards pass for misc. ops, add tests (#11257)
JohannesGaessler Jan 16, 2025
4dbc8b9
llama : add internlm3 support (#11233)
RunningLeon Jan 16, 2025
206bc53
vulkan: optimize coopmat2 q2_k dequant function (#11130)
jeffbolznv Jan 16, 2025
466300f
vulkan: optimize coopmat2 q4_k/q5_k dequant functions. (#11206)
jeffbolznv Jan 16, 2025
bd38dde
vulkan: support copy from f32 to q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl (#11…
jeffbolznv Jan 16, 2025
7a689c4
README : added kalavai to infrastructure list (#11216)
musoles Jan 17, 2025
960ec65
llama : fix deprecation message: vocabable -> vocab (#11269)
dwrensha Jan 17, 2025
a133566
vocab : fix double-eos check (#11273)
ggerganov Jan 17, 2025
667d728
rpc : early register backend devices (#11262)
rgerganov Jan 17, 2025
3edfa7d
llama.android: add field formatChat to control whether to parse speci…
codezjx Jan 17, 2025
44e18ef
vulkan: fix coopmat2 flash attention for non-contiguous inputs (#11281)
jeffbolznv Jan 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
SYCL: Add gated linear attention kernel (ggml-org#11175)
* SYCL: Add Gated Linear attention kernel

* glahpp: add a space at the end of file

* gla: Put the barrier inside the main logic loop
  • Loading branch information
qnixsynapse authored Jan 15, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit f446c2cf6a56a750b67c967505e717a996d2f2fd
1 change: 1 addition & 0 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
@@ -29,5 +29,6 @@
#include "wkv6.hpp"
#include "outprod.hpp"
#include "element_wise.hpp"
#include "gla.hpp"

#endif // GGML_SYCL_BACKEND_HPP
4 changes: 4 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
@@ -4040,6 +4040,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
case GGML_OP_RWKV_WKV6:
ggml_sycl_op_rwkv_wkv6(ctx, dst);
break;
case GGML_OP_GATED_LINEAR_ATTN:
ggml_sycl_op_gated_linear_attn(ctx, dst);
break;
default:
return false;
}
@@ -4507,6 +4510,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_LEAKY_RELU:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
return true;
default:
return false;
105 changes: 105 additions & 0 deletions ggml/src/ggml-sycl/gla.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#include <sycl/sycl.hpp>

#include "common.hpp"

template <u_int HEAD_SIZE>
static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, u_int T, u_int C, u_int H, float scale,
const float * k, const float * v, const float * r, const float * td,
const float * s, float * dst) {
const u_int head_size = HEAD_SIZE;
const u_int state_size = C * head_size;
const u_int n_seq_tokens = T / B;
sycl::range<1> block_dims((C / H));
sycl::range<1> grid_dims((B * H));
stream->submit([&](sycl::handler & cgh) {
/* local memory accessors*/
auto _k = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
auto _r = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);

cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
u_int tid = item.get_local_id(0);
u_int bid = item.get_group(0);

u_int batch_i = bid / H;
u_int head_i = bid % H;

float state[head_size];

#pragma unroll
for (u_int i = 0; i < head_size; i++) {
state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
}

for (u_int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {

item.barrier(sycl::access::fence_space::local_space); //sync threads
_k[tid] = k[t];
_r[tid] = r[t];
_td[tid] = td[t];
item.barrier(sycl::access::fence_space::local_space); //sync threads

const float _v = v[t];
float y = 0;

for (u_int j = 0; j < head_size; j += 4) {
const sycl::float4 & k = (sycl::float4 &) (_k[j]);
const sycl::float4 & r = (sycl::float4 &) (_r[j]);
const sycl::float4 & td = (sycl::float4 &) (_td[j]);
sycl::float4 & s = (sycl::float4 &) (state[j]);
sycl::float4 kv;

kv.x() = k.x() * _v;
kv.y() = k.y() * _v;
kv.z() = k.z() * _v;
kv.w() = k.w() * _v;

s.x() = s.x() * td.x() + kv.x();
s.y() = s.y() * td.y() + kv.y();
s.z() = s.z() * td.z() + kv.z();
s.w() = s.w() * td.w() + kv.w();

y += r.x() * s.x();
y += r.y() * s.y();
y += r.z() * s.z();
y += r.w() * s.w();
}
dst[t] = y * scale;
}
#pragma unroll
for (u_int i = 0; i < head_size; i++) {
dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
}
});
});
}

void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const float * k_d = static_cast<const float *>(dst->src[0]->data);
const float * v_d = static_cast<const float *>(dst->src[1]->data);
const float * r_d = static_cast<const float *>(dst->src[2]->data);
const float * td_d = static_cast<const float *>(dst->src[3]->data);
const float * s_d = static_cast<const float *>(dst->src[4]->data);

const int64_t B = dst->src[4]->ne[1];
const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[1];

dpct::queue_ptr stream = ctx.stream();
GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
GGML_ASSERT(C % H == 0);
GGML_ASSERT(C / H == 64 || C / H == 128);

float scale;
memcpy(&scale, dst->op_params, sizeof(float));

float * dst_d = (float *) dst->data;

if (C / H == 64) {
gated_linear_attn_f32_kernel<64>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
} else {
gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
}
}
8 changes: 8 additions & 0 deletions ggml/src/ggml-sycl/gla.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef GGML_SYCL_GLA_HPP
#define GGML_SYCL_GLA_HPP

#include "common.hpp"

void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

#endif // GGML_SYCL_GLA_HPP