@@ -2404,13 +2404,30 @@ static struct ggml_cgraph * llm_build_llama(
2404
2404
}
2405
2405
#endif // GGML_USE_CUBLAS
2406
2406
2407
+ // KQ_scale
2407
2408
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2408
2409
ggml_allocr_alloc(lctx.alloc, KQ_scale);
2409
2410
if (!ggml_allocr_is_measure(lctx.alloc)) {
2410
2411
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2411
2412
}
2412
2413
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2413
2414
2415
+ // KQ_mask
2416
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
2417
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2418
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2419
+ float * data = (float *) KQ_mask->data;
2420
+ memset(data, 0, ggml_nbytes(KQ_mask));
2421
+
2422
+ for (int h = 0; h < 1; ++h) {
2423
+ for (int j = 0; j < N; ++j) {
2424
+ for (int i = n_past + j + 1; i < n_past + N; ++i) {
2425
+ data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
2426
+ }
2427
+ }
2428
+ }
2429
+ }
2430
+
2414
2431
for (int il = 0; il < n_layer; ++il) {
2415
2432
ggml_format_name(inpL, "layer_inp_%d", il);
2416
2433
@@ -2447,11 +2464,11 @@ static struct ggml_cgraph * llm_build_llama(
2447
2464
offload_func_kq(tmpq);
2448
2465
ggml_set_name(tmpq, "tmpq");
2449
2466
2450
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2467
+ struct ggml_tensor * Kcur = ggml_rope_custom (ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2451
2468
offload_func_kq(Kcur);
2452
2469
ggml_set_name(Kcur, "Kcur");
2453
2470
2454
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2471
+ struct ggml_tensor * Qcur = ggml_rope_custom (ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2455
2472
offload_func_kq(Qcur);
2456
2473
ggml_set_name(Qcur, "Qcur");
2457
2474
@@ -2502,17 +2519,18 @@ static struct ggml_cgraph * llm_build_llama(
2502
2519
2503
2520
// KQ_scaled = KQ / sqrt(n_embd_head)
2504
2521
// KQ_scaled shape [n_past + N, N, n_head, 1]
2505
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
2522
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
2506
2523
offload_func_kq(KQ_scaled);
2507
2524
ggml_set_name(KQ_scaled, "KQ_scaled");
2508
2525
2509
2526
// KQ_masked = mask_past(KQ_scaled)
2510
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2527
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2528
+ //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2511
2529
offload_func_kq(KQ_masked);
2512
2530
ggml_set_name(KQ_masked, "KQ_masked");
2513
2531
2514
2532
// KQ = soft_max(KQ_masked)
2515
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
2533
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
2516
2534
offload_func_v(KQ_soft_max);
2517
2535
ggml_set_name(KQ_soft_max, "KQ_soft_max");
2518
2536
@@ -2783,8 +2801,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2783
2801
struct ggml_tensor * Qcur;
2784
2802
switch (model.type) {
2785
2803
case MODEL_7B:
2786
- Kcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2787
- Qcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2804
+ Kcur = ggml_rope_custom (ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2805
+ Qcur = ggml_rope_custom (ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2788
2806
break;
2789
2807
case MODEL_13B:
2790
2808
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
@@ -2847,7 +2865,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2847
2865
2848
2866
// KQ_scaled = KQ / sqrt(n_embd_head)
2849
2867
// KQ_scaled shape [n_past + N, N, n_head, 1]
2850
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
2868
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
2851
2869
offload_func_kq(KQ_scaled);
2852
2870
ggml_set_name(KQ_scaled, "KQ_scaled");
2853
2871
@@ -2856,7 +2874,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2856
2874
2857
2875
switch (model.type) {
2858
2876
case MODEL_7B:
2859
- KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2877
+ KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
2860
2878
break;
2861
2879
case MODEL_13B:
2862
2880
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
@@ -2867,13 +2885,13 @@ static struct ggml_cgraph * llm_build_baichaun(
2867
2885
GGML_ASSERT(false);
2868
2886
}
2869
2887
// KQ_masked = mask_past(KQ_scaled)
2870
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2888
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
2871
2889
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2872
2890
// offload_func_kq(KQ_masked);
2873
2891
// ggml_set_name(KQ_masked, "KQ_masked");
2874
2892
2875
2893
// KQ = soft_max(KQ_masked)
2876
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
2894
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
2877
2895
offload_func_v(KQ_soft_max);
2878
2896
ggml_set_name(KQ_soft_max, "KQ_soft_max");
2879
2897
@@ -3179,9 +3197,9 @@ static struct ggml_cgraph * llm_build_falcon(
3179
3197
offload_func_v(tmpv);
3180
3198
3181
3199
// using mode = 2 for neox mode
3182
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace (ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3200
+ struct ggml_tensor * Qcur = ggml_rope_custom (ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3183
3201
offload_func_kq(Qcur);
3184
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace (ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3202
+ struct ggml_tensor * Kcur = ggml_rope_custom (ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3185
3203
offload_func_kq(Kcur);
3186
3204
3187
3205
{
@@ -3220,15 +3238,15 @@ static struct ggml_cgraph * llm_build_falcon(
3220
3238
offload_func_kq(KQ);
3221
3239
ggml_set_name(KQ, "KQ");
3222
3240
3223
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
3241
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
3224
3242
offload_func_kq(KQ_scaled);
3225
3243
ggml_set_name(KQ_scaled, "KQ_scaled");
3226
3244
3227
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
3245
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
3228
3246
offload_func_kq(KQ_masked);
3229
3247
ggml_set_name(KQ_masked, "KQ_masked");
3230
3248
3231
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
3249
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
3232
3250
offload_func_v(KQ_soft_max);
3233
3251
ggml_set_name(KQ_soft_max, "KQ_soft_max");
3234
3252
0 commit comments