@@ -42,35 +42,51 @@ static const size_t MB = 1024*1024;
42
42
// TODO: dynamically determine these sizes
43
43
// needs modifications in ggml
44
44
45
- static const std::map<e_model, size_t > MEM_REQ_SCRATCH0 = {
46
- { MODEL_7B, 512ull *MB },
47
- { MODEL_13B, 512ull *MB },
48
- { MODEL_30B, 512ull *MB },
49
- { MODEL_65B, 512ull *MB },
50
- };
45
+ static const std::map<e_model, size_t > & MEM_REQ_SCRATCH0 ()
46
+ {
47
+ static std::map<e_model, size_t > _MEM_REQ_SCRATCH0 = {
48
+ { MODEL_7B, 512ull * MB },
49
+ { MODEL_13B, 512ull * MB },
50
+ { MODEL_30B, 512ull * MB },
51
+ { MODEL_65B, 512ull * MB },
52
+ };
53
+ return _MEM_REQ_SCRATCH0;
54
+ }
51
55
52
- static const std::map<e_model, size_t > MEM_REQ_SCRATCH1 = {
53
- { MODEL_7B, 512ull *MB },
54
- { MODEL_13B, 512ull *MB },
55
- { MODEL_30B, 512ull *MB },
56
- { MODEL_65B, 512ull *MB },
56
+ static const std::map<e_model, size_t > & MEM_REQ_SCRATCH1 ()
57
+ {
58
+ static std::map<e_model, size_t > _MEM_REQ_SCRATCH1 = {
59
+ { MODEL_7B, 512ull * MB },
60
+ { MODEL_13B, 512ull * MB },
61
+ { MODEL_30B, 512ull * MB },
62
+ { MODEL_65B, 512ull * MB },
63
+ };
64
+ return _MEM_REQ_SCRATCH1;
57
65
};
58
66
59
67
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
60
- static const std::map<e_model, size_t > MEM_REQ_KV_SELF = {
61
- { MODEL_7B, 1026ull *MB },
62
- { MODEL_13B, 1608ull *MB },
63
- { MODEL_30B, 3124ull *MB },
64
- { MODEL_65B, 5120ull *MB },
68
+ static const std::map<e_model, size_t > & MEM_REQ_KV_SELF ()
69
+ {
70
+ static std::map<e_model, size_t > _MEM_REQ_KV_SELF = {
71
+ { MODEL_7B, 1026ull * MB },
72
+ { MODEL_13B, 1608ull * MB },
73
+ { MODEL_30B, 3124ull * MB },
74
+ { MODEL_65B, 5120ull * MB },
75
+ };
76
+ return _MEM_REQ_KV_SELF;
65
77
};
66
78
67
79
// this is mostly needed for temporary mul_mat buffers to dequantize the data
68
80
// not actually needed if BLAS is disabled
69
- static const std::map<e_model, size_t > MEM_REQ_EVAL = {
70
- { MODEL_7B, 768ull *MB },
71
- { MODEL_13B, 1024ull *MB },
72
- { MODEL_30B, 1280ull *MB },
73
- { MODEL_65B, 1536ull *MB },
81
+ static const std::map<e_model, size_t > & MEM_REQ_EVAL ()
82
+ {
83
+ static std::map<e_model, size_t > _MEM_REQ_EVAL = {
84
+ { MODEL_7B, 768ull * MB },
85
+ { MODEL_13B, 1024ull * MB },
86
+ { MODEL_30B, 1280ull * MB },
87
+ { MODEL_65B, 1536ull * MB },
88
+ };
89
+ return _MEM_REQ_EVAL;
74
90
};
75
91
76
92
// default hparams (LLaMA 7B)
@@ -899,13 +915,13 @@ static void llama_model_load_internal(
899
915
const size_t mem_required =
900
916
ctx_size +
901
917
mmapped_size +
902
- MEM_REQ_SCRATCH0.at (model.type ) +
903
- MEM_REQ_SCRATCH1.at (model.type ) +
904
- MEM_REQ_EVAL.at (model.type );
918
+ MEM_REQ_SCRATCH0 () .at (model.type ) +
919
+ MEM_REQ_SCRATCH1 () .at (model.type ) +
920
+ MEM_REQ_EVAL () .at (model.type );
905
921
906
922
// this is the memory required by one llama_state
907
923
const size_t mem_required_state =
908
- scale*MEM_REQ_KV_SELF.at (model.type );
924
+ scale*MEM_REQ_KV_SELF () .at (model.type );
909
925
910
926
fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
911
927
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
@@ -1732,10 +1748,10 @@ struct llama_context * llama_init_from_file(
1732
1748
ctx->embedding .resize (hparams.n_embd );
1733
1749
}
1734
1750
1735
- ctx->buf_compute .resize (MEM_REQ_EVAL.at (ctx->model .type ));
1751
+ ctx->buf_compute .resize (MEM_REQ_EVAL () .at (ctx->model .type ));
1736
1752
1737
- ctx->buf_scratch [0 ].resize (MEM_REQ_SCRATCH0.at (ctx->model .type ));
1738
- ctx->buf_scratch [1 ].resize (MEM_REQ_SCRATCH1.at (ctx->model .type ));
1753
+ ctx->buf_scratch [0 ].resize (MEM_REQ_SCRATCH0 () .at (ctx->model .type ));
1754
+ ctx->buf_scratch [1 ].resize (MEM_REQ_SCRATCH1 () .at (ctx->model .type ));
1739
1755
}
1740
1756
1741
1757
return ctx;
0 commit comments