Skip to content

Commit ec2e10c

Browse files
committed
llama : add llama_init_backend() API (close #1527)
1 parent d2c59b8 commit ec2e10c

File tree

7 files changed

+48
-29
lines changed

7 files changed

+48
-29
lines changed

examples/benchmark/benchmark-matmult.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
#include <locale.h>
21
#include "ggml.h"
32
#include "build-info.h"
3+
4+
#include <locale.h>
45
#include <assert.h>
56
#include <math.h>
67
#include <cstring>

examples/embedding/embedding.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ int main(int argc, char ** argv) {
3131
params.prompt = gpt_random_prompt(rng);
3232
}
3333

34+
llama_init_backend();
35+
3436
llama_context * ctx;
3537

3638
// load the model

examples/main/main.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,7 @@ int main(int argc, char ** argv) {
9696
params.prompt = gpt_random_prompt(rng);
9797
}
9898

99-
// params.prompt = R"(// this function checks if the number n is prime
100-
//bool is_prime(int n) {)";
99+
llama_init_backend();
101100

102101
llama_context * ctx;
103102
g_ctx = &ctx;

examples/perplexity/perplexity.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ int main(int argc, char ** argv) {
143143
params.prompt = gpt_random_prompt(rng);
144144
}
145145

146+
llama_init_backend();
147+
146148
llama_context * ctx;
147149

148150
// load the model and apply lora adapter, if any

examples/quantize/quantize.cpp

+7-14
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
#include "ggml.h"
2-
#include "llama.h"
31
#include "build-info.h"
42

3+
#include "llama.h"
4+
55
#include <cstdio>
66
#include <map>
77
#include <string>
@@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
4242
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
4343
//
4444
int main(int argc, char ** argv) {
45-
ggml_time_init();
46-
4745
if (argc < 3) {
4846
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
4947
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
5250
return 1;
5351
}
5452

55-
// needed to initialize f16 tables
56-
{
57-
struct ggml_init_params params = { 0, NULL, false };
58-
struct ggml_context * ctx = ggml_init(params);
59-
ggml_free(ctx);
60-
}
53+
llama_init_backend();
6154

6255
// parse command line arguments
6356
const std::string fname_inp = argv[1];
@@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
116109
}
117110
fprintf(stderr, "\n");
118111

119-
const int64_t t_main_start_us = ggml_time_us();
112+
const int64_t t_main_start_us = llama_time_us();
120113

121114
int64_t t_quantize_us = 0;
122115

123116
// load the model
124117
{
125-
const int64_t t_start_us = ggml_time_us();
118+
const int64_t t_start_us = llama_time_us();
126119

127120
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
128121
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
129122
return 1;
130123
}
131124

132-
t_quantize_us = ggml_time_us() - t_start_us;
125+
t_quantize_us = llama_time_us() - t_start_us;
133126
}
134127

135128
// report timing
136129
{
137-
const int64_t t_main_end_us = ggml_time_us();
130+
const int64_t t_main_end_us = llama_time_us();
138131

139132
printf("\n");
140133
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);

llama.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,21 @@ bool llama_mlock_supported() {
839839
return llama_mlock::SUPPORTED;
840840
}
841841

842+
void llama_init_backend() {
843+
ggml_time_init();
844+
845+
// needed to initialize f16 tables
846+
{
847+
struct ggml_init_params params = { 0, NULL, false };
848+
struct ggml_context * ctx = ggml_init(params);
849+
ggml_free(ctx);
850+
}
851+
}
852+
853+
int64_t llama_time_us() {
854+
return ggml_time_us();
855+
}
856+
842857
//
843858
// model loading
844859
//

llama.h

+19-12
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ extern "C" {
4040
typedef int llama_token;
4141

4242
typedef struct llama_token_data {
43-
llama_token id; // token id
44-
float logit; // log-odds of the token
45-
float p; // probability of the token
43+
llama_token id; // token id
44+
float logit; // log-odds of the token
45+
float p; // probability of the token
4646
} llama_token_data;
4747

4848
typedef struct llama_token_data_array {
@@ -73,23 +73,30 @@ extern "C" {
7373

7474
// model file types
7575
enum llama_ftype {
76-
LLAMA_FTYPE_ALL_F32 = 0,
77-
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
76+
LLAMA_FTYPE_ALL_F32 = 0,
77+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
8080
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82-
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
81+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
83+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
8686
};
8787

8888
LLAMA_API struct llama_context_params llama_context_default_params();
8989

9090
LLAMA_API bool llama_mmap_supported();
9191
LLAMA_API bool llama_mlock_supported();
9292

93+
// TODO: not great API - very likely to change
94+
// Initialize the llama + ggml backend
95+
// Call once at the start of the program
96+
LLAMA_API void llama_init_backend();
97+
98+
LLAMA_API int64_t llama_time_us();
99+
93100
// Various functions for loading a ggml llama model.
94101
// Allocate (almost) all memory needed for the model.
95102
// Return NULL on failure

0 commit comments

Comments
 (0)