Skip to content

Commit 14f4f40

Browse files
authored
Merge b3565
Merge b3565
2 parents 8bc7a98 + 6e02327 commit 14f4f40

File tree

117 files changed

+4783
-1344
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+4783
-1344
lines changed

.devops/llama-server.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
33
FROM ubuntu:$UBUNTU_VERSION AS build
44

55
RUN apt-get update && \
6-
apt-get install -y build-essential git libcurl4-openssl-dev curl
6+
apt-get install -y build-essential git libcurl4-openssl-dev
77

88
WORKDIR /app
99

@@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
1616
FROM ubuntu:$UBUNTU_VERSION AS runtime
1717

1818
RUN apt-get update && \
19-
apt-get install -y libcurl4-openssl-dev libgomp1
19+
apt-get install -y libcurl4-openssl-dev libgomp1 curl
2020

2121
COPY --from=build /app/llama-server /llama-server
2222

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ models-mnt
7979
!models/ggml-vocab-*.gguf*
8080

8181
# Zig
82-
8382
zig-out/
8483
zig-cache/
8584

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Execute [the full CI locally on your machine](ci/README.md) before publishing
66
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
77
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
8+
- Consider allowing write access to your branch for faster review
89
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
910

1011
# Pull requests (for collaborators)

Makefile

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BUILD_TARGETS = \
1919
llama-imatrix \
2020
llama-infill \
2121
llama-llava-cli \
22+
llama-minicpmv-cli\
2223
llama-lookahead \
2324
llama-lookup \
2425
llama-lookup-create \
@@ -888,15 +889,16 @@ ggml/src/ggml-metal-embed.o: \
888889
ggml/src/ggml-common.h
889890
@echo "Embedding Metal library"
890891
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
891-
$(eval TEMP_ASSEMBLY=$(shell mktemp))
892-
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
893-
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
894-
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
895-
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
896-
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
897-
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
898-
@$(AS) $(TEMP_ASSEMBLY) -o $@
899-
@rm -f ${TEMP_ASSEMBLY}
892+
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
893+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
894+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
895+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
896+
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
897+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
898+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
899+
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
900+
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
901+
@rmdir ${TEMP_ASSEMBLY}
900902
endif
901903
endif # GGML_METAL
902904

@@ -1205,6 +1207,7 @@ clean:
12051207
rm -rvf ggml/*.dll
12061208
rm -rvf ggml/*.so
12071209
rm -vrf ggml/src/*.o
1210+
rm -rvf ggml/src/llamafile/*.o
12081211
rm -rvf common/build-info.cpp
12091212
rm -vrf ggml/src/ggml-metal-embed.metal
12101213
rm -vrf ggml/src/ggml-cuda/*.o
@@ -1451,15 +1454,20 @@ libllava.a: examples/llava/llava.cpp \
14511454
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
14521455

14531456
llama-llava-cli: examples/llava/llava-cli.cpp \
1454-
examples/llava/clip.h \
1455-
examples/llava/clip.cpp \
1457+
examples/llava/llava.cpp \
14561458
examples/llava/llava.h \
1459+
examples/llava/clip.cpp \
1460+
examples/llava/clip.h \
1461+
$(OBJ_ALL)
1462+
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
1463+
1464+
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
14571465
examples/llava/llava.cpp \
1466+
examples/llava/llava.h \
1467+
examples/llava/clip.cpp \
1468+
examples/llava/clip.h \
14581469
$(OBJ_ALL)
1459-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1460-
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
1461-
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
1462-
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
1470+
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
14631471

14641472
ifeq ($(UNAME_S),Darwin)
14651473
swift: examples/batched.swift

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
9595
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
9696
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
9797
- [x] [OLMo](https://allenai.org/olmo)
98+
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
9899
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
100+
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
101+
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
102+
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
103+
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
104+
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
105+
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
99106
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
107+
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
100108

101109
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
102110

@@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
145153
- [Faraday](https://faraday.dev/) (proprietary)
146154
- [LMStudio](https://lmstudio.ai/) (proprietary)
147155
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
156+
- [ramalama](https://github.com/containers/ramalama) (MIT)
148157
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
149158
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
150159
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)

common/common.cpp

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
684684
}
685685
if (arg == "--lora") {
686686
CHECK_ARG
687-
params.lora_adapter.emplace_back(argv[i], 1.0f);
687+
params.lora_adapters.push_back({
688+
std::string(argv[i]),
689+
1.0,
690+
});
688691
return true;
689692
}
690693
if (arg == "--lora-scaled") {
691694
CHECK_ARG
692-
const char* lora_adapter = argv[i];
695+
std::string lora_adapter = argv[i];
693696
CHECK_ARG
694-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
697+
params.lora_adapters.push_back({
698+
lora_adapter,
699+
std::stof(argv[i]),
700+
});
701+
return true;
702+
}
703+
if (arg == "--lora-init-without-apply") {
704+
params.lora_init_without_apply = true;
695705
return true;
696706
}
697707
if (arg == "--control-vector") {
@@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16541664
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16551665
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
16561666
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1667+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
16571668

16581669
#ifndef LOG_DISABLE_LOGS
16591670
options.push_back({ "logging" });
@@ -1766,6 +1777,17 @@ std::string string_get_sortable_timestamp() {
17661777
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
17671778
}
17681779

1780+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
1781+
if (search.empty()) {
1782+
return; // Avoid infinite loop if 'search' is an empty string
1783+
}
1784+
size_t pos = 0;
1785+
while ((pos = s.find(search, pos)) != std::string::npos) {
1786+
s.replace(pos, search.length(), replace);
1787+
pos += replace.length();
1788+
}
1789+
}
1790+
17691791
void string_process_escapes(std::string & input) {
17701792
std::size_t input_len = input.length();
17711793
std::size_t output_idx = 0;
@@ -2039,8 +2061,8 @@ std::string fs_get_cache_file(const std::string & filename) {
20392061
//
20402062
// Model utils
20412063
//
2042-
2043-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2064+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2065+
llama_init_result iparams;
20442066
auto mparams = llama_model_params_from_gpt_params(params);
20452067

20462068
llama_model * model = nullptr;
@@ -2055,7 +2077,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20552077

20562078
if (model == NULL) {
20572079
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2058-
return std::make_tuple(nullptr, nullptr);
2080+
return iparams;
20592081
}
20602082

20612083
auto cparams = llama_context_params_from_gpt_params(params);
@@ -2064,7 +2086,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20642086
if (lctx == NULL) {
20652087
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
20662088
llama_free_model(model);
2067-
return std::make_tuple(nullptr, nullptr);
2089+
return iparams;
20682090
}
20692091

20702092
if (!params.control_vectors.empty()) {
@@ -2075,7 +2097,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20752097
if (cvec.n_embd == -1) {
20762098
llama_free(lctx);
20772099
llama_free_model(model);
2078-
return std::make_tuple(nullptr, nullptr);
2100+
return iparams;
20792101
}
20802102

20812103
int err = llama_control_vector_apply(lctx,
@@ -2087,21 +2109,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20872109
if (err) {
20882110
llama_free(lctx);
20892111
llama_free_model(model);
2090-
return std::make_tuple(nullptr, nullptr);
2112+
return iparams;
20912113
}
20922114
}
20932115

2094-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2095-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2096-
float lora_scale = std::get<1>(params.lora_adapter[i]);
2097-
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2098-
if (adapter == nullptr) {
2099-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2116+
// load and optionally apply lora adapters
2117+
for (auto & la : params.lora_adapters) {
2118+
llama_lora_adapter_container loaded_la;
2119+
loaded_la.path = la.path;
2120+
loaded_la.scale = la.scale;
2121+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2122+
if (loaded_la.adapter == nullptr) {
2123+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
21002124
llama_free(lctx);
21012125
llama_free_model(model);
2102-
return std::make_tuple(nullptr, nullptr);
2126+
return iparams;
21032127
}
2104-
llama_lora_adapter_set(lctx, adapter, lora_scale);
2128+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2129+
}
2130+
if (!params.lora_init_without_apply) {
2131+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
21052132
}
21062133

21072134
if (params.ignore_eos) {
@@ -2129,13 +2156,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21292156
tmp.clear();
21302157
tmp.push_back(decoder_start_token_id);
21312158
}
2132-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2159+
if (llama_model_has_decoder(model)) {
2160+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2161+
}
21332162
llama_kv_cache_clear(lctx);
21342163
llama_synchronize(lctx);
21352164
llama_reset_timings(lctx);
21362165
}
21372166

2138-
return std::make_tuple(model, lctx);
2167+
iparams.model = model;
2168+
iparams.context = lctx;
2169+
return iparams;
2170+
}
2171+
2172+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2173+
llama_lora_adapter_clear(ctx);
2174+
for (auto & la : lora_adapters) {
2175+
if (la.scale != 0.0f) {
2176+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
2177+
}
2178+
}
21392179
}
21402180

21412181
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -3160,19 +3200,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31603200
}
31613201

31623202
fprintf(stream, "lora:\n");
3163-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3164-
if (std::get<1>(la) != 1.0f) {
3165-
continue;
3203+
for (auto & la : params.lora_adapters) {
3204+
if (la.scale == 1.0f) {
3205+
fprintf(stream, " - %s\n", la.path.c_str());
31663206
}
3167-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
31683207
}
31693208
fprintf(stream, "lora_scaled:\n");
3170-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3171-
if (std::get<1>(la) == 1.0f) {
3172-
continue;
3209+
for (auto & la : params.lora_adapters) {
3210+
if (la.scale != 1.0f) {
3211+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
31733212
}
3174-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
31753213
}
3214+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
31763215
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
31773216
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
31783217
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

common/common.h

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333

3434
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3535

36+
struct llama_lora_adapter_info {
37+
std::string path;
38+
float scale;
39+
};
40+
41+
struct llama_lora_adapter_container : llama_lora_adapter_info {
42+
struct llama_lora_adapter * adapter;
43+
};
44+
3645
// build info
3746
extern int LLAMA_BUILD_NUMBER;
3847
extern char const * LLAMA_COMMIT;
@@ -126,8 +135,8 @@ struct gpt_params {
126135
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
127136
std::vector<llama_model_kv_override> kv_overrides;
128137

129-
// TODO: avoid tuple, use struct
130-
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
138+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
139+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
131140

132141
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
133142

@@ -277,6 +286,8 @@ std::vector<std::string> string_split(std::string input, char separator);
277286
std::string string_strip(const std::string & str);
278287
std::string string_get_sortable_timestamp();
279288

289+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
290+
280291
template<class T>
281292
static std::vector<T> string_split(const std::string & str, char delim) {
282293
std::vector<T> values;
@@ -308,15 +319,23 @@ std::string fs_get_cache_file(const std::string & filename);
308319
// Model utils
309320
//
310321

311-
// TODO: avoid tuplue, use struct
312-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
322+
struct llama_init_result {
323+
struct llama_model * model = nullptr;
324+
struct llama_context * context = nullptr;
325+
std::vector<llama_lora_adapter_container> lora_adapters;
326+
};
327+
328+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
313329

314330
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
315331
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
316332

317333
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318334
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
319335

336+
// clear LoRA adapters from context, then apply new list of adapters
337+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
338+
320339
// Batch utils
321340

322341
void llama_batch_clear(struct llama_batch & batch);

0 commit comments

Comments
 (0)