Skip to content

Commit 6a9fe3d

Browse files
committed
Merge branch 'master' into fix-sessions
2 parents b0670db + 9476b01 commit 6a9fe3d

19 files changed

+1267
-100
lines changed

.dockerignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
*.o
22
*.a
33
.cache/
4+
.git/
5+
.github/
6+
.gitignore
47
.vs/
58
.vscode/
69
.DS_Store

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ models-mnt
4040
/embedding
4141
/gguf
4242
/gguf-llama-simple
43+
/infill
4344
/libllama.so
4445
/llama-bench
4546
/main

CMakeLists.txt

+25-11
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,9 @@ if (LLAMA_MPI)
343343
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
344344
add_compile_definitions(GGML_USE_MPI)
345345
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
346-
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
347-
set(c_flags ${c_flags} -Wno-cast-qual)
346+
if (NOT MSVC)
347+
add_compile_options(-Wno-cast-qual)
348+
endif()
348349
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
349350
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
350351
# Even if you're only using the C header, C++ programs may bring in MPI
@@ -418,10 +419,11 @@ if (LLAMA_ALL_WARNINGS)
418419
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
419420
-Werror=implicit-function-declaration)
420421
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
422+
set(host_cxx_flags "")
421423

422424
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
423425
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
424-
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
426+
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
425427

426428
if (
427429
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -431,27 +433,38 @@ if (LLAMA_ALL_WARNINGS)
431433
endif()
432434
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
433435
set(c_flags ${c_flags} -Wdouble-promotion)
434-
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
436+
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
435437

436438
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
437-
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
439+
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
438440
endif()
439441
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
440-
set(cxx_flags ${cxx_flags} -Wextra-semi)
442+
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
441443
endif()
442444
endif()
443445
else()
444446
# todo : msvc
445447
endif()
446448

447-
add_compile_options(
448-
${warning_flags}
449-
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
450-
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
451-
)
449+
set(c_flags ${c_flags} ${warning_flags})
450+
set(cxx_flags ${cxx_flags} ${warning_flags})
451+
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
452+
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
453+
454+
endif()
452455

456+
if (NOT MSVC)
457+
set(cuda_flags -Wno-pedantic)
458+
endif()
459+
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
460+
461+
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
462+
if (NOT cuda_host_flags STREQUAL "")
463+
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
453464
endif()
454465

466+
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
467+
455468
if (WIN32)
456469
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
457470

@@ -705,6 +718,7 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
705718
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
706719
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
707720
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
721+
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
708722

709723
configure_package_config_file(
710724
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in

Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
55
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@@ -543,6 +543,9 @@ main: examples/main/main.cpp build-info.h ggml.
543543
@echo '==== Run ./main -h for help. ===='
544544
@echo
545545

546+
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
547+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
548+
546549
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
547550
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
548551

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

1212
### Hot topics
1313

14-
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
14+
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
15+
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
1516
**Devs should become familiar with the new API**
1617
- Local Falcon 180B inference on Mac Studio
1718

@@ -92,7 +93,8 @@ as the main playground for developing new features for the [ggml](https://github
9293
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
9394
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
9495
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
95-
- [X] Mistral AI v0.1
96+
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
97+
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
9698

9799
**Bindings:**
98100

@@ -662,6 +664,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
662664
663665
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
664666
667+
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
668+
665669
### Instruction mode with Alpaca
666670
667671
1. First, download the `ggml` Alpaca model into the `./models` folder

common/common.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
389389
params.interactive_first = true;
390390
} else if (arg == "-ins" || arg == "--instruct") {
391391
params.instruct = true;
392+
} else if (arg == "--infill") {
393+
params.infill = true;
392394
} else if (arg == "--multiline-input") {
393395
params.multiline_input = true;
394396
} else if (arg == "--simple-io") {

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ struct gpt_params {
120120
bool use_mlock = false; // use mlock to keep model in memory
121121
bool numa = false; // attempt optimizations that help on some NUMA systems
122122
bool verbose_prompt = false; // print prompt tokens before generation
123+
bool infill = false; // use infill mode
123124
};
124125

125126
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

examples/finetune/finetune.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
332332

333333
assert_shape_1d(layer.attention_norm, hparams.n_embd);
334334
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
335-
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd);
336-
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd);
335+
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa());
336+
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
337337
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
338338
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
339339
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
@@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
626626

627627
// KQ_pos - contains the positions
628628
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
629-
{
629+
ggml_allocr_alloc(alloc, KQ_pos);
630+
if (!ggml_allocr_is_measure(alloc)) {
630631
int * data = (int *) KQ_pos->data;
631632
for (int i = 0; i < N; ++i) {
632633
data[i] = n_past + i;
@@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
786787
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
787788
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
788789
ggml_allocr_alloc(alloc, t36->grad);
790+
// KQ_pos
791+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
789792

790793
// make sure base model tensors data cannot be used in viewable operations
791794
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));

examples/infill/CMakeLists.txt

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET infill)
2+
add_executable(${TARGET} infill.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
6+
if(TARGET BUILD_INFO)
7+
add_dependencies(${TARGET} BUILD_INFO)
8+
endif()

examples/infill/README.md

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# llama.cpp/example/infill
2+
3+
This example shows how to use the infill mode with Code Llama models supporting infill mode.
4+
Currently the 7B and 13B models support infill mode.
5+
6+
Infill supports most of the options available in the main example.
7+
8+
For further information have a look at the main README.md in llama.cpp/example/main/README.md
9+
10+
## Common Options
11+
12+
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
13+
14+
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
15+
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
16+
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
17+
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
18+
19+
## Input Prompts
20+
21+
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
22+
23+
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
24+
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
25+
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
26+
27+
## Interaction
28+
29+
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
30+
31+
### Interaction Options
32+
33+
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
34+
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
35+
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
36+
37+
### Example
38+
39+
```bash
40+
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
41+
```

0 commit comments

Comments
 (0)