Skip to content

Commit 94daebe

Browse files
committed
Merge branch 'master' of github.com:ggerganov/llama.cpp into grammar-example
* 'master' of github.com:ggerganov/llama.cpp: (24 commits) convert : fix Baichuan2 models by using vocab size in config.json (ggml-org#3299) readme : add project status link ggml : fix build after ggml-org#3329 llm : add Refact model (ggml-org#3329) sync : ggml (conv 1d + 2d updates, UB fixes) (ggml-org#3468) finetune : readme fix typo (ggml-org#3465) ggml : add RISC-V Vector Support for K-Quants and improved the existing intrinsics (ggml-org#3453) main : consistent prefix/suffix coloring (ggml-org#3425) llama : fix session saving/loading (ggml-org#3400) llama : expose model's rope_freq_scale in the API (ggml-org#3418) metal : alibi for arbitrary number of heads (ggml-org#3426) cmake : make LLAMA_NATIVE flag actually use the instructions supported by the processor (ggml-org#3273) Work on the BPE tokenizer (ggml-org#3252) convert : fix vocab size when not defined in hparams (ggml-org#3421) cmake : increase minimum version for add_link_options (ggml-org#3444) CLBlast: Add broadcast support for matrix multiplication (ggml-org#3402) gguf : add BERT, MPT, and GPT-J arch info (ggml-org#3408) gguf : general usability improvements (ggml-org#3409) cmake : make CUDA flags more similar to the Makefile (ggml-org#3420) finetune : fix ggml-org#3404 (ggml-org#3437) ...
2 parents e5c4193 + 019ba1d commit 94daebe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+4692
-937
lines changed

.dockerignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
*.o
22
*.a
33
.cache/
4+
.git/
5+
.github/
6+
.gitignore
47
.vs/
58
.vscode/
69
.DS_Store

.github/workflows/build.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ jobs:
188188
sysctl -a
189189
mkdir build
190190
cd build
191-
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
191+
cmake ..
192192
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
193193
194194
- name: Test
@@ -265,17 +265,17 @@ jobs:
265265
matrix:
266266
include:
267267
- build: 'noavx'
268-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
268+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
269269
- build: 'avx2'
270-
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
270+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
271271
- build: 'avx'
272-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
272+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
273273
- build: 'avx512'
274-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
274+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
275275
- build: 'clblast'
276-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
276+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
277277
- build: 'openblas'
278-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
278+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
279279

280280
steps:
281281
- name: Clone
@@ -414,7 +414,7 @@ jobs:
414414
run: |
415415
mkdir build
416416
cd build
417-
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
417+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
418418
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
419419
420420
- name: Determine tag name

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ models-mnt
4040
/embedding
4141
/gguf
4242
/gguf-llama-simple
43+
/infill
4344
/libllama.so
4445
/llama-bench
4546
/main
@@ -90,4 +91,5 @@ tests/test-quantize-perf
9091
tests/test-sampling
9192
tests/test-tokenizer-0-llama
9293
tests/test-tokenizer-0-falcon
93-
tests/test-tokenizer-1
94+
tests/test-tokenizer-1-llama
95+
tests/test-tokenizer-1-bpe

CMakeLists.txt

+43-23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
1+
cmake_minimum_required(VERSION 3.13) # for add_link_options
22
project("llama.cpp" C CXX)
33

44
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -44,7 +44,7 @@ endif()
4444

4545
# general
4646
option(LLAMA_STATIC "llama: static link libraries" OFF)
47-
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
47+
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
4848
option(LLAMA_LTO "llama: enable link time optimization" OFF)
4949

5050
# debug
@@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
5858
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
5959

6060
# instruction set specific
61-
option(LLAMA_AVX "llama: enable AVX" ON)
62-
option(LLAMA_AVX2 "llama: enable AVX2" ON)
63-
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
64-
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
65-
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
66-
option(LLAMA_FMA "llama: enable FMA" ON)
61+
if (LLAMA_NATIVE)
62+
set(INS_ENB OFF)
63+
else()
64+
set(INS_ENB ON)
65+
endif()
66+
67+
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
68+
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
69+
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
70+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
71+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
72+
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
6773
# in MSVC F16C is implied with AVX2/AVX512
6874
if (NOT MSVC)
69-
option(LLAMA_F16C "llama: enable F16C" ON)
75+
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
7076
endif()
7177

7278
# 3rd party libs
@@ -343,8 +349,9 @@ if (LLAMA_MPI)
343349
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
344350
add_compile_definitions(GGML_USE_MPI)
345351
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
346-
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
347-
set(c_flags ${c_flags} -Wno-cast-qual)
352+
if (NOT MSVC)
353+
add_compile_options(-Wno-cast-qual)
354+
endif()
348355
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
349356
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
350357
# Even if you're only using the C header, C++ programs may bring in MPI
@@ -418,10 +425,11 @@ if (LLAMA_ALL_WARNINGS)
418425
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
419426
-Werror=implicit-function-declaration)
420427
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
428+
set(host_cxx_flags "")
421429

422430
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
423431
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
424-
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
432+
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
425433

426434
if (
427435
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -431,27 +439,38 @@ if (LLAMA_ALL_WARNINGS)
431439
endif()
432440
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
433441
set(c_flags ${c_flags} -Wdouble-promotion)
434-
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
442+
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
435443

436444
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
437-
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
445+
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
438446
endif()
439447
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
440-
set(cxx_flags ${cxx_flags} -Wextra-semi)
448+
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
441449
endif()
442450
endif()
443451
else()
444452
# todo : msvc
445453
endif()
446454

447-
add_compile_options(
448-
${warning_flags}
449-
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
450-
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
451-
)
455+
set(c_flags ${c_flags} ${warning_flags})
456+
set(cxx_flags ${cxx_flags} ${warning_flags})
457+
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
458+
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
452459

453460
endif()
454461

462+
if (NOT MSVC)
463+
set(cuda_flags -Wno-pedantic)
464+
endif()
465+
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
466+
467+
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
468+
if (NOT cuda_host_flags STREQUAL "")
469+
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
470+
endif()
471+
472+
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
473+
455474
if (WIN32)
456475
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
457476

@@ -491,9 +510,6 @@ if (NOT MSVC)
491510
if (LLAMA_GPROF)
492511
add_compile_options(-pg)
493512
endif()
494-
if (LLAMA_NATIVE)
495-
add_compile_options(-march=native)
496-
endif()
497513
endif()
498514

499515
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@@ -548,6 +564,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
548564
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
549565
endif()
550566
else()
567+
if (LLAMA_NATIVE)
568+
add_compile_options(-march=native)
569+
endif()
551570
if (LLAMA_F16C)
552571
add_compile_options(-mf16c)
553572
endif()
@@ -705,6 +724,7 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
705724
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
706725
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
707726
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
727+
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
708728

709729
configure_package_config_file(
710730
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in

Makefile

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
66

77
# Code coverage output files
88
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
6262
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
6363
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
6464
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
65-
continue; \
65+
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6666
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6767
continue; \
68+
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
69+
continue; \
6870
else \
6971
echo "Running test $$test_target..."; \
7072
./$$test_target; \
@@ -543,6 +545,9 @@ main: examples/main/main.cpp build-info.h ggml.
543545
@echo '==== Run ./main -h for help. ===='
544546
@echo
545547

548+
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
549+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
550+
546551
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
547552
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
548553

@@ -667,6 +672,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
667672
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
668673
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
669674

675+
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
676+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
677+
670678
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
671679
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
672680

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
66
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
77

8-
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
8+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
99

1010
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

common/common.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
389389
params.interactive_first = true;
390390
} else if (arg == "-ins" || arg == "--instruct") {
391391
params.instruct = true;
392+
} else if (arg == "--infill") {
393+
params.infill = true;
392394
} else if (arg == "--multiline-input") {
393395
params.multiline_input = true;
394396
} else if (arg == "--simple-io") {
@@ -921,6 +923,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
921923
result += piece;
922924
}
923925

926+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
924927
return result;
925928
}
926929

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ struct gpt_params {
120120
bool use_mlock = false; // use mlock to keep model in memory
121121
bool numa = false; // attempt optimizations that help on some NUMA systems
122122
bool verbose_prompt = false; // print prompt tokens before generation
123+
bool infill = false; // use infill mode
123124
};
124125

125126
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

convert-baichuan-hf-to-gguf.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
from pathlib import Path
1212
from typing import TYPE_CHECKING, Any
1313
import itertools
14-
import gguf
1514
import numpy as np
1615
import torch
1716
from sentencepiece import SentencePieceProcessor # type: ignore[import]
1817

18+
if 'NO_LOCAL_GGUF' not in os.environ:
19+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20+
import gguf
21+
1922

2023
if TYPE_CHECKING:
2124
from typing import TypeAlias
@@ -174,8 +177,11 @@ def parse_args() -> argparse.Namespace:
174177
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
175178

176179
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
180+
vocab_size = hparams.get('vocab_size')
181+
if vocab_size is None:
182+
vocab_size = tokenizer.vocab_size()
177183

178-
for i in range(tokenizer.vocab_size()):
184+
for i in range(vocab_size):
179185
text: bytes
180186
score: float
181187

0 commit comments

Comments
 (0)