Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit df7346c

Browse files
committedJun 22, 2023
Merge 'origin/master' into hipblas
2 parents 5dd2fbe + 7487137 commit df7346c

File tree

5 files changed

+92
-57
lines changed

5 files changed

+92
-57
lines changed
 

‎CMakeLists.txt

+9-16
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,15 @@ if (LLAMA_CUBLAS)
251251
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
252252
endif()
253253

254+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
255+
if (LLAMA_CUDA_DMMV_F16)
256+
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
257+
else()
258+
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
259+
endif()
260+
endif()
261+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
262+
254263
else()
255264
message(WARNING "cuBLAS not found")
256265
endif()
@@ -525,22 +534,6 @@ if (BUILD_SHARED_LIBS)
525534
endif()
526535
endif()
527536

528-
if (GGML_SOURCES_CUDA)
529-
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
530-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
531-
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
532-
533-
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
534-
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
535-
536-
if (BUILD_SHARED_LIBS)
537-
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
538-
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
539-
endif()
540-
541-
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
542-
endif()
543-
544537

545538
#
546539
# programs, examples and tests

‎README.md

+3-7
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
99

1010
**Hot topics:**
1111

12+
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
1213
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
13-
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
14-
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
15-
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
16-
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
17-
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
1814

1915
<details>
2016
<summary>Table of Contents</summary>
@@ -344,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements
344340
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
345341
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
346342
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
347-
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
343+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
348344
349345
- #### CLBlast
350346
@@ -378,7 +374,7 @@ Building the program with BLAS support may lead to some performance improvements
378374
```sh
379375
git clone https://github.com/CNugteren/CLBlast.git
380376
mkdir CLBlast/build
381-
cd CLBLast/build
377+
cd CLBlast/build
382378
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
383379
cmake --build . --config Release
384380
cmake --install . --prefix /some/path

‎convert.py

+69-22
Original file line numberDiff line numberDiff line change
@@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
130130
TENSORS_SET = set(TENSORS_LIST)
131131

132132

133+
def find_n_mult(n_ff: int, n_embd: int) -> int:
134+
# hardcoded magic range
135+
for n_mult in range(256, 1, -1):
136+
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137+
if calc_ff == n_ff:
138+
return n_mult
139+
return 1
140+
133141
@dataclass
134142
class Params:
135143
n_vocab: int
136144
n_embd: int
137145
n_mult: int
138146
n_head: int
139147
n_layer: int
140-
file_type: GGMLFileType
141148

142149
@staticmethod
143-
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
144-
n_vocab, n_embd = model["tok_embeddings.weight"].shape
150+
def guessed(model: 'LazyModel') -> 'Params':
151+
# try transformer naming first
152+
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
153+
154+
# try transformer naming first
155+
if "model.layers.0.self_attn.q_proj.weight" in model:
156+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
else:
158+
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159+
160+
n_head=n_embd // 128 # guessed
145161

146162
return Params(
147163
n_vocab=n_vocab,
148164
n_embd=n_embd,
149165
n_mult=256,
150-
n_head=n_embd // 128,
151-
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
152-
file_type=file_type,
166+
n_head=n_head,
167+
n_layer=n_layer,
153168
)
154169

170+
@staticmethod
171+
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
172+
config = json.load(open(config_path))
173+
174+
n_vocab = config["vocab_size"];
175+
n_embd = config["hidden_size"];
176+
n_head = config["num_attention_heads"];
177+
n_layer = config["num_hidden_layers"];
178+
n_ff = config["intermediate_size"];
179+
180+
n_mult = find_n_mult(n_ff, n_embd);
181+
182+
return Params(
183+
n_vocab=n_vocab,
184+
n_embd=n_embd,
185+
n_mult=n_mult,
186+
n_head=n_head,
187+
n_layer=n_layer,
188+
)
189+
190+
@staticmethod
191+
def load(model_plus: 'ModelPlus') -> 'Params':
192+
orig_config_path = model_plus.paths[0].parent / "params.json"
193+
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
194+
195+
if hf_transformer_config_path.exists():
196+
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
197+
else:
198+
params = Params.guessed(model_plus.model)
199+
200+
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
201+
return params
202+
155203

156204
class SentencePieceVocab:
157205
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def load() -> Tensor:
595643
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
596644

597645

598-
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
646+
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
599647
out: LazyModel = {}
600648
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
601649
out["norm.weight"] = model["model.norm.weight"]
602650
out["output.weight"] = model["lm_head.weight"]
603651

604-
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
605652
for i in itertools.count():
606653
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
607654
break
608-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
609-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
655+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
610657
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
611658
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
612659

@@ -920,7 +967,7 @@ class OutputFile:
920967
def __init__(self, fname_out: Path) -> None:
921968
self.fout = open(fname_out, "wb")
922969

923-
def write_file_header(self, params: Params) -> None:
970+
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
924971
self.fout.write(b"ggjt"[::-1]) # magic
925972
values = [
926973
1, # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
930977
params.n_head,
931978
params.n_layer,
932979
params.n_embd // params.n_head, # rot (obsolete)
933-
params.file_type.value,
980+
file_type.value,
934981
]
935982
self.fout.write(struct.pack("i" * len(values), *values))
936983

@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
9581005
of.fout.close()
9591006

9601007
@staticmethod
961-
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
1008+
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
9621009
check_vocab_size(params, vocab)
9631010
of = OutputFile(fname_out)
964-
of.write_file_header(params)
1011+
of.write_file_header(params, file_type)
9651012
print("Writing vocab...")
9661013
of.write_vocab(vocab)
9671014

@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
9971044
raise Exception(f"Unexpected combination of types: {name_to_type}")
9981045

9991046

1000-
def do_necessary_conversions(model: LazyModel) -> LazyModel:
1047+
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
10011048
model = handle_quantization(model)
10021049

10031050
if "lm_head.weight" in model:
1004-
model = convert_transformers_to_orig(model)
1051+
model = convert_transformers_to_orig(model, params)
10051052
model = filter_and_sort_tensors(model)
10061053

10071054
return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
11071154
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
11081155

11091156

1110-
def default_outfile(model_paths: List[Path], params: Params) -> Path:
1157+
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
11111158
namestr = {
11121159
GGMLFileType.AllF32: "f32",
11131160
GGMLFileType.MostlyF16: "f16",
11141161
GGMLFileType.MostlyQ4_0: "q4_0",
11151162
GGMLFileType.MostlyQ4_1: "q4_1",
11161163
GGMLFileType.PerLayerIsQ4_1: "q4_1",
1117-
}[params.file_type]
1164+
}[file_type]
11181165
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
11191166
if ret in model_paths:
11201167
sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
11641211
else:
11651212
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
11661213
vocab = load_vocab(vocab_dir)
1214+
params = Params.load(model_plus)
11671215
model = model_plus.model
1168-
model = do_necessary_conversions(model)
1216+
model = do_necessary_conversions(model, params)
11691217
output_type = pick_output_type(model, args.outtype)
11701218
model = convert_to_output_type(model, output_type)
1171-
params = Params.guessed(model, output_type)
1172-
outfile = args.outfile or default_outfile(model_plus.paths, params)
1173-
OutputFile.write_all(outfile, params, model, vocab)
1219+
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1220+
OutputFile.write_all(outfile, params, output_type, model, vocab)
11741221
print(f"Wrote {outfile}")
11751222

11761223

‎llama.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -925,21 +925,21 @@ static bool kv_cache_init(
925925

926926
struct llama_context_params llama_context_default_params() {
927927
struct llama_context_params result = {
928+
/*.seed =*/ -1,
928929
/*.n_ctx =*/ 512,
929930
/*.n_batch =*/ 512,
930931
/*.gpu_layers =*/ 0,
931932
/*.main_gpu =*/ 0,
932933
/*.tensor_split =*/ {0},
934+
/*.progress_callback =*/ nullptr,
935+
/*.progress_callback_user_data =*/ nullptr,
933936
/*.low_vram =*/ false,
934-
/*.seed =*/ -1,
935937
/*.f16_kv =*/ true,
936938
/*.logits_all =*/ false,
937939
/*.vocab_only =*/ false,
938940
/*.use_mmap =*/ true,
939941
/*.use_mlock =*/ false,
940942
/*.embedding =*/ false,
941-
/*.progress_callback =*/ nullptr,
942-
/*.progress_callback_user_data =*/ nullptr,
943943
};
944944

945945
return result;

‎llama.h

+8-9
Original file line numberDiff line numberDiff line change
@@ -71,28 +71,27 @@ extern "C" {
7171

7272
typedef void (*llama_progress_callback)(float progress, void *ctx);
7373

74-
struct llama_context_params {
74+
struct llama_context_params {
75+
int seed; // RNG seed, -1 for random
7576
int n_ctx; // text context
7677
int n_batch; // prompt processing batch size
7778
int n_gpu_layers; // number of layers to store in VRAM
7879
int main_gpu; // the GPU that is used for scratch and small tensors
7980
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80-
bool low_vram; // if true, reduce VRAM usage at the cost of performance
81-
int seed; // RNG seed, -1 for random
81+
// called with a progress value between 0 and 1, pass NULL to disable
82+
llama_progress_callback progress_callback;
83+
// context pointer passed to the progress callback
84+
void * progress_callback_user_data;
8285

86+
// Keep the booleans together to avoid misalignment during copy-by-value.
87+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
8388
bool f16_kv; // use fp16 for KV cache
8489
bool logits_all; // the llama_eval() call computes all logits, not just the last one
8590
bool vocab_only; // only load the vocabulary, no weights
8691
bool use_mmap; // use mmap if possible
8792
bool use_mlock; // force system to keep model in RAM
8893
bool embedding; // embedding mode only
89-
90-
// called with a progress value between 0 and 1, pass NULL to disable
91-
llama_progress_callback progress_callback;
92-
// context pointer passed to the progress callback
93-
void * progress_callback_user_data;
9494
};
95-
9695
// model file types
9796
enum llama_ftype {
9897
LLAMA_FTYPE_ALL_F32 = 0,

0 commit comments

Comments
 (0)
Please sign in to comment.