|
9 | 9 | #include <algorithm>
|
10 | 10 | #include <sstream>
|
11 | 11 | #include <unordered_set>
|
| 12 | +#include <regex> |
12 | 13 |
|
13 | 14 | #if defined(__APPLE__) && defined(__MACH__)
|
14 | 15 | #include <sys/types.h>
|
@@ -295,6 +296,40 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
295 | 296 | fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
296 | 297 | fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
297 | 298 | #endif
|
| 299 | + } else if (arg == "--main-gpu" || arg == "-mg") { |
| 300 | + if (++i >= argc) { |
| 301 | + invalid_param = true; |
| 302 | + break; |
| 303 | + } |
| 304 | +#ifdef GGML_USE_CUBLAS |
| 305 | + params.main_gpu = std::stoi(argv[i]); |
| 306 | +#else |
| 307 | + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); |
| 308 | +#endif |
| 309 | + } else if (arg == "--tensor-split" || arg == "-ts") { |
| 310 | + if (++i >= argc) { |
| 311 | + invalid_param = true; |
| 312 | + break; |
| 313 | + } |
| 314 | +#ifdef GGML_USE_CUBLAS |
| 315 | + std::string arg_next = argv[i]; |
| 316 | + |
| 317 | + // split string by , and / |
| 318 | + const std::regex regex{R"([,/]+)"}; |
| 319 | + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; |
| 320 | + std::vector<std::string> split_arg{it, {}}; |
| 321 | + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); |
| 322 | + |
| 323 | + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { |
| 324 | + if (i < split_arg.size()) { |
| 325 | + params.tensor_split[i] = std::stof(split_arg[i]); |
| 326 | + } else { |
| 327 | + params.tensor_split[i] = 0.0f; |
| 328 | + } |
| 329 | + } |
| 330 | +#else |
| 331 | + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); |
| 332 | +#endif // GGML_USE_CUBLAS |
298 | 333 | } else if (arg == "--no-mmap") {
|
299 | 334 | params.use_mmap = false;
|
300 | 335 | } else if (arg == "--mtest") {
|
@@ -438,6 +473,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
438 | 473 | #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
439 | 474 | fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
440 | 475 | fprintf(stderr, " number of layers to store in VRAM\n");
|
| 476 | + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); |
| 477 | + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); |
| 478 | + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); |
441 | 479 | #endif
|
442 | 480 | fprintf(stderr, " --mtest compute maximum memory usage\n");
|
443 | 481 | fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
@@ -483,7 +521,10 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
483 | 521 | auto lparams = llama_context_default_params();
|
484 | 522 |
|
485 | 523 | lparams.n_ctx = params.n_ctx;
|
| 524 | + lparams.n_batch = params.n_batch; |
486 | 525 | lparams.n_gpu_layers = params.n_gpu_layers;
|
| 526 | + lparams.main_gpu = params.main_gpu; |
| 527 | + memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); |
487 | 528 | lparams.seed = params.seed;
|
488 | 529 | lparams.f16_kv = params.memory_f16;
|
489 | 530 | lparams.use_mmap = params.use_mmap;
|
|
0 commit comments