Skip to content

Commit 17366df

Browse files
Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)
* CUDA multi GPU + scratch ggml_cuda_compute_forward Tensor parallelism ggml_cuda_add ggml_cuda_rms_norm ggml_cuda_silu CUDA scratch buffer --main-gpu CLI option
1 parent 44f906e commit 17366df

File tree

12 files changed

+1207
-530
lines changed

12 files changed

+1207
-530
lines changed

examples/common.cpp

+41
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <algorithm>
1010
#include <sstream>
1111
#include <unordered_set>
12+
#include <regex>
1213

1314
#if defined(__APPLE__) && defined(__MACH__)
1415
#include <sys/types.h>
@@ -295,6 +296,40 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
295296
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
296297
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
297298
#endif
299+
} else if (arg == "--main-gpu" || arg == "-mg") {
300+
if (++i >= argc) {
301+
invalid_param = true;
302+
break;
303+
}
304+
#ifdef GGML_USE_CUBLAS
305+
params.main_gpu = std::stoi(argv[i]);
306+
#else
307+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
308+
#endif
309+
} else if (arg == "--tensor-split" || arg == "-ts") {
310+
if (++i >= argc) {
311+
invalid_param = true;
312+
break;
313+
}
314+
#ifdef GGML_USE_CUBLAS
315+
std::string arg_next = argv[i];
316+
317+
// split string by , and /
318+
const std::regex regex{R"([,/]+)"};
319+
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
320+
std::vector<std::string> split_arg{it, {}};
321+
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
322+
323+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
324+
if (i < split_arg.size()) {
325+
params.tensor_split[i] = std::stof(split_arg[i]);
326+
} else {
327+
params.tensor_split[i] = 0.0f;
328+
}
329+
}
330+
#else
331+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
332+
#endif // GGML_USE_CUBLAS
298333
} else if (arg == "--no-mmap") {
299334
params.use_mmap = false;
300335
} else if (arg == "--mtest") {
@@ -438,6 +473,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
438473
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
439474
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
440475
fprintf(stderr, " number of layers to store in VRAM\n");
476+
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
477+
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
478+
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
441479
#endif
442480
fprintf(stderr, " --mtest compute maximum memory usage\n");
443481
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@@ -483,7 +521,10 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
483521
auto lparams = llama_context_default_params();
484522

485523
lparams.n_ctx = params.n_ctx;
524+
lparams.n_batch = params.n_batch;
486525
lparams.n_gpu_layers = params.n_gpu_layers;
526+
lparams.main_gpu = params.main_gpu;
527+
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
487528
lparams.seed = params.seed;
488529
lparams.f16_kv = params.memory_f16;
489530
lparams.use_mmap = params.use_mmap;

examples/common.h

+9-7
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121
int32_t get_num_physical_cores();
2222

2323
struct gpt_params {
24-
int32_t seed = -1; // RNG seed
25-
int32_t n_threads = get_num_physical_cores();
26-
int32_t n_predict = -1; // new tokens to predict
27-
int32_t n_ctx = 512; // context size
28-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30-
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
24+
int32_t seed = -1; // RNG seed
25+
int32_t n_threads = get_num_physical_cores();
26+
int32_t n_predict = -1; // new tokens to predict
27+
int32_t n_ctx = 512; // context size
28+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30+
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
32+
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
3133

3234
// sampling parameters
3335
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

examples/main/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -286,5 +286,7 @@ These options provide extra functionality and customization when running the LLa
286286
- `--verbose-prompt`: Print the prompt before generating text.
287287
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
288288
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
289+
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
290+
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
289291
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
290292
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/server/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,8 @@ Test();
287287
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
288288
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
289289
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
290+
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
291+
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
290292
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
291293
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
292294
- `--port`: Set the port to listen. Default: `8080`.

examples/server/server.cpp

+48
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,10 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
401401
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
402402
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
403403
fprintf(stderr, " number of layers to store in VRAM\n");
404+
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
405+
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
406+
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
407+
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
404408
#endif
405409
fprintf(stderr, " -m FNAME, --model FNAME\n");
406410
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -502,6 +506,50 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
502506
#else
503507
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
504508
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
509+
#endif
510+
}
511+
else if (arg == "--tensor-split" || arg == "-ts")
512+
{
513+
if (++i >= argc)
514+
{
515+
invalid_param = true;
516+
break;
517+
}
518+
#ifdef GGML_USE_CUBLAS
519+
std::string arg_next = argv[i];
520+
521+
// split string by , and /
522+
const std::regex regex{R"([,/]+)"};
523+
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
524+
std::vector<std::string> split_arg{it, {}};
525+
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
526+
527+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
528+
{
529+
if (i < split_arg.size())
530+
{
531+
params.tensor_split[i] = std::stof(split_arg[i]);
532+
}
533+
else
534+
{
535+
params.tensor_split[i] = 0.0f;
536+
}
537+
}
538+
#else
539+
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
540+
#endif // GGML_USE_CUBLAS
541+
}
542+
else if (arg == "--main-gpu" || arg == "-mg")
543+
{
544+
if (++i >= argc)
545+
{
546+
invalid_param = true;
547+
break;
548+
}
549+
#ifdef GGML_USE_CUBLAS
550+
params.main_gpu = std::stoi(argv[i]);
551+
#else
552+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
505553
#endif
506554
}
507555
else

0 commit comments

Comments
 (0)