Skip to content

Commit b925f1f

Browse files
authored
cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
1 parent 90b19bd commit b925f1f

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

ggml-cuda.cu

+12-2
Original file line numberDiff line numberDiff line change
@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
355355
}
356356

357357
void * ggml_cuda_host_malloc(size_t size) {
358-
void * ptr;
359-
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
358+
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
359+
return nullptr;
360+
}
361+
362+
void * ptr = nullptr;
363+
cudaError_t err = cudaMallocHost((void **) &ptr, size);
364+
if (err != cudaSuccess) {
365+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
366+
size/1024.0/1024.0, cudaGetErrorString(err));
367+
return nullptr;
368+
}
369+
360370
return ptr;
361371
}
362372

llama-util.h

+38-4
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ struct llama_buffer {
395395
uint8_t * addr = NULL;
396396
size_t size = 0;
397397

398+
llama_buffer() = default;
399+
398400
void resize(size_t size) {
399401
delete[] addr;
400402
addr = new uint8_t[size];
@@ -404,27 +406,59 @@ struct llama_buffer {
404406
~llama_buffer() {
405407
delete[] addr;
406408
}
409+
410+
// disable copy and move
411+
llama_buffer(const llama_buffer&) = delete;
412+
llama_buffer(llama_buffer&&) = delete;
413+
llama_buffer& operator=(const llama_buffer&) = delete;
414+
llama_buffer& operator=(llama_buffer&&) = delete;
407415
};
408416

409417
#ifdef GGML_USE_CUBLAS
410418
#include "ggml-cuda.h"
411419
struct llama_ctx_buffer {
412420
uint8_t * addr = NULL;
421+
bool is_cuda;
413422
size_t size = 0;
414423

424+
llama_ctx_buffer() = default;
425+
415426
void resize(size_t size) {
427+
free();
428+
429+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
416430
if (addr) {
417-
ggml_cuda_host_free(addr);
431+
is_cuda = true;
432+
}
433+
else {
434+
// fall back to pageable memory
435+
addr = new uint8_t[size];
436+
is_cuda = false;
418437
}
419-
addr = (uint8_t *) ggml_cuda_host_malloc(size);
420438
this->size = size;
421439
}
422440

423-
~llama_ctx_buffer() {
441+
void free() {
424442
if (addr) {
425-
ggml_cuda_host_free(addr);
443+
if (is_cuda) {
444+
ggml_cuda_host_free(addr);
445+
}
446+
else {
447+
delete[] addr;
448+
}
426449
}
450+
addr = NULL;
427451
}
452+
453+
~llama_ctx_buffer() {
454+
free();
455+
}
456+
457+
// disable copy and move
458+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
459+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
460+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
461+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
428462
};
429463
#else
430464
typedef llama_buffer llama_ctx_buffer;

llama.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -727,8 +727,7 @@ struct llama_model_loader {
727727
LLAMA_ASSERT(offset == lt.size);
728728
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
729729
// Let's load the data into temporary buffers to ensure the OS performs large loads.
730-
std::vector<llama_buffer> tmp_bufs;
731-
tmp_bufs.resize(lt.shards.size());
730+
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
732731
for (size_t i = 0; i < lt.shards.size(); i++) {
733732
llama_load_tensor_shard & shard = lt.shards.at(i);
734733
llama_file & file = file_loaders.at(shard.file_idx)->file;

0 commit comments

Comments
 (0)