Memory optimizations (ggml-org#44)

cmp-nct · web-flow · commit 1b55412eca2d · 2023-07-05T20:47:32.000+02:00
* RAM usage reduction and calculations
Removed -b batch limit (1024) (tested up to-b 8192)
Fixed a integer overflow in ggml matmul (happened at around nbatch 3000)
Added a dynamic calculation for batched scratch memory consumption
Overall reduced RAM buffer sizes by magnitudes for normal settings
RAM usage scales quadratically with increasing context size * batch
Using a small batch (or default 1) will result in a very small memory footprint even at thousands of tokens processed
Tested up to 13,000 tokens prompt and 8k batch
Needs more tests on various platforms

* removed debug

* minor

---------
diff --git a/examples/falcon/falcon_main.cpp b/examples/falcon/falcon_main.cpp
@@ -151,6 +151,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+
     #if defined(GGML_USE_CUBLAS)
     // wait for cublas and show device information
     {
@@ -168,14 +169,15 @@ int main(int argc, char ** argv) {
     // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
     // uncomment the "used_mem" line in llama.cpp to see the results
     if (params.mem_test) {
+        falcon_prepare_buffers(ctx, params.n_batch, params.n_ctx);
         {
-            const std::vector<llama_token> tmp(params.n_batch, falcon_token_bos());
-            falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads,params.debug_timings);
+            const std::vector<falcon_token> tmp((int)params.n_batch, falcon_token_bos());
+            falcon_eval(ctx, tmp.data(), (int)tmp.size(), 0, params.n_threads,params.debug_timings);
         }
 
         {
-            const std::vector<llama_token> tmp = { 0, };
-            falcon_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads,params.debug_timings);
+            const std::vector<falcon_token> tmp = { 0, };
+            falcon_eval(ctx, tmp.data(), (int)tmp.size(), params.n_predict - 1, params.n_threads,params.debug_timings);
         }
 
         falcon_print_timings(ctx);
@@ -193,7 +195,7 @@ int main(int argc, char ** argv) {
     }
 
     std::string path_session = params.path_prompt_cache;
-    std::vector<llama_token> session_tokens;
+    std::vector<falcon_token> session_tokens;
 
     if (!path_session.empty()) {
         fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
@@ -219,7 +221,7 @@ int main(int argc, char ** argv) {
     }
 
     // tokenize the prompt
-    std::vector<llama_token> embd_inp;
+    std::vector<falcon_token> embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         // Falcon does not have a dedicated bos token (bos==eos), so don't inject it here
@@ -239,11 +241,11 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
-
+    falcon_prepare_buffers(ctx, params.n_batch, embd_inp.size()+1);
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
     if (session_tokens.size()) {
-        for (llama_token id : session_tokens) {
+        for (falcon_token id : session_tokens) {
             if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                 break;
             }
@@ -275,8 +277,8 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    std::vector<llama_token> inp_pfx;
-    std::vector<llama_token> inp_sfx;
+    std::vector<falcon_token> inp_pfx;
+    std::vector<falcon_token> inp_sfx;
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -294,7 +296,7 @@ int main(int argc, char ** argv) {
 
     // determine newline token
     //auto llama_token_newline = ::falcon_tokenize(ctx, "\n", false);
-    auto llama_token_newline = std::vector<llama_token>(193);
+    auto llama_token_newline = std::vector<falcon_token>(193);
 
     if (params.verbose_prompt) {
         fprintf(stderr, "\n");
@@ -359,7 +361,7 @@ fprintf(stderr, "|            | %5d | %.3f | %.3f | %.3f | %5d | %.3f | %.3f | %
 fprintf(stderr, "+============+=======+=======+=======+=======+=======+=======+-------+-------+------+------+--------+---------+\n");
 
 fprintf(stderr, "| %10s | %5s | %5s | %5s | %5s | %13s |\n", 
-                "Generation", "Ctx", "Batch", "Keep","Prmpt","Seed");
+                "Generation", "Ctx", "Batch", "Keep","Prom.","Seed");
 fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\n");  
 fprintf(stderr, "|            | %5d | %5d | %5d | %5zu | %13d |\n",
                 n_ctx, params.n_batch, params.n_keep, embd_inp.size(),params.seed);
@@ -372,7 +374,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
     fprintf(stderr, "\n\n");
 
     // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::vector<falcon_token> last_n_tokens(n_ctx);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
     if (params.interactive) {
@@ -406,11 +408,11 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
     // the first thing we will do is to output the prompt, so set color accordingly
     console_set_color(con_st, CONSOLE_COLOR_PROMPT);
 
-    std::vector<llama_token> embd;
+    std::vector<falcon_token> embd;
 
     // do one empty run to warm up the model
     {
-        const std::vector<llama_token> tmp = { falcon_token_bos(), };
+        const std::vector<falcon_token> tmp = { falcon_token_bos(), };
         falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads,0);
         llama_reset_timings(ctx);
     }
@@ -521,7 +523,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
                 llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
             }
 
-            llama_token id = 0;
+            falcon_token id = 0;
 
             {
                 auto logits  = falcon_get_logits(ctx);
@@ -532,13 +534,13 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
                     logits[it->first] += it->second;
                 }
 
-                std::vector<llama_token_data> candidates;
+                std::vector<falcon_token_data> candidates;
                 candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                for (falcon_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(falcon_token_data{token_id, logits[token_id], 0.0f});
                 }
 
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+                falcon_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
                 // Apply penalties
                 float nl_logit = logits[falcon_token_nl()];
diff --git a/examples/falcon_common.cpp b/examples/falcon_common.cpp
@@ -265,7 +265,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(1024+128, params.n_batch); // appears to work fine with scratch buffer, keep in eye
+            // params.n_batch = std::min(1024+128, params.n_batch); // appears to work fine with scratch buffer, keep in eye
         } else if (arg == "--keep") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -428,7 +428,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             std::stringstream ss(argv[i]);
-            llama_token key;
+            falcon_token key;
             char sign;
             std::string value_str;
             try {
@@ -602,9 +602,9 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 }
 
 // TODO: not great allocating this every time
-std::vector<llama_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos) {
+std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos) {
     // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
+    std::vector<falcon_token> res(text.size() + (int) add_bos);
     const int n = falcon_tokenize(ctx, text.c_str(), res.data(), static_cast<int>(res.size()), add_bos);
     assert(n >= 0);
     res.resize(n);
diff --git a/examples/falcon_common.h b/examples/falcon_common.h
@@ -36,7 +36,7 @@ struct gpt_params {
     // int     mb_reserve_gpu_other           = false; // override reserved megabytes of VRAM for secondary GPUs
 
     // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    std::unordered_map<falcon_token, float> logit_bias; // logit bias for specific tokens
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
@@ -93,7 +93,7 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 // Vocab utils
 //
 
-std::vector<llama_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos);
+std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos);
 
 //
 // Model utils
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1944,30 +1944,30 @@ void ggml_cuda_update_gpu_status(int device_id) {
 }
 void ggml_cuda_print_gpu_status(const GPUStatus *status, bool print_summary) {
     if (status == NULL) {
-        printf("Error: Invalid GPU status pointer.\n");
+        fprintf(stderr,"Error: Invalid GPU status pointer.\n");
         return;
     }
 
     const char *divider = "+----+------------------------------------+------------+-----------+-----------+-----------+-----------+";
-    printf("%s\n", divider);
-    printf("| ID | %-25s %2d found | %10s | %9s | %9s | %9s | %9s |\n", "Device", status->num_devices, "VRAM Total", "VRAM Free", "VRAM Used","Split at ", "Device");
-    printf("%s\n", divider);
+    fprintf(stderr,"%s\n", divider);
+    fprintf(stderr,"| ID | %-25s %2d found | %10s | %9s | %9s | %9s | %9s |\n", "Device", status->num_devices, "VRAM Total", "VRAM Free", "VRAM Used","Split at ", "Device");
+    fprintf(stderr,"%s\n", divider);
 
     for (int i = 0; i < status->num_devices; ++i) {
         const struct cudaDeviceProp *prop = &status->device_props[i];
         size_t vram_used = status->device_vram_total[i] - status->device_vram_free[i];
         float split_at_percentage = g_tensor_split[i] * 100;
-        printf("| %2d | %-34s | %7zu MB | %6zu MB | %6zu MB | %8.1f%% | %9s |\n", 
+        fprintf(stderr,"| %2d | %-34s | %7zu MB | %6zu MB | %6zu MB | %8.1f%% | %9s |\n", 
                 i,prop->name, status->device_vram_total[i] / (1024 * 1024), status->device_vram_free[i] / (1024 * 1024), vram_used / (1024 * 1024),split_at_percentage, (i == status->main_device_id) ? "Primary" : "Secondary");
         // printf("%s\n", divider);
     }
     if (print_summary && status->num_devices > 1)
     {
-        printf("%s\n", divider);
-        printf("|    | %-34s | %7zu MB | %6zu MB | %6zu MB | %9s | %9s |\n", 
+        fprintf(stderr,"%s\n", divider);
+        fprintf(stderr,"|    | %-34s | %7zu MB | %6zu MB | %6zu MB | %9s | %9s |\n", 
             "Device summary", status->total_vram / (1024 * 1024), status->total_free_vram / (1024 * 1024), (status->total_vram - status->total_free_vram) / (1024 * 1024), "N/A", "All");
     }
-    printf("%s\n", divider);
+    fprintf(stderr,"%s\n", divider);
     
 }
 
@@ -2090,7 +2090,7 @@ void * ggml_cuda_host_malloc(size_t size) {
         // The allocation error can be bypassed. A null ptr will assigned out of this function.
         // This can fixed the OOM error in WSL.
         cudaGetLastError();
-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned (CUDA optimized) memory: %s\n",
             size/1024.0/1024.0, cudaGetErrorString(err));
         return nullptr;
     }
diff --git a/ggml.c b/ggml.c
@@ -4539,7 +4539,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         size_needed += GGML_TENSOR_SIZE;
 
         if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+            GGML_PRINT("\n%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                     __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
             assert(false);
             return NULL;
@@ -4552,14 +4552,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
         };
     } else {
         if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
-            GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+            GGML_PRINT("\n%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
                     __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
             assert(false);
             return NULL;
         }
 
         if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+            GGML_PRINT("\n%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                     __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
             assert(false);
             return NULL;
@@ -10921,7 +10921,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-    const int64_t ne10 = src1->ne[0];
+    // const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
     const int64_t ne13 = src1->ne[3];
@@ -10931,20 +10931,20 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne2  = dst->ne[2];
     const int64_t ne3  = dst->ne[3];
 
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
 
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1]; UNUSED(nb11);
-    const int nb12 = src1->nb[2]; UNUSED(nb12);
-    const int nb13 = src1->nb[3]; UNUSED(nb13);
+    const size_t nb10 = src1->nb[0];
+    const size_t nb11 = src1->nb[1]; UNUSED(nb11);
+    const size_t nb12 = src1->nb[2]; UNUSED(nb12);
+    const size_t nb13 = src1->nb[3]; UNUSED(nb13);
 
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    const size_t nb0  = dst->nb[0];
+    const size_t nb1  = dst->nb[1];
+    const size_t nb2  = dst->nb[2];
+    const size_t nb3  = dst->nb[3];
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -10962,7 +10962,17 @@ static void ggml_compute_forward_mul_mat_f32(
     GGML_ASSERT(nb0 == sizeof(float));
     GGML_ASSERT(nb0 <= nb1);
     GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
+    if (!(nb2 <= nb3) )
+        {
+            fprintf(stderr,"nb2=%zu nb3=%zu Assert condition is %s\n",nb2,nb3,nb2 <= nb3 ? "true" : "false");
+            ggml_tensor_printf(dst,"",0,true,false); 
+            
+        }
+    GGML_ASSERT(nb2 <= nb3); 
+    
+
+
+
 
     //GGML_ASSERT(ne0 == ne01);
     //GGML_ASSERT(ne1 == ne11);
@@ -12840,7 +12850,7 @@ static void ggml_compute_forward_rope_f32(
     const size_t nb2 = dst->nb[2];
     const size_t nb3 = dst->nb[3];
 
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    // printf("%s[%d] ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", dst->name,dst->meta.layer_id,ne0, ne1, ne2, ne3);
     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
 
     GGML_ASSERT(nb00 == sizeof(float));
@@ -19525,7 +19535,7 @@ void ggml_printTensorSample(char *prefix,const struct ggml_tensor * tensor) {
     printf("%s",  sep);
     printf("| Content of %s \"%s\" (%d dim)",prefix,tensor->name,tensor->n_dims);
     printf("\n");
-    const int max_elements = 4;
+    const int max_elements = 40000;
     
     if (tensor->n_dims == 1) {
         printf("| ");
@@ -19595,7 +19605,7 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
     */
     {
         pos = 0;
-        for (int i = 0; i < tensor->n_dims; i++) {
+        for (int i = 0; i <= tensor->n_dims; i++) {
             pos += snprintf(strides + pos, sizeof(strides) - pos, "%" PRId64, tensor->nb[i]);
             if (i != tensor->n_dims - 1) {
                 pos += snprintf(strides + pos, sizeof(strides) - pos, "x");
@@ -19683,7 +19693,28 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
     }
     printf("%s\n",  sep_border);
 }
+float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4) {
+    if (tensor->n_dims < 1 || tensor->n_dims > 4) {
+        printf("Error: Incorrect dimension number %d\n", tensor->n_dims);
+        return -1; // handle error
+    }
+
+    int indices[4] = {ind1, ind2, ind3, ind4};
+    int total_offset = 0;
+
+    for (int i = 0; i < tensor->n_dims; i++) {
+        if (indices[i] > tensor->ne[i] || indices[i] < 0) {
+            printf("Error: Incorrect index for dimension %d\n", i);
+            printf("Index: %d, Dimension size: %ld\n", indices[i], tensor->ne[i]);
+            return -1; // handle error
+        }
+
+        total_offset += indices[i] * tensor->nb[i];
+    }
 
+    // Return the value at the calculated offset
+    return *(float *)((char *) tensor->data + total_offset);
+}
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {
diff --git a/ggml.h b/ggml.h
@@ -1345,6 +1345,8 @@ extern "C" {
     
     // visualize the tensor - extended adds more information - when printing sample content extended will also print src0 and src1 content
     void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample);
+    // helper to accessa specific single index value (tested for fp32 only, though nb[] is considered)
+    float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4);
 
     //
     // optimization
diff --git a/libfalcon.cpp b/libfalcon.cpp
diff --git a/libfalcon.h b/libfalcon.h