From 5010b6ae848d30efa06af2b6d9eff07b8224b1bf Mon Sep 17 00:00:00 2001
From: John <nolife+git@gmail.com>
Date: Mon, 10 Apr 2023 05:14:35 +0200
Subject: [PATCH 1/4] Adds _PRELOAD_MMAP_FILE flag to fully preload the model
 even when using mmap(). This brings back consistency so benchmarking token
 inference does not depend on ssd/disk speed anymore.

---
 llama_util.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
diff --git a/llama_util.h b/llama_util.h
index d68f49bd239a4..c7f165654ab8a 100755
--- a/llama_util.h
+++ b/llama_util.h
@@ -156,7 +156,35 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_mmap {
     void * addr;
     size_t size;
+    void preload_mmap_file(void *addr, size_t length)
+    {
+    #ifndef _PRELOAD_MMAP_FILE
+        return;
+    #endif
+    // Get the page size of the system
+    #if defined(_WIN32)
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        long page_size = si.dwPageSize;
+    #else
+        long page_size = sysconf(_SC_PAGE_SIZE); // in windows we can use GetSystemInfo:
+    #endif
 
+        if (page_size == -1)
+        {
+            perror("sysconf");
+            return;
+        }
+
+        // Loop over the mapped file, jumping by page size
+        for (size_t i = 0; i < length; i += page_size)
+        {
+            // Dereference the pointer at each page boundary
+            volatile char c = ((char *)addr)[i];
+            // Force the compiler to not optimize the loop away:
+            (void)c; // Use the value of 'c' to avoid compiler warnings and ensure the loop is not optimized away
+        }
+    }
     llama_mmap(const llama_mmap &) = delete;
 
 #ifdef _POSIX_MAPPED_FILES
@@ -180,6 +208,8 @@ struct llama_mmap {
             fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                     strerror(errno));
         }
+        // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently
+        preload_mmap_file(addr, file->size);
     }
 
     ~llama_mmap() {
@@ -217,6 +247,9 @@ struct llama_mmap {
             fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                     llama_format_win_err(GetLastError()).c_str());
         }
+
+        // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently
+        preload_mmap_file(addr, file->size);
     }
 
     ~llama_mmap() {

From 56b6fa5397ed3cf041051e5ad203977585bb2ee2 Mon Sep 17 00:00:00 2001
From: John <nolife+git@gmail.com>
Date: Mon, 10 Apr 2023 05:23:53 +0200
Subject: [PATCH 2/4] linux will need unistd.h

---
 llama_util.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama_util.h b/llama_util.h
index c7f165654ab8a..b6e68e16017a4 100755
--- a/llama_util.h
+++ b/llama_util.h
@@ -30,6 +30,8 @@
     #include <windows.h>
     #include <io.h>
     #include <stdio.h> // for _fseeki64
+#else
+    #include <unistd.h>
 #endif
 
 #define LLAMA_ASSERT(x) \

From f4c1c6b97a63a373833707b1a806b652dc49e1d1 Mon Sep 17 00:00:00 2001
From: John <nolife+git@gmail.com>
Date: Tue, 11 Apr 2023 00:28:04 +0200
Subject: [PATCH 3/4] Updated preloader to use multithreading - currently set
 to 50% of the available threads on the system Tested on Windows - a small
 performance during loading is not avoidable but this is the best possible
 solution On Linux

---
 llama_util.h | 102 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 10 deletions(-)

diff --git a/llama_util.h b/llama_util.h
index b6e68e16017a4..2d7448bde3e67 100755
--- a/llama_util.h
+++ b/llama_util.h
@@ -3,7 +3,7 @@
 
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
-
+#define _PRELOAD_MMAP_FILE 1 // when using mmap, preload the entire file to prevent loading during first token inference
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
@@ -11,6 +11,7 @@
 #include <cstdarg>
 #include <cstdlib>
 #include <climits>
+#include <thread>
 
 #include <string>
 #include <vector>
@@ -30,8 +31,34 @@
     #include <windows.h>
     #include <io.h>
     #include <stdio.h> // for _fseeki64
+    typedef volatile LONG atomic_int;
+    typedef atomic_int atomic_bool;
+
+    typedef HANDLE pthread_t;
+    typedef DWORD thread_ret_t;
+
+    static int pthread_create(pthread_t *out, void *unused, thread_ret_t (*func)(void *), void *arg)
+    {
+        HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
+        if (handle == NULL)
+        {
+            return EAGAIN;
+        }
+
+        *out = handle;
+        return 0;
+    }
+
+    static int pthread_join(pthread_t thread, void *unused)
+    {
+        return (int)WaitForSingleObject(thread, INFINITE);
+    }
 #else
     #include <unistd.h>
+    #include <pthread.h>
+    #include <stdatomic.h>
+
+    typedef void *thread_ret_t;
 #endif
 
 #define LLAMA_ASSERT(x) \
@@ -158,7 +185,28 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_mmap {
     void * addr;
     size_t size;
-    void preload_mmap_file(void *addr, size_t length)
+    typedef struct
+    {
+        size_t start;
+        size_t end;
+        void *addr;
+        int n_threads;
+        int n_thread;
+        int page_size;
+    } thread_data_t;
+    static thread_ret_t worker_preload_memory(void *arg)
+    {
+        thread_data_t *data = (thread_data_t *)arg;
+        volatile char buffer;
+        for (size_t offset = data->start + data->n_thread * data->page_size; offset <= data->end; offset += data->n_threads * data->page_size)
+        {
+            volatile void *buffer_ptr = &buffer;
+            memcpy((void *)buffer_ptr, (char *)data->addr + offset, sizeof(buffer));
+            if (data->n_threads < data->n_thread && buffer==0) exit(-1); // to avoid compiler optimization - the previous simple access method did not work in thread workers
+        }
+        return NULL;
+    }
+    void preload_mmap_file(void *addr, size_t length, int n_threads)
     {
     #ifndef _PRELOAD_MMAP_FILE
         return;
@@ -177,14 +225,48 @@ struct llama_mmap {
             perror("sysconf");
             return;
         }
-
-        // Loop over the mapped file, jumping by page size
-        for (size_t i = 0; i < length; i += page_size)
+        HANDLE hProcess = GetCurrentProcess();
+        WIN32_MEMORY_RANGE_ENTRY range;
+        range.VirtualAddress = addr;
+        range.NumberOfBytes = length;
+
+        // if (!VirtualLock(addr, length))    {    }; // no benefit. for systems with too little RAM we should lock a part and restrict the preload to that new length
+        if (!PrefetchVirtualMemory(hProcess, 1, &range, 0)) { }; // Prefetches part of the data and signals readahead to the file system
+
+        if (n_threads > 32)
+            n_threads = 32;
+        pthread_t threads[32];
+        thread_data_t thread_data[32];
+    
+        // we split the pages between the threads - that was the only reliable solution I could find
+        size_t num_pages_per_thread = (length / page_size) / n_threads;
+        int pages = ceil(length / page_size);
+        for (int page_start = 0; page_start < pages; page_start += n_threads * num_pages_per_thread)
         {
-            // Dereference the pointer at each page boundary
-            volatile char c = ((char *)addr)[i];
-            // Force the compiler to not optimize the loop away:
-            (void)c; // Use the value of 'c' to avoid compiler warnings and ensure the loop is not optimized away
+            size_t chunk_start = page_start * page_size;
+            size_t chunk_end = chunk_start + page_size * n_threads * num_pages_per_thread;
+            for (int i = 0; i < n_threads; ++i)
+            {
+                thread_data[i].start = chunk_start;
+                thread_data[i].end = chunk_end;
+                if (thread_data[i].end > length)
+                {
+                    thread_data[i].end = length;
+                }
+                thread_data[i].addr = addr;
+                thread_data[i].page_size = page_size;
+                thread_data[i].n_threads = n_threads;
+                thread_data[i].n_thread = i;
+                pthread_create(&threads[i], NULL, worker_preload_memory, &thread_data[i]);
+                if (thread_data[i].end == length)
+                    break;
+            }
+
+            for (int i = 0; i < n_threads; ++i)
+            {
+                pthread_join(threads[i], NULL);
+            }
+            
         }
     }
     llama_mmap(const llama_mmap &) = delete;
@@ -251,7 +333,7 @@ struct llama_mmap {
         }
 
         // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently
-        preload_mmap_file(addr, file->size);
+        preload_mmap_file(addr, file->size, std::thread::hardware_concurrency()/2);
     }
 
     ~llama_mmap() {

From 6e65b8a81716ec86c9ba6e915e5cc4e4f133fbc3 Mon Sep 17 00:00:00 2001
From: John <nolife+git@gmail.com>
Date: Tue, 11 Apr 2023 00:36:00 +0200
Subject: [PATCH 4/4] Updated preloader to use multithreading Tested on Windows
 - a small performance hit during loading is not avoidable but this is the
 fastest method I found On Linux - madvise needs a test if it's working.
 otherwise readahead() needs to be implemented in the TODO region

---
 llama_util.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llama_util.h b/llama_util.h
index 2d7448bde3e67..d791b83f7edb8 100755
--- a/llama_util.h
+++ b/llama_util.h
@@ -225,13 +225,19 @@ struct llama_mmap {
             perror("sysconf");
             return;
         }
+        #ifdef _WIN32
         HANDLE hProcess = GetCurrentProcess();
         WIN32_MEMORY_RANGE_ENTRY range;
         range.VirtualAddress = addr;
         range.NumberOfBytes = length;
-
         // if (!VirtualLock(addr, length))    {    }; // no benefit. for systems with too little RAM we should lock a part and restrict the preload to that new length
         if (!PrefetchVirtualMemory(hProcess, 1, &range, 0)) { }; // Prefetches part of the data and signals readahead to the file system
+        #else
+        // todo
+        //if (posix_madvise(addr, length, POSIX_MADV_WILLNEED) == -1) { }; 
+        // readahead() should be the equivalent method for Linux. I don't think madvise will cause a full fetch      
+        // the multi threaded read below is pseudo sequential, it also needs a test without OS level readahead in place (worst case set threads to 1 in linux or return)
+        #endif
 
         if (n_threads > 32)
             n_threads = 32;