From 5010b6ae848d30efa06af2b6d9eff07b8224b1bf Mon Sep 17 00:00:00 2001 From: John Date: Mon, 10 Apr 2023 05:14:35 +0200 Subject: [PATCH 1/4] Adds _PRELOAD_MMAP_FILE flag to fully preload the model even when using mmap(). This brings back consistency so benchmarking token inference does not depend on ssd/disk speed anymore. --- llama_util.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/llama_util.h b/llama_util.h index d68f49bd239a4..c7f165654ab8a 100755 --- a/llama_util.h +++ b/llama_util.h @@ -156,7 +156,35 @@ static std::string llama_format_win_err(DWORD err) { struct llama_mmap { void * addr; size_t size; + void preload_mmap_file(void *addr, size_t length) + { + #ifndef _PRELOAD_MMAP_FILE + return; + #endif + // Get the page size of the system + #if defined(_WIN32) + SYSTEM_INFO si; + GetSystemInfo(&si); + long page_size = si.dwPageSize; + #else + long page_size = sysconf(_SC_PAGE_SIZE); // in windows we can use GetSystemInfo: + #endif + if (page_size == -1) + { + perror("sysconf"); + return; + } + + // Loop over the mapped file, jumping by page size + for (size_t i = 0; i < length; i += page_size) + { + // Dereference the pointer at each page boundary + volatile char c = ((char *)addr)[i]; + // Force the compiler to not optimize the loop away: + (void)c; // Use the value of 'c' to avoid compiler warnings and ensure the loop is not optimized away + } + } llama_mmap(const llama_mmap &) = delete; #ifdef _POSIX_MAPPED_FILES @@ -180,6 +208,8 @@ struct llama_mmap { fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno)); } + // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently + preload_mmap_file(addr, file->size); } ~llama_mmap() { @@ -217,6 +247,9 @@ struct llama_mmap { fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } + + // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently + preload_mmap_file(addr, file->size); } ~llama_mmap() { From 56b6fa5397ed3cf041051e5ad203977585bb2ee2 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 10 Apr 2023 05:23:53 +0200 Subject: [PATCH 2/4] linux will need unistd.h --- llama_util.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_util.h b/llama_util.h index c7f165654ab8a..b6e68e16017a4 100755 --- a/llama_util.h +++ b/llama_util.h @@ -30,6 +30,8 @@ #include #include #include // for _fseeki64 +#else + #include #endif #define LLAMA_ASSERT(x) \ From f4c1c6b97a63a373833707b1a806b652dc49e1d1 Mon Sep 17 00:00:00 2001 From: John Date: Tue, 11 Apr 2023 00:28:04 +0200 Subject: [PATCH 3/4] Updated preloader to use multithreading - currently set to 50% of the available threads on the system Tested on Windows - a small performance during loading is not avoidable but this is the best possible solution On Linux --- llama_util.h | 102 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 10 deletions(-) diff --git a/llama_util.h b/llama_util.h index b6e68e16017a4..2d7448bde3e67 100755 --- a/llama_util.h +++ b/llama_util.h @@ -3,7 +3,7 @@ #ifndef LLAMA_UTIL_H #define LLAMA_UTIL_H - +#define _PRELOAD_MMAP_FILE 1 // when using mmap, preload the entire file to prevent loading during first token inference #include #include #include @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -30,8 +31,34 @@ #include #include #include // for _fseeki64 + typedef volatile LONG atomic_int; + typedef atomic_int atomic_bool; + + typedef HANDLE pthread_t; + typedef DWORD thread_ret_t; + + static int pthread_create(pthread_t *out, void *unused, thread_ret_t (*func)(void *), void *arg) + { + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; + } + + static int pthread_join(pthread_t thread, void *unused) + { + return (int)WaitForSingleObject(thread, INFINITE); + } #else #include + #include + #include + + typedef void *thread_ret_t; #endif #define LLAMA_ASSERT(x) \ @@ -158,7 +185,28 @@ static std::string llama_format_win_err(DWORD err) { struct llama_mmap { void * addr; size_t size; - void preload_mmap_file(void *addr, size_t length) + typedef struct + { + size_t start; + size_t end; + void *addr; + int n_threads; + int n_thread; + int page_size; + } thread_data_t; + static thread_ret_t worker_preload_memory(void *arg) + { + thread_data_t *data = (thread_data_t *)arg; + volatile char buffer; + for (size_t offset = data->start + data->n_thread * data->page_size; offset <= data->end; offset += data->n_threads * data->page_size) + { + volatile void *buffer_ptr = &buffer; + memcpy((void *)buffer_ptr, (char *)data->addr + offset, sizeof(buffer)); + if (data->n_threads < data->n_thread && buffer==0) exit(-1); // to avoid compiler optimization - the previous simple access method did not work in thread workers + } + return NULL; + } + void preload_mmap_file(void *addr, size_t length, int n_threads) { #ifndef _PRELOAD_MMAP_FILE return; @@ -177,14 +225,48 @@ struct llama_mmap { perror("sysconf"); return; } - - // Loop over the mapped file, jumping by page size - for (size_t i = 0; i < length; i += page_size) + HANDLE hProcess = GetCurrentProcess(); + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = length; + + // if (!VirtualLock(addr, length)) { }; // no benefit. for systems with too little RAM we should lock a part and restrict the preload to that new length + if (!PrefetchVirtualMemory(hProcess, 1, &range, 0)) { }; // Prefetches part of the data and signals readahead to the file system + + if (n_threads > 32) + n_threads = 32; + pthread_t threads[32]; + thread_data_t thread_data[32]; + + // we split the pages between the threads - that was the only reliable solution I could find + size_t num_pages_per_thread = (length / page_size) / n_threads; + int pages = ceil(length / page_size); + for (int page_start = 0; page_start < pages; page_start += n_threads * num_pages_per_thread) { - // Dereference the pointer at each page boundary - volatile char c = ((char *)addr)[i]; - // Force the compiler to not optimize the loop away: - (void)c; // Use the value of 'c' to avoid compiler warnings and ensure the loop is not optimized away + size_t chunk_start = page_start * page_size; + size_t chunk_end = chunk_start + page_size * n_threads * num_pages_per_thread; + for (int i = 0; i < n_threads; ++i) + { + thread_data[i].start = chunk_start; + thread_data[i].end = chunk_end; + if (thread_data[i].end > length) + { + thread_data[i].end = length; + } + thread_data[i].addr = addr; + thread_data[i].page_size = page_size; + thread_data[i].n_threads = n_threads; + thread_data[i].n_thread = i; + pthread_create(&threads[i], NULL, worker_preload_memory, &thread_data[i]); + if (thread_data[i].end == length) + break; + } + + for (int i = 0; i < n_threads; ++i) + { + pthread_join(threads[i], NULL); + } + } } llama_mmap(const llama_mmap &) = delete; @@ -251,7 +333,7 @@ struct llama_mmap { } // if _PRELOAD_MMAP_FILE is define, this will preload the file into the page cache efficiently - preload_mmap_file(addr, file->size); + preload_mmap_file(addr, file->size, std::thread::hardware_concurrency()/2); } ~llama_mmap() { From 6e65b8a81716ec86c9ba6e915e5cc4e4f133fbc3 Mon Sep 17 00:00:00 2001 From: John Date: Tue, 11 Apr 2023 00:36:00 +0200 Subject: [PATCH 4/4] Updated preloader to use multithreading Tested on Windows - a small performance hit during loading is not avoidable but this is the fastest method I found On Linux - madvise needs a test if it's working. otherwise readahead() needs to be implemented in the TODO region --- llama_util.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_util.h b/llama_util.h index 2d7448bde3e67..d791b83f7edb8 100755 --- a/llama_util.h +++ b/llama_util.h @@ -225,13 +225,19 @@ struct llama_mmap { perror("sysconf"); return; } + #ifdef _WIN32 HANDLE hProcess = GetCurrentProcess(); WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; range.NumberOfBytes = length; - // if (!VirtualLock(addr, length)) { }; // no benefit. for systems with too little RAM we should lock a part and restrict the preload to that new length if (!PrefetchVirtualMemory(hProcess, 1, &range, 0)) { }; // Prefetches part of the data and signals readahead to the file system + #else + // todo + //if (posix_madvise(addr, length, POSIX_MADV_WILLNEED) == -1) { }; + // readahead() should be the equivalent method for Linux. I don't think madvise will cause a full fetch + // the multi threaded read below is pseudo sequential, it also needs a test without OS level readahead in place (worst case set threads to 1 in linux or return) + #endif if (n_threads > 32) n_threads = 32;