From c1465a510b7f09eee54897a4f0597f014ec406f4 Mon Sep 17 00:00:00 2001
From: Marko Tasic <mtasic85@gmail.com>
Date: Thu, 18 Jul 2024 13:41:12 +0200
Subject: [PATCH] llama ctypes cpu/cuda

---
 examples/demo_cffi.py                         |   1 +
 .../{demo_ctypes.py => demo_ctypes_cpu.py}    |   3 +-
 examples/demo_models.py                       |   7 ++
 ..._cli_ctypes.py => llama_cli_ctypes_cpu.py} |   2 +-
 scripts/build.py                              | 112 +++++++++++++++++-
 scripts/clean.py                              |  15 ++-
 6 files changed, 131 insertions(+), 9 deletions(-)
 rename examples/{demo_ctypes.py => demo_ctypes_cpu.py} (87%)
 rename llama/{llama_cli_ctypes.py => llama_cli_ctypes_cpu.py} (98%)

diff --git a/examples/demo_cffi.py b/examples/demo_cffi.py
index e3f7392..ce6c50e 100644
--- a/examples/demo_cffi.py
+++ b/examples/demo_cffi.py
@@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]):
     options = Options(
         ctx_size=2048,
         predict=-2,
+        log_disable=False,
         model=model,
         prompt=messages,
     )
diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes_cpu.py
similarity index 87%
rename from examples/demo_ctypes.py
rename to examples/demo_ctypes_cpu.py
index 9c7811a..edec6e0 100644
--- a/examples/demo_ctypes.py
+++ b/examples/demo_ctypes_cpu.py
@@ -2,7 +2,7 @@
 # import sys
 # sys.path.append(os.path.abspath('.'))
 
-from llama.llama_cli_ctypes import llama_generate, Model, Options
+from llama.llama_cli_ctypes_cpu import llama_generate, Model, Options
 
 from demo_models import models
 
@@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]):
     options = Options(
         ctx_size=2048,
         predict=-2,
+        log_disable=False,
         model=model,
         prompt=messages,
     )
diff --git a/examples/demo_models.py b/examples/demo_models.py
index 28c464a..87569fa 100644
--- a/examples/demo_models.py
+++ b/examples/demo_models.py
@@ -20,36 +20,43 @@
     Model(
         'microsoft/Phi-3-mini-128k-instruct',
         'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
+        # 'Phi-3.1-mini-128k-instruct-Q4_K_S.gguf',
         'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
     ),
     Model(
         'microsoft/Phi-3-mini-4k-instruct',
         'bartowski/Phi-3.1-mini-4k-instruct-GGUF',
+        # 'Phi-3.1-mini-4k-instruct-Q4_K_S.gguf',
         'Phi-3.1-mini-4k-instruct-Q4_K_M.gguf',
     ),
     Model(
         'microsoft/phi-2',
         'andrijdavid/phi-2-GGUF',
+        # 'ggml-model-Q4_K_S.gguf',
         'ggml-model-Q4_K_M.gguf',
     ),
     Model(
         'IndexTeam/Index-1.9B-Chat',
         'IndexTeam/Index-1.9B-Chat-GGUF',
+        # 'ggml-model-Q4_0.gguf',
         'ggml-model-Q4_K_M.gguf',
     ),
     Model(
         'internlm/internlm2-chat-1_8b',
         'QuantFactory/internlm2-chat-1_8b-GGUF',
+        # 'internlm2-chat-1_8b.Q4_K_S.gguf',
         'internlm2-chat-1_8b.Q4_K_M.gguf',
     ),
     Model(
         'Qwen/Qwen2-1.5B-Instruct',
         'Qwen/Qwen2-1.5B-Instruct-GGUF',
+        # 'qwen2-1_5b-instruct-q4_0.gguf',
         'qwen2-1_5b-instruct-q4_k_m.gguf',
     ),
     Model(
         'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
         'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+        # 'tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf',
         'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
     ),
 ]
diff --git a/llama/llama_cli_ctypes.py b/llama/llama_cli_ctypes_cpu.py
similarity index 98%
rename from llama/llama_cli_ctypes.py
rename to llama/llama_cli_ctypes_cpu.py
index ab82ee1..9df4877 100644
--- a/llama/llama_cli_ctypes.py
+++ b/llama/llama_cli_ctypes_cpu.py
@@ -19,7 +19,7 @@
 
 module_path = os.path.abspath(__file__)
 module_dir = os.path.dirname(module_path)
-llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
+llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cpu.so')
 lib = ctypes.CDLL(llama_cli_lib_path)
 
 _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
diff --git a/scripts/build.py b/scripts/build.py
index b2aefcf..376d56f 100644
--- a/scripts/build.py
+++ b/scripts/build.py
@@ -5,7 +5,7 @@
 
 from cffi import FFI
 
-from clean import clean
+from clean import clean_llama, clean_llama_cpp, clean
 
 ffibuilder = FFI()
 
@@ -32,14 +32,19 @@
 )
 
 
-def build(*args, **kwargs):
-    # build static and shared library
-    env = os.environ.copy()
-
+def clone_llama_cpp():
     subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
     subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True)
     subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True)
 
+
+def build_cpu(*args, **kwargs):
+    # build static and shared library
+    env = os.environ.copy()
+
+    #
+    # build llama.cpp
+    #
     if 'PYODIDE' in env and env['PYODIDE'] == '1':
         env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
         env['UNAME_M'] = 'wasm'
@@ -55,6 +60,8 @@ def build(*args, **kwargs):
         'GGML_NO_LLAMAFILE=1',
         # 'GGML_OPENBLAS=1',
     ], check=True, env=env)
+
+    subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cpu.so'], check=True)
     
     # cffi
     ffibuilder.compile(tmpdir='build', verbose=True)
@@ -70,6 +77,99 @@ def build(*args, **kwargs):
         shutil.move(file, 'llama/')
 
 
-if __name__ == '__main__':
+def build_cuda_12_5(*args, **kwargs):
+    # build static and shared library
+    env = os.environ.copy()
+
+    #
+    # cuda env
+    #
+    cuda_file = 'cuda_12.5.1_555.42.06_linux.run'
+    cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/{cuda_file}'
+    cuda_output_dir = os.path.abspath('./cuda-12.5.1')
+
+    env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin'
+    env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
+
+    # download cuda file
+    subprocess.run(['wget', '-N', cuda_url, '-P', cuda_output_dir], check=True)
+    
+    # extract cuda file
+    cmd = ['chmod', '+x', f'{cuda_output_dir}/{cuda_file}']
+    subprocess.run(cmd, check=True)
+    
+    cmd = [
+        f'{cuda_output_dir}/{cuda_file}',
+        '--tar',
+        'mxvf',
+        '--wildcards',
+        './builds/cuda_cccl/*',
+        './builds/cuda_cudart/*',
+        './builds/cuda_nvcc/*',
+        './builds/libcublas/*',
+        '-C',
+        cuda_output_dir,
+    ]
+    subprocess.run(cmd, cwd=cuda_output_dir, check=True)
+
+    cmd = ['mkdir', '-p', f'{cuda_output_dir}/dist']
+    subprocess.run(cmd, check=True)
+
+    cmd = f'cp -r {cuda_output_dir}/builds/cuda_cccl/* {cuda_output_dir}/dist'
+    subprocess.run(cmd, shell=True, check=True)
+
+    cmd = f'cp -r {cuda_output_dir}/builds/cuda_cudart/* {cuda_output_dir}/dist'
+    subprocess.run(cmd, shell=True, check=True)
+
+    cmd = f'cp -r {cuda_output_dir}/builds/cuda_nvcc/* {cuda_output_dir}/dist'
+    subprocess.run(cmd, shell=True, check=True)
+
+    cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
+    subprocess.run(cmd, shell=True, check=True)
+
+    #
+    # build llama.cpp
+    #
+    if 'PYODIDE' in env and env['PYODIDE'] == '1':
+        env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
+        env['UNAME_M'] = 'wasm'
+
+    subprocess.run([
+        'make',
+        '-C',
+        'llama.cpp',
+        '-j',
+        'llama-cli-shared',
+        'GGML_NO_OPENMP=1',
+        'GGML_NO_LLAMAFILE=1',
+        'GGML_CUDA=1',
+    ], check=True, env=env)
+
+    subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cuda-12_5.so'], check=True)
+
+    # ctypes
+    for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
+        shutil.move(file, 'llama/')
+
+    for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
+        shutil.move(file, 'llama/')
+
+    for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
+        shutil.move(file, 'llama/')
+
+
+def build(*args, **kwargs):
     clean()
+    clone_llama_cpp()
+
+    # cpu
+    clean_llama_cpp()
+    build_cpu(*args, **kwargs)
+
+    # cuda 12.5
+    clean_llama_cpp()
+    build_cuda_12_5(*args, **kwargs)
+
+
+if __name__ == '__main__':
     build()
diff --git a/scripts/clean.py b/scripts/clean.py
index 8a66cdc..8fbc201 100644
--- a/scripts/clean.py
+++ b/scripts/clean.py
@@ -2,9 +2,22 @@
 import subprocess
 
 
-def clean():
+def clean_llama():
     files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll')
     subprocess.run(['rm', '-fv'] + files, check=True)
+
+
+def clean_llama_cpp():
+    subprocess.run([
+        'make',
+        '-C',
+        'llama.cpp',
+        'clean'
+    ], check=True)
+
+
+def clean():
+    clean_llama()
     subprocess.run(['rm', '-fr', 'build'], check=True)
     subprocess.run(['rm', '-fr', 'dist'], check=True)
     subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)