diff --git a/examples/demo_cffi.py b/examples/demo_cffi.py index e3f7392..ce6c50e 100644 --- a/examples/demo_cffi.py +++ b/examples/demo_cffi.py @@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]): options = Options( ctx_size=2048, predict=-2, + log_disable=False, model=model, prompt=messages, ) diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes_cpu.py similarity index 87% rename from examples/demo_ctypes.py rename to examples/demo_ctypes_cpu.py index 9c7811a..edec6e0 100644 --- a/examples/demo_ctypes.py +++ b/examples/demo_ctypes_cpu.py @@ -2,7 +2,7 @@ # import sys # sys.path.append(os.path.abspath('.')) -from llama.llama_cli_ctypes import llama_generate, Model, Options +from llama.llama_cli_ctypes_cpu import llama_generate, Model, Options from demo_models import models @@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]): options = Options( ctx_size=2048, predict=-2, + log_disable=False, model=model, prompt=messages, ) diff --git a/examples/demo_models.py b/examples/demo_models.py index 28c464a..87569fa 100644 --- a/examples/demo_models.py +++ b/examples/demo_models.py @@ -20,36 +20,43 @@ Model( 'microsoft/Phi-3-mini-128k-instruct', 'bartowski/Phi-3.1-mini-128k-instruct-GGUF', + # 'Phi-3.1-mini-128k-instruct-Q4_K_S.gguf', 'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf', ), Model( 'microsoft/Phi-3-mini-4k-instruct', 'bartowski/Phi-3.1-mini-4k-instruct-GGUF', + # 'Phi-3.1-mini-4k-instruct-Q4_K_S.gguf', 'Phi-3.1-mini-4k-instruct-Q4_K_M.gguf', ), Model( 'microsoft/phi-2', 'andrijdavid/phi-2-GGUF', + # 'ggml-model-Q4_K_S.gguf', 'ggml-model-Q4_K_M.gguf', ), Model( 'IndexTeam/Index-1.9B-Chat', 'IndexTeam/Index-1.9B-Chat-GGUF', + # 'ggml-model-Q4_0.gguf', 'ggml-model-Q4_K_M.gguf', ), Model( 'internlm/internlm2-chat-1_8b', 'QuantFactory/internlm2-chat-1_8b-GGUF', + # 'internlm2-chat-1_8b.Q4_K_S.gguf', 'internlm2-chat-1_8b.Q4_K_M.gguf', ), Model( 'Qwen/Qwen2-1.5B-Instruct', 'Qwen/Qwen2-1.5B-Instruct-GGUF', + # 'qwen2-1_5b-instruct-q4_0.gguf', 'qwen2-1_5b-instruct-q4_k_m.gguf', ), Model( 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', + # 'tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf', 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', ), ] diff --git a/llama/llama_cli_ctypes.py b/llama/llama_cli_ctypes_cpu.py similarity index 98% rename from llama/llama_cli_ctypes.py rename to llama/llama_cli_ctypes_cpu.py index ab82ee1..9df4877 100644 --- a/llama/llama_cli_ctypes.py +++ b/llama/llama_cli_ctypes_cpu.py @@ -19,7 +19,7 @@ module_path = os.path.abspath(__file__) module_dir = os.path.dirname(module_path) -llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so') +llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cpu.so') lib = ctypes.CDLL(llama_cli_lib_path) _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p) diff --git a/scripts/build.py b/scripts/build.py index b2aefcf..376d56f 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -5,7 +5,7 @@ from cffi import FFI -from clean import clean +from clean import clean_llama, clean_llama_cpp, clean ffibuilder = FFI() @@ -32,14 +32,19 @@ ) -def build(*args, **kwargs): - # build static and shared library - env = os.environ.copy() - +def clone_llama_cpp(): subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True) subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True) subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True) + +def build_cpu(*args, **kwargs): + # build static and shared library + env = os.environ.copy() + + # + # build llama.cpp + # if 'PYODIDE' in env and env['PYODIDE'] == '1': env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH ' env['UNAME_M'] = 'wasm' @@ -55,6 +60,8 @@ def build(*args, **kwargs): 'GGML_NO_LLAMAFILE=1', # 'GGML_OPENBLAS=1', ], check=True, env=env) + + subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cpu.so'], check=True) # cffi ffibuilder.compile(tmpdir='build', verbose=True) @@ -70,6 +77,99 @@ def build(*args, **kwargs): shutil.move(file, 'llama/') -if __name__ == '__main__': +def build_cuda_12_5(*args, **kwargs): + # build static and shared library + env = os.environ.copy() + + # + # cuda env + # + cuda_file = 'cuda_12.5.1_555.42.06_linux.run' + cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/{cuda_file}' + cuda_output_dir = os.path.abspath('./cuda-12.5.1') + + env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin' + env['CUDA_PATH'] = f'{cuda_output_dir}/dist' + + # download cuda file + subprocess.run(['wget', '-N', cuda_url, '-P', cuda_output_dir], check=True) + + # extract cuda file + cmd = ['chmod', '+x', f'{cuda_output_dir}/{cuda_file}'] + subprocess.run(cmd, check=True) + + cmd = [ + f'{cuda_output_dir}/{cuda_file}', + '--tar', + 'mxvf', + '--wildcards', + './builds/cuda_cccl/*', + './builds/cuda_cudart/*', + './builds/cuda_nvcc/*', + './builds/libcublas/*', + '-C', + cuda_output_dir, + ] + subprocess.run(cmd, cwd=cuda_output_dir, check=True) + + cmd = ['mkdir', '-p', f'{cuda_output_dir}/dist'] + subprocess.run(cmd, check=True) + + cmd = f'cp -r {cuda_output_dir}/builds/cuda_cccl/* {cuda_output_dir}/dist' + subprocess.run(cmd, shell=True, check=True) + + cmd = f'cp -r {cuda_output_dir}/builds/cuda_cudart/* {cuda_output_dir}/dist' + subprocess.run(cmd, shell=True, check=True) + + cmd = f'cp -r {cuda_output_dir}/builds/cuda_nvcc/* {cuda_output_dir}/dist' + subprocess.run(cmd, shell=True, check=True) + + cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist' + subprocess.run(cmd, shell=True, check=True) + + # + # build llama.cpp + # + if 'PYODIDE' in env and env['PYODIDE'] == '1': + env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH ' + env['UNAME_M'] = 'wasm' + + subprocess.run([ + 'make', + '-C', + 'llama.cpp', + '-j', + 'llama-cli-shared', + 'GGML_NO_OPENMP=1', + 'GGML_NO_LLAMAFILE=1', + 'GGML_CUDA=1', + ], check=True, env=env) + + subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cuda-12_5.so'], check=True) + + # ctypes + for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'): + shutil.move(file, 'llama/') + + for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'): + shutil.move(file, 'llama/') + + for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'): + shutil.move(file, 'llama/') + + +def build(*args, **kwargs): clean() + clone_llama_cpp() + + # cpu + clean_llama_cpp() + build_cpu(*args, **kwargs) + + # cuda 12.5 + clean_llama_cpp() + build_cuda_12_5(*args, **kwargs) + + +if __name__ == '__main__': build() diff --git a/scripts/clean.py b/scripts/clean.py index 8a66cdc..8fbc201 100644 --- a/scripts/clean.py +++ b/scripts/clean.py @@ -2,9 +2,22 @@ import subprocess -def clean(): +def clean_llama(): files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll') subprocess.run(['rm', '-fv'] + files, check=True) + + +def clean_llama_cpp(): + subprocess.run([ + 'make', + '-C', + 'llama.cpp', + 'clean' + ], check=True) + + +def clean(): + clean_llama() subprocess.run(['rm', '-fr', 'build'], check=True) subprocess.run(['rm', '-fr', 'dist'], check=True) subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)