Skip to content

Commit

Permalink
llama ctypes cpu/cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
mtasic85 committed Jul 18, 2024
1 parent 1fc1779 commit c1465a5
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 9 deletions.
1 change: 1 addition & 0 deletions examples/demo_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]):
options = Options(
ctx_size=2048,
predict=-2,
log_disable=False,
model=model,
prompt=messages,
)
Expand Down
3 changes: 2 additions & 1 deletion examples/demo_ctypes.py → examples/demo_ctypes_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# import sys
# sys.path.append(os.path.abspath('.'))

from llama.llama_cli_ctypes import llama_generate, Model, Options
from llama.llama_cli_ctypes_cpu import llama_generate, Model, Options

from demo_models import models

Expand All @@ -11,6 +11,7 @@ def demo_model(model: Model, messages: list[dict]):
options = Options(
ctx_size=2048,
predict=-2,
log_disable=False,
model=model,
prompt=messages,
)
Expand Down
7 changes: 7 additions & 0 deletions examples/demo_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,43 @@
Model(
'microsoft/Phi-3-mini-128k-instruct',
'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
# 'Phi-3.1-mini-128k-instruct-Q4_K_S.gguf',
'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
),
Model(
'microsoft/Phi-3-mini-4k-instruct',
'bartowski/Phi-3.1-mini-4k-instruct-GGUF',
# 'Phi-3.1-mini-4k-instruct-Q4_K_S.gguf',
'Phi-3.1-mini-4k-instruct-Q4_K_M.gguf',
),
Model(
'microsoft/phi-2',
'andrijdavid/phi-2-GGUF',
# 'ggml-model-Q4_K_S.gguf',
'ggml-model-Q4_K_M.gguf',
),
Model(
'IndexTeam/Index-1.9B-Chat',
'IndexTeam/Index-1.9B-Chat-GGUF',
# 'ggml-model-Q4_0.gguf',
'ggml-model-Q4_K_M.gguf',
),
Model(
'internlm/internlm2-chat-1_8b',
'QuantFactory/internlm2-chat-1_8b-GGUF',
# 'internlm2-chat-1_8b.Q4_K_S.gguf',
'internlm2-chat-1_8b.Q4_K_M.gguf',
),
Model(
'Qwen/Qwen2-1.5B-Instruct',
'Qwen/Qwen2-1.5B-Instruct-GGUF',
# 'qwen2-1_5b-instruct-q4_0.gguf',
'qwen2-1_5b-instruct-q4_k_m.gguf',
),
Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
# 'tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
),
]
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

module_path = os.path.abspath(__file__)
module_dir = os.path.dirname(module_path)
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cpu.so')
lib = ctypes.CDLL(llama_cli_lib_path)

_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
Expand Down
112 changes: 106 additions & 6 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from cffi import FFI

from clean import clean
from clean import clean_llama, clean_llama_cpp, clean

ffibuilder = FFI()

Expand All @@ -32,14 +32,19 @@
)


def build(*args, **kwargs):
# build static and shared library
env = os.environ.copy()

def clone_llama_cpp():
subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True)
subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True)


def build_cpu(*args, **kwargs):
# build static and shared library
env = os.environ.copy()

#
# build llama.cpp
#
if 'PYODIDE' in env and env['PYODIDE'] == '1':
env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
env['UNAME_M'] = 'wasm'
Expand All @@ -55,6 +60,8 @@ def build(*args, **kwargs):
'GGML_NO_LLAMAFILE=1',
# 'GGML_OPENBLAS=1',
], check=True, env=env)

subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cpu.so'], check=True)

# cffi
ffibuilder.compile(tmpdir='build', verbose=True)
Expand All @@ -70,6 +77,99 @@ def build(*args, **kwargs):
shutil.move(file, 'llama/')


if __name__ == '__main__':
def build_cuda_12_5(*args, **kwargs):
# build static and shared library
env = os.environ.copy()

#
# cuda env
#
cuda_file = 'cuda_12.5.1_555.42.06_linux.run'
cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/{cuda_file}'
cuda_output_dir = os.path.abspath('./cuda-12.5.1')

env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin'
env['CUDA_PATH'] = f'{cuda_output_dir}/dist'

# download cuda file
subprocess.run(['wget', '-N', cuda_url, '-P', cuda_output_dir], check=True)

# extract cuda file
cmd = ['chmod', '+x', f'{cuda_output_dir}/{cuda_file}']
subprocess.run(cmd, check=True)

cmd = [
f'{cuda_output_dir}/{cuda_file}',
'--tar',
'mxvf',
'--wildcards',
'./builds/cuda_cccl/*',
'./builds/cuda_cudart/*',
'./builds/cuda_nvcc/*',
'./builds/libcublas/*',
'-C',
cuda_output_dir,
]
subprocess.run(cmd, cwd=cuda_output_dir, check=True)

cmd = ['mkdir', '-p', f'{cuda_output_dir}/dist']
subprocess.run(cmd, check=True)

cmd = f'cp -r {cuda_output_dir}/builds/cuda_cccl/* {cuda_output_dir}/dist'
subprocess.run(cmd, shell=True, check=True)

cmd = f'cp -r {cuda_output_dir}/builds/cuda_cudart/* {cuda_output_dir}/dist'
subprocess.run(cmd, shell=True, check=True)

cmd = f'cp -r {cuda_output_dir}/builds/cuda_nvcc/* {cuda_output_dir}/dist'
subprocess.run(cmd, shell=True, check=True)

cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
subprocess.run(cmd, shell=True, check=True)

#
# build llama.cpp
#
if 'PYODIDE' in env and env['PYODIDE'] == '1':
env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
env['UNAME_M'] = 'wasm'

subprocess.run([
'make',
'-C',
'llama.cpp',
'-j',
'llama-cli-shared',
'GGML_NO_OPENMP=1',
'GGML_NO_LLAMAFILE=1',
'GGML_CUDA=1',
], check=True, env=env)

subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cuda-12_5.so'], check=True)

# ctypes
for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
shutil.move(file, 'llama/')

for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
shutil.move(file, 'llama/')

for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
shutil.move(file, 'llama/')


def build(*args, **kwargs):
clean()
clone_llama_cpp()

# cpu
clean_llama_cpp()
build_cpu(*args, **kwargs)

# cuda 12.5
clean_llama_cpp()
build_cuda_12_5(*args, **kwargs)


if __name__ == '__main__':
build()
15 changes: 14 additions & 1 deletion scripts/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,22 @@
import subprocess


def clean():
def clean_llama():
files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll')
subprocess.run(['rm', '-fv'] + files, check=True)


def clean_llama_cpp():
subprocess.run([
'make',
'-C',
'llama.cpp',
'clean'
], check=True)


def clean():
clean_llama()
subprocess.run(['rm', '-fr', 'build'], check=True)
subprocess.run(['rm', '-fr', 'dist'], check=True)
subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)
Expand Down

0 comments on commit c1465a5

Please sign in to comment.