Add bindings for LoRA adapters. Closes ggml-org#88

abetlen · abetlen · commit 35abf89552c6 · 2023-04-18T01:30:04.000-04:00
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -114,7 +114,9 @@ class llama_context_params(Structure):
 LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4)  # tok_embeddings.weight and output.weight are F16
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
+    4
+)  # tok_embeddings.weight and output.weight are F16
 
 # Functions
 
@@ -175,6 +177,22 @@ def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_
 _lib.llama_model_quantize.restype = c_int
 
 
+# Apply a LoRA adapter to a loaded model
+# path_base_model is the path to a higher quality model to use as a base for
+# the layers modified by the adapter. Can be NULL to use the current loaded model.
+# The model needs to be reloaded before applying a new adapter, otherwise the adapter
+# will be applied on top of the previous one
+# Returns 0 on success
+def llama_apply_lora_from_file(
+    ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int
+) -> c_int:
+    return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
+
+
+_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
+_lib.llama_apply_lora_from_file.restype = c_int
+
+
 # Returns the KV cache that will contain the context for the
 # ongoing prediction with the model.
 def llama_get_kv_cache(ctx: llama_context_p):