From f8e00d537f670906fa4efc8537b9d7e58c2bf997 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 13:40:20 -0400 Subject: [PATCH 1/7] Update llama.cpp --- llama_cpp/llama.py | 58 ++++++++++++++++++++----- llama_cpp/llama_cpp.py | 28 +++++++++++- llama_cpp/server/app.py | 94 ++++++++++++++++++++++++++++++----------- vendor/llama.cpp | 2 +- 4 files changed, 144 insertions(+), 38 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c9ea90fb4..705a4b217 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -230,8 +230,14 @@ def __init__( n_batch: int = 512, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, + yarn_ext_factor: float = float("nan"), + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, mul_mat_q: bool = True, f16_kv: bool = True, logits_all: bool = False, @@ -255,30 +261,30 @@ def __init__( Args: model_path: Path to the model. - seed: Random seed. -1 for random. - n_ctx: Maximum context size. - n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded. - main_gpu: Main GPU to use. - tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split. + main_gpu: The GPU that is used for scratch and small tensors. + tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. + vocab_only: Only load the vocabulary no weights. + use_mmap: Use mmap if possible. + use_mlock: Force the system to keep the model in RAM. + seed: Random seed. -1 for random. + n_ctx: Context size. + n_batch: Batch size for prompt processing (must be >= 32 to use BLAS) + n_threads: Number of threads to use. If None, the number of threads is automatically determined. + n_threads_batch: Number of threads to use for batch processing. If None, use n_threads. + rope_scaling_type: Type of rope scaling to use. rope_freq_base: Base frequency for rope sampling. rope_freq_scale: Scale factor for rope sampling. - low_vram: Use low VRAM mode. mul_mat_q: if true, use experimental mul_mat_q kernels f16_kv: Use half-precision for key/value cache. logits_all: Return logits for all tokens, not just the last token. - vocab_only: Only load the vocabulary no weights. - use_mmap: Use mmap if possible. - use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. If None, the number of threads is automatically determined. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) chat_format: String specifying the chat format to use when calling create_chat_completion. verbose: Print verbose output to stderr. - kwargs: Unused keyword arguments (for additional backwards compatibility). Raises: ValueError: If the model path does not exist. @@ -332,12 +338,30 @@ def __init__( self.context_params.n_batch = self.n_batch self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch + self.context_params.rope_scaling_type = ( + rope_scaling_type if rope_scaling_type is not None else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED + ) self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 ) self.context_params.rope_freq_scale = ( rope_freq_scale if rope_freq_scale != 0.0 else 0 ) + self.context_params.yarn_ext_factor = ( + yarn_ext_factor if yarn_ext_factor != 0.0 else 0 + ) + self.context_params.yarn_attn_factor = ( + yarn_attn_factor if yarn_attn_factor != 0.0 else 0 + ) + self.context_params.yarn_beta_fast = ( + yarn_beta_fast if yarn_beta_fast != 0.0 else 0 + ) + self.context_params.yarn_beta_slow = ( + yarn_beta_slow if yarn_beta_slow != 0.0 else 0 + ) + self.context_params.yarn_orig_ctx = ( + yarn_orig_ctx if yarn_orig_ctx != 0 else 0 + ) self.context_params.mul_mat_q = mul_mat_q self.context_params.f16_kv = f16_kv self.context_params.logits_all = logits_all @@ -1671,8 +1695,14 @@ def __getstate__(self): n_batch=self.n_batch, n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, + rope_scaling_type=self.context_params.rope_scaling_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, + yarn_ext_factor=self.context_params.yarn_ext_factor, + yarn_attn_factor=self.context_params.yarn_attn_factor, + yarn_beta_fast=self.context_params.yarn_beta_fast, + yarn_beta_slow=self.context_params.yarn_beta_slow, + yarn_orig_ctx=self.context_params.yarn_orig_ctx, mul_mat_q=self.context_params.mul_mat_q, f16_kv=self.context_params.f16_kv, logits_all=self.context_params.logits_all, @@ -1709,6 +1739,12 @@ def __setstate__(self, state): n_threads_batch=state["n_threads_batch"], rope_freq_base=state["rope_freq_base"], rope_freq_scale=state["rope_freq_scale"], + rope_scaling_type=state["rope_scaling_type"], + yarn_ext_factor=state["yarn_ext_factor"], + yarn_attn_factor=state["yarn_attn_factor"], + yarn_beta_fast=state["yarn_beta_fast"], + yarn_beta_slow=state["yarn_beta_slow"], + yarn_orig_ctx=state["yarn_orig_ctx"], mul_mat_q=state["mul_mat_q"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ba4e26b7d..b6216a5d9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -192,6 +192,18 @@ def _load_shared_library(lib_base_name: str): LLAMA_FTYPE_MOSTLY_Q6_K = 18 LLAMA_FTYPE_GUESSED = 1024 +# enum llama_rope_scaling_type { +# LLAMA_ROPE_SCALING_UNSPECIFIED = -1, +# LLAMA_ROPE_SCALING_NONE = 0, +# LLAMA_ROPE_SCALING_LINEAR = 1, +# LLAMA_ROPE_SCALING_YARN = 2, +# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, +# }; +LLAMA_ROPE_SCALING_UNSPECIFIED = -1 +LLAMA_ROPE_SCALING_NONE = 0 +LLAMA_ROPE_SCALING_LINEAR = 1 +LLAMA_ROPE_SCALING_YARN = 2 +LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN # typedef struct llama_token_data { # llama_token id; // token id @@ -308,10 +320,16 @@ class llama_model_params(Structure): # uint32_t n_batch; // prompt processing maximum batch size # uint32_t n_threads; // number of threads to use for generation # uint32_t n_threads_batch; // number of threads to use for batch processing +# int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 -# float rope_freq_base; // RoPE base frequency, 0 = from model -# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model +# float rope_freq_base; // RoPE base frequency, 0 = from model +# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model +# float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model +# float yarn_attn_factor; // YaRN magnitude scaling factor +# float yarn_beta_fast; // YaRN low correction dim +# float yarn_beta_slow; // YaRN high correction dim +# uint32_t yarn_orig_ctx; // YaRN original context size # // Keep the booleans together to avoid misalignment during copy-by-value. @@ -327,8 +345,14 @@ class llama_context_params(Structure): ("n_batch", c_uint32), ("n_threads", c_uint32), ("n_threads_batch", c_uint32), + ("rope_scaling_type", c_int8), ("rope_freq_base", c_float), ("rope_freq_scale", c_float), + ("yarn_ext_factor", c_float), + ("yarn_attn_factor", c_float), + ("yarn_beta_fast", c_float), + ("yarn_beta_slow", c_float), + ("yarn_orig_ctx", c_uint32), ("mul_mat_q", c_bool), ("f16_kv", c_bool), ("logits_all", c_bool), diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f8d8c7658..73b660ae1 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,11 +41,7 @@ class Settings(BaseSettings): default=None, description="The alias of the model to use for generating completions.", ) - seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") - n_ctx: int = Field(default=2048, ge=1, description="The context size.") - n_batch: int = Field( - default=512, ge=1, description="The batch size to use per eval." - ) + # Model Params n_gpu_layers: int = Field( default=0, ge=-1, @@ -60,17 +56,6 @@ class Settings(BaseSettings): default=None, description="Split layers across multiple GPUs in proportion.", ) - rope_freq_base: float = Field( - default=0.0, description="RoPE base frequency" - ) - rope_freq_scale: float = Field( - default=0.0, description="RoPE frequency scaling factor" - ) - mul_mat_q: bool = Field( - default=True, description="if true, use experimental mul_mat_q kernels" - ) - f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") - logits_all: bool = Field(default=True, description="Whether to return logits.") vocab_only: bool = Field( default=False, description="Whether to only return the vocabulary." ) @@ -82,17 +67,59 @@ class Settings(BaseSettings): default=llama_cpp.llama_mlock_supported(), description="Use mlock.", ) - embedding: bool = Field(default=True, description="Whether to use embeddings.") + # Context Params + seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") + n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) n_threads: int = Field( default=max(multiprocessing.cpu_count() // 2, 1), ge=1, description="The number of threads to use.", ) + n_threads_batch: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=0, + description="The number of threads to use when batch processing.", + ) + rope_scaling_type: int = Field( + default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED + ) + rope_freq_base: float = Field( + default=0.0, description="RoPE base frequency" + ) + rope_freq_scale: float = Field( + default=0.0, description="RoPE frequency scaling factor" + ) + yarn_ext_factor: float = Field( + default=float("nan") + ) + yarn_attn_factor: float = Field( + default=1.0 + ) + yarn_beta_fast: float = Field( + default=32.0 + ) + yarn_beta_slow: float = Field( + default=1.0 + ) + yarn_orig_ctx: int = Field( + default=0 + ) + mul_mat_q: bool = Field( + default=True, description="if true, use experimental mul_mat_q kernels" + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + logits_all: bool = Field(default=True, description="Whether to return logits.") + embedding: bool = Field(default=True, description="Whether to use embeddings.") + # Sampling Params last_n_tokens_size: int = Field( default=64, ge=0, description="Last n tokens to keep for repeat penalty calculation.", ) + # LoRA Params lora_base: Optional[str] = Field( default=None, description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." @@ -101,14 +128,17 @@ class Settings(BaseSettings): default=None, description="Path to a LoRA file to apply to the model.", ) + # Backend Params numa: bool = Field( default=False, description="Enable NUMA support.", ) + # Chat Format Params chat_format: str = Field( default="llama-2", description="Chat format to use.", ) + # Cache Params cache: bool = Field( default=False, description="Use a cache to reduce processing times for evaluated prompts.", @@ -121,9 +151,11 @@ class Settings(BaseSettings): default=2 << 30, description="The size of the cache in bytes. Only used if cache is True.", ) + # Misc verbose: bool = Field( default=True, description="Whether to print debug information." ) + # Server Params host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( @@ -345,27 +377,41 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, - seed=settings.seed, - n_ctx=settings.n_ctx, - n_batch=settings.n_batch, + # Model Params n_gpu_layers=settings.n_gpu_layers, main_gpu=settings.main_gpu, tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + rope_scaling_type=settings.rope_scaling_type, rope_freq_base=settings.rope_freq_base, rope_freq_scale=settings.rope_freq_scale, + yarn_ext_factor=settings.yarn_ext_factor, + yarn_attn_factor=settings.yarn_attn_factor, + yarn_beta_fast=settings.yarn_beta_fast, + yarn_beta_slow=settings.yarn_beta_slow, + yarn_orig_ctx=settings.yarn_orig_ctx, mul_mat_q=settings.mul_mat_q, f16_kv=settings.f16_kv, logits_all=settings.logits_all, - vocab_only=settings.vocab_only, - use_mmap=settings.use_mmap, - use_mlock=settings.use_mlock, embedding=settings.embedding, - n_threads=settings.n_threads, + # Sampling Params last_n_tokens_size=settings.last_n_tokens_size, + # LoRA Params lora_base=settings.lora_base, lora_path=settings.lora_path, + # Backend Params numa=settings.numa, + # Chat Format Params chat_format=settings.chat_format, + # Misc verbose=settings.verbose, ) if settings.cache: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 50337961a..4ff1046d7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 50337961a678fce4081554b24e56e86b67660163 +Subproject commit 4ff1046d75e64f0e556d8dcd930ea25c23eb8b18 From 9136ce254043520b09d8dff522dc98c8f55e7d77 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 13:45:32 -0400 Subject: [PATCH 2/7] Fix build examples --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c633c0797..1026120cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,9 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") + set(LLAMA_BUILD_TESTS "Off" CACHE BOOL "" FORCE) + set(LLAMA_BUILD_EXAMPLES "Off" CACHE BOOL "" FORCE) + set(LLAMA_BUILD_SERVER "Off" CACHE BOOL "" FORCE) if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") # Need to disable these llama.cpp flags on Apple x86_64, # otherwise users may encounter invalid instruction errors From 46bd07a858fe5f80f7d9ff9f0aed755a8ddef42d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 13:52:18 -0400 Subject: [PATCH 3/7] Exclude examples directory --- CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1026120cf..a798cf9fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,6 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") - set(LLAMA_BUILD_TESTS "Off" CACHE BOOL "" FORCE) - set(LLAMA_BUILD_EXAMPLES "Off" CACHE BOOL "" FORCE) - set(LLAMA_BUILD_SERVER "Off" CACHE BOOL "" FORCE) if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") # Need to disable these llama.cpp flags on Apple x86_64, # otherwise users may encounter invalid instruction errors @@ -18,6 +15,7 @@ if (LLAMA_BUILD) set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE) endif() add_subdirectory(vendor/llama.cpp) + add_subdirectory(vendor/llama.cpp/examples EXCLUDE_FROM_ALL) install( TARGETS llama LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp From bb961bb191291bf80ff845178f6233ab7a41f8e5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 13:53:57 -0400 Subject: [PATCH 4/7] Revert cmake changes --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a798cf9fd..c633c0797 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,6 @@ if (LLAMA_BUILD) set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE) endif() add_subdirectory(vendor/llama.cpp) - add_subdirectory(vendor/llama.cpp/examples EXCLUDE_FROM_ALL) install( TARGETS llama LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp From 57c0ddfa5ca83e13080840c9d823e25057d480b3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 14:12:02 -0400 Subject: [PATCH 5/7] Try actions/checkout@v4 --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 24448ec9f..fbe3584b0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -17,7 +17,7 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "true" - name: Set up Python ${{ matrix.python-version }} From 1815c7b193e05ad2ba258ebbe8bef5d340c41b56 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 14:21:51 -0400 Subject: [PATCH 6/7] Try to update submodules --- .github/workflows/test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fbe3584b0..e9f447a9c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,6 +20,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" + - name: Update submodules + run: git submodule update --init --recursive - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: From d645c882915eaf5f5d6428db63bcd3b579757809 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 14:25:03 -0400 Subject: [PATCH 7/7] Revert --- .github/workflows/test.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e9f447a9c..fbe3584b0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,8 +20,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Update submodules - run: git submodule update --init --recursive - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: