feat: Initial Granite support (#1271)

grahamking · web-flow · commit 7d0c938690e7 · 2025-05-29T15:25:07.000-04:00
- Add Granite to our tokenizer - Fix pre-processor to load context length correctly - Add strftime_now Jinja function for prompt templates - Update llama.cpp - Handle trtllm errors when not using trtllm Support depends on the engine: - `mistral.rs`, our default engine, doesn't support Granite yet. - `llama.cpp` does and works very well: ``` dynamo-run out=llamacpp ~/llms/granite-3.3-2b-instruct-Q4_K_M.gguf --context-length 16384 ``` - `vllm` also works very well: ``` dynamo-run in=http out=vllm ~/llms/granite-3.3-2b-instruct --context-length 16384 ``` - `sglang` mostly works, but it doesn't catch the stop token, so we do in the HTTP ingress, and log an error. The Text ingress doesn't catch it because I disabled it to make the raw echo engine work. A bit of work to do here. Closes: #1245
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/lib/bindings/python/src/dynamo/llm/__init__.py b/lib/bindings/python/src/dynamo/llm/__init__.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+
 from dynamo._core import AggregatedMetrics as AggregatedMetrics
 
 try:
@@ -45,3 +47,7 @@
     )
 except ImportError:
     pass  # TensorRTLLM is not enabled by default
+except Exception as e:
+    # Don't let TensorRTLLM break other engines
+    logger = logging.getLogger(__name__)
+    logger.exception(f"Error importing TensorRT-LLM components: {e}")
diff --git a/lib/engines/llamacpp/Cargo.toml b/lib/engines/llamacpp/Cargo.toml
@@ -38,4 +38,4 @@ async-stream = { workspace = true }
 tokio = { workspace = true }
 tracing = { workspace = true }
 
-llama-cpp-2 = { version = "0.1.103" }
+llama-cpp-2 = { version = "0.1.107" }
diff --git a/lib/llm/src/gguf.rs b/lib/llm/src/gguf.rs
@@ -58,6 +58,7 @@ pub enum GGUFArchitecture {
     Qwen2,
     Qwen3,
     Gemma3,
+    Granite,
 }
 
 // Wraps from_str() for some convenience:
diff --git a/lib/llm/src/model_card/create.rs b/lib/llm/src/model_card/create.rs
@@ -115,15 +115,20 @@ impl ModelDeploymentCard {
     }
 
     async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
+        // This is usually the right choice
         let context_length = file_json_field(
-            &Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
-            "model_max_length",
+            &Path::join(&PathBuf::from(repo_id), "config.json"),
+            "max_position_embeddings",
         )
+        // But sometimes this is
+        .or_else(|_| {
+            file_json_field(
+                &Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
+                "model_max_length",
+            )
+        })
+        // If neither of those are present let the engine default it
         .unwrap_or(0);
-        tracing::trace!(
-            context_length,
-            "Loaded context length (model_max_length) from tokenizer_config.json"
-        );
 
         Ok(Self {
             display_name: model_name.to_string(),
diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -15,7 +15,7 @@
 
 use std::sync::Arc;
 
-use super::tokcfg::{raise_exception, tojson, ChatTemplate};
+use super::tokcfg::{raise_exception, strftime_now, tojson, ChatTemplate};
 use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
 use either::Either;
 use minijinja::Environment;
@@ -50,10 +50,11 @@ impl HfTokenizerConfigJsonFormatter {
         // todo: should we use this: minijinja_contrib::add_to_environment(&mut env);
         env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback);
 
-        // add custom functions and filters
-        env.add_function("raise_exception", raise_exception);
         env.add_filter("tojson", tojson);
 
+        env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
+
         let mut supports_add_generation_prompt = None;
 
         match &chat_template.0 {
diff --git a/lib/llm/src/preprocessor/prompt/template/tokcfg.rs b/lib/llm/src/preprocessor/prompt/template/tokcfg.rs
@@ -17,6 +17,7 @@
 
 use std::{collections::HashMap, fs::File, path::Path};
 
+use chrono::{DateTime, Local};
 use either::Either;
 use ggus::{GGufMetaKV, GGufReader};
 use memmap2::Mmap;
@@ -225,3 +226,10 @@ pub fn tojson(value: Value, kwargs: Kwargs) -> Result<Value, Error> {
         Value::from_safe_string(rv)
     })
 }
+
+pub fn strftime_now(format_str: &str) -> Result<Value, Error> {
+    let local: DateTime<Local> = Local::now();
+    Ok(Value::from_safe_string(
+        local.format(format_str).to_string(),
+    ))
+}

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ pub enum GGUFArchitecture {`
`58`	`58`	`Qwen2,`
`59`	`59`	`Qwen3,`
`60`	`60`	`Gemma3,`
	`61`	`+ Granite,`
`61`	`62`	`}`
`62`	`63`
`63`	`64`	`// Wraps from_str() for some convenience:`