Skip to content

Commit 7d0c938

Browse files
authored
feat: Initial Granite support (#1271)
- Add Granite to our tokenizer - Fix pre-processor to load context length correctly - Add strftime_now Jinja function for prompt templates - Update llama.cpp - Handle trtllm errors when not using trtllm Support depends on the engine: - `mistral.rs`, our default engine, doesn't support Granite yet. - `llama.cpp` does and works very well: ``` dynamo-run out=llamacpp ~/llms/granite-3.3-2b-instruct-Q4_K_M.gguf --context-length 16384 ``` - `vllm` also works very well: ``` dynamo-run in=http out=vllm ~/llms/granite-3.3-2b-instruct --context-length 16384 ``` - `sglang` mostly works, but it doesn't catch the stop token, so we do in the HTTP ingress, and log an error. The Text ingress doesn't catch it because I disabled it to make the raw echo engine work. A bit of work to do here. Closes: #1245
1 parent d784877 commit 7d0c938

File tree

7 files changed

+33
-12
lines changed

7 files changed

+33
-12
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/bindings/python/src/dynamo/llm/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import logging
17+
1618
from dynamo._core import AggregatedMetrics as AggregatedMetrics
1719

1820
try:
@@ -45,3 +47,7 @@
4547
)
4648
except ImportError:
4749
pass # TensorRTLLM is not enabled by default
50+
except Exception as e:
51+
# Don't let TensorRTLLM break other engines
52+
logger = logging.getLogger(__name__)
53+
logger.exception(f"Error importing TensorRT-LLM components: {e}")

lib/engines/llamacpp/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ async-stream = { workspace = true }
3838
tokio = { workspace = true }
3939
tracing = { workspace = true }
4040

41-
llama-cpp-2 = { version = "0.1.103" }
41+
llama-cpp-2 = { version = "0.1.107" }

lib/llm/src/gguf.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ pub enum GGUFArchitecture {
5858
Qwen2,
5959
Qwen3,
6060
Gemma3,
61+
Granite,
6162
}
6263

6364
// Wraps from_str() for some convenience:

lib/llm/src/model_card/create.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,15 +115,20 @@ impl ModelDeploymentCard {
115115
}
116116

117117
async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
118+
// This is usually the right choice
118119
let context_length = file_json_field(
119-
&Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
120-
"model_max_length",
120+
&Path::join(&PathBuf::from(repo_id), "config.json"),
121+
"max_position_embeddings",
121122
)
123+
// But sometimes this is
124+
.or_else(|_| {
125+
file_json_field(
126+
&Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
127+
"model_max_length",
128+
)
129+
})
130+
// If neither of those are present let the engine default it
122131
.unwrap_or(0);
123-
tracing::trace!(
124-
context_length,
125-
"Loaded context length (model_max_length) from tokenizer_config.json"
126-
);
127132

128133
Ok(Self {
129134
display_name: model_name.to_string(),

lib/llm/src/preprocessor/prompt/template/formatters.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
use std::sync::Arc;
1717

18-
use super::tokcfg::{raise_exception, tojson, ChatTemplate};
18+
use super::tokcfg::{raise_exception, strftime_now, tojson, ChatTemplate};
1919
use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
2020
use either::Either;
2121
use minijinja::Environment;
@@ -50,10 +50,11 @@ impl HfTokenizerConfigJsonFormatter {
5050
// todo: should we use this: minijinja_contrib::add_to_environment(&mut env);
5151
env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback);
5252

53-
// add custom functions and filters
54-
env.add_function("raise_exception", raise_exception);
5553
env.add_filter("tojson", tojson);
5654

55+
env.add_function("raise_exception", raise_exception);
56+
env.add_function("strftime_now", strftime_now);
57+
5758
let mut supports_add_generation_prompt = None;
5859

5960
match &chat_template.0 {

lib/llm/src/preprocessor/prompt/template/tokcfg.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use std::{collections::HashMap, fs::File, path::Path};
1919

20+
use chrono::{DateTime, Local};
2021
use either::Either;
2122
use ggus::{GGufMetaKV, GGufReader};
2223
use memmap2::Mmap;
@@ -225,3 +226,10 @@ pub fn tojson(value: Value, kwargs: Kwargs) -> Result<Value, Error> {
225226
Value::from_safe_string(rv)
226227
})
227228
}
229+
230+
pub fn strftime_now(format_str: &str) -> Result<Value, Error> {
231+
let local: DateTime<Local> = Local::now();
232+
Ok(Value::from_safe_string(
233+
local.format(format_str).to_string(),
234+
))
235+
}

0 commit comments

Comments
 (0)