Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.mmproj_use_gpu = false;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
add_opt(common_arg({ "--clip-reduced-vram" },
"offload clip weights to CPU and stream at runtime to device (default: false)",
[](common_params & params) {
params.clip_reduced_vram = true;
}
).set_env("LLAMA_ARG_CLIP_REDUCED_VRAM"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
Expand Down
175 changes: 175 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,181 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

// Initialize llama_context and related state using an already-loaded llama_model
// - Does not take ownership of the model passed in; caller manages its lifetime
// - Mirrors common_init_from_params except for model creation/free
struct common_init_result common_init_from_existing_model(common_params & params, struct llama_model * model_in) {
common_init_result iparams;

if (model_in == NULL) {
LOG_ERR("%s: model '%s' is NULL\n", __func__, params.model.path.c_str());
return iparams;
}

common_init_sampler_from_model(model_in, params.sampling);

const llama_vocab * vocab = llama_model_get_vocab(model_in);

auto cparams = common_context_params_to_llama(params);

llama_context * lctx = llama_init_from_model(model_in, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with existing model '%s'\n", __func__, params.model.path.c_str());
return iparams;
}

if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model_in);

const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);

return iparams;
}

int err = llama_apply_adapter_cvec(
lctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
params.control_vector_layer_start,
params.control_vector_layer_end);
if (err) {
llama_free(lctx);

return iparams;
}
}

if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
bool ok = true;

if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}

bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
bool has_rerank_prompt = llama_model_chat_template(model_in, "rerank") != NULL;

if (!has_eos && !has_sep && !has_rerank_prompt) {
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
}

if (!ok) {
llama_free(lctx);

return iparams;
}
}

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model_in, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
return iparams;
}

char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}

if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters);
}

if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}

// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}

if (params.sampling.ignore_eos) {
// add EOG biases to the active set of logit biases
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
}

if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}

if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}

if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

llama_set_warmup(lctx, true);

std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);

// some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos);
}
if (eos != LLAMA_TOKEN_NULL) {
tmp.push_back(eos);
}
if (tmp.empty()) {
tmp.push_back(0);
}

if (llama_model_has_encoder(model_in)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model_in);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = bos;
}
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
if (llama_model_has_decoder(model_in)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_memory_clear(llama_get_memory(lctx), true);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
}

iparams.context.reset(lctx);

return iparams;
}

std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
Expand Down
4 changes: 4 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ struct common_params {
bool has_speculative() const {
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
}

bool clip_reduced_vram = false; // keep weights' of vision models offloaded to CPU and stream at runtime to required device
};

// call once at the start of a program if it uses libcommon
Expand Down Expand Up @@ -657,6 +659,8 @@ struct common_init_result {
};

struct common_init_result common_init_from_params(common_params & params);
// Initialize only the context using an already-loaded llama_model*
struct common_init_result common_init_from_existing_model(common_params & params, struct llama_model * model);

struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,3 +490,4 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
//

projector_type clip_get_projector_type(const struct clip_ctx * ctx);
int64_t clip_get_image_encode_timing(const struct clip_ctx * ctx);
Loading