Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Qwen2VL #10361

Merged
merged 35 commits into from
Dec 14, 2024
Merged
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c17546f
Barebone Qwen2VL LLM convertor
HimariO Sep 21, 2024
7c6f793
Add Qwen2VL cli entrypoint
HimariO Sep 22, 2024
b24bd89
[WIP] add qwen2vl arch
HimariO Sep 25, 2024
3541196
Verify m-rope output
HimariO Sep 29, 2024
9d389a0
Add vl-rope/2d-rope support for qwen2vl ViT
HimariO Sep 30, 2024
f661483
update qwen2vl cli tool
HimariO Oct 1, 2024
3c3691e
update 5D tensor op workaround
HimariO Oct 2, 2024
c13edfe
[WIP] qwen2vl vision model
HimariO Oct 10, 2024
7e9fc72
make batch and clip utils compatible with qwen2vl
HimariO Oct 18, 2024
bcd49f5
[WIP] create inference workflow, gguf convert script but fix
HimariO Oct 18, 2024
023f007
correcting vision-rope behavior, add the missing last layer back to ViT
HimariO Oct 20, 2024
3d19dd4
add arg parser to qwen2vl_surgery
HimariO Oct 20, 2024
53480d2
replace variable size array with vector
HimariO Oct 21, 2024
0882f57
cuda-gdb cmake preset
HimariO Oct 27, 2024
3237bb4
add fp32 mrope, vision rope kernel
HimariO Oct 28, 2024
201f704
add fp16 support for qwen2vl and m-rope
HimariO Oct 30, 2024
f1fa60f
add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
HimariO Oct 30, 2024
241bb45
fix rope op mode switching, out dated func args
HimariO Nov 4, 2024
07553cf
update `llama_hparams`
HimariO Nov 10, 2024
fac0345
update to keep up stream changes
HimariO Nov 11, 2024
cbd08b4
resolve linter, test errors
HimariO Nov 29, 2024
6c39aa3
add makefile entry, update speical image padding token
HimariO Dec 7, 2024
ac2089c
add mrope unit test, fix few compiler warnings
HimariO Dec 7, 2024
12f17f7
rename `mrope` related function, params
HimariO Dec 7, 2024
3ba7664
minor updates on debug util, bug fixs
HimariO Dec 9, 2024
b24ab86
add `m-rope` testcase to `test-backend-ops`
HimariO Dec 9, 2024
d7edc55
Merge branch 'master' into qwen2-vl
HimariO Dec 11, 2024
9abb252
Apply suggestions from code review
HimariO Dec 13, 2024
c292bf1
Merge branch 'ggerganov:master' into qwen2-vl
HimariO Dec 13, 2024
e9748e4
fix traililng whitespce
HimariO Dec 13, 2024
ef7f74b
store `llama_hparams.rope_sections` with fixed size array
HimariO Dec 13, 2024
e2e9a6c
update position id tensor size check in GGML_OP_ROPE
HimariO Dec 13, 2024
a02a190
minor updates
HimariO Dec 13, 2024
19aba1d
update `ggml_backend_*_supports_op` of unsupported backends
HimariO Dec 13, 2024
f96909e
remote old `rope_section` compare operator
HimariO Dec 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update 5D tensor op workaround
HimariO committed Nov 29, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 3c3691e10f0e5342dee99956536f64a5aba28fc6
121 changes: 116 additions & 5 deletions examples/llava/qwen2vl-cli.cpp
Original file line number Diff line number Diff line change
@@ -19,12 +19,21 @@

static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
auto batch = llama_batch_get_one(&tokens[i], n_eval, *n_past, 0);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 3);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = j % batch.n_tokens;
}
batch.pos = pos.data();

if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
@@ -296,9 +305,12 @@ static void tmp_test_conv2d_reshape(struct llava_context * ctx_llava, gpt_params
ggml_set_input(inp_raw);

auto image_pixels = batch_size * image_size_width * image_size_height * 3;
auto one_ch = image_size_width * image_size_height;
std::vector<float> dummy_img;
dummy_img.resize(image_pixels);
std::fill(dummy_img.begin(), dummy_img.end(), 0.1);
std::fill(dummy_img.begin(), dummy_img.begin() + one_ch, 0.1);
std::fill(dummy_img.begin() + one_ch, dummy_img.begin() + one_ch * 2, 0.2);
std::fill(dummy_img.begin() + one_ch * 2, dummy_img.end(), 0.3);
memcpy(inp_raw->data, dummy_img.data(), image_pixels * ggml_element_size(inp_raw));

int patch_size = 14;
@@ -343,6 +355,105 @@ static void tmp_test_conv2d_reshape(struct llava_context * ctx_llava, gpt_params
(float *) ggml_get_data(inp),
sizeof(float) * num_patches * hidden_size * batch_size);
ggml_free(ctx0);

std::ofstream outFile("conv2d.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));

outFile.close();
std::cout << "Data successfully written to conv2d.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}


static void tmp_test_4d_reshape(struct llava_context * ctx_llava, gpt_params * params) {
int image_size_width = 32;
int image_size_height = 32;
int batch_size = 1;

static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size);

struct ggml_init_params init_params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};

struct ggml_context * ctx0 = ggml_init(init_params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);

struct ggml_tensor * inp_raw = ggml_new_tensor_4d(
ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 8, batch_size);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);

auto image_pixels = batch_size * image_size_width * image_size_height * 8;
auto one_ch = image_size_width * image_size_height;
std::vector<float> dummy_img;
dummy_img.resize(image_pixels);
for (int i = 0; i < 8; i++)
{
// std::fill(
// dummy_img.begin() + one_ch * i,
// dummy_img.begin() + one_ch * (i + 1),
// 0.1 * i
// );
for (size_t y = 0; y < image_size_height; y++)
{
for (size_t x = 0; x < image_size_width; x++)
{
dummy_img[one_ch * i + image_size_width * y + x] = i * (image_size_width * y + x) / (float)(32 * 32);
}

}

}
memcpy(inp_raw->data, dummy_img.data(), image_pixels * ggml_element_size(inp_raw));

int patch_size = 1;
int hidden_size = 8;
int patch_w = image_size_width / patch_size;
int patch_h = image_size_height / patch_size;
int num_patches = (image_size_width / patch_size) * (image_size_height / patch_size);

// inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
// inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); // swap axis 0 & 1, ignore axis 3 which is empty in this tensor
// auto inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 2, 0, 1, 3)); // [w, h, c, b] -> [c, w, h, b]
auto inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] [(0-->1), (1-->2), (2-->0), (3-->3)]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patch_w / 2, patch_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patch_w / 2, 2, batch_size * (patch_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_2d(
ctx0, inp,
hidden_size * 4, (patch_w / 2) * batch_size * (patch_h / 2));

ggml_build_forward_expand(gf, inp);
ggml_graph_compute_with_ctx(ctx0, gf, 2);

std::vector<float> embd;
embd.resize(num_patches * hidden_size * batch_size);
memcpy(
embd.data(),
(float *) ggml_get_data(inp),
sizeof(float) * num_patches * hidden_size * batch_size);
ggml_free(ctx0);

std::ofstream outFile("reshape_4d.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));

outFile.close();
std::cout << "Data successfully written to reshape_4d.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}


@@ -582,11 +693,11 @@ int main(int argc, char ** argv) {
auto ctx_llava = llava_init_context(&params, model);

// process the prompt
// tmp_test_conv2d_reshape(ctx_llava, &params);
tmp_test_4d_reshape(ctx_llava, &params);
// tmp_test_rope(ctx_llava, &params);
// tmp_test_mrope(ctx_llava, &params);
tmp_test_mrope_2d(ctx_llava, &params);
process_prompt(ctx_llava, nullptr, &params, params.prompt);
// tmp_test_mrope_2d(ctx_llava, &params);
// process_prompt(ctx_llava, nullptr, &params, params.prompt);

llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;