Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support MiniCPM-V-2.5 #7599

Merged
merged 67 commits into from
Aug 9, 2024
Merged
Changes from 1 commit
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
7a49a6f
init
tc-mb May 23, 2024
c536fa6
rename
tc-mb May 23, 2024
2b91903
add run android for termux in readme
tc-mb May 23, 2024
0480d5f
add android readme
tc-mb May 23, 2024
ec1cea7
add instructions in readme
tc-mb May 23, 2024
a491f45
change name in readme
tc-mb May 23, 2024
7573b63
Update README.md
iceflame89 May 23, 2024
94dcaba
fixed line
harvestingmoon May 23, 2024
b31f51f
Merge pull request #1 from harvestingmoon/minicpm-v2.5
tc-mb May 24, 2024
629420e
add result in readme
tc-mb May 24, 2024
b48708a
random pos_embed
tc-mb May 26, 2024
d9fbc1d
add positions index
tc-mb May 26, 2024
18fe620
change for ollama
tc-mb May 26, 2024
2997a68
change for ollama
tc-mb May 26, 2024
8541e99
better pos_embed in clip
tc-mb May 26, 2024
d8974b8
support ollama
tc-mb May 27, 2024
e73a0c7
updata cmakelist
tc-mb May 28, 2024
6366d62
updata cmakelist
tc-mb May 28, 2024
056d178
rename wrapper
tc-mb May 28, 2024
3c306f1
clear code
tc-mb May 28, 2024
9495504
replace and organize code
tc-mb May 28, 2024
b37ab0b
add link
tc-mb May 28, 2024
8767ce2
Merge branch 'prepare-PR-of-minicpm-v2.5' into prepare-PR
tc-mb May 28, 2024
8bd47ce
Merge pull request #7 from OpenBMB/prepare-PR
tc-mb May 28, 2024
28d4a7f
Merge pull request #8 from OpenBMB/master
tc-mb May 28, 2024
02eb445
sync master
tc-mb May 28, 2024
07f48f9
fix warnings
tc-mb May 28, 2024
c38d152
fix warnings
tc-mb May 28, 2024
88f5e6a
fix bug in bicubic resize when need resize iamge smaller
tc-mb May 30, 2024
a913ca4
receive review comments and modify
tc-mb May 31, 2024
a95a6d9
receive review comments and modify
tc-mb Jun 2, 2024
c390dd4
Merge branch 'ggerganov:master' into prepare-PR-of-minicpm-v2.5
tc-mb Jun 4, 2024
efe4c61
put all code into llava dir
tc-mb Jun 4, 2024
ee5b850
Merge pull request #11 from OpenBMB/pr_add_all_in_llava
tc-mb Jun 4, 2024
77beb4d
Merge branch 'prepare-PR-of-minicpm-v2.5' into master
tc-mb Jun 24, 2024
cb8cfb9
Merge pull request #15 from OpenBMB/master
tc-mb Jun 24, 2024
8f03505
fix quality problem in pr code
tc-mb Jun 25, 2024
e68c8bc
change n_layer
tc-mb Jun 25, 2024
4c67d7c
add space in "-1"
tc-mb Jun 25, 2024
977941d
imitate reshape bug of python code
tc-mb Jul 4, 2024
3e6348b
fix bug in clip
tc-mb Jul 7, 2024
c5b6851
fix issues for merging
tc-mb Jul 17, 2024
5959b14
fix llama-minicpmv-cli in cmake file
tc-mb Jul 19, 2024
292a469
change pr readme
tc-mb Jul 20, 2024
be8b5b2
fix code review
tc-mb Jul 22, 2024
4c75583
remove in line 33 directory in the /cmakelists.txt (not in example, i…
tc-mb Jul 22, 2024
62fa15b
fix cmakefile
tc-mb Jul 23, 2024
dad4abe
add warn
tc-mb Jul 23, 2024
3642be9
fix KEY_HAS_MINICPMV_PROJ
tc-mb Jul 23, 2024
fcde997
remove load_image_size into clip_ctx
tc-mb Jul 23, 2024
6fd0937
remove the extern "C", MINICPMV_API
tc-mb Jul 23, 2024
107e1ed
fix uhd code for review comment
tc-mb Jul 25, 2024
72b9629
delete minicpmv-wrapper in pr
tc-mb Jul 25, 2024
f3d400d
remove uhd_image_embed
tc-mb Jul 26, 2024
65f7455
Modify 2 notes
tc-mb Jul 26, 2024
6e29913
clip : style changes
ggerganov Aug 6, 2024
f33071d
Merge pull request #19 from ggerganov/prepare-PR-of-minicpm-v2.5-gg
tc-mb Aug 6, 2024
f04c6e2
del common.h in clip
tc-mb Aug 6, 2024
5ec4de7
Merge branch 'master' into prepare-PR-of-minicpm-v2.5
tc-mb Aug 6, 2024
5ab9577
fix Type-Check error
tc-mb Aug 7, 2024
28230d0
fix Type-Check error
tc-mb Aug 7, 2024
e3eff2a
fix Type-Check error
tc-mb Aug 7, 2024
0eb0bfa
fix Type-Check error
tc-mb Aug 7, 2024
712fd7c
fix makefile error
tc-mb Aug 7, 2024
616f3ea
fix ubuntu-make error
tc-mb Aug 7, 2024
2d14c81
try fix clip
tc-mb Aug 8, 2024
069631e
try fix 1
tc-mb Aug 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
clip : style changes
ggerganov authored Aug 6, 2024
commit 6e299132e7f143ec6abb8f26ee14b0040592c983
73 changes: 35 additions & 38 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
@@ -563,18 +563,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;

const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
if(load_image_size==nullptr){
load_image_size= clip_image_size_init();
if (load_image_size == nullptr) {
load_image_size = clip_image_size_init();
}
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width;
image_size_height = load_image_size->height;
if (is_inf){
image_size_width = imgs->data->nx;
image_size_width = load_image_size->width;
image_size_height = load_image_size->height;
if (is_inf) {
image_size_width = imgs->data->nx;
image_size_height = imgs->data->ny;
}
}
@@ -618,7 +618,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * embeddings = inp;
struct ggml_tensor * pos_embed;

if(ctx->has_llava_projector){
if (ctx->has_llava_projector) {
// concat class_embeddings and patch_embeddings
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
@@ -638,7 +638,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

if(ctx->has_minicpmv_projector){
if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
@@ -655,7 +655,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}

// loop over layers
if (ctx->has_minicpmv_projector){
if (ctx->has_minicpmv_projector) {
n_layer += 1;
}
for (int il = 0; il < n_layer - 1; il++) {
@@ -747,8 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}

// llava projector
if(ctx->has_llava_projector)
{
if (ctx->has_llava_projector) {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@@ -770,8 +769,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);

}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -931,19 +929,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
}
// minicpmv projector
else if(ctx->has_minicpmv_projector)
else if (ctx->has_minicpmv_projector)
{
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
struct ggml_tensor * q = model.mm_model_query;
{ // layernorm
q = ggml_norm(ctx0, q, eps);
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
}
struct ggml_tensor *k, *v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
{ // layernorm
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
}
struct ggml_tensor * k;
{ // position
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
@@ -1467,7 +1466,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}

void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size){
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
ctx_clip->load_image_size = load_image_size;
}

@@ -1839,16 +1838,16 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
LOG_TEE("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>());

if(multiple <= 1){
if (multiple <= 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8 *source_image = clip_image_u8_init();
clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image);
}
else if(multiple > 1){
else if (multiple > 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8 *source_image = clip_image_u8_init();
clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
@@ -1858,7 +1857,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);

auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 *refine_image = clip_image_u8_init();
clip_image_u8 * refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);

LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
@@ -1891,7 +1890,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
return images;
}

int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
const int max_slice_nums=9;
const int scale_resolution=448;
const int original_width = ctx_clip->load_image_size->width;
@@ -1906,16 +1905,15 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {

if(clip_is_minicpmv(ctx)){
if (clip_is_minicpmv(ctx)) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i){
for (size_t i = 0; i < imgs.size(); ++i) {
res_imgs->size += imgs[i].size();
}
res_imgs->data = new clip_image_f32[res_imgs->size];
int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i){
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init();
@@ -2149,7 +2147,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
return n_patches;
}

static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>>& pos) {
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
assert(embed_dim % 2 == 0);
int H = pos.size();
int W = pos[0].size();
@@ -2173,7 +2171,7 @@ static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from
return emb;
}

static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>>& grid) {
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
assert(embed_dim % 2 == 0);
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
@@ -2269,12 +2267,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;

const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
image_size_width = imgs->data[0].nx;;
image_size_height = imgs->data[0].ny;
image_size_width = imgs->data[0].nx;
image_size_height = imgs->data[0].ny;
}
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -2343,8 +2341,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
}
}
else{
} else {
{
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1 change: 1 addition & 0 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@ struct clip_image_size {
int width;
int height;
};

struct clip_image_u8_batch {
struct clip_image_u8 * data;
size_t size;