@@ -631,7 +631,7 @@ struct clip_graph {
631631 }
632632
633633 // arrangement of the [IMG_BREAK] token
634- {
634+ if (model. token_embd_img_break ) {
635635 // not efficient, but works
636636 // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
637637 // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
@@ -2289,6 +2289,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
22892289 res = graph.build_siglip ();
22902290 } break ;
22912291 case PROJECTOR_TYPE_PIXTRAL:
2292+ case PROJECTOR_TYPE_LIGHTONOCR:
22922293 {
22932294 res = graph.build_pixtral ();
22942295 } break ;
@@ -2581,6 +2582,7 @@ struct clip_model_loader {
25812582 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
25822583 } break ;
25832584 case PROJECTOR_TYPE_PIXTRAL:
2585+ case PROJECTOR_TYPE_LIGHTONOCR:
25842586 {
25852587 hparams.rope_theta = 10000 .0f ;
25862588 hparams.warmup_image_size = hparams.patch_size * 8 ;
@@ -2966,6 +2968,15 @@ struct clip_model_loader {
29662968 model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM, false );
29672969 model.mm_patch_merger_w = get_tensor (TN_MM_PATCH_MERGER, false );
29682970 } break ;
2971+ case PROJECTOR_TYPE_LIGHTONOCR:
2972+ {
2973+ model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
2974+ model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ), false );
2975+ model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
2976+ model.mm_2_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ), false );
2977+ model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM, false );
2978+ model.mm_patch_merger_w = get_tensor (TN_MM_PATCH_MERGER, false );
2979+ } break ;
29692980 case PROJECTOR_TYPE_ULTRAVOX:
29702981 {
29712982 model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
@@ -3881,7 +3892,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
38813892 res_imgs->entries .push_back (std::move (img_f32));
38823893 return true ;
38833894
3884- } else if (ctx->proj_type () == PROJECTOR_TYPE_PIXTRAL) {
3895+ } else if (ctx->proj_type () == PROJECTOR_TYPE_PIXTRAL
3896+ || ctx->proj_type () == PROJECTOR_TYPE_LIGHTONOCR
3897+ ) {
38853898 clip_image_u8 resized_image;
38863899 auto new_size = image_manipulation::calc_size_preserved_ratio (original_size, params.patch_size , params.image_size );
38873900 image_manipulation::bilinear_resize (*img, resized_image, new_size.width , new_size.height );
@@ -4125,12 +4138,17 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
41254138 n_patches = x_patch * y_patch;
41264139 } break ;
41274140 case PROJECTOR_TYPE_PIXTRAL:
4141+ case PROJECTOR_TYPE_LIGHTONOCR:
41284142 {
41294143 // dynamic size
41304144 int n_merge = params.spatial_merge_size ;
41314145 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
41324146 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
4133- n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
4147+ if (ctx->model .token_embd_img_break ) {
4148+ n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
4149+ } else {
4150+ n_patches = n_patches_y * n_patches_x;
4151+ }
41344152 } break ;
41354153 case PROJECTOR_TYPE_VOXTRAL:
41364154 case PROJECTOR_TYPE_ULTRAVOX:
@@ -4508,6 +4526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
45084526 } break ;
45094527 case PROJECTOR_TYPE_PIXTRAL:
45104528 case PROJECTOR_TYPE_KIMIVL:
4529+ case PROJECTOR_TYPE_LIGHTONOCR:
45114530 {
45124531 // set the 2D positions
45134532 int n_patches_per_col = image_size_width / patch_size;
@@ -4638,6 +4657,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
46384657 return ctx->model .mm_model_peg_0_b ->ne [0 ];
46394658 case PROJECTOR_TYPE_MLP:
46404659 case PROJECTOR_TYPE_PIXTRAL:
4660+ case PROJECTOR_TYPE_LIGHTONOCR:
46414661 return ctx->model .mm_2_w ->ne [1 ];
46424662 case PROJECTOR_TYPE_MLP_NORM:
46434663 return ctx->model .mm_3_b ->ne [0 ];
0 commit comments