From 10c830cdfe8b18114a66104276218ce656c0cf02 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 1 Feb 2024 23:15:40 +0100
Subject: [PATCH 01/24] Create llava-survery-v2.py

---
 examples/llava/llava-survery-v2.py | 138 +++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 examples/llava/llava-survery-v2.py

diff --git a/examples/llava/llava-survery-v2.py b/examples/llava/llava-survery-v2.py
new file mode 100644
index 0000000000000..51f9cb638fd95
--- /dev/null
+++ b/examples/llava/llava-survery-v2.py
@@ -0,0 +1,138 @@
+import argparse
+import glob
+import os
+import torch
+from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+
+ 
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+    return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+    if is_safetensor_file(file_path):
+        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
+        tensors = {}
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key).clone()
+        return tensors, 'safetensor'
+    else:
+        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+    if file_type == 'safetensor':
+        # safe_save(model, file_path)
+        save_file(model, file_path)
+    else:
+        torch.save(model, file_path)
+
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+    checkpoint, file_type = load_model(checkpoint_path)
+    # file_type = 'pytorch'
+    model_path = os.path.dirname(checkpoint_path)
+    print(f"Searching for vision tower tensors in {checkpoint_path}")
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
+    
+    if len(clip_tensors) > 0:
+        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+        # Adapted for file type
+        clip_path = os.path.join(model_path, "llava.clip")
+        
+        if os.path.exists(clip_path):
+            existing_clip, _ = load_model(clip_path)
+        else:
+            existing_clip = {}
+        # Update existing_clip with new tensors, avoid duplicates
+        for name in clip_tensors:
+            simple_name = name.replace("vision_tower.vision_tower.", "")
+            print(f"Adding {simple_name} to llava.clip")
+            if simple_name not in existing_clip:
+                existing_clip[simple_name] = checkpoint[name]
+
+        # Save the updated clip tensors back to llava.clip
+        save_model(existing_clip, clip_path, 'pytorch')
+
+        # Remove the tensors from the original checkpoint
+        for name in clip_tensors:
+            del checkpoint[name]
+
+        # Save the updated checkpoint
+        checkpoint_path = checkpoint_path
+        save_model(checkpoint, checkpoint_path, file_type)
+        return True
+    return False
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+    # Generalized to handle both PyTorch and SafeTensors models
+    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    for last_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {last_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(last_checkpoint_path):
+            print(f"No vision tower found in {last_checkpoint_path}")
+            # we break once none is found, so far all models append them at the end
+            break
+    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+last_checkpoint_path = checkpoint_paths[0]
+first_checkpoint_path = checkpoint_paths[-1]
+
+print(f"Taking projector from {last_checkpoint_path}")
+
+# Load the checkpoint
+first_checkpoint, file_type = load_model(first_checkpoint_path)
+last_checkpoint, file_type = load_model(last_checkpoint_path)
+mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+
+
+
+if len(mm_tensors) == 0:
+    for k, v in last_checkpoint.items():
+        print(k)
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print("No tensors found. Is this a LLaVA model?")
+    exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+    projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+    projector[name] = first_checkpoint[name].float()
+    
+save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+for name in mm_tensors:
+    del last_checkpoint[name]
+for name in first_mm_tensors:
+    del first_checkpoint[name]
+
+if len(mm_tensors) > 0:
+    save_model(last_checkpoint, last_checkpoint_path, file_type)
+if len(first_mm_tensors) > 0:
+    save_model(first_checkpoint, first_checkpoint_path, file_type)
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")

From 97dda1e098bf1908bfd34759d630bfe77ea84fb9 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 1 Feb 2024 23:16:30 +0100
Subject: [PATCH 02/24] Update convert-image-encoder-to-gguf.py

---
 .../llava/convert-image-encoder-to-gguf.py    | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index f5a3c9b46f9e3..82acfb22595a3 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -80,12 +80,13 @@ def bytes_to_unicode():
                 help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
-ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
-ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
@@ -105,7 +106,7 @@ def bytes_to_unicode():
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
 
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
     vocab = None
     tokens = None
 else:
@@ -133,7 +134,7 @@ def bytes_to_unicode():
 if args.use_f32:
     ftype = 0
 
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or args.clip_model_is_openclip:
     model = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
@@ -202,6 +203,23 @@ def bytes_to_unicode():
     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
     block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+    if "image_grid_pinpoints" in v_hparams:
+        # no nested array - flatten it
+        image_grid_pinpoints = []
+        for pinpoint in v_hparams["image_grid_pinpoints"]:
+            image_grid_pinpoints.extend(pinpoint)
+        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+    if "image_crop_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+    if "image_aspect_ratio" in v_hparams:
+        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+    if "image_split_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+    if "mm_patch_merge_type" in v_hparams:
+        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+    if "mm_projector_type" in v_hparams:
+        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+
 
     if processor is not None:
         image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean

From 8ebdaec76169c58472f1a97a71fc3548578eae00 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:25:08 +0100
Subject: [PATCH 03/24] Update convert-image-encoder-to-gguf.py

---
 .../llava/convert-image-encoder-to-gguf.py    | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 82acfb22595a3..115b6b35b4da0 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -203,8 +203,41 @@ def bytes_to_unicode():
     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
     block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+                            #     /**
+                            #      "image_grid_pinpoints": [
+                            #         [
+                            #         336,
+                            #         672
+                            #         ],
+                            #         [
+                            #         672,
+                            #         336
+                            #         ],
+                            #         [
+                            #         672,
+                            #         672
+                            #         ],
+                            #         [
+                            #         1008,
+                            #         336
+                            #         ],
+                            #         [
+                            #         336,
+                            #         1008
+                            #         ]
+                            #     ],
+                            #     Flattened:
+                            #     [
+                            #         336, 672,
+                            #         672, 336,
+                            #         672, 672,
+                            #         1008, 336,
+                            #         336, 1008
+                            #     ]
+                            #  * 
+                            #  */
     if "image_grid_pinpoints" in v_hparams:
-        # no nested array - flatten it
+        # flatten it
         image_grid_pinpoints = []
         for pinpoint in v_hparams["image_grid_pinpoints"]:
             image_grid_pinpoints.extend(pinpoint)

From 1f9367c1348808bc5e83756dbcdbf9cd59adef84 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:26:05 +0100
Subject: [PATCH 04/24] Rename llava-survery-v2.py to llava-surgery-v2.py

---
 examples/llava/{llava-survery-v2.py => llava-surgery-v2.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/llava/{llava-survery-v2.py => llava-surgery-v2.py} (100%)

diff --git a/examples/llava/llava-survery-v2.py b/examples/llava/llava-surgery-v2.py
similarity index 100%
rename from examples/llava/llava-survery-v2.py
rename to examples/llava/llava-surgery-v2.py

From a27b9a45df460fd3a3b4c81858d7f55119126b09 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Fri, 2 Feb 2024 01:48:14 +0100
Subject: [PATCH 05/24] Update convert-image-encoder-to-gguf.py

will now search for projector
---
 .../llava/convert-image-encoder-to-gguf.py    | 454 ++++++------------
 1 file changed, 143 insertions(+), 311 deletions(-)

diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 115b6b35b4da0..a65b05f8a96db 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -1,327 +1,159 @@
 import argparse
+import glob
 import os
-import json
-
 import torch
-import numpy as np
-from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
-
-TEXT = "clip.text"
-VISION = "clip.vision"
-
-
-def k(raw_key: str, arch: str) -> str:
-    return raw_key.format(arch=arch)
-
-
-def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
-    if name in (
-        "logit_scale",
-        "text_model.embeddings.position_ids",
-        "vision_model.embeddings.position_ids",
-    ):
-        return True
-
-    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
-        return True
-
-    if name.startswith("v") and not has_vision:
-        return True
-
-    if name.startswith("t") and not has_text:
-        return True
-
-    return False
-
-
-def get_tensor_name(name: str) -> str:
-    if "projection" in name:
-        return name
-
-    if "mm_projector" in name:
-        return name.replace("model.mm_projector", "mm")
-
-    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
-
-
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
-ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
-ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
-ap.add_argument("--text-only", action="store_true", required=False,
-                help="Save a text-only model. It can't be used to encode images")
-ap.add_argument("--vision-only", action="store_true", required=False,
-                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
-                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
-                help="The clip model is from openclip (for ViT-SO400M type))")
-ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
-ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
-# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
-# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
-default_image_mean = [0.48145466, 0.4578275, 0.40821073]
-default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
-
-# with proper
-args = ap.parse_args()
-
-
-if args.text_only and args.vision_only:
-    print("--text-only and --image-only arguments cannot be specified at the same time.")
-    exit(1)
-
-if args.use_f32:
-    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
-
-# output in the same directory as the model if output_dir is None
-dir_model = args.model_dir
-
-if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
-    vocab = None
-    tokens = None
-else:
-    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-        vocab = json.load(f)
-        tokens = [key for key in vocab]
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    config = json.load(f)
-    if args.clip_model_is_vision:
-        v_hparams = config
-        t_hparams = None
+from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+
+ 
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+    return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+    if is_safetensor_file(file_path):
+        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
+        tensors = {}
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key).clone()
+        return tensors, 'safetensor'
     else:
-        v_hparams = config["vision_config"]
-        t_hparams = config["text_config"]
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if args.use_f32:
-    ftype = 0
-
-if args.clip_model_is_vision or args.clip_model_is_openclip:
-    model = CLIPVisionModel.from_pretrained(dir_model)
-    processor = None
-else:
-    model = CLIPModel.from_pretrained(dir_model)
-    processor = CLIPProcessor.from_pretrained(dir_model)
-
-fname_middle = None
-has_text_encoder = True
-has_vision_encoder = True
-has_llava_projector = False
-if args.text_only:
-    fname_middle = "text-"
-    has_vision_encoder = False
-elif args.llava_projector is not None:
-    fname_middle = "mmproj-"
-    has_text_encoder = False
-    has_llava_projector = True
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
-else:
-    fname_middle = ""
-
-output_dir = args.output_dir if args.output_dir is not None else dir_model
-os.makedirs(output_dir, exist_ok=True)
-output_prefix = os.path.basename(output_dir).replace("ggml_", "")
-fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
-fout = GGUFWriter(path=fname_out, arch="clip")
-
-fout.add_bool("clip.has_text_encoder", has_text_encoder)
-fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
-fout.add_bool("clip.has_llava_projector", has_llava_projector)
-fout.add_file_type(ftype)
-model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
-fout.add_name(model_name)
-if args.text_only:
-    fout.add_description("text-only CLIP model")
-elif args.vision_only and not has_llava_projector:
-    fout.add_description("vision-only CLIP model")
-elif has_llava_projector:
-    fout.add_description("image encoder for LLaVA")
-    # add projector type
-    fout.add_string("clip.projector_type", args.projector_type)
-else:
-    fout.add_description("two-tower CLIP model")
+        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
 
-if has_text_encoder:
-    # text_model hparams
-    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
-    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
-    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
-    fout.add_token_list(tokens)
 
-if has_vision_encoder:
-    # vision_model hparams
-    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
-    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
-    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
-    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
-                            #     /**
-                            #      "image_grid_pinpoints": [
-                            #         [
-                            #         336,
-                            #         672
-                            #         ],
-                            #         [
-                            #         672,
-                            #         336
-                            #         ],
-                            #         [
-                            #         672,
-                            #         672
-                            #         ],
-                            #         [
-                            #         1008,
-                            #         336
-                            #         ],
-                            #         [
-                            #         336,
-                            #         1008
-                            #         ]
-                            #     ],
-                            #     Flattened:
-                            #     [
-                            #         336, 672,
-                            #         672, 336,
-                            #         672, 672,
-                            #         1008, 336,
-                            #         336, 1008
-                            #     ]
-                            #  * 
-                            #  */
-    if "image_grid_pinpoints" in v_hparams:
-        # flatten it
-        image_grid_pinpoints = []
-        for pinpoint in v_hparams["image_grid_pinpoints"]:
-            image_grid_pinpoints.extend(pinpoint)
-        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
-    if "image_crop_resolution" in v_hparams:
-        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
-    if "image_aspect_ratio" in v_hparams:
-        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
-    if "image_split_resolution" in v_hparams:
-        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
-    if "mm_patch_merge_type" in v_hparams:
-        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
-    if "mm_projector_type" in v_hparams:
-        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
-
-
-    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+# Unified saving function
+def save_model(model, file_path, file_type):
+    if file_type == 'safetensor':
+        # safe_save(model, file_path)
+        save_file(model, file_path)
     else:
-        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
-        image_std = args.image_std if args.image_std is not None else default_image_std
-    fout.add_array("clip.vision.image_mean", image_mean)
-    fout.add_array("clip.vision.image_std", image_std)
-
-use_gelu = v_hparams["hidden_act"] == "gelu"
-fout.add_bool("clip.use_gelu", use_gelu)
-
-
-if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
-    projector = torch.load(args.llava_projector)
-    for name, data in projector.items():
-        name = get_tensor_name(name)
-        # pw and dw conv ndim==4
-        if data.ndim == 2 or data.ndim == 4:
-            data = data.squeeze().numpy().astype(np.float16)
+        torch.save(model, file_path)
+
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+    checkpoint, file_type = load_model(checkpoint_path)
+    # file_type = 'pytorch'
+    model_path = os.path.dirname(checkpoint_path)
+    print(f"Searching for vision tower tensors in {checkpoint_path}")
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
+    
+    if len(clip_tensors) > 0:
+        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+        # Adapted for file type
+        clip_path = os.path.join(model_path, "llava.clip")
+        
+        if os.path.exists(clip_path):
+            existing_clip, _ = load_model(clip_path)
         else:
-            data = data.squeeze().numpy().astype(np.float32)
-
-        fout.add_tensor(name, data)
-
-    print("Projector tensors added\n")
+            existing_clip = {}
+        # Update existing_clip with new tensors, avoid duplicates
+        for name in clip_tensors:
+            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+            print(f"Adding {simple_name} to llava.clip")
+            if simple_name not in existing_clip:
+                existing_clip[simple_name] = checkpoint[name]
+
+        # Save the updated clip tensors back to llava.clip
+        save_model(existing_clip, clip_path, 'pytorch')
+
+        # Remove the tensors from the original checkpoint
+        for name in clip_tensors:
+            del checkpoint[name]
+
+        # Save the updated checkpoint
+        checkpoint_path = checkpoint_path
+        save_model(checkpoint, checkpoint_path, file_type)
+        return True
+    return False
 
-state_dict = model.state_dict()
-for name, data in state_dict.items():
-    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
-        # we don't need this
-        print(f"skipping parameter: {name}")
-        continue
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+    newline_checkpoint_path = None
+    projector_checkpoint_path = None
 
-    name = get_tensor_name(name)
-    data = data.squeeze().numpy()
+    for path in checkpoint_paths:
+        checkpoint, _ = load_model(path)
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+            newline_checkpoint_path = path
+        if projector(checkpoint):
+            projector_checkpoint_path = path
 
-    n_dims = len(data.shape)
+    return newline_checkpoint_path, projector_checkpoint_path
 
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if n_dims == 4:
-        print(f"tensor {name} is always saved in f16")
-        data = data.astype(np.float16)
-        ftype_cur = 1
-    elif ftype == 1:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
+def newline_criteria(checkpoint):
+    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
 
-    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
-    fout.add_tensor(name, data)
+def proj_criteria(checkpoint):
+    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
 
 
-fout.write_header_to_file()
-fout.write_kv_data_to_file()
-fout.write_tensors_to_file()
-fout.close()
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
 
-print("Done. Output file: " + fname_out)
+if args.clean_vision_tower:
+    # Generalized to handle both PyTorch and SafeTensors models
+    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    for projector_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {projector_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+            print(f"No vision tower found in {projector_checkpoint_path}")
+            # we break once none is found, so far all models append them at the end
+            break
+    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+print(f"Taking newline from {newline_checkpoint_path}")
+
+# Load the checkpoint
+first_checkpoint, file_type = load_model(newline_checkpoint_path)
+last_checkpoint, file_type = load_model(projector_checkpoint_path)
+mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+
+
+
+if len(mm_tensors) == 0:
+    for k, v in last_checkpoint.items():
+        print(k)
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print("No tensors found. Is this a LLaVA model?")
+    exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+    projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+    projector[name] = first_checkpoint[name].float()
+    
+save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+for name in mm_tensors:
+    del last_checkpoint[name]
+for name in first_mm_tensors:
+    del first_checkpoint[name]
+
+if len(mm_tensors) > 0:
+    save_model(last_checkpoint, projector_checkpoint_path, file_type)
+if len(first_mm_tensors) > 0:
+    save_model(first_checkpoint, newline_checkpoint_path, file_type)
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")

From 440b2ae2b1cdaa53b5b546c569d658bd8cecfa6a Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Fri, 2 Feb 2024 02:07:29 +0100
Subject: [PATCH 06/24] Update convert-image-encoder-to-gguf.py

whoops
---
 .../llava/convert-image-encoder-to-gguf.py    | 454 ++++++++++++------
 1 file changed, 311 insertions(+), 143 deletions(-)

diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index a65b05f8a96db..115b6b35b4da0 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -1,159 +1,327 @@
 import argparse
-import glob
 import os
+import json
+
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
-
- 
-# Function to determine if file is a SafeTensor file
-def is_safetensor_file(file_path):
-    return file_path.endswith('.safetensors')
-
-
-# Unified loading function
-def load_model(file_path):
-    if is_safetensor_file(file_path):
-        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
-        tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
-            for key in f.keys():
-                tensors[key] = f.get_tensor(key).clone()
-        return tensors, 'safetensor'
-    else:
-        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
 
+TEXT = "clip.text"
+VISION = "clip.vision"
 
-# Unified saving function
-def save_model(model, file_path, file_type):
-    if file_type == 'safetensor':
-        # safe_save(model, file_path)
-        save_file(model, file_path)
-    else:
-        torch.save(model, file_path)
-
-
-# Adapted function to clean vision tower from checkpoint
-def clean_vision_tower_from_checkpoint(checkpoint_path):
-    checkpoint, file_type = load_model(checkpoint_path)
-    # file_type = 'pytorch'
-    model_path = os.path.dirname(checkpoint_path)
-    print(f"Searching for vision tower tensors in {checkpoint_path}")
-    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
-    
-    if len(clip_tensors) > 0:
-        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
-        # Adapted for file type
-        clip_path = os.path.join(model_path, "llava.clip")
-        
-        if os.path.exists(clip_path):
-            existing_clip, _ = load_model(clip_path)
-        else:
-            existing_clip = {}
-        # Update existing_clip with new tensors, avoid duplicates
-        for name in clip_tensors:
-            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
-            print(f"Adding {simple_name} to llava.clip")
-            if simple_name not in existing_clip:
-                existing_clip[simple_name] = checkpoint[name]
-
-        # Save the updated clip tensors back to llava.clip
-        save_model(existing_clip, clip_path, 'pytorch')
-
-        # Remove the tensors from the original checkpoint
-        for name in clip_tensors:
-            del checkpoint[name]
-
-        # Save the updated checkpoint
-        checkpoint_path = checkpoint_path
-        save_model(checkpoint, checkpoint_path, file_type)
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
         return True
+
     return False
 
-def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
-    newline_checkpoint_path = None
-    projector_checkpoint_path = None
 
-    for path in checkpoint_paths:
-        checkpoint, _ = load_model(path)
-        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
-            newline_checkpoint_path = path
-        if projector(checkpoint):
-            projector_checkpoint_path = path
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+
+    if "mm_projector" in name:
+        return name.replace("model.mm_projector", "mm")
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
 
-    return newline_checkpoint_path, projector_checkpoint_path
 
-def newline_criteria(checkpoint):
-    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
 
-def proj_criteria(checkpoint):
-    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
 
+ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 
-# Command-line interface setup
-ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
-ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+# with proper
 args = ap.parse_args()
 
-if args.clean_vision_tower:
-    # Generalized to handle both PyTorch and SafeTensors models
-    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
-    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
-    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
-    for projector_checkpoint_path in checkpoint_paths:
-        print(f"Cleaning {projector_checkpoint_path}")
-        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
-            print(f"No vision tower found in {projector_checkpoint_path}")
-            # we break once none is found, so far all models append them at the end
-            break
-    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
-
-# Now we look for the projector in the last checkpoint
-model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
-checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
-# last_checkpoint_path = checkpoint_paths[0]
-# first_checkpoint_path = checkpoint_paths[-1]
-newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
-
-print(f"Taking projector from {projector_checkpoint_path}")
-print(f"Taking newline from {newline_checkpoint_path}")
-
-# Load the checkpoint
-first_checkpoint, file_type = load_model(newline_checkpoint_path)
-last_checkpoint, file_type = load_model(projector_checkpoint_path)
-mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
-first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
-
-
-
-if len(mm_tensors) == 0:
-    for k, v in last_checkpoint.items():
-        print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
-    print("No tensors found. Is this a LLaVA model?")
-    exit()
-
-print(f"Found {len(mm_tensors)} tensors to extract.")
-print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
-# projector = {name: checkpoint.[name].float() for name in mm_tensors}
-projector = {}
-for name in mm_tensors:
-    projector[name] = last_checkpoint[name].float()
-for name in first_mm_tensors:
-    projector[name] = first_checkpoint[name].float()
-    
-save_model(projector, f"{args.model}/llava.projector", 'pytorch')
-
-for name in mm_tensors:
-    del last_checkpoint[name]
-for name in first_mm_tensors:
-    del first_checkpoint[name]
-
-if len(mm_tensors) > 0:
-    save_model(last_checkpoint, projector_checkpoint_path, file_type)
-if len(first_mm_tensors) > 0:
-    save_model(first_checkpoint, newline_checkpoint_path, file_type)
-
-print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+if args.clip_model_is_vision or args.clip_model_is_openclip:
+    model = CLIPVisionModel.from_pretrained(dir_model)
+    processor = None
+else:
+    model = CLIPModel.from_pretrained(dir_model)
+    processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+                            #     /**
+                            #      "image_grid_pinpoints": [
+                            #         [
+                            #         336,
+                            #         672
+                            #         ],
+                            #         [
+                            #         672,
+                            #         336
+                            #         ],
+                            #         [
+                            #         672,
+                            #         672
+                            #         ],
+                            #         [
+                            #         1008,
+                            #         336
+                            #         ],
+                            #         [
+                            #         336,
+                            #         1008
+                            #         ]
+                            #     ],
+                            #     Flattened:
+                            #     [
+                            #         336, 672,
+                            #         672, 336,
+                            #         672, 672,
+                            #         1008, 336,
+                            #         336, 1008
+                            #     ]
+                            #  * 
+                            #  */
+    if "image_grid_pinpoints" in v_hparams:
+        # flatten it
+        image_grid_pinpoints = []
+        for pinpoint in v_hparams["image_grid_pinpoints"]:
+            image_grid_pinpoints.extend(pinpoint)
+        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+    if "image_crop_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+    if "image_aspect_ratio" in v_hparams:
+        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+    if "image_split_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+    if "mm_patch_merge_type" in v_hparams:
+        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+    if "mm_projector_type" in v_hparams:
+        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+    model.vision_model.encoder.layers.pop(-1)
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
+    print("Projector tensors added\n")
+
+state_dict = model.state_dict()
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)

From 35b7a7a18393986f9052fe147e376352219c75af Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Fri, 2 Feb 2024 02:07:42 +0100
Subject: [PATCH 07/24] Update llava-surgery-v2.py

---
 examples/llava/llava-surgery-v2.py | 47 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index 51f9cb638fd95..a5850b96e77d9 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -51,7 +51,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
             existing_clip = {}
         # Update existing_clip with new tensors, avoid duplicates
         for name in clip_tensors:
-            simple_name = name.replace("vision_tower.vision_tower.", "")
+            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
             print(f"Adding {simple_name} to llava.clip")
             if simple_name not in existing_clip:
                 existing_clip[simple_name] = checkpoint[name]
@@ -69,6 +69,25 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
         return True
     return False
 
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+    newline_checkpoint_path = None
+    projector_checkpoint_path = None
+
+    for path in checkpoint_paths:
+        checkpoint, _ = load_model(path)
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+            newline_checkpoint_path = path
+        if projector(checkpoint):
+            projector_checkpoint_path = path
+
+    return newline_checkpoint_path, projector_checkpoint_path
+
+def newline_criteria(checkpoint):
+    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
+
 
 # Command-line interface setup
 ap = argparse.ArgumentParser()
@@ -81,25 +100,27 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
     model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
     # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
     checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
-    for last_checkpoint_path in checkpoint_paths:
-        print(f"Cleaning {last_checkpoint_path}")
-        if not clean_vision_tower_from_checkpoint(last_checkpoint_path):
-            print(f"No vision tower found in {last_checkpoint_path}")
+    for projector_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {projector_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+            print(f"No vision tower found in {projector_checkpoint_path}")
             # we break once none is found, so far all models append them at the end
-            break
+            # break
     print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
 
 # Now we look for the projector in the last checkpoint
 model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
 checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
-last_checkpoint_path = checkpoint_paths[0]
-first_checkpoint_path = checkpoint_paths[-1]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
 
-print(f"Taking projector from {last_checkpoint_path}")
+print(f"Taking projector from {projector_checkpoint_path}")
+print(f"Taking newline from {newline_checkpoint_path}")
 
 # Load the checkpoint
-first_checkpoint, file_type = load_model(first_checkpoint_path)
-last_checkpoint, file_type = load_model(last_checkpoint_path)
+first_checkpoint, file_type = load_model(newline_checkpoint_path)
+last_checkpoint, file_type = load_model(projector_checkpoint_path)
 mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
 first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
 
@@ -129,9 +150,9 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
     del first_checkpoint[name]
 
 if len(mm_tensors) > 0:
-    save_model(last_checkpoint, last_checkpoint_path, file_type)
+    save_model(last_checkpoint, projector_checkpoint_path, file_type)
 if len(first_mm_tensors) > 0:
-    save_model(first_checkpoint, first_checkpoint_path, file_type)
+    save_model(first_checkpoint, newline_checkpoint_path, file_type)
 
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")

From 37a147ebf9c492af646bba349ee0d26e76bd6035 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 8 Feb 2024 07:42:49 +0100
Subject: [PATCH 08/24] Clip: Bugfix for normalization (it did not loat the 3
 std and mean values) Clip: bicubic resize function Clip: added
 save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added
 normalization with FP16 precision simulation (image tensors match HF
 implementation, can be switched off, only used for llava-1.6) Clip: added
 newline tensor, mergetype kv, image-grid kv, new resize-pad function with
 resolution from gridpoints Clip: clip_image_preprocess now returns a float *
 vector instead of float, this way llava 1.5 and 1.6 is supported llava: added
 ggml cpu graph for embedding patching, added spatial_unpad preliminary
 support, added a lot of comments that need to be cleaned when all is final
 convert-image-encoder: fixed image-grid flattening

---
 examples/llava/clip.cpp                       | 581 ++++++++++++++++--
 examples/llava/clip.h                         |  41 +-
 .../llava/convert-image-encoder-to-gguf.py    |   3 +-
 examples/llava/llava-surgery-v2.py            |   5 +-
 examples/llava/llava.cpp                      | 260 +++++++-
 examples/server/server.cpp                    |   5 +-
 6 files changed, 839 insertions(+), 56 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9129052a223bb..8193945ee43d7 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -71,6 +71,11 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_STD "clip.vision.image_std"
 #define KEY_PROJ_TYPE "clip.projector_type"
 
+#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+
+
 //
 // tensor name constants
 //
@@ -94,6 +99,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_LLAVA_PROJ "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_IMAGE_NEWLINE "model.image_newline"
 
 
 enum projector_type {
@@ -233,26 +239,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
-//
-// image data
-//
-
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
 
 //
 // clip layers
@@ -309,6 +295,7 @@ struct clip_vision_model {
     struct ggml_tensor * mm_0_b = NULL;
     struct ggml_tensor * mm_2_w = NULL;
     struct ggml_tensor * mm_2_b = NULL;
+    struct ggml_tensor * image_newline = NULL;
 
     // Yi type models with mlp+normalization projection
     struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
@@ -370,6 +357,10 @@ struct clip_ctx {
     ggml_allocr * compute_alloc = NULL;
 };
 
+const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams;
+}
+
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -382,6 +373,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int image_size = hparams.image_size;
     const int patch_size = hparams.patch_size;
     const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches_per_side = image_size / patch_size;
     const int num_positions = num_patches + 1;
     const int hidden_size = hparams.hidden_size;
     const int n_head = hparams.n_head;
@@ -582,7 +574,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
             embeddings = ggml_gelu(ctx0, embeddings);
-
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
 
@@ -966,12 +957,37 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
         hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
         hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+        try {
+            int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
+            int n = gguf_get_arr_n(ctx, idx);
+            const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
+            for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) {
+                hparams.image_grid_pinpoints[i] = pinpoints[i];
+            }
+            hparams.image_grid_pinpoints[n] = 0;
+        } catch (std::runtime_error & e) {  
+            hparams.image_grid_pinpoints[0]=0;
+        }
+        try {
+            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
+            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
+        } catch (std::runtime_error & e) {
+            strcpy(hparams.mm_patch_merge_type, "flat");
+        }
+        try {
+            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
+        }
+        catch(const std::exception& e) {
+            hparams.image_crop_resolution = hparams.image_size;
+        }        
 
         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
         int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
+        const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
+        const float * std_data  = (const float *)gguf_get_arr_data(ctx, idx_std);
         for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i]  = *((const float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_mean[i] = mean_data[i];
+            new_clip->image_std[i]  = std_data[i];
         }
 
         if (verbosity >= 2) {
@@ -983,14 +999,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             printf("v_projection_dim   %d\n", hparams.projection_dim);
             printf("v_n_head           %d\n", hparams.n_head);
             printf("v_n_layer          %d\n", hparams.n_layer);
-        }
-
-        vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-        vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+            printf("v_eps              %f\n", hparams.eps);
+            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            printf("v_image_grid_pinpoints: ");
+            for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
+                printf("%d ", hparams.image_grid_pinpoints[i]);
+            }
+            printf("\n");
+            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
 
+        }
+        try
+        {
+            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+        }
+        catch(const std::exception& e)
+        {
+            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+        }
+        
         // LLaVA projection
         if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
@@ -1015,6 +1047,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
             } catch (std::runtime_error & e) {  }
+            try {
+                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
+                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
+            } catch (std::runtime_error & e) {  }
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1134,13 +1170,423 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     return true;
 }
 
+
+void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    // PPM header: P6 format, width, height, and max color value
+    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+    // Write pixel data
+    for (size_t i = 0; i < img.buf.size(); i += 3) {
+        // PPM expects binary data in RGB format, which matches our image buffer
+        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+    }
+
+    file.close();
+}
+void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    int bytesPerPixel = 3;
+    int widthInBytes = img.nx * bytesPerPixel;
+    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+    int stride = widthInBytes + paddingAmount;
+
+    // Bitmap file header
+    unsigned char fileHeader[14] = {
+        'B','M',     // Signature
+        0,0,0,0,    // Image file size in bytes
+        0,0,0,0,    // Reserved
+        54,0,0,0    // Start of pixel array
+    };
+
+    // Total file size
+    fileSize = 54 + (stride * img.ny);
+    fileHeader[2] = (unsigned char)(fileSize);
+    fileHeader[3] = (unsigned char)(fileSize >> 8);
+    fileHeader[4] = (unsigned char)(fileSize >> 16);
+    fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+    // Bitmap information header (BITMAPINFOHEADER)
+    unsigned char infoHeader[40] = {
+        40,0,0,0,   // Size of this header (40 bytes)
+        0,0,0,0,    // Image width
+        0,0,0,0,    // Image height
+        1,0,        // Number of color planes
+        24,0,       // Bits per pixel
+        0,0,0,0,    // No compression
+        0,0,0,0,    // Image size (can be 0 for no compression)
+        0,0,0,0,    // X pixels per meter (not specified)
+        0,0,0,0,    // Y pixels per meter (not specified)
+        0,0,0,0,    // Total colors (color table not used)
+        0,0,0,0     // Important colors (all are important)
+    };
+
+    // Width and height in the information header
+    infoHeader[4] = (unsigned char)(img.nx);
+    infoHeader[5] = (unsigned char)(img.nx >> 8);
+    infoHeader[6] = (unsigned char)(img.nx >> 16);
+    infoHeader[7] = (unsigned char)(img.nx >> 24);
+    infoHeader[8] = (unsigned char)(img.ny);
+    infoHeader[9] = (unsigned char)(img.ny >> 8);
+    infoHeader[10] = (unsigned char)(img.ny >> 16);
+    infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+    // Write file headers
+    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+    // Pixel data
+    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < img.nx; ++x) {
+            // Each pixel
+            size_t pixelIndex = (y * img.nx + x) * 3;
+            unsigned char pixel[3] = {
+                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+                img.buf[pixelIndex + 1],
+                img.buf[pixelIndex]
+            };
+            file.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        // Write padding for the row
+        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+    }
+
+    file.close();
+}
+
+// Linear interpolation between two points
+inline float lerp(float s, float e, float t) {
+    return s + (e - s) * t;
+}
+// Bilinear resize function 
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+    
+    float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+    float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+    
+    for (int y = 0; y < target_height; y++) {
+        for (int x = 0; x < target_width; x++) {
+            float px = x_ratio * x;
+            float py = y_ratio * y;
+            int x_floor = static_cast<int>(px);
+            int y_floor = static_cast<int>(py);
+            float x_lerp = px - x_floor;
+            float y_lerp = py - y_floor;
+
+            for (int c = 0; c < 3; c++) {
+                float top = lerp(
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                float bottom = lerp(
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+            }
+        }
+    }
+}
+
+// for replication purposes `.to(model.device, dtype=torch.float16)`
+// converts a float to half precision and back to float
+float simulateFloat16Precision(float value) {
+    // Convert float32 to float16
+    uint32_t f32 = *reinterpret_cast<uint32_t*>(&value);
+    uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit)
+    uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32)
+    uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32)
+
+    // Handle overflow/underflow
+    if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable
+        exponent = 0x1F;
+        mantissa = 0;
+    } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision
+        exponent = 0;
+        mantissa = 0;
+    }
+
+    uint16_t f16 = sign | (exponent << 10) | mantissa;
+
+    // Convert back to float32
+    uint32_t sign32 = (f16 & 0x8000) << 16;
+    uint32_t exponent32 = ((f16 >> 10) & 0x1F);
+    uint32_t mantissa32 = (f16 & 0x3FF) << 13;
+
+    // Adjust bias back
+    exponent32 = exponent32 == 0 ? 0 : exponent32 + 112;
+
+    uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32;
+    float result = *reinterpret_cast<float*>(&f32Result);
+
+    return result;
+}
+// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16)
+void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) {
+    dst->nx = src->nx;
+    dst->ny = src->ny;
+    dst->buf.resize(src->buf.size());
+    
+    for (size_t i = 0; i < src->buf.size(); ++i) {
+        int c = i % 3; // rgb
+        dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
+        
+        if (replicate_float16) {
+            dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
+        }
+    }
+}
+inline float clip(float x, float lower, float upper)
+{
+    return std::max(lower, std::min(x, upper));
+}
+bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height)
+{
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    int a, b, c, d, index;
+    float Ca, Cb, Cc;
+    float C[5];
+    float d0, d2, d3, a0, a1, a2, a3;
+    int i, j, k, ii, jj;
+    int x, y;
+    float dx, dy;
+    float tx, ty;
+
+    tx = (float)nx / (float)target_width;
+    ty = (float)ny / (float)target_height;
+
+    float scale = std::max(tx, ty);
+
+    // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+    //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+    //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+    for (i = 0; i < target_height; i++)
+    {
+        for (j = 0; j < target_width; j++)
+        {
+            x = (int)(tx * j);
+            y = (int)(ty * i);
+
+            dx = tx * j - x;
+            dy = ty * i - y;
+
+            index = (y * nx + x) * 3;
+            a = (y * nx + (x + 1)) * 3;
+            b = ((y + 1) * nx + x) * 3;
+            c = ((y + 1) * nx + (x + 1)) * 3;
+
+            for (k = 0; k < 3; k++)
+            {
+                for (jj = 0; jj <= 3; jj++)
+                {
+                    d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                    d0 = C[0] - C[1];
+                    d2 = C[2] - C[1];
+                    d3 = C[3] - C[1];
+                    a0 = C[1];
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                    const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                    dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+// llava-1.6 type of resize_and_pad (black)
+void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
+    int target_width = target_resolution.first;
+    int target_height = target_resolution.second;
+
+    float scale_w = static_cast<float>(target_width) / image.nx;
+    float scale_h = static_cast<float>(target_height) / image.ny;
+
+    int new_width, new_height;
+
+    if (scale_w < scale_h) {
+        new_width = target_width;
+        new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
+    } else {
+        new_height = target_height;
+        new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
+    }
+
+    clip_image_u8 resized_image;
+    // bilinear_resize(image, resized_image, new_width, new_height);
+    bicubic_resize(image, resized_image, new_width, new_height);
+
+    clip_image_u8 padded_image;
+    padded_image.nx = target_width;
+    padded_image.ny = target_height;
+    padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
+
+    // Calculate padding offsets
+    int pad_x = (target_width - new_width) / 2;
+    int pad_y = (target_height - new_height) / 2;
+
+    // Copy the resized image into the center of the padded buffer
+    for (int y = 0; y < new_height; ++y) {
+        for (int x = 0; x < new_width; ++x) {
+            for (int c = 0; c < 3; ++c) {
+                padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
+            }
+        }
+    }
+
+    image_output = std::move(padded_image);
+}
+
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width = original_size.first;
+    int original_height = original_size.second;
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+
+std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8& image, int patch_size) {
+    std::vector<clip_image_u8*> patches;
+    int width = image.nx;
+    int height = image.ny;
+    for (int i = 0; i < height; i += patch_size) {
+        for (int j = 0; j < width; j += patch_size) {
+            clip_image_u8 *patch = clip_image_u8_init();
+            patch->nx = std::min(patch_size, width - j);
+            patch->ny = std::min(patch_size, height - i);
+            patch->buf.resize(3 * patch->nx * patch->ny);
+            for (int y = 0; y < patch->ny; ++y) {
+                for (int x = 0; x < patch->nx; ++x) {
+                    for (int c = 0; c < 3; ++c) {
+                        patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
+                    }
+                }
+            }
+            patches.push_back(patch);
+        }
+    }
+    return patches;
+}
+
+
+// debug function to convert f32 to u8
+void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(3 * src.nx * src.ny);
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    }
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ * 
+ * @param image_size 
+ * @param grid_pinpoints 
+ * @param image_patch_size 
+ * @return <int, int> 
+ */
+struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector: 
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+
 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
         return false;
     }
+    auto & params = ctx->vision_model.hparams;
+    // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+    if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
+        pad2square = false;
+    } else {
+        // pad2square = true; // todo: consider automatic decisions on that options for all models
+    }
+    // free the previous res_tensor
+    if (res_tensor.size() > 0) {
+        for (size_t i = 0; i < res_tensor.size(); i++) {
+            clip_image_f32_free(res_tensor[i]);
+        }
+        res_tensor.clear();
+    }
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1151,7 +1597,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         temp->nx = longer_side;
         temp->ny = longer_side;
         temp->buf.resize(3 * longer_side * longer_side);
-        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
+        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
 
         // fill with background color
         for (size_t i = 0; i < temp->buf.size(); i++) {
@@ -1169,18 +1615,65 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
         }
     } else {
-        temp->nx = img->nx;
-        temp->ny = img->ny;
-        temp->buf.resize(img->buf.size());
-        memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        if (params.image_grid_pinpoints[0] != 0)
+        {
+            // "spatial_unpad" with "anyres" processing for llava-1.6
+            std::vector<std::pair<int, int>> possible_resolutions;
+            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+            }
+            std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
+            // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second);
+            // clip_image_save_to_bmp(*img, "input.bmp");
+            resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
+            // clip_image_save_to_bmp(*temp, "resized.bmp");
+            // visually verify normalized image:
+            // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); 
+            // {
+            //     clip_image_u8 * temp2 = clip_image_u8_init();
+            //     clip_image_convert_f32_to_u8(*res, *temp2);
+            //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
+            //     clip_image_u8_free(temp2);
+            // }
+
+            std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+            // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
+        
+            clip_image_u8 *image_original_resize = clip_image_u8_init();
+            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
+            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
+            patches.insert(patches.begin(), image_original_resize);
+
+            res_tensor.clear();
+            for (auto& patch : patches) {
+                clip_image_f32 *temp_image_f32 = clip_image_f32_init();
+                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true);
+                res_tensor.push_back(temp_image_f32);
+            }
+
+            for (size_t i = 0; i < patches.size(); i++) {
+                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                clip_image_u8_free(patches[i]);
+            }
+
+            clip_image_u8_free(temp);
+            
+            return true;
+        } else {
+            temp->nx = img->nx;
+            temp->ny = img->ny;
+            temp->buf.resize(img->buf.size());
+            memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        }
     }
 
     const int nx = temp->nx;
     const int ny = temp->ny;
+    // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
 
     const int nx2 = ctx->vision_model.hparams.image_size;
     const int ny2 = ctx->vision_model.hparams.image_size;
-
+    clip_image_f32 * res = clip_image_f32_init();
     res->nx = nx2;
     res->ny = ny2;
     res->buf.resize(3 * nx2 * ny2);
@@ -1234,6 +1727,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     }
     clip_image_u8_free(temp);
 
+    // {
+    //     clip_image_u8 * temp2 = clip_image_u8_init();
+    //     clip_image_convert_f32_to_u8(*res, *temp2);
+    //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
+    //     clip_image_u8_free(temp2);
+    // }
+    res_tensor.push_back(res);
     return true;
 }
 
@@ -1302,6 +1802,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     type = static_cast<ggml_type>(itype);
 
     auto * ctx_clip = clip_model_load(fname_inp, 2);
+    
 
     const auto & ctx_src = ctx_clip->ctx_gguf;
     const auto & ctx_data = ctx_clip->ctx_data;
@@ -1495,6 +1996,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     }
 }
 
+ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) {
+    return ctx->vision_model.image_newline;
+}
+
 int clip_n_patches(const struct clip_ctx * ctx) {
     auto & params = ctx->vision_model.hparams;
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -1506,4 +2011,4 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
+}
\ No newline at end of file
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 458a256a107fe..09346b603b259 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,6 +3,8 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <string>
+#include <vector>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -32,10 +34,20 @@ struct clip_vision_hparams {
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
+
     float eps;
+
+    char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
+    int32_t image_grid_pinpoints[32]; 
+    int32_t image_crop_resolution;
+
 };
 
+struct clip_ctx;
+CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx);
+
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
@@ -44,6 +56,24 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
+// RGB uint8 image
+CLIP_API struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+ CLIP_API struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
@@ -53,6 +83,10 @@ struct clip_image_f32_batch {
     struct clip_image_f32 * data;
     size_t size;
 };
+CLIP_API struct clip_image_grid_shape {
+    int first;
+    int second;
+};
 
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
@@ -61,11 +95,16 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename);
+CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square);
+CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size);
+CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx);
 
-CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 115b6b35b4da0..ea331f2fe9875 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -240,7 +240,8 @@ def bytes_to_unicode():
         # flatten it
         image_grid_pinpoints = []
         for pinpoint in v_hparams["image_grid_pinpoints"]:
-            image_grid_pinpoints.extend(pinpoint)
+            for p in pinpoint:
+                image_grid_pinpoints.append(p)
         fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
     if "image_crop_resolution" in v_hparams:
         fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index a5850b96e77d9..6b4fac80d29d7 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -13,11 +13,12 @@ def is_safetensor_file(file_path):
 # Unified loading function
 def load_model(file_path):
     if is_safetensor_file(file_path):
-        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
         tensors = {}
         with safe_open(file_path, framework="pt", device="cpu") as f:
             for key in f.keys():
                 tensors[key] = f.get_tensor(key).clone()
+                # output shape
+                print(f"{key} : {tensors[key].shape}")
         return tensors, 'safetensor'
     else:
         return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
@@ -156,4 +157,4 @@ def proj_criteria(checkpoint):
 
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
\ No newline at end of file
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index d42e7582e8c66..3a0c4a8a4a874 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -6,27 +6,261 @@
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+#include <numeric>
 
 #include "base64.hpp"
 
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
+static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+    struct temp_model {
+        struct ggml_tensor *newline;
+        struct ggml_context * ctx; 
+    } model;
+
+    auto & vparams = clip_get_vision_hparams(ctx_clip);
+    auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+    int num_patches_width = grid_shape.first; // grid 1-4
+    int num_patches_height = grid_shape.second; // grid 1-4
+    
+    // TODO: size calculation is not calculated - it's only tens of MB
+    size_t ctx_size = 0;
+    {
+        ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
+        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // 
+    }
+    
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
+    };
+
+        // Python reference for full unpad:
+        // base_image_feature = image_feature[0]
+        // image_feature = image_feature[1:]
+        // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        // image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        // image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        // image_feature = torch.cat((
+        //     image_feature,
+        //     self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        // ), dim=-1)
+        // image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        // image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+        
+        // embeddings -> tokens -> 24 x 24
+        /**
+         * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
+         * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet
+         * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+         * Once all images are processed to prepended the base_image_features without any changes.
+         */
+    /**
+        Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+        # image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        # image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        # image_feature = image_feature.flatten(0, 3)
+
+        # Reshape to 4D tensor by merging the last two dimensions
+        image_feature = image_feature.view(2, 2, 24, 24*4096)
+        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+        image_feature = image_feature.view(-1, 4096)
+     * 
+     */
+    model.ctx = ggml_init(params);
+    
+    ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
+    // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
+    
+    ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
+    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
+    if (newline_tmp->backend != GGML_BACKEND_CPU) {
+        if (newline_tmp->buffer == NULL) {
+            printf("newline_tmp tensor buffer is NULL\n");
+        }
+        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
+    } else
+    {
+        model.newline->data = newline_tmp->data;
+        if (model.newline->data == NULL) {
+            printf("newline_tmp tensor data is NULL\n");
+        }
+    }
+
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip));
+    // fill it with the image embeddings, ignoring the first
+    for (int i = 1; i < image_embd_v.size(); i++)
+    {
+        // printf("Copying image_embd_v[%d] to image_features tensor\n", i);
+        size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
+
+        // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc:
+        // float *floatPtr = static_cast<float*>(image_embd_v[i]);
+        // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++)
+        // {
+        //     // floatPtr[j] = (j + 1) / 10000.0f;
+        //     int feature = j % clip_n_mmproj_embd(ctx_clip) + 1;
+        //     floatPtr[j] = i + feature / 10000.0f;
+        // }
+        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+    }
+    // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1));
+
+    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
+    // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+    size_t size_ele = ggml_type_size(GGML_TYPE_F32);
+    // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
+
+    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, 
+                                                                    num_patches_height, // nb0 : 4 byte für jedes 
+                                                                    num_patches_width, 
+                                                                    num_patches_per_side * num_patches_per_side, 
+                                                                    clip_n_mmproj_embd(ctx_clip), 
+
+                                                                    size_ele * num_patches_height,
+                                                                    size_ele * num_patches_height * num_patches_width,
+                                                                    size_ele * num_patches_height * num_patches_width * num_patches_per_side,
+                                                                    0);
+                                                                
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, 
+                                                                num_patches_height, 
+                                                                num_patches_width, 
+                                                                num_patches_per_side, 
+                                                                num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                
+                                                                size_ele * num_patches_height,
+                                                                size_ele * num_patches_height * num_patches_width,
+                                                                size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
+
+    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); 
+    permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
+
+    struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
+    struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed
+    // struct ggml_tensor *prepared_cont = prepared; // the view only flattens
+
+    ggml_build_forward_expand(gf, prepared_cont);
+
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    //  ggml_tensor_printf(image_features,"image_features",__LINE__,false,true);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true);
+    // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true);
+
+    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
+    // append without newline tokens:
+    // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
+    // append with newline tokens:
+    for (size_t i = 0; i < image_embd_v.size() - 1; ++i) {
+        // we append with +1 offset (base image is prepended)
+        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i,
+            (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip),
+            clip_embd_nbytes(ctx_clip));
+        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i ,
+            (float*)model.newline->data,
+            ggml_nbytes(model.newline));
+    }
+
+    size_t newline_tokens = image_embd_v.size()-1;
+    *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens;
+
+    // Debug: Test single segments
+    // Current findings: sending base image, sending a segment embedding all works similar to python
+    // However, permuted embeddings do not work yet (stride issue?)
+    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+    // *n_img_pos_out=576;
+    
+    ggml_free(model.ctx);
+
+    return true;
+}
+
+
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = clip_image_f32_init();
-    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
+    std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) {
         fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-        clip_image_f32_free(img_res);
+        for (auto img_res : img_res_v) {
+            clip_image_f32_free(img_res);
+        }
         return false;
     }
 
-    *n_img_pos = clip_n_patches(ctx_clip);
-
     const int64_t t_img_enc_start_us = ggml_time_us();
-    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
-    clip_image_f32_free(img_res);
-    if (!encoded) {
-        fprintf(stderr, "Unable to encode image\n");
+    auto & vparams = clip_get_vision_hparams(ctx_clip);
+    // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format
+    // for (int i = 0; i < img_res_v.size(); i++)
+    // {
+    //     printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny);
+    //     for (int j = 0; j < 10; j++)
+    //     {
+    //         for (int k = 0; k < 10; k++)
+    //         {
+    //             printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]);
+    //         }
+    //         printf("\n");
+    //     }
+    // }
+
+    if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) 
+    {
+        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096
+        clip_image_f32_free(img_res_v[0]);
+        if (!encoded) {
+            fprintf(stderr, "Unable to encode image\n");
+
+            return false;
+        }
+    } else
+    {
+        // spatial_unpad llava-1.6 type embedding
+        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size());
+        for (int i = 0; i < img_res_v.size(); i++)
+        {
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
+            bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            clip_image_f32_free(img_res_v[i]);
+            if (!encoded) {
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size());
+                return false;
+            }
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);        
+
+
+        std::vector<std::pair<int, int>> grid_pinpoints;
+        for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) {
+            grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]});
+        }
+        img_res_v.clear();
+        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size);
+
+        int n_img_pos_out;
+        handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        *n_img_pos = n_img_pos_out;
+
+        for (int i = 0; i < image_embd_v.size(); i++)
+        {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+
+        // debug image/segment/normalization content:
+        // clip_image_u8 * tmp = clip_image_u8_init();
+        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
+        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
 
-        return false;
     }
+    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@@ -36,6 +270,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     return true;
 }
 
+
+
 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
         // make sure that the correct mmproj was used, i.e., compare apples to apples
     int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
@@ -48,7 +284,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }
 
 static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
     if (!image_embd) {
         fprintf(stderr, "Unable to allocate memory for image embeddings\n");
         free(image_embd);
@@ -151,7 +387,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
         return NULL;
     }
 
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
     free(image_bytes);
 
     return embed;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ea77125eac99d..353bd89760819 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -943,13 +943,14 @@ struct llama_server_context
             {
                 continue;
             }
-            clip_image_f32 * img_res = clip_image_f32_init();
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
+            std::vector<clip_image_f32*> img_res_v;
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true))
             {
                 LOG_TEE("Error processing the given image");
                 clip_free(clp_ctx);
                 return false;
             }
+            clip_image_f32 * img_res = img_res_v[0];
             img.image_tokens = clip_n_patches(clp_ctx);
             img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
             if (!img.image_embedding)

From 7dcadb4ec3fa233fc13b581acf533a19a3dc7480 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Sun, 11 Feb 2024 03:30:36 +0100
Subject: [PATCH 09/24] whitespace corrections

---
 examples/llava/clip.cpp                       | 36 +++++++-------
 examples/llava/clip.h                         |  2 +-
 .../llava/convert-image-encoder-to-gguf.py    |  2 +-
 examples/llava/llava-surgery-v2.py            |  7 ++-
 examples/llava/llava.cpp                      | 49 +++++++++----------
 5 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 8193945ee43d7..7a7374cd8d64a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1,7 +1,6 @@
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
-
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -965,7 +964,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 hparams.image_grid_pinpoints[i] = pinpoints[i];
             }
             hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {  
+        } catch (std::runtime_error & e) {
             hparams.image_grid_pinpoints[0]=0;
         }
         try {
@@ -979,7 +978,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
         catch(const std::exception& e) {
             hparams.image_crop_resolution = hparams.image_size;
-        }        
+        }
 
         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
         int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
@@ -1022,7 +1021,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         {
             fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
         }
-        
+
         // LLaVA projection
         if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
@@ -1270,12 +1269,12 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam
 inline float lerp(float s, float e, float t) {
     return s + (e - s) * t;
 }
-// Bilinear resize function 
+// Bilinear resize function
 void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
     dst.nx = target_width;
     dst.ny = target_height;
     dst.buf.resize(3 * target_width * target_height);
-    
+
     float x_ratio = static_cast<float>(src.nx - 1) / target_width;
     float y_ratio = static_cast<float>(src.ny - 1) / target_height;
     
@@ -1343,11 +1342,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co
     dst->nx = src->nx;
     dst->ny = src->ny;
     dst->buf.resize(src->buf.size());
-    
+
     for (size_t i = 0; i < src->buf.size(); ++i) {
         int c = i % 3; // rgb
         dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
-        
+
         if (replicate_float16) {
             dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
         }
@@ -1546,15 +1545,15 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst)
 
 /**
  * @brief Get the anyres image grid shape object
- * 
- * @param image_size 
- * @param grid_pinpoints 
- * @param image_patch_size 
- * @return <int, int> 
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
  */
 struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
     /**
-        Conversion from gguf flat array to vector: 
+        Conversion from gguf flat array to vector:
         std::vector<std::pair<int, int>> possible_resolutions;
         for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
             possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
@@ -1628,7 +1627,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
             resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
             // clip_image_save_to_bmp(*temp, "resized.bmp");
             // visually verify normalized image:
-            // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); 
+            // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
             // {
             //     clip_image_u8 * temp2 = clip_image_u8_init();
             //     clip_image_convert_f32_to_u8(*res, *temp2);
@@ -1638,7 +1637,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
 
             std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
             // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
-        
+
             clip_image_u8 *image_original_resize = clip_image_u8_init();
             // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
             bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
@@ -1655,9 +1654,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
                 // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                 clip_image_u8_free(patches[i]);
             }
-
+        
             clip_image_u8_free(temp);
-            
+
             return true;
         } else {
             temp->nx = img->nx;
@@ -1802,7 +1801,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     type = static_cast<ggml_type>(itype);
 
     auto * ctx_clip = clip_model_load(fname_inp, 2);
-    
 
     const auto & ctx_src = ctx_clip->ctx_gguf;
     const auto & ctx_data = ctx_clip->ctx_data;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 09346b603b259..c1981bb5d1574 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -38,7 +38,7 @@ struct clip_vision_hparams {
     float eps;
 
     char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
-    int32_t image_grid_pinpoints[32]; 
+    int32_t image_grid_pinpoints[32];
     int32_t image_crop_resolution;
 
 };
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index ea331f2fe9875..61a14703702ad 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -234,7 +234,7 @@ def bytes_to_unicode():
                             #         1008, 336,
                             #         336, 1008
                             #     ]
-                            #  * 
+                            #  *
                             #  */
     if "image_grid_pinpoints" in v_hparams:
         # flatten it
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index 6b4fac80d29d7..e94d10a55ddbf 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -4,7 +4,6 @@
 import torch
 from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
 
- 
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
     return file_path.endswith('.safetensors')
@@ -40,12 +39,12 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
     model_path = os.path.dirname(checkpoint_path)
     print(f"Searching for vision tower tensors in {checkpoint_path}")
     clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
-    
+
     if len(clip_tensors) > 0:
         print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
         # Adapted for file type
         clip_path = os.path.join(model_path, "llava.clip")
-        
+
         if os.path.exists(clip_path):
             existing_clip, _ = load_model(clip_path)
         else:
@@ -142,7 +141,7 @@ def proj_criteria(checkpoint):
     projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
     projector[name] = first_checkpoint[name].float()
-    
+
 save_model(projector, f"{args.model}/llava.projector", 'pytorch')
 
 for name in mm_tensors:
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 3a0c4a8a4a874..5ba9d072dfba9 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -14,21 +14,21 @@
 static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
     struct temp_model {
         struct ggml_tensor *newline;
-        struct ggml_context * ctx; 
+        struct ggml_context * ctx;
     } model;
 
     auto & vparams = clip_get_vision_hparams(ctx_clip);
     auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
     int num_patches_width = grid_shape.first; // grid 1-4
     int num_patches_height = grid_shape.second; // grid 1-4
-    
+
     // TODO: size calculation is not calculated - it's only tens of MB
     size_t ctx_size = 0;
     {
         ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
-        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // 
+        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
     }
-    
+
     struct ggml_init_params params {
         /*.mem_size   =*/ ctx_size,
         /*.mem_buffer =*/ NULL,
@@ -47,7 +47,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         // ), dim=-1)
         // image_feature = image_feature.flatten(1, 2).transpose(0, 1)
         // image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-        
+
         // embeddings -> tokens -> 24 x 24
         /**
          * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
@@ -66,13 +66,13 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         image_feature = image_feature.view(2, 2, 24, 24*4096)
         image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
         image_feature = image_feature.view(-1, 4096)
-     * 
+     *
      */
     model.ctx = ggml_init(params);
-    
+
     ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
     // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
-    
+
     ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
     model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
     if (newline_tmp->backend != GGML_BACKEND_CPU) {
@@ -112,28 +112,28 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
     size_t size_ele = ggml_type_size(GGML_TYPE_F32);
     // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
 
-    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, 
-                                                                    num_patches_height, // nb0 : 4 byte für jedes 
-                                                                    num_patches_width, 
-                                                                    num_patches_per_side * num_patches_per_side, 
-                                                                    clip_n_mmproj_embd(ctx_clip), 
+    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
+                                                                    num_patches_height,
+                                                                    num_patches_width,
+                                                                    num_patches_per_side * num_patches_per_side,
+                                                                    clip_n_mmproj_embd(ctx_clip),
 
                                                                     size_ele * num_patches_height,
                                                                     size_ele * num_patches_height * num_patches_width,
                                                                     size_ele * num_patches_height * num_patches_width * num_patches_per_side,
                                                                     0);
-                                                                
-    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, 
-                                                                num_patches_height, 
-                                                                num_patches_width, 
-                                                                num_patches_per_side, 
+
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
+                                                                num_patches_height,
+                                                                num_patches_width,
+                                                                num_patches_per_side,
                                                                 num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
-                                                                
+
                                                                 size_ele * num_patches_height,
                                                                 size_ele * num_patches_height * num_patches_width,
                                                                 size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
 
-    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); 
+    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
     permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
 
     struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
@@ -172,9 +172,8 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
     // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
     // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
     // *n_img_pos_out=576;
-    
-    ggml_free(model.ctx);
 
+    ggml_free(model.ctx);
     return true;
 }
 
@@ -205,7 +204,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     //     }
     // }
 
-    if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) 
+    if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
     {
         // flat / default llava-1.5 type embedding
         *n_img_pos = clip_n_patches(ctx_clip);
@@ -233,7 +232,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);        
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
 
         std::vector<std::pair<int, int>> grid_pinpoints;
@@ -260,7 +259,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
     }
     printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
-    
+
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

From 7107b9098e3f1375a6647360d9c8ba41125c4973 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Sun, 11 Feb 2024 03:44:07 +0100
Subject: [PATCH 10/24] ws

---
 examples/llava/clip.cpp            | 6 +++---
 examples/llava/llava-surgery-v2.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7a7374cd8d64a..56d3fd0af9a27 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1277,7 +1277,7 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi
 
     float x_ratio = static_cast<float>(src.nx - 1) / target_width;
     float y_ratio = static_cast<float>(src.ny - 1) / target_height;
-    
+
     for (int y = 0; y < target_height; y++) {
         for (int x = 0; x < target_width; x++) {
             float px = x_ratio * x;
@@ -1654,7 +1654,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
                 // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                 clip_image_u8_free(patches[i]);
             }
-        
+
             clip_image_u8_free(temp);
 
             return true;
@@ -2009,4 +2009,4 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
\ No newline at end of file
+}
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index e94d10a55ddbf..f0ade4ceb357b 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -156,4 +156,4 @@ def proj_criteria(checkpoint):
 
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
\ No newline at end of file
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")

From 51e60c996f5cdce71c21eaf53da0f6afee87acd1 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Mon, 12 Feb 2024 04:02:54 +0100
Subject: [PATCH 11/24] Tensors are now properly permuted. Before the
 embeddings were inserted 1:1, now they are split into the 24x24 patches as in
 reference.

---
 examples/llava/clip.cpp  |   4 +-
 examples/llava/llava.cpp | 140 +++++++++++++--------------------------
 2 files changed, 48 insertions(+), 96 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 56d3fd0af9a27..60d8e8e802f05 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1,6 +1,7 @@
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
+// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -1622,7 +1623,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
-            // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second);
             // clip_image_save_to_bmp(*img, "input.bmp");
             resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
             // clip_image_save_to_bmp(*temp, "resized.bmp");
@@ -1646,7 +1646,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
             res_tensor.clear();
             for (auto& patch : patches) {
                 clip_image_f32 *temp_image_f32 = clip_image_f32_init();
-                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true);
+                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication
                 res_tensor.push_back(temp_image_f32);
             }
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 5ba9d072dfba9..42d00082b8c6b 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -34,44 +34,40 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
     };
-
-        // Python reference for full unpad:
-        // base_image_feature = image_feature[0]
-        // image_feature = image_feature[1:]
-        // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-        // image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-        // image_feature = unpad_image(image_feature, image_sizes[image_idx])
-        // image_feature = torch.cat((
-        //     image_feature,
-        //     self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
-        // ), dim=-1)
-        // image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-        // image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-
-        // embeddings -> tokens -> 24 x 24
-        /**
-         * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
-         * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet
-         * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
-         * Once all images are processed to prepended the base_image_features without any changes.
-         */
-    /**
-        Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
-        # image_feature = image_feature.view(2, 2, 24, 24, 4096)
-        # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
-        # image_feature = image_feature.view(2, 24, 2, 24, 4096)
-        # image_feature = image_feature.flatten(0, 3)
-
-        # Reshape to 4D tensor by merging the last two dimensions
+    // Python reference code for full unpad:
+    /*
+        base_image_feature = image_feature[0]
+        image_feature = image_feature[1:]
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        image_feature = torch.cat((
+            image_feature,
+            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        ), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    */
+    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
+    // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+    // Once all images are processed to prepended the base_image_features without any changes.
+
+    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+    /*
+        image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        image_feature = image_feature.flatten(0, 3)
+
+        // Reshape to 4D tensor by merging the last two dimensions
         image_feature = image_feature.view(2, 2, 24, 24*4096)
         image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
         image_feature = image_feature.view(-1, 4096)
-     *
-     */
-    model.ctx = ggml_init(params);
+    */
 
+    model.ctx = ggml_init(params);
     ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
-    // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
 
     ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
     model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
@@ -88,83 +84,39 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         }
     }
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip));
-    // fill it with the image embeddings, ignoring the first
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4
+    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+    // fill it with the image embeddings, ignoring the base
     for (int i = 1; i < image_embd_v.size(); i++)
     {
-        // printf("Copying image_embd_v[%d] to image_features tensor\n", i);
         size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
-
-        // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc:
-        // float *floatPtr = static_cast<float*>(image_embd_v[i]);
-        // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++)
-        // {
-        //     // floatPtr[j] = (j + 1) / 10000.0f;
-        //     int feature = j % clip_n_mmproj_embd(ctx_clip) + 1;
-        //     floatPtr[j] = i + feature / 10000.0f;
-        // }
         memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
     }
-    // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1));
 
     struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
-    // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
     size_t size_ele = ggml_type_size(GGML_TYPE_F32);
-    // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
-
-    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
-                                                                    num_patches_height,
-                                                                    num_patches_width,
-                                                                    num_patches_per_side * num_patches_per_side,
-                                                                    clip_n_mmproj_embd(ctx_clip),
-
-                                                                    size_ele * num_patches_height,
-                                                                    size_ele * num_patches_height * num_patches_width,
-                                                                    size_ele * num_patches_height * num_patches_width * num_patches_per_side,
-                                                                    0);
 
     struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
-                                                                num_patches_height,
-                                                                num_patches_width,
-                                                                num_patches_per_side,
                                                                 num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
-
-                                                                size_ele * num_patches_height,
-                                                                size_ele * num_patches_height * num_patches_width,
-                                                                size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
-
+                                                                num_patches_per_side,
+                                                                num_patches_width,
+                                                                num_patches_height,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), 
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
     struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
-    permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
-
-    struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
-    struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed
-    // struct ggml_tensor *prepared_cont = prepared; // the view only flattens
-
-    ggml_build_forward_expand(gf, prepared_cont);
-
+    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+    ggml_build_forward_expand(gf, flatten);
     ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-
     struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
-    //  ggml_tensor_printf(image_features,"image_features",__LINE__,false,true);
-    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true);
-    // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
-    // append without newline tokens:
-    // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
-    // append with newline tokens:
-    for (size_t i = 0; i < image_embd_v.size() - 1; ++i) {
-        // we append with +1 offset (base image is prepended)
-        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i,
-            (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip),
-            clip_embd_nbytes(ctx_clip));
-        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i ,
-            (float*)model.newline->data,
-            ggml_nbytes(model.newline));
-    }
-
-    size_t newline_tokens = image_embd_v.size()-1;
-    *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens;
+    // append without newline tokens (default behavior in llava_arch when not using unpad ):
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
+    *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip);
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python

From 60c5f46ba734391f62005d2b20ff8d791e1fcdae Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Mon, 12 Feb 2024 04:04:57 +0100
Subject: [PATCH 12/24] ws

---
 examples/llava/llava.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 42d00082b8c6b..4ba89eb97cf56 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -101,7 +101,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
                                                                 num_patches_per_side,
                                                                 num_patches_width,
                                                                 num_patches_height,
-                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), 
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
     // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);

From 0dd6c9da2a81337ec112f4c96627c01742a08943 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Mon, 12 Feb 2024 04:34:51 +0100
Subject: [PATCH 13/24] added verbose_prompt support into cli added stopwords
 for llava-1.6 into cli

---
 examples/llava/llava-cli.cpp | 26 ++++++++++++++++++++++++--
 examples/llava/llava.cpp     |  6 +++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 6ac70ba69e281..04fe6bef05ea2 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -167,11 +167,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         }
 
         printf("system_prompt: %s\n", system_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
         printf("user_prompt: %s\n", user_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
     } else {
         // llava-1.5 native mode
         system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
         user_prompt = prompt + "\nASSISTANT:";
+        if (params->verbose_prompt) {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
     }
 
     eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
@@ -183,13 +201,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     fprintf(stderr, "\n");
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-
+    std::string response = "";
     for (int i = 0; i < max_tgt_len; i++) {
         const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        response += tmp;
         if (strcmp(tmp, "</s>") == 0) break;
         if (strstr(tmp, "###")) break; // Yi-VL behavior
-
         printf("%s", tmp);
+        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
         fflush(stdout);
     }
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4ba89eb97cf56..ff99a688e8605 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -116,7 +116,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
     memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
-    *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip);
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -179,12 +179,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             clip_image_f32_free(img_res_v[i]);
             if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size());
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size());
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
 
         std::vector<std::pair<int, int>> grid_pinpoints;

From 3a722678690952ed922a4dde8693a00882f1890a Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Tue, 13 Feb 2024 00:29:17 +0100
Subject: [PATCH 14/24] moved llava functions to llava.cpp, made clip.h C
 compatible API, replaced vector style functions with pointers, added a debug
 define to remove functions from compilation while not needed

---
 examples/llava/clip.cpp    | 141 ++++++++++++++-----------------------
 examples/llava/clip.h      |  27 +------
 examples/llava/llava.cpp   | 136 ++++++++++++++++++++++++++---------
 examples/server/server.cpp |  29 ++++++--
 4 files changed, 184 insertions(+), 149 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 60d8e8e802f05..ad12bd8c514fb 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -31,6 +31,25 @@
 #include <sstream>
 #include <cinttypes>
 
+// #define CLIP_DEBUG_FUNCTIONS
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -961,10 +980,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
             int n = gguf_get_arr_n(ctx, idx);
             const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
-            for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) {
+            for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
                 hparams.image_grid_pinpoints[i] = pinpoints[i];
             }
-            hparams.image_grid_pinpoints[n] = 0;
+            if (n < 32)
+                hparams.image_grid_pinpoints[n] = 0;
         } catch (std::runtime_error & e) {
             hparams.image_grid_pinpoints[0]=0;
         }
@@ -1170,7 +1190,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     return true;
 }
 
-
+#ifdef CLIP_DEBUG_FUNCTIONS
 void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
@@ -1265,6 +1285,7 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam
 
     file.close();
 }
+#endif
 
 // Linear interpolation between two points
 inline float lerp(float s, float e, float t) {
@@ -1305,41 +1326,8 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi
     }
 }
 
-// for replication purposes `.to(model.device, dtype=torch.float16)`
-// converts a float to half precision and back to float
-float simulateFloat16Precision(float value) {
-    // Convert float32 to float16
-    uint32_t f32 = *reinterpret_cast<uint32_t*>(&value);
-    uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit)
-    uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32)
-    uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32)
-
-    // Handle overflow/underflow
-    if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable
-        exponent = 0x1F;
-        mantissa = 0;
-    } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision
-        exponent = 0;
-        mantissa = 0;
-    }
-
-    uint16_t f16 = sign | (exponent << 10) | mantissa;
-
-    // Convert back to float32
-    uint32_t sign32 = (f16 & 0x8000) << 16;
-    uint32_t exponent32 = ((f16 >> 10) & 0x1F);
-    uint32_t mantissa32 = (f16 & 0x3FF) << 13;
-
-    // Adjust bias back
-    exponent32 = exponent32 == 0 ? 0 : exponent32 + 112;
-
-    uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32;
-    float result = *reinterpret_cast<float*>(&f32Result);
-
-    return result;
-}
-// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16)
-void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) {
+// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
+void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
     dst->nx = src->nx;
     dst->ny = src->ny;
     dst->buf.resize(src->buf.size());
@@ -1347,12 +1335,9 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co
     for (size_t i = 0; i < src->buf.size(); ++i) {
         int c = i % 3; // rgb
         dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
-
-        if (replicate_float16) {
-            dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
-        }
     }
 }
+
 inline float clip(float x, float lower, float upper)
 {
     return std::max(lower, std::min(x, upper));
@@ -1471,7 +1456,6 @@ void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_outpu
             }
         }
     }
-
     image_output = std::move(padded_image);
 }
 
@@ -1533,7 +1517,7 @@ std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8& image, int
     return patches;
 }
 
-
+#ifdef CLIP_DEBUG_FUNCTIONS
 // debug function to convert f32 to u8
 void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
     dst.nx = src.nx;
@@ -1543,32 +1527,12 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst)
         dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
     }
 }
+#endif
 
-/**
- * @brief Get the anyres image grid shape object
- *
- * @param image_size
- * @param grid_pinpoints
- * @param image_patch_size
- * @return <int, int>
- */
-struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
-    /**
-        Conversion from gguf flat array to vector:
-        std::vector<std::pair<int, int>> possible_resolutions;
-        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
-            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
-        }
-     */
-    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
-    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
-}
-
-
-// normalize: x = (x - mean) / std
-// TODO: implement bicubic interpolation instead of linear.
-// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square) {
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
+// res_imgs memory is being allocated here, previous allocations will be freed if found
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ) {
+    bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
         return false;
@@ -1576,23 +1540,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
     auto & params = ctx->vision_model.hparams;
     // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
     if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
-        pad2square = false;
-    } else {
-        // pad2square = true; // todo: consider automatic decisions on that options for all models
+        pad_to_square = false;
     }
-    // free the previous res_tensor
-    if (res_tensor.size() > 0) {
-        for (size_t i = 0; i < res_tensor.size(); i++) {
-            clip_image_f32_free(res_tensor[i]);
+    // free the previous res_imgs if any set
+    if (res_imgs.size > 0 && res_imgs.size < 100) {
+        for (size_t i = 0; i < res_imgs.size; i++) {
+            clip_image_f32_free(&(res_imgs.data[i]));
         }
-        res_tensor.clear();
+        delete[] res_imgs.data;
     }
+    res_imgs.data = nullptr;
+    res_imgs.size = 0;
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
 
     clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
-    if (pad2square && img->nx != img->ny) {
+    if (pad_to_square && img->nx != img->ny) {
         int longer_side = std::max(img->nx, img->ny);
         temp->nx = longer_side;
         temp->ny = longer_side;
@@ -1636,18 +1600,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
             // }
 
             std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
-            // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
 
             clip_image_u8 *image_original_resize = clip_image_u8_init();
-            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
-            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
+            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
+            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
             patches.insert(patches.begin(), image_original_resize);
-
-            res_tensor.clear();
+            // clip_image_f32_batch_init(patches.size());
+            res_imgs.size = patches.size();
+            res_imgs.data = new clip_image_f32[res_imgs.size];
+            int num=0;
             for (auto& patch : patches) {
-                clip_image_f32 *temp_image_f32 = clip_image_f32_init();
-                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication
-                res_tensor.push_back(temp_image_f32);
+                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                num++;
             }
 
             for (size_t i = 0; i < patches.size(); i++) {
@@ -1732,7 +1696,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
     //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
     //     clip_image_u8_free(temp2);
     // }
-    res_tensor.push_back(res);
+    // res_imgs.push_back(res);
+    res_imgs.size = 1;
+    res_imgs.data = new clip_image_f32[res_imgs.size];
+    res_imgs.data[0] = std::move(*res);
     return true;
 }
 
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index c1981bb5d1574..2d1858bbd4082 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,8 +3,6 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
-#include <vector>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -56,24 +54,6 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
-// RGB uint8 image
-CLIP_API struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
- CLIP_API struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
-
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
@@ -95,14 +75,11 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename);
-CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square);
-CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size);
+/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs );
 CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx);
 
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index ff99a688e8605..699fd256a8ae2 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -10,8 +10,78 @@
 
 #include "base64.hpp"
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width = original_size.first;
+    int original_height = original_size.second;
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector:
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
     struct temp_model {
         struct ggml_tensor *newline;
         struct ggml_context * ctx;
@@ -21,11 +91,12 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
     auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
     int num_patches_width = grid_shape.first; // grid 1-4
     int num_patches_height = grid_shape.second; // grid 1-4
+    const size_t num_images = num_patches_width + num_patches_height + 1;
 
     // TODO: size calculation is not calculated - it's only tens of MB
     size_t ctx_size = 0;
     {
-        ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
+        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
         ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
     }
 
@@ -84,10 +155,10 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         }
     }
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
-    for (int i = 1; i < image_embd_v.size(); i++)
+    for (int i = 1; i < num_images; i++)
     {
         size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
         memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
@@ -106,6 +177,15 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
     // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
     struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
+    /**
+     At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
+         image_feature = torch.cat((
+        image_feature,
+        self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+    ), dim=-1)
+     * 
+     */
+
     // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
     struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
@@ -115,7 +195,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
     *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
 
     // Debug: Test single segments
@@ -131,37 +211,25 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
 
 
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) {
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
         fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-        for (auto img_res : img_res_v) {
-            clip_image_f32_free(img_res);
-        }
+        delete[] img_res_v.data;
         return false;
     }
 
     const int64_t t_img_enc_start_us = ggml_time_us();
     auto & vparams = clip_get_vision_hparams(ctx_clip);
-    // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format
-    // for (int i = 0; i < img_res_v.size(); i++)
-    // {
-    //     printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny);
-    //     for (int j = 0; j < 10; j++)
-    //     {
-    //         for (int k = 0; k < 10; k++)
-    //         {
-    //             printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]);
-    //         }
-    //         printf("\n");
-    //     }
-    // }
 
     if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
     {
         // flat / default llava-1.5 type embedding
         *n_img_pos = clip_n_patches(ctx_clip);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096
-        clip_image_f32_free(img_res_v[0]);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
+        delete[] img_res_v.data;
         if (!encoded) {
             fprintf(stderr, "Unable to encode image\n");
 
@@ -172,30 +240,32 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // spatial_unpad llava-1.6 type embedding
         // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
         std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size());
-        for (int i = 0; i < img_res_v.size(); i++)
+        image_embd_v.resize(img_res_v.size);
+        for (int i = 0; i < img_res_v.size; i++)
         {
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
-            clip_image_f32_free(img_res_v[i]);
+            bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size());
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
 
         std::vector<std::pair<int, int>> grid_pinpoints;
         for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) {
             grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]});
         }
-        img_res_v.clear();
+        // free all img_res_v - not needed anymore
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size);
 
         int n_img_pos_out;
-        handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
         *n_img_pos = n_img_pos_out;
 
         for (int i = 0; i < image_embd_v.size(); i++)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 353bd89760819..9148f6ca21331 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,6 +31,23 @@
 
 using json = nlohmann::json;
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 struct server_params
 {
     std::string hostname = "127.0.0.1";
@@ -943,14 +960,17 @@ struct llama_server_context
             {
                 continue;
             }
-            std::vector<clip_image_f32*> img_res_v;
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true))
+            clip_image_f32_batch img_res_v;
+            img_res_v.size = 0;
+            img_res_v.data = nullptr;
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
             {
                 LOG_TEE("Error processing the given image");
                 clip_free(clp_ctx);
+                delete[] img_res_v.data;
                 return false;
             }
-            clip_image_f32 * img_res = img_res_v[0];
+            clip_image_f32 * img_res = &img_res_v.data[0];
             img.image_tokens = clip_n_patches(clp_ctx);
             img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
             if (!img.image_embedding)
@@ -965,7 +985,8 @@ struct llama_server_context
                 LOG_TEE("Unable to encode image\n");
                 return false;
             }
-            clip_image_f32_free(img_res);
+            // clip_image_f32_free(img_res);
+            delete[] img_res_v.data;
             img.request_encode_image = false;
         }
 

From 07f5cd7beccf93d4f720d2c037cbc5ca86385cd5 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Tue, 13 Feb 2024 00:35:31 +0100
Subject: [PATCH 15/24] ws

---
 examples/llava/llava.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 699fd256a8ae2..9f955e2ae5a62 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -183,7 +183,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
         image_feature,
         self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
     ), dim=-1)
-     * 
+     *
      */
 
     // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);

From 6b8d69b451feadb38972174980314d3da9e4f179 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 19:58:44 +0200
Subject: [PATCH 16/24] convert : skip unknown tensors (need for LLaVA)

---
 convert.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index 75c10011846e4..237f8d782570e 100755
--- a/convert.py
+++ b/convert.py
@@ -1195,7 +1195,9 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     for name, lazy_tensor in model.items():
         tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
         if name_new is None:
-            raise Exception(f"Unexpected tensor name: {name}")
+            #raise Exception(f"Unexpected tensor name: {name}")
+            print(f"Unexpected tensor name: {name} - skipping")
+            continue
 
         if tensor_type in should_skip:
             print(f"skipping tensor {name_new}")

From a2848854a445c18b5339f2a928c59bb4cc8082d2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 19:59:00 +0200
Subject: [PATCH 17/24] llava : update readme

---
 examples/llava/README.md                        | 6 +++++-
 examples/llava/convert-image-encoder-to-gguf.py | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 323c5fdd02835..c1c030951f3dd 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -49,8 +49,12 @@ python ./convert.py ../llava-v1.5-7b
 
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
 
+## LLaVA 1.6
+
+- Use `llava-surgery-v2.py`
+
 ## TODO
 
-- [ ] Support non-CPU backend for the image encoding part.
+- [x] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 61a14703702ad..3988da70c9731 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -78,9 +78,9 @@ def bytes_to_unicode():
                 help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                 help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                 help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
@@ -89,8 +89,8 @@ def bytes_to_unicode():
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 
 # with proper
 args = ap.parse_args()

From 65ec518d4120bc25425204d5834991ab9bca0639 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 20:22:28 +0200
Subject: [PATCH 18/24] llava : fix compile warnings

---
 examples/llava/clip.cpp  | 109 ++++++++++++++++++++++++---------------
 examples/llava/clip.h    |  27 ++++------
 examples/llava/llava.cpp |  72 +++++++++++++-------------
 3 files changed, 112 insertions(+), 96 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ad12bd8c514fb..2baceda5da387 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -30,6 +30,7 @@
 #include <vector>
 #include <sstream>
 #include <cinttypes>
+#include <limits>
 
 // #define CLIP_DEBUG_FUNCTIONS
 
@@ -242,7 +243,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
     }
 }
 
-static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
     printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
@@ -263,6 +264,24 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 // clip layers
 //
 
+struct clip_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+
+    float eps;
+
+    char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
+
+    int32_t image_grid_pinpoints[32];
+    int32_t image_crop_resolution;
+
+};
+
 struct clip_layer {
     // attention
     struct ggml_tensor * k_w;
@@ -292,7 +311,7 @@ struct clip_layer {
 };
 
 struct clip_vision_model {
-    struct clip_vision_hparams hparams;
+    struct clip_hparams hparams;
 
     // embeddings
     struct ggml_tensor * class_embedding;
@@ -376,10 +395,6 @@ struct clip_ctx {
     ggml_allocr * compute_alloc = NULL;
 };
 
-const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams;
-}
-
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -392,7 +407,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int image_size = hparams.image_size;
     const int patch_size = hparams.patch_size;
     const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_patches_per_side = image_size / patch_size;
+    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
     const int num_positions = num_patches + 1;
     const int hidden_size = hparams.hidden_size;
     const int n_head = hparams.n_head;
@@ -1292,7 +1307,7 @@ inline float lerp(float s, float e, float t) {
     return s + (e - s) * t;
 }
 // Bilinear resize function
-void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
     dst.nx = target_width;
     dst.ny = target_height;
     dst.buf.resize(3 * target_width * target_height);
@@ -1327,7 +1342,7 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi
 }
 
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
-void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
+static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
     dst->nx = src->nx;
     dst->ny = src->ny;
     dst->buf.resize(src->buf.size());
@@ -1338,12 +1353,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co
     }
 }
 
-inline float clip(float x, float lower, float upper)
-{
+inline float clip(float x, float lower, float upper) {
     return std::max(lower, std::min(x, upper));
 }
-bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height)
-{
+
+static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
     const int nx = img.nx;
     const int ny = img.ny;
 
@@ -1351,11 +1365,10 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid
     dst.ny = target_height;
     dst.buf.resize(3 * target_width * target_height);
 
-    int a, b, c, d, index;
-    float Ca, Cb, Cc;
+    float Cc;
     float C[5];
     float d0, d2, d3, a0, a1, a2, a3;
-    int i, j, k, ii, jj;
+    int i, j, k, jj;
     int x, y;
     float dx, dy;
     float tx, ty;
@@ -1363,39 +1376,29 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid
     tx = (float)nx / (float)target_width;
     ty = (float)ny / (float)target_height;
 
-    float scale = std::max(tx, ty);
-
     // Bicubic interpolation; adapted from ViT.cpp, inspired from :
     //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
     //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
 
-    for (i = 0; i < target_height; i++)
-    {
-        for (j = 0; j < target_width; j++)
-        {
+    for (i = 0; i < target_height; i++) {
+        for (j = 0; j < target_width; j++) {
             x = (int)(tx * j);
             y = (int)(ty * i);
 
             dx = tx * j - x;
             dy = ty * i - y;
 
-            index = (y * nx + x) * 3;
-            a = (y * nx + (x + 1)) * 3;
-            b = ((y + 1) * nx + x) * 3;
-            c = ((y + 1) * nx + (x + 1)) * 3;
-
-            for (k = 0; k < 3; k++)
-            {
-                for (jj = 0; jj <= 3; jj++)
-                {
+            for (k = 0; k < 3; k++) {
+                for (jj = 0; jj <= 3; jj++) {
                     d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                     d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                     d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                     a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
 
                     a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
-                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
-                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+
                     C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
 
                     d0 = C[0] - C[1];
@@ -1403,8 +1406,8 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid
                     d3 = C[3] - C[1];
                     a0 = C[1];
                     a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
-                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
-                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
                     Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
 
                     const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
@@ -1418,7 +1421,7 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid
 }
 
 // llava-1.6 type of resize_and_pad (black)
-void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
+static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
     int target_width = target_resolution.first;
     int target_height = target_resolution.second;
 
@@ -1467,7 +1470,7 @@ void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_outpu
  * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
  * @return The best fit resolution in the format (width, height).
  */
-static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
     int original_width = original_size.first;
     int original_height = original_size.second;
     std::pair<int, int> best_fit;
@@ -1494,7 +1497,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
 }
 
 
-std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8& image, int patch_size) {
+static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
     std::vector<clip_image_u8*> patches;
     int width = image.nx;
     int height = image.ny;
@@ -1531,7 +1534,7 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst)
 
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ) {
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -1710,6 +1713,30 @@ void clip_free(clip_ctx * ctx) {
     delete ctx;
 }
 
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+int32_t clip_image_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.image_size;
+}
+
+int32_t clip_patch_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.patch_size;
+}
+
+int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.hidden_size;
+}
+
+const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.mm_patch_merge_type;
+}
+
+const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.image_grid_pinpoints;
+}
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -1973,7 +2000,3 @@ int clip_n_patches(const struct clip_ctx * ctx) {
     }
     return n_patches;
 }
-
-size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 2d1858bbd4082..5e0b5c64b57c5 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -24,25 +24,7 @@ struct clip_ctx;
 extern "C" {
 #endif
 
-struct clip_vision_hparams {
-    int32_t image_size;
-    int32_t patch_size;
-    int32_t hidden_size;
-    int32_t n_intermediate;
-    int32_t projection_dim;
-    int32_t n_head;
-    int32_t n_layer;
-
-    float eps;
-
-    char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
-    int32_t image_grid_pinpoints[32];
-    int32_t image_crop_resolution;
-
-};
-
 struct clip_ctx;
-CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx);
 
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
@@ -51,6 +33,15 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 9f955e2ae5a62..ea956ac005a97 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -2,14 +2,13 @@
 #include "common.h"
 #include "llama.h"
 #include "llava.h"
+#include "base64.hpp"
 
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 #include <numeric>
 
-#include "base64.hpp"
-
 // RGB uint8 image
 struct clip_image_u8 {
     int nx;
@@ -35,8 +34,9 @@ struct clip_image_f32 {
  * @return The best fit resolution in the format (width, height).
  */
 static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
-    int original_width = original_size.first;
+    int original_width  = original_size.first;
     int original_height = original_size.second;
+
     std::pair<int, int> best_fit;
     int max_effective_resolution = 0;
     int min_wasted_resolution = std::numeric_limits<int>::max();
@@ -45,7 +45,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
         int width = resolution.first;
         int height = resolution.second;
         float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
-        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_width  = static_cast<int>(original_width * scale);
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
@@ -59,6 +59,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
 
     return best_fit;
 }
+
 /**
  * @brief Get the anyres image grid shape object
  *
@@ -67,7 +68,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
  * @param image_patch_size
  * @return <int, int>
  */
-struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
     /**
         Conversion from gguf flat array to vector:
         std::vector<std::pair<int, int>> possible_resolutions;
@@ -79,22 +80,26 @@ struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, in
     return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
 }
 
-
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
-    struct temp_model {
-        struct ggml_tensor *newline;
+    struct {
+        struct ggml_tensor * newline;
         struct ggml_context * ctx;
     } model;
 
-    auto & vparams = clip_get_vision_hparams(ctx_clip);
-    auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
-    int num_patches_width = grid_shape.first; // grid 1-4
+    const int32_t image_size = clip_image_size(ctx_clip);
+    const int32_t patch_size = clip_patch_size(ctx_clip);
+
+    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+
+    int num_patches_width  = grid_shape.first;  // grid 1-4
     int num_patches_height = grid_shape.second; // grid 1-4
+
     const size_t num_images = num_patches_width + num_patches_height + 1;
 
     // TODO: size calculation is not calculated - it's only tens of MB
     size_t ctx_size = 0;
+
     {
         ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
         ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
@@ -105,6 +110,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
     };
+
     // Python reference code for full unpad:
     /*
         base_image_feature = image_feature[0]
@@ -138,17 +144,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     */
 
     model.ctx = ggml_init(params);
-    ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
 
-    ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
+    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
     model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
     if (newline_tmp->backend != GGML_BACKEND_CPU) {
         if (newline_tmp->buffer == NULL) {
             printf("newline_tmp tensor buffer is NULL\n");
         }
         ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
-    } else
-    {
+    } else {
         model.newline->data = newline_tmp->data;
         if (model.newline->data == NULL) {
             printf("newline_tmp tensor data is NULL\n");
@@ -158,8 +162,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
-    for (int i = 1; i < num_images; i++)
-    {
+    for (size_t i = 1; i < num_images; i++) {
         size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
         memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
     }
@@ -222,10 +225,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     }
 
     const int64_t t_img_enc_start_us = ggml_time_us();
-    auto & vparams = clip_get_vision_hparams(ctx_clip);
 
-    if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
-    {
+    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
         *n_img_pos = clip_n_patches(ctx_clip);
         bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
@@ -235,41 +238,43 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
             return false;
         }
-    } else
-    {
+    } else {
         // spatial_unpad llava-1.6 type embedding
         // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
         std::vector<float *> image_embd_v;
         image_embd_v.resize(img_res_v.size);
-        for (int i = 0; i < img_res_v.size; i++)
-        {
+        for (size_t i = 0; i < img_res_v.size; i++) {
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size);
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
         printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
+        const int32_t * image_grid = clip_image_grid(ctx_clip);
 
         std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) {
-            grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]});
+        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
+            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
         }
+
         // free all img_res_v - not needed anymore
         delete[] img_res_v.data;
         img_res_v.size = 0;
         img_res_v.data = nullptr;
-        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size);
+
+        const int32_t image_size = clip_image_size(ctx_clip);
+
+        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
 
         int n_img_pos_out;
         clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
         *n_img_pos = n_img_pos_out;
 
-        for (int i = 0; i < image_embd_v.size(); i++)
-        {
+        for (size_t i = 0; i < image_embd_v.size(); i++) {
             free(image_embd_v[i]);
         }
         image_embd_v.clear();
@@ -278,10 +283,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // clip_image_u8 * tmp = clip_image_u8_init();
         // clip_image_convert_f32_to_u8(*image_feature, *tmp);
         // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
-
     }
-    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
+    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@@ -291,8 +295,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     return true;
 }
 
-
-
 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
         // make sure that the correct mmproj was used, i.e., compare apples to apples
     int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));

From 997dd1fdf7c367dcfc92758d5b4f61de7546125f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 20:40:01 +0200
Subject: [PATCH 19/24] llava : style

---
 examples/llava/clip.cpp  | 232 ++++++++++++++++++++-------------------
 examples/llava/clip.h    |  34 +++---
 examples/llava/llava.cpp |  11 +-
 examples/llava/llava.h   |   2 -
 4 files changed, 141 insertions(+), 138 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index a7562eb47acfe..1cdb2be74a31d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -70,29 +70,29 @@ static std::string format(const char * fmt, ...) {
 // key constants
 //
 
-#define KEY_FTYPE "general.file_type"
-#define KEY_NAME "general.name"
-#define KEY_DESCRIPTION "general.description"
-#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
+#define KEY_FTYPE          "general.file_type"
+#define KEY_NAME           "general.name"
+#define KEY_DESCRIPTION    "general.description"
+#define KEY_HAS_TEXT_ENC   "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC    "clip.has_vision_encoder"
 #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
-#define KEY_USE_GELU "clip.use_gelu"
-#define KEY_N_EMBD "clip.%s.embedding_length"
-#define KEY_N_FF "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK "clip.%s.block_count"
-#define KEY_N_HEAD "clip.%s.attention.head_count"
+#define KEY_USE_GELU       "clip.use_gelu"
+#define KEY_N_EMBD         "clip.%s.embedding_length"
+#define KEY_N_FF           "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK        "clip.%s.block_count"
+#define KEY_N_HEAD         "clip.%s.attention.head_count"
 #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM "clip.%s.projection_dim"
-#define KEY_TOKENS "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS "clip.text.context_length"
-#define KEY_IMAGE_SIZE "clip.vision.image_size"
-#define KEY_PATCH_SIZE "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN "clip.vision.image_mean"
-#define KEY_IMAGE_STD "clip.vision.image_std"
-#define KEY_PROJ_TYPE "clip.projector_type"
-
-#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_PROJ_DIM       "clip.%s.projection_dim"
+#define KEY_TOKENS         "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS    "clip.text.context_length"
+#define KEY_IMAGE_SIZE     "clip.vision.image_size"
+#define KEY_PATCH_SIZE     "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN     "clip.vision.image_mean"
+#define KEY_IMAGE_STD      "clip.vision.image_std"
+#define KEY_PROJ_TYPE      "clip.projector_type"
+
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
 
 
@@ -100,26 +100,26 @@ static std::string format(const char * fmt, ...) {
 // tensor name constants
 //
 
-#define TN_TOKEN_EMBD "%s.token_embd.weight"
-#define TN_POS_EMBD "%s.position_embd.weight"
-#define TN_CLASS_EMBD "v.class_embd"
-#define TN_PATCH_EMBD "v.patch_embd.weight"
-#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
-#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
-#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
-#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
-#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
-#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
-#define TN_LN_1 "%s.blk.%d.ln1.%s"
-#define TN_LN_2 "%s.blk.%d.ln2.%s"
-#define TN_LN_PRE "%s.pre_ln.%s"
-#define TN_LN_POST "%s.post_ln.%s"
-#define TN_TEXT_PROJ "text_projection.weight"
-#define TN_VIS_PROJ "visual_projection.weight"
-#define TN_LLAVA_PROJ "mm.%d.%s"
-#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_TOKEN_EMBD      "%s.token_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd.weight"
+#define TN_CLASS_EMBD      "v.class_embd"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"
+#define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s"
+#define TN_LN_2            "%s.blk.%d.ln2.%s"
+#define TN_LN_PRE          "%s.pre_ln.%s"
+#define TN_LN_POST         "%s.post_ln.%s"
+#define TN_TEXT_PROJ       "text_projection.weight"
+#define TN_VIS_PROJ        "visual_projection.weight"
+#define TN_LLAVA_PROJ      "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
-#define TN_IMAGE_NEWLINE "model.image_newline"
+#define TN_IMAGE_NEWLINE   "model.image_newline"
 
 
 enum projector_type {
@@ -130,8 +130,8 @@ enum projector_type {
 };
 
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,           "mlp"     },
-    { PROJECTOR_TYPE_LDP,          "ldp"    },
+    { PROJECTOR_TYPE_MLP, "mlp" },
+    { PROJECTOR_TYPE_LDP, "ldp" },
 };
 
 
@@ -191,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
     }
 }
 
-
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
     std::string result;
     for (size_t pos = 0; ; pos += search.length()) {
@@ -279,7 +278,6 @@ struct clip_hparams {
 
     int32_t image_grid_pinpoints[32];
     int32_t image_crop_resolution;
-
 };
 
 struct clip_layer {
@@ -333,6 +331,7 @@ struct clip_vision_model {
     struct ggml_tensor * mm_0_b = NULL;
     struct ggml_tensor * mm_2_w = NULL;
     struct ggml_tensor * mm_2_b = NULL;
+
     struct ggml_tensor * image_newline = NULL;
 
     // Yi type models with mlp+normalization projection
@@ -389,9 +388,10 @@ struct clip_ctx {
     std::vector<uint8_t> buf_compute_meta;
 
     // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = NULL;
+    ggml_backend_buffer_t params_buffer  = NULL;
     ggml_backend_buffer_t compute_buffer = NULL;
-    ggml_backend_t backend = NULL;
+
+    ggml_backend_t backend       = NULL;
     ggml_gallocr_t compute_alloc = NULL;
 };
 
@@ -404,19 +404,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    const int image_size = hparams.image_size;
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+    const int image_size           = hparams.image_size;
+    const int patch_size           = hparams.patch_size;
+    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
     const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions = num_patches + 1;
-    const int hidden_size = hparams.hidden_size;
-    const int n_head = hparams.n_head;
-    const int d_head = hidden_size / n_head;
-    const int n_layer = hparams.n_layer;
-    //const int n_intermediate = hparams.n_intermediate;
-    //const int projection_dim = hparams.projection_dim;
-    const float eps = hparams.eps;
-    int batch_size = imgs->size;
+    const int num_positions        = num_patches + 1;
+    const int hidden_size          = hparams.hidden_size;
+    const int n_head               = hparams.n_head;
+    const int d_head               = hidden_size / n_head;
+    const int n_layer              = hparams.n_layer;
+    const float eps                = hparams.eps;
+
+    const int batch_size = imgs->size;
+
     if (ctx->has_llava_projector) {
         GGML_ASSERT(batch_size == 1);
     }
@@ -816,10 +816,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         if (idx != -1) {
             const std::string proj_type = gguf_get_val_str(ctx, idx);
             new_clip->proj_type = clip_projector_type_from_string(proj_type);
-        }
-        else {
+        } else {
             new_clip->proj_type = PROJECTOR_TYPE_MLP;
         }
+
         if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
             if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
                 new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
@@ -944,6 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
         hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
         hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+
         try {
             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
             int n = gguf_get_arr_n(ctx, idx);
@@ -956,23 +957,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         } catch (std::runtime_error & e) {
             hparams.image_grid_pinpoints[0]=0;
         }
+
         try {
             int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
             strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
         } catch (std::runtime_error & e) {
             strcpy(hparams.mm_patch_merge_type, "flat");
         }
+
         try {
             hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        }
-        catch(const std::exception& e) {
+        } catch(const std::exception& e) {
             hparams.image_crop_resolution = hparams.image_size;
         }
 
         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
         int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
+
         const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
         const float * std_data  = (const float *)gguf_get_arr_data(ctx, idx_std);
+
         for (int i = 0; i < 3; ++i) {
             new_clip->image_mean[i] = mean_data[i];
             new_clip->image_std[i]  = std_data[i];
@@ -998,16 +1002,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
 
         }
-        try
-        {
+
+        try {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
             vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
             vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        }
-        catch(const std::exception& e)
-        {
+        } catch(const std::exception& e) {
             fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
         }
 
@@ -1039,40 +1041,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                 // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
             } catch (std::runtime_error & e) {  }
-        }
-        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
-            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
-            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
-            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
-            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
-            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
-            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
-            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
             vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
             vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
             vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
             vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
-            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
-            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
-            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
-            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
-            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
-            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
             vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
             vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
             vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
             vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
-            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
-            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
-            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
-        }
-        else {
+            vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+        } else {
             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
         }
 
         vision_model.layers.resize(hparams.n_layer);
+
         for (int il = 0; il < hparams.n_layer; ++il) {
             auto & layer = vision_model.layers[il];
             layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight"));
@@ -1412,7 +1413,6 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag
     image_output = std::move(padded_image);
 }
 
-
 /**
  * Selects the best resolution from a list of possible resolutions based on the original size.
  *
@@ -1446,7 +1446,6 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
     return best_fit;
 }
 
-
 static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
     std::vector<clip_image_u8*> patches;
     int width = image.nx;
@@ -1472,7 +1471,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 
 #ifdef CLIP_DEBUG_FUNCTIONS
 // debug function to convert f32 to u8
-void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
     dst.nx = src.nx;
     dst.ny = src.ny;
     dst.buf.resize(3 * src.nx * src.ny);
@@ -1532,8 +1531,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
         }
     } else {
-        if (params.image_grid_pinpoints[0] != 0)
-        {
+        if (params.image_grid_pinpoints[0] != 0) {
             // "spatial_unpad" with "anyres" processing for llava-1.6
             std::vector<std::pair<int, int>> possible_resolutions;
             for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
@@ -1656,6 +1654,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     return true;
 }
 
+ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
+    return ctx->vision_model.image_newline;
+}
+
 void clip_free(clip_ctx * ctx) {
     ggml_free(ctx->ctx_data);
     gguf_free(ctx->ctx_gguf);
@@ -1687,6 +1689,18 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_grid_pinpoints;
 }
 
+int clip_n_patches(const struct clip_ctx * ctx) {
+    const auto & params = ctx->vision_model.hparams;
+
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        n_patches /= 4;
+    }
+
+    return n_patches;
+}
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -1706,7 +1720,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     int batch_size = imgs->size;
-    if(ctx->has_llava_projector) {
+    if (ctx->has_llava_projector) {
         GGML_ASSERT(batch_size == 1); // TODO: support multiple images
     }
 
@@ -1717,9 +1731,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     // set inputs
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
-    const int image_size = hparams.image_size;
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+
+    const int image_size    = hparams.image_size;
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
     const int num_positions = num_patches + 1;
 
     {
@@ -1794,11 +1809,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
     return true;
 }
 
 bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
-
     ggml_type type = GGML_TYPE_Q4_1;
 
     assert(itype < GGML_TYPE_COUNT);
@@ -1987,26 +2002,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
         return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
     }
-    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+    if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
         return ctx->vision_model.mm_2_b->ne[0];
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
-        return ctx->vision_model.mm_3_b->ne[0];
     }
-    else {
-        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
-        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        return ctx->vision_model.mm_3_b->ne[0];
     }
-}
-
-ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) {
-    return ctx->vision_model.image_newline;
-}
 
-int clip_n_patches(const struct clip_ctx * ctx) {
-    auto & params = ctx->vision_model.hparams;
-    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-        n_patches /= 4;
-    }
-    return n_patches;
+    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 5e0b5c64b57c5..cd9a4022f5778 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -26,7 +26,17 @@ extern "C" {
 
 struct clip_ctx;
 
-CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
 CLIP_API void clip_free(struct clip_ctx * ctx);
@@ -45,33 +55,21 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
-struct clip_image_u8_batch {
-    struct clip_image_u8 * data;
-    size_t size;
-};
-
-struct clip_image_f32_batch {
-    struct clip_image_f32 * data;
-    size_t size;
-};
-CLIP_API struct clip_image_grid_shape {
-    int first;
-    int second;
-};
-
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
-CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs );
-CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx);
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+
+CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index ea956ac005a97..22953417f0975 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -26,6 +26,11 @@ struct clip_image_f32 {
     std::vector<float> buf;
 };
 
+struct clip_image_grid_shape {
+    int first;
+    int second;
+};
+
 /**
  * Selects the best resolution from a list of possible resolutions based on the original size.
  *
@@ -344,7 +349,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
     return true;
 }
 
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
     clip_image_u8 * img = clip_image_u8_init();
     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
         clip_image_u8_free(img);
@@ -401,7 +406,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
     return true;
 }
 
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
     unsigned char* image_bytes;
     long image_bytes_length;
     auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
@@ -416,7 +421,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
     return embed;
 }
 
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
+void llava_image_embed_free(struct llava_image_embed * embed) {
     free(embed->embed);
     free(embed);
 }
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index e08ce78839dcb..9e9466a5d1726 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -3,7 +3,6 @@
 
 #include "ggml.h"
 
-
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@@ -42,7 +41,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
 
-
 #ifdef __cplusplus
 }
 #endif

From 9d166b0850db18fac234d60af38213faf8dedaf8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 20:43:45 +0200
Subject: [PATCH 20/24] convert : add --skip-unknown CLI arg

---
 convert.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/convert.py b/convert.py
index 1fc2d4719efd0..63a0a5d78075b 100755
--- a/convert.py
+++ b/convert.py
@@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
             for (name, tensor) in model.items()}
 
 
-def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
     tmap = gguf.TensorNameMap(ARCH, params.n_layer)
     should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
 
@@ -1199,9 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     for name, lazy_tensor in model.items():
         tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
         if name_new is None:
-            #raise Exception(f"Unexpected tensor name: {name}")
-            print(f"Unexpected tensor name: {name} - skipping")
-            continue
+            if skip_unknown:
+                print(f"Unexpected tensor name: {name} - skipping")
+                continue
+            else:
+                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
 
         if tensor_type in should_skip:
             print(f"skipping tensor {name_new}")
@@ -1379,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None:
         output_choices.append("q8_0")
     vocab_types = ["spm", "bpe", "hfft"]
     parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
-    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
-    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
-    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
-    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
-    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
-    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
+    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
 
     args = parser.parse_args(args_in)
     if args.awq_path:
@@ -1463,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
     print(f"Special vocab info: {special_vocab}")
 
     model   = model_plus.model
-    model   = convert_model_names(model, params)
+    model   = convert_model_names(model, params, args.skip_unknown)
     ftype   = pick_output_type(model, args.outtype)
     model   = convert_to_output_type(model, ftype)
     outfile = args.outfile or default_outfile(model_plus.paths, ftype)

From c92431a0a4643346e3c96a6b971dcadf0d5d4a99 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Feb 2024 20:51:20 +0200
Subject: [PATCH 21/24] server : remove clip structs

---
 examples/llava/clip.cpp    |  4 +++-
 examples/server/server.cpp | 29 ++++++++---------------------
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 1cdb2be74a31d..73438e3f5e241 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1483,7 +1483,7 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
 
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -1648,9 +1648,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     //     clip_image_u8_free(temp2);
     // }
     // res_imgs.push_back(res);
+
     res_imgs.size = 1;
     res_imgs.data = new clip_image_f32[res_imgs.size];
     res_imgs.data[0] = std::move(*res);
+
     return true;
 }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a9f71725dc163..6e343403032fc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,23 +31,6 @@
 
 using json = nlohmann::json;
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 struct server_params
 {
     std::string hostname = "127.0.0.1";
@@ -992,10 +975,13 @@ struct llama_server_context
             {
                 LOG_TEE("Error processing the given image");
                 clip_free(clp_ctx);
-                delete[] img_res_v.data;
+                clip_image_f32_free(img_res_v.data);
                 return false;
             }
-            clip_image_f32 * img_res = &img_res_v.data[0];
+
+            // note: assumes only one image was returned by clip_image_preprocess
+            clip_image_f32 * img_res = img_res_v.data;
+
             img.image_tokens = clip_n_patches(clp_ctx);
             img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
             if (!img.image_embedding)
@@ -1010,8 +996,9 @@ struct llama_server_context
                 LOG_TEE("Unable to encode image\n");
                 return false;
             }
-            // clip_image_f32_free(img_res);
-            delete[] img_res_v.data;
+
+            clip_image_f32_free(img_res_v.data);
+
             img.request_encode_image = false;
         }
 

From c9874dd0d65c8e0d42588f287af02d8905999e21 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Wed, 14 Feb 2024 05:05:57 +0100
Subject: [PATCH 22/24] bugfix for non llava-1.6

It should now work with llava-1.5 as well
---
 examples/llava/llava-surgery-v2.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index f0ade4ceb357b..5bc5bc5137fe0 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -38,7 +38,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
     # file_type = 'pytorch'
     model_path = os.path.dirname(checkpoint_path)
     print(f"Searching for vision tower tensors in {checkpoint_path}")
-    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
 
     if len(clip_tensors) > 0:
         print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
@@ -46,8 +46,10 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
         clip_path = os.path.join(model_path, "llava.clip")
 
         if os.path.exists(clip_path):
+            print(f"Loading existing llava.clip from {clip_path}")
             existing_clip, _ = load_model(clip_path)
         else:
+            print(f"Creating new llava.clip at {clip_path}")
             existing_clip = {}
         # Update existing_clip with new tensors, avoid duplicates
         for name in clip_tensors:
@@ -116,19 +118,24 @@ def proj_criteria(checkpoint):
 newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
 
 print(f"Taking projector from {projector_checkpoint_path}")
-print(f"Taking newline from {newline_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+    print(f"Taking newline from {newline_checkpoint_path}")
+    first_checkpoint, file_type = load_model(newline_checkpoint_path)
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
 
 # Load the checkpoint
-first_checkpoint, file_type = load_model(newline_checkpoint_path)
-last_checkpoint, file_type = load_model(projector_checkpoint_path)
-mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
-first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
-
-
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+    last_checkpoint, file_type = load_model(projector_checkpoint_path)
+    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
 
 if len(mm_tensors) == 0:
-    for k, v in last_checkpoint.items():
-        print(k)
+    if last_checkpoint is not None:
+        for k, v in last_checkpoint.items():
+            print(k)
     print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
     print("No tensors found. Is this a LLaVA model?")
     exit()
@@ -142,7 +149,8 @@ def proj_criteria(checkpoint):
 for name in first_mm_tensors:
     projector[name] = first_checkpoint[name].float()
 
-save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+if len(projector) > 0:
+    save_model(projector, f"{args.model}/llava.projector", 'pytorch')
 
 for name in mm_tensors:
     del last_checkpoint[name]

From 7974ff7f027739b108927acc1eb540076fadfb6d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 14 Feb 2024 09:34:16 +0200
Subject: [PATCH 23/24] clip : minor code rearrange

---
 examples/llava/clip.cpp | 221 ++++++++++++++++++++--------------------
 1 file changed, 110 insertions(+), 111 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 73438e3f5e241..9c5091e613849 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -32,7 +32,7 @@
 #include <cinttypes>
 #include <limits>
 
-// #define CLIP_DEBUG_FUNCTIONS
+//#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -258,6 +258,114 @@ static projector_type clip_projector_type_from_string(const std::string & name)
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
+#ifdef CLIP_DEBUG_FUNCTIONS
+static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    // PPM header: P6 format, width, height, and max color value
+    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+    // Write pixel data
+    for (size_t i = 0; i < img.buf.size(); i += 3) {
+        // PPM expects binary data in RGB format, which matches our image buffer
+        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+    }
+
+    file.close();
+}
+
+static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    int bytesPerPixel = 3;
+    int widthInBytes = img.nx * bytesPerPixel;
+    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+    int stride = widthInBytes + paddingAmount;
+
+    // Bitmap file header
+    unsigned char fileHeader[14] = {
+        'B','M',     // Signature
+        0,0,0,0,    // Image file size in bytes
+        0,0,0,0,    // Reserved
+        54,0,0,0    // Start of pixel array
+    };
+
+    // Total file size
+    fileSize = 54 + (stride * img.ny);
+    fileHeader[2] = (unsigned char)(fileSize);
+    fileHeader[3] = (unsigned char)(fileSize >> 8);
+    fileHeader[4] = (unsigned char)(fileSize >> 16);
+    fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+    // Bitmap information header (BITMAPINFOHEADER)
+    unsigned char infoHeader[40] = {
+        40,0,0,0,   // Size of this header (40 bytes)
+        0,0,0,0,    // Image width
+        0,0,0,0,    // Image height
+        1,0,        // Number of color planes
+        24,0,       // Bits per pixel
+        0,0,0,0,    // No compression
+        0,0,0,0,    // Image size (can be 0 for no compression)
+        0,0,0,0,    // X pixels per meter (not specified)
+        0,0,0,0,    // Y pixels per meter (not specified)
+        0,0,0,0,    // Total colors (color table not used)
+        0,0,0,0     // Important colors (all are important)
+    };
+
+    // Width and height in the information header
+    infoHeader[4] = (unsigned char)(img.nx);
+    infoHeader[5] = (unsigned char)(img.nx >> 8);
+    infoHeader[6] = (unsigned char)(img.nx >> 16);
+    infoHeader[7] = (unsigned char)(img.nx >> 24);
+    infoHeader[8] = (unsigned char)(img.ny);
+    infoHeader[9] = (unsigned char)(img.ny >> 8);
+    infoHeader[10] = (unsigned char)(img.ny >> 16);
+    infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+    // Write file headers
+    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+    // Pixel data
+    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < img.nx; ++x) {
+            // Each pixel
+            size_t pixelIndex = (y * img.nx + x) * 3;
+            unsigned char pixel[3] = {
+                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+                img.buf[pixelIndex + 1],
+                img.buf[pixelIndex]
+            };
+            file.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        // Write padding for the row
+        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+    }
+
+    file.close();
+}
+
+// debug function to convert f32 to u8
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(3 * src.nx * src.ny);
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    }
+}
+#endif
+
 
 //
 // clip layers
@@ -274,7 +382,7 @@ struct clip_hparams {
 
     float eps;
 
-    char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
+    char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
 
     int32_t image_grid_pinpoints[32];
     int32_t image_crop_resolution;
@@ -1156,103 +1264,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     return true;
 }
 
-#ifdef CLIP_DEBUG_FUNCTIONS
-void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
-        return;
-    }
-
-    // PPM header: P6 format, width, height, and max color value
-    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
-
-    // Write pixel data
-    for (size_t i = 0; i < img.buf.size(); i += 3) {
-        // PPM expects binary data in RGB format, which matches our image buffer
-        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
-    }
-
-    file.close();
-}
-void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
-        return;
-    }
-
-    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
-    int bytesPerPixel = 3;
-    int widthInBytes = img.nx * bytesPerPixel;
-    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
-    int stride = widthInBytes + paddingAmount;
-
-    // Bitmap file header
-    unsigned char fileHeader[14] = {
-        'B','M',     // Signature
-        0,0,0,0,    // Image file size in bytes
-        0,0,0,0,    // Reserved
-        54,0,0,0    // Start of pixel array
-    };
-
-    // Total file size
-    fileSize = 54 + (stride * img.ny);
-    fileHeader[2] = (unsigned char)(fileSize);
-    fileHeader[3] = (unsigned char)(fileSize >> 8);
-    fileHeader[4] = (unsigned char)(fileSize >> 16);
-    fileHeader[5] = (unsigned char)(fileSize >> 24);
-
-    // Bitmap information header (BITMAPINFOHEADER)
-    unsigned char infoHeader[40] = {
-        40,0,0,0,   // Size of this header (40 bytes)
-        0,0,0,0,    // Image width
-        0,0,0,0,    // Image height
-        1,0,        // Number of color planes
-        24,0,       // Bits per pixel
-        0,0,0,0,    // No compression
-        0,0,0,0,    // Image size (can be 0 for no compression)
-        0,0,0,0,    // X pixels per meter (not specified)
-        0,0,0,0,    // Y pixels per meter (not specified)
-        0,0,0,0,    // Total colors (color table not used)
-        0,0,0,0     // Important colors (all are important)
-    };
-
-    // Width and height in the information header
-    infoHeader[4] = (unsigned char)(img.nx);
-    infoHeader[5] = (unsigned char)(img.nx >> 8);
-    infoHeader[6] = (unsigned char)(img.nx >> 16);
-    infoHeader[7] = (unsigned char)(img.nx >> 24);
-    infoHeader[8] = (unsigned char)(img.ny);
-    infoHeader[9] = (unsigned char)(img.ny >> 8);
-    infoHeader[10] = (unsigned char)(img.ny >> 16);
-    infoHeader[11] = (unsigned char)(img.ny >> 24);
-
-    // Write file headers
-    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
-    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
-
-    // Pixel data
-    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
-    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
-        for (int x = 0; x < img.nx; ++x) {
-            // Each pixel
-            size_t pixelIndex = (y * img.nx + x) * 3;
-            unsigned char pixel[3] = {
-                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
-                img.buf[pixelIndex + 1],
-                img.buf[pixelIndex]
-            };
-            file.write(reinterpret_cast<char*>(pixel), 3);
-        }
-        // Write padding for the row
-        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
-    }
-
-    file.close();
-}
-#endif
-
 // Linear interpolation between two points
 inline float lerp(float s, float e, float t) {
     return s + (e - s) * t;
@@ -1469,18 +1480,6 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
     return patches;
 }
 
-#ifdef CLIP_DEBUG_FUNCTIONS
-// debug function to convert f32 to u8
-static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(3 * src.nx * src.ny);
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
-    }
-}
-#endif
-
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {

From 6727cfd21ac3d3d14a21de690326552333de9ab8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 14 Feb 2024 09:35:57 +0200
Subject: [PATCH 24/24] llava : update readme a bit

---
 examples/llava/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 4e789da3dd190..e2ef0eff1466c 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -19,9 +19,9 @@ After building, run: `./llava-cli` to see the usage. For example:
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
 
-## Model conversion
+## LLaVA 1.5
 
-- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
+- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
 
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@@ -59,6 +59,8 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 
 - Use `llava-surgery-v2.py`
 
+- TODO: add detailed instructions
+
 ## TODO
 
 - [x] Support non-CPU backend for the image encoding part.