From 10c830cdfe8b18114a66104276218ce656c0cf02 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Thu, 1 Feb 2024 23:15:40 +0100 Subject: [PATCH 01/24] Create llava-survery-v2.py --- examples/llava/llava-survery-v2.py | 138 +++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 examples/llava/llava-survery-v2.py diff --git a/examples/llava/llava-survery-v2.py b/examples/llava/llava-survery-v2.py new file mode 100644 index 0000000000000..51f9cb638fd95 --- /dev/null +++ b/examples/llava/llava-survery-v2.py @@ -0,0 +1,138 @@ +import argparse +import glob +import os +import torch +from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file + + +# Function to determine if file is a SafeTensor file +def is_safetensor_file(file_path): + return file_path.endswith('.safetensors') + + +# Unified loading function +def load_model(file_path): + if is_safetensor_file(file_path): + # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' + tensors = {} + with safe_open(file_path, framework="pt", device="cpu") as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key).clone() + return tensors, 'safetensor' + else: + return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' + + +# Unified saving function +def save_model(model, file_path, file_type): + if file_type == 'safetensor': + # safe_save(model, file_path) + save_file(model, file_path) + else: + torch.save(model, file_path) + + +# Adapted function to clean vision tower from checkpoint +def clean_vision_tower_from_checkpoint(checkpoint_path): + checkpoint, file_type = load_model(checkpoint_path) + # file_type = 'pytorch' + model_path = os.path.dirname(checkpoint_path) + print(f"Searching for vision tower tensors in {checkpoint_path}") + clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] + + if len(clip_tensors) > 0: + print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") + # Adapted for file type + clip_path = os.path.join(model_path, "llava.clip") + + if os.path.exists(clip_path): + existing_clip, _ = load_model(clip_path) + else: + existing_clip = {} + # Update existing_clip with new tensors, avoid duplicates + for name in clip_tensors: + simple_name = name.replace("vision_tower.vision_tower.", "") + print(f"Adding {simple_name} to llava.clip") + if simple_name not in existing_clip: + existing_clip[simple_name] = checkpoint[name] + + # Save the updated clip tensors back to llava.clip + save_model(existing_clip, clip_path, 'pytorch') + + # Remove the tensors from the original checkpoint + for name in clip_tensors: + del checkpoint[name] + + # Save the updated checkpoint + checkpoint_path = checkpoint_path + save_model(checkpoint, checkpoint_path, file_type) + return True + return False + + +# Command-line interface setup +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") +ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +args = ap.parse_args() + +if args.clean_vision_tower: + # Generalized to handle both PyTorch and SafeTensors models + model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) + # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] + checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] + for last_checkpoint_path in checkpoint_paths: + print(f"Cleaning {last_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(last_checkpoint_path): + print(f"No vision tower found in {last_checkpoint_path}") + # we break once none is found, so far all models append them at the end + break + print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") + +# Now we look for the projector in the last checkpoint +model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) +checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] +last_checkpoint_path = checkpoint_paths[0] +first_checkpoint_path = checkpoint_paths[-1] + +print(f"Taking projector from {last_checkpoint_path}") + +# Load the checkpoint +first_checkpoint, file_type = load_model(first_checkpoint_path) +last_checkpoint, file_type = load_model(last_checkpoint_path) +mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] +first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] + + + +if len(mm_tensors) == 0: + for k, v in last_checkpoint.items(): + print(k) + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") + print("No tensors found. Is this a LLaVA model?") + exit() + +print(f"Found {len(mm_tensors)} tensors to extract.") +print(f"Found additional {len(first_mm_tensors)} tensors to extract.") +# projector = {name: checkpoint.[name].float() for name in mm_tensors} +projector = {} +for name in mm_tensors: + projector[name] = last_checkpoint[name].float() +for name in first_mm_tensors: + projector[name] = first_checkpoint[name].float() + +save_model(projector, f"{args.model}/llava.projector", 'pytorch') + +for name in mm_tensors: + del last_checkpoint[name] +for name in first_mm_tensors: + del first_checkpoint[name] + +if len(mm_tensors) > 0: + save_model(last_checkpoint, last_checkpoint_path, file_type) +if len(first_mm_tensors) > 0: + save_model(first_checkpoint, first_checkpoint_path, file_type) + +print("Done!") +print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") From 97dda1e098bf1908bfd34759d630bfe77ea84fb9 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Thu, 1 Feb 2024 23:16:30 +0100 Subject: [PATCH 02/24] Update convert-image-encoder-to-gguf.py --- .../llava/convert-image-encoder-to-gguf.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index f5a3c9b46f9e3..82acfb22595a3 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -80,12 +80,13 @@ def bytes_to_unicode(): help="Save a vision-only model. It can't be used to encode texts") ap.add_argument("--clip_model_is_vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") -ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") -ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) @@ -105,7 +106,7 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir -if args.clip_model_is_vision: +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None else: @@ -133,7 +134,7 @@ def bytes_to_unicode(): if args.use_f32: ftype = 0 -if args.clip_model_is_vision: +if args.clip_model_is_vision or args.clip_model_is_openclip: model = CLIPVisionModel.from_pretrained(dir_model) processor = None else: @@ -202,6 +203,23 @@ def bytes_to_unicode(): fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + if "image_grid_pinpoints" in v_hparams: + # no nested array - flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + image_grid_pinpoints.extend(pinpoint) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean From 8ebdaec76169c58472f1a97a71fc3548578eae00 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:25:08 +0100 Subject: [PATCH 03/24] Update convert-image-encoder-to-gguf.py --- .../llava/convert-image-encoder-to-gguf.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 82acfb22595a3..115b6b35b4da0 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -203,8 +203,41 @@ def bytes_to_unicode(): fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ if "image_grid_pinpoints" in v_hparams: - # no nested array - flatten it + # flatten it image_grid_pinpoints = [] for pinpoint in v_hparams["image_grid_pinpoints"]: image_grid_pinpoints.extend(pinpoint) From 1f9367c1348808bc5e83756dbcdbf9cd59adef84 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:26:05 +0100 Subject: [PATCH 04/24] Rename llava-survery-v2.py to llava-surgery-v2.py --- examples/llava/{llava-survery-v2.py => llava-surgery-v2.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/llava/{llava-survery-v2.py => llava-surgery-v2.py} (100%) diff --git a/examples/llava/llava-survery-v2.py b/examples/llava/llava-surgery-v2.py similarity index 100% rename from examples/llava/llava-survery-v2.py rename to examples/llava/llava-surgery-v2.py From a27b9a45df460fd3a3b4c81858d7f55119126b09 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 01:48:14 +0100 Subject: [PATCH 05/24] Update convert-image-encoder-to-gguf.py will now search for projector --- .../llava/convert-image-encoder-to-gguf.py | 454 ++++++------------ 1 file changed, 143 insertions(+), 311 deletions(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 115b6b35b4da0..a65b05f8a96db 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,327 +1,159 @@ import argparse +import glob import os -import json - import torch -import numpy as np -from gguf import * -from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel - -TEXT = "clip.text" -VISION = "clip.vision" - - -def k(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: - if name in ( - "logit_scale", - "text_model.embeddings.position_ids", - "vision_model.embeddings.position_ids", - ): - return True - - if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: - return True - - if name.startswith("v") and not has_vision: - return True - - if name.startswith("t") and not has_text: - return True - - return False - - -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - - if "mm_projector" in name: - return name.replace("model.mm_projector", "mm") - - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - - -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") -ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) -ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") -ap.add_argument("--text-only", action="store_true", required=False, - help="Save a text-only model. It can't be used to encode images") -ap.add_argument("--vision-only", action="store_true", required=False, - help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip_model_is_vision", action="store_true", required=False, - help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, - help="The clip model is from openclip (for ViT-SO400M type))") -ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") -ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) -# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 -# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) - -# with proper -args = ap.parse_args() - - -if args.text_only and args.vision_only: - print("--text-only and --image-only arguments cannot be specified at the same time.") - exit(1) - -if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") - -# output in the same directory as the model if output_dir is None -dir_model = args.model_dir - -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: - vocab = None - tokens = None -else: - with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - config = json.load(f) - if args.clip_model_is_vision: - v_hparams = config - t_hparams = None +from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file + + +# Function to determine if file is a SafeTensor file +def is_safetensor_file(file_path): + return file_path.endswith('.safetensors') + + +# Unified loading function +def load_model(file_path): + if is_safetensor_file(file_path): + # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' + tensors = {} + with safe_open(file_path, framework="pt", device="cpu") as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key).clone() + return tensors, 'safetensor' else: - v_hparams = config["vision_config"] - t_hparams = config["text_config"] - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if args.use_f32: - ftype = 0 - -if args.clip_model_is_vision or args.clip_model_is_openclip: - model = CLIPVisionModel.from_pretrained(dir_model) - processor = None -else: - model = CLIPModel.from_pretrained(dir_model) - processor = CLIPProcessor.from_pretrained(dir_model) - -fname_middle = None -has_text_encoder = True -has_vision_encoder = True -has_llava_projector = False -if args.text_only: - fname_middle = "text-" - has_vision_encoder = False -elif args.llava_projector is not None: - fname_middle = "mmproj-" - has_text_encoder = False - has_llava_projector = True -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False -else: - fname_middle = "" - -output_dir = args.output_dir if args.output_dir is not None else dir_model -os.makedirs(output_dir, exist_ok=True) -output_prefix = os.path.basename(output_dir).replace("ggml_", "") -fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") - -fout.add_bool("clip.has_text_encoder", has_text_encoder) -fout.add_bool("clip.has_vision_encoder", has_vision_encoder) -fout.add_bool("clip.has_llava_projector", has_llava_projector) -fout.add_file_type(ftype) -model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) -fout.add_name(model_name) -if args.text_only: - fout.add_description("text-only CLIP model") -elif args.vision_only and not has_llava_projector: - fout.add_description("vision-only CLIP model") -elif has_llava_projector: - fout.add_description("image encoder for LLaVA") - # add projector type - fout.add_string("clip.projector_type", args.projector_type) -else: - fout.add_description("two-tower CLIP model") + return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' -if has_text_encoder: - # text_model hparams - fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) - fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) - fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) - fout.add_token_list(tokens) -if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) - fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) - fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) - block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) - # /** - # "image_grid_pinpoints": [ - # [ - # 336, - # 672 - # ], - # [ - # 672, - # 336 - # ], - # [ - # 672, - # 672 - # ], - # [ - # 1008, - # 336 - # ], - # [ - # 336, - # 1008 - # ] - # ], - # Flattened: - # [ - # 336, 672, - # 672, 336, - # 672, 672, - # 1008, 336, - # 336, 1008 - # ] - # * - # */ - if "image_grid_pinpoints" in v_hparams: - # flatten it - image_grid_pinpoints = [] - for pinpoint in v_hparams["image_grid_pinpoints"]: - image_grid_pinpoints.extend(pinpoint) - fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) - if "image_crop_resolution" in v_hparams: - fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) - if "image_aspect_ratio" in v_hparams: - fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) - if "image_split_resolution" in v_hparams: - fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) - if "mm_patch_merge_type" in v_hparams: - fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) - if "mm_projector_type" in v_hparams: - fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) - - - if processor is not None: - image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean - image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std +# Unified saving function +def save_model(model, file_path, file_type): + if file_type == 'safetensor': + # safe_save(model, file_path) + save_file(model, file_path) else: - image_mean = args.image_mean if args.image_mean is not None else default_image_mean - image_std = args.image_std if args.image_std is not None else default_image_std - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - -use_gelu = v_hparams["hidden_act"] == "gelu" -fout.add_bool("clip.use_gelu", use_gelu) - - -if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) - projector = torch.load(args.llava_projector) - for name, data in projector.items(): - name = get_tensor_name(name) - # pw and dw conv ndim==4 - if data.ndim == 2 or data.ndim == 4: - data = data.squeeze().numpy().astype(np.float16) + torch.save(model, file_path) + + +# Adapted function to clean vision tower from checkpoint +def clean_vision_tower_from_checkpoint(checkpoint_path): + checkpoint, file_type = load_model(checkpoint_path) + # file_type = 'pytorch' + model_path = os.path.dirname(checkpoint_path) + print(f"Searching for vision tower tensors in {checkpoint_path}") + clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] + + if len(clip_tensors) > 0: + print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") + # Adapted for file type + clip_path = os.path.join(model_path, "llava.clip") + + if os.path.exists(clip_path): + existing_clip, _ = load_model(clip_path) else: - data = data.squeeze().numpy().astype(np.float32) - - fout.add_tensor(name, data) - - print("Projector tensors added\n") + existing_clip = {} + # Update existing_clip with new tensors, avoid duplicates + for name in clip_tensors: + simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name + print(f"Adding {simple_name} to llava.clip") + if simple_name not in existing_clip: + existing_clip[simple_name] = checkpoint[name] + + # Save the updated clip tensors back to llava.clip + save_model(existing_clip, clip_path, 'pytorch') + + # Remove the tensors from the original checkpoint + for name in clip_tensors: + del checkpoint[name] + + # Save the updated checkpoint + checkpoint_path = checkpoint_path + save_model(checkpoint, checkpoint_path, file_type) + return True + return False -state_dict = model.state_dict() -for name, data in state_dict.items(): - if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): - # we don't need this - print(f"skipping parameter: {name}") - continue +def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): + newline_checkpoint_path = None + projector_checkpoint_path = None - name = get_tensor_name(name) - data = data.squeeze().numpy() + for path in checkpoint_paths: + checkpoint, _ = load_model(path) + if newline_criteria(checkpoint) and newline_checkpoint_path is None: + newline_checkpoint_path = path + if projector(checkpoint): + projector_checkpoint_path = path - n_dims = len(data.shape) + return newline_checkpoint_path, projector_checkpoint_path - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 +def newline_criteria(checkpoint): + return any(k.startswith("model.image_newline") for k in checkpoint.keys()) - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) +def proj_criteria(checkpoint): + return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) -fout.write_header_to_file() -fout.write_kv_data_to_file() -fout.write_tensors_to_file() -fout.close() +# Command-line interface setup +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") +ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +args = ap.parse_args() -print("Done. Output file: " + fname_out) +if args.clean_vision_tower: + # Generalized to handle both PyTorch and SafeTensors models + model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) + # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] + checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] + for projector_checkpoint_path in checkpoint_paths: + print(f"Cleaning {projector_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): + print(f"No vision tower found in {projector_checkpoint_path}") + # we break once none is found, so far all models append them at the end + break + print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") + +# Now we look for the projector in the last checkpoint +model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) +checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] +# last_checkpoint_path = checkpoint_paths[0] +# first_checkpoint_path = checkpoint_paths[-1] +newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) + +print(f"Taking projector from {projector_checkpoint_path}") +print(f"Taking newline from {newline_checkpoint_path}") + +# Load the checkpoint +first_checkpoint, file_type = load_model(newline_checkpoint_path) +last_checkpoint, file_type = load_model(projector_checkpoint_path) +mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] +first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] + + + +if len(mm_tensors) == 0: + for k, v in last_checkpoint.items(): + print(k) + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") + print("No tensors found. Is this a LLaVA model?") + exit() + +print(f"Found {len(mm_tensors)} tensors to extract.") +print(f"Found additional {len(first_mm_tensors)} tensors to extract.") +# projector = {name: checkpoint.[name].float() for name in mm_tensors} +projector = {} +for name in mm_tensors: + projector[name] = last_checkpoint[name].float() +for name in first_mm_tensors: + projector[name] = first_checkpoint[name].float() + +save_model(projector, f"{args.model}/llava.projector", 'pytorch') + +for name in mm_tensors: + del last_checkpoint[name] +for name in first_mm_tensors: + del first_checkpoint[name] + +if len(mm_tensors) > 0: + save_model(last_checkpoint, projector_checkpoint_path, file_type) +if len(first_mm_tensors) > 0: + save_model(first_checkpoint, newline_checkpoint_path, file_type) + +print("Done!") +print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") From 440b2ae2b1cdaa53b5b546c569d658bd8cecfa6a Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 02:07:29 +0100 Subject: [PATCH 06/24] Update convert-image-encoder-to-gguf.py whoops --- .../llava/convert-image-encoder-to-gguf.py | 454 ++++++++++++------ 1 file changed, 311 insertions(+), 143 deletions(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index a65b05f8a96db..115b6b35b4da0 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,159 +1,327 @@ import argparse -import glob import os +import json + import torch -from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file - - -# Function to determine if file is a SafeTensor file -def is_safetensor_file(file_path): - return file_path.endswith('.safetensors') - - -# Unified loading function -def load_model(file_path): - if is_safetensor_file(file_path): - # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' - tensors = {} - with safe_open(file_path, framework="pt", device="cpu") as f: - for key in f.keys(): - tensors[key] = f.get_tensor(key).clone() - return tensors, 'safetensor' - else: - return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' +import numpy as np +from gguf import * +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel +TEXT = "clip.text" +VISION = "clip.vision" -# Unified saving function -def save_model(model, file_path, file_type): - if file_type == 'safetensor': - # safe_save(model, file_path) - save_file(model, file_path) - else: - torch.save(model, file_path) - - -# Adapted function to clean vision tower from checkpoint -def clean_vision_tower_from_checkpoint(checkpoint_path): - checkpoint, file_type = load_model(checkpoint_path) - # file_type = 'pytorch' - model_path = os.path.dirname(checkpoint_path) - print(f"Searching for vision tower tensors in {checkpoint_path}") - clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] - - if len(clip_tensors) > 0: - print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") - # Adapted for file type - clip_path = os.path.join(model_path, "llava.clip") - - if os.path.exists(clip_path): - existing_clip, _ = load_model(clip_path) - else: - existing_clip = {} - # Update existing_clip with new tensors, avoid duplicates - for name in clip_tensors: - simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name - print(f"Adding {simple_name} to llava.clip") - if simple_name not in existing_clip: - existing_clip[simple_name] = checkpoint[name] - - # Save the updated clip tensors back to llava.clip - save_model(existing_clip, clip_path, 'pytorch') - - # Remove the tensors from the original checkpoint - for name in clip_tensors: - del checkpoint[name] - - # Save the updated checkpoint - checkpoint_path = checkpoint_path - save_model(checkpoint, checkpoint_path, file_type) + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: return True + return False -def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): - newline_checkpoint_path = None - projector_checkpoint_path = None - for path in checkpoint_paths: - checkpoint, _ = load_model(path) - if newline_criteria(checkpoint) and newline_checkpoint_path is None: - newline_checkpoint_path = path - if projector(checkpoint): - projector_checkpoint_path = path +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + + if "mm_projector" in name: + return name.replace("model.mm_projector", "mm") + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - return newline_checkpoint_path, projector_checkpoint_path -def newline_criteria(checkpoint): - return any(k.startswith("model.image_newline") for k in checkpoint.keys()) +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) -def proj_criteria(checkpoint): - return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) +ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip_model_is_vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) -# Command-line interface setup -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") -ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +# with proper args = ap.parse_args() -if args.clean_vision_tower: - # Generalized to handle both PyTorch and SafeTensors models - model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) - # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] - checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] - for projector_checkpoint_path in checkpoint_paths: - print(f"Cleaning {projector_checkpoint_path}") - if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): - print(f"No vision tower found in {projector_checkpoint_path}") - # we break once none is found, so far all models append them at the end - break - print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") - -# Now we look for the projector in the last checkpoint -model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) -checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] -# last_checkpoint_path = checkpoint_paths[0] -# first_checkpoint_path = checkpoint_paths[-1] -newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) - -print(f"Taking projector from {projector_checkpoint_path}") -print(f"Taking newline from {newline_checkpoint_path}") - -# Load the checkpoint -first_checkpoint, file_type = load_model(newline_checkpoint_path) -last_checkpoint, file_type = load_model(projector_checkpoint_path) -mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] -first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] - - - -if len(mm_tensors) == 0: - for k, v in last_checkpoint.items(): - print(k) - print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") - print("No tensors found. Is this a LLaVA model?") - exit() - -print(f"Found {len(mm_tensors)} tensors to extract.") -print(f"Found additional {len(first_mm_tensors)} tensors to extract.") -# projector = {name: checkpoint.[name].float() for name in mm_tensors} -projector = {} -for name in mm_tensors: - projector[name] = last_checkpoint[name].float() -for name in first_mm_tensors: - projector[name] = first_checkpoint[name].float() - -save_model(projector, f"{args.model}/llava.projector", 'pytorch') - -for name in mm_tensors: - del last_checkpoint[name] -for name in first_mm_tensors: - del first_checkpoint[name] - -if len(mm_tensors) > 0: - save_model(last_checkpoint, projector_checkpoint_path, file_type) -if len(first_mm_tensors) > 0: - save_model(first_checkpoint, newline_checkpoint_path, file_type) - -print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = config["text_config"] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +if args.clip_model_is_vision or args.clip_model_is_openclip: + model = CLIPVisionModel.from_pretrained(dir_model) + processor = None +else: + model = CLIPModel.from_pretrained(dir_model) + processor = CLIPProcessor.from_pretrained(dir_model) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_llava_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_llava_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_llava_projector", has_llava_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_llava_projector: + fout.add_description("vision-only CLIP model") +elif has_llava_projector: + fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ + if "image_grid_pinpoints" in v_hparams: + # flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + image_grid_pinpoints.extend(pinpoint) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = v_hparams["hidden_act"] == "gelu" +fout.add_bool("clip.use_gelu", use_gelu) + + +if has_llava_projector: + model.vision_model.encoder.layers.pop(-1) + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) + + fout.add_tensor(name, data) + + print("Projector tensors added\n") + +state_dict = model.state_dict() +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) From 35b7a7a18393986f9052fe147e376352219c75af Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 02:07:42 +0100 Subject: [PATCH 07/24] Update llava-surgery-v2.py --- examples/llava/llava-surgery-v2.py | 47 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 51f9cb638fd95..a5850b96e77d9 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -51,7 +51,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): existing_clip = {} # Update existing_clip with new tensors, avoid duplicates for name in clip_tensors: - simple_name = name.replace("vision_tower.vision_tower.", "") + simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name print(f"Adding {simple_name} to llava.clip") if simple_name not in existing_clip: existing_clip[simple_name] = checkpoint[name] @@ -69,6 +69,25 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): return True return False +def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): + newline_checkpoint_path = None + projector_checkpoint_path = None + + for path in checkpoint_paths: + checkpoint, _ = load_model(path) + if newline_criteria(checkpoint) and newline_checkpoint_path is None: + newline_checkpoint_path = path + if projector(checkpoint): + projector_checkpoint_path = path + + return newline_checkpoint_path, projector_checkpoint_path + +def newline_criteria(checkpoint): + return any(k.startswith("model.image_newline") for k in checkpoint.keys()) + +def proj_criteria(checkpoint): + return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) + # Command-line interface setup ap = argparse.ArgumentParser() @@ -81,25 +100,27 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] - for last_checkpoint_path in checkpoint_paths: - print(f"Cleaning {last_checkpoint_path}") - if not clean_vision_tower_from_checkpoint(last_checkpoint_path): - print(f"No vision tower found in {last_checkpoint_path}") + for projector_checkpoint_path in checkpoint_paths: + print(f"Cleaning {projector_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): + print(f"No vision tower found in {projector_checkpoint_path}") # we break once none is found, so far all models append them at the end - break + # break print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") # Now we look for the projector in the last checkpoint model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] -last_checkpoint_path = checkpoint_paths[0] -first_checkpoint_path = checkpoint_paths[-1] +# last_checkpoint_path = checkpoint_paths[0] +# first_checkpoint_path = checkpoint_paths[-1] +newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) -print(f"Taking projector from {last_checkpoint_path}") +print(f"Taking projector from {projector_checkpoint_path}") +print(f"Taking newline from {newline_checkpoint_path}") # Load the checkpoint -first_checkpoint, file_type = load_model(first_checkpoint_path) -last_checkpoint, file_type = load_model(last_checkpoint_path) +first_checkpoint, file_type = load_model(newline_checkpoint_path) +last_checkpoint, file_type = load_model(projector_checkpoint_path) mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] @@ -129,9 +150,9 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): del first_checkpoint[name] if len(mm_tensors) > 0: - save_model(last_checkpoint, last_checkpoint_path, file_type) + save_model(last_checkpoint, projector_checkpoint_path, file_type) if len(first_mm_tensors) > 0: - save_model(first_checkpoint, first_checkpoint_path, file_type) + save_model(first_checkpoint, newline_checkpoint_path, file_type) print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") From 37a147ebf9c492af646bba349ee0d26e76bd6035 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Thu, 8 Feb 2024 07:42:49 +0100 Subject: [PATCH 08/24] Clip: Bugfix for normalization (it did not loat the 3 std and mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6) Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final convert-image-encoder: fixed image-grid flattening --- examples/llava/clip.cpp | 581 ++++++++++++++++-- examples/llava/clip.h | 41 +- .../llava/convert-image-encoder-to-gguf.py | 3 +- examples/llava/llava-surgery-v2.py | 5 +- examples/llava/llava.cpp | 260 +++++++- examples/server/server.cpp | 5 +- 6 files changed, 839 insertions(+), 56 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9129052a223bb..8193945ee43d7 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -71,6 +71,11 @@ static std::string format(const char * fmt, ...) { #define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" + + // // tensor name constants // @@ -94,6 +99,7 @@ static std::string format(const char * fmt, ...) { #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" enum projector_type { @@ -233,26 +239,6 @@ static projector_type clip_projector_type_from_string(const std::string & name) return PROJECTOR_TYPE_UNKNOWN; } -// -// image data -// - -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; // // clip layers @@ -309,6 +295,7 @@ struct clip_vision_model { struct ggml_tensor * mm_0_b = NULL; struct ggml_tensor * mm_2_w = NULL; struct ggml_tensor * mm_2_b = NULL; + struct ggml_tensor * image_newline = NULL; // Yi type models with mlp+normalization projection struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 @@ -370,6 +357,10 @@ struct clip_ctx { ggml_allocr * compute_alloc = NULL; }; +const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams; +} + static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -382,6 +373,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches_per_side = image_size / patch_size; const int num_positions = num_patches + 1; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; @@ -582,7 +574,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); @@ -966,12 +957,37 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + try { + int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); + int n = gguf_get_arr_n(ctx, idx); + const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); + for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) { + hparams.image_grid_pinpoints[i] = pinpoints[i]; + } + hparams.image_grid_pinpoints[n] = 0; + } catch (std::runtime_error & e) { + hparams.image_grid_pinpoints[0]=0; + } + try { + int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); + strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); + } catch (std::runtime_error & e) { + strcpy(hparams.mm_patch_merge_type, "flat"); + } + try { + hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 + } + catch(const std::exception& e) { + hparams.image_crop_resolution = hparams.image_size; + } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); + const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); for (int i = 0; i < 3; ++i) { - new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean)); - new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std)); + new_clip->image_mean[i] = mean_data[i]; + new_clip->image_std[i] = std_data[i]; } if (verbosity >= 2) { @@ -983,14 +999,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("v_projection_dim %d\n", hparams.projection_dim); printf("v_n_head %d\n", hparams.n_head); printf("v_n_layer %d\n", hparams.n_layer); - } - - vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); - vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); - vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); - vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + printf("v_eps %f\n", hparams.eps); + printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + printf("v_image_grid_pinpoints: "); + for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) { + printf("%d ", hparams.image_grid_pinpoints[i]); + } + printf("\n"); + printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + } + try + { + vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); + vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + } + catch(const std::exception& e) + { + fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); + } + // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); @@ -1015,6 +1047,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); } catch (std::runtime_error & e) { } + try { + vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); + // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); + } catch (std::runtime_error & e) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1134,13 +1170,423 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } + +void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} +void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// Linear interpolation between two points +inline float lerp(float s, float e, float t) { + return s + (e - s) * t; +} +// Bilinear resize function +void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + } + } + } +} + +// for replication purposes `.to(model.device, dtype=torch.float16)` +// converts a float to half precision and back to float +float simulateFloat16Precision(float value) { + // Convert float32 to float16 + uint32_t f32 = *reinterpret_cast(&value); + uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit) + uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32) + uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32) + + // Handle overflow/underflow + if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable + exponent = 0x1F; + mantissa = 0; + } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision + exponent = 0; + mantissa = 0; + } + + uint16_t f16 = sign | (exponent << 10) | mantissa; + + // Convert back to float32 + uint32_t sign32 = (f16 & 0x8000) << 16; + uint32_t exponent32 = ((f16 >> 10) & 0x1F); + uint32_t mantissa32 = (f16 & 0x3FF) << 13; + + // Adjust bias back + exponent32 = exponent32 == 0 ? 0 : exponent32 + 112; + + uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32; + float result = *reinterpret_cast(&f32Result); + + return result; +} +// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16) +void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) { + dst->nx = src->nx; + dst->ny = src->ny; + dst->buf.resize(src->buf.size()); + + for (size_t i = 0; i < src->buf.size(); ++i) { + int c = i % 3; // rgb + dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; + + if (replicate_float16) { + dst->buf[i] = simulateFloat16Precision(dst->buf[i]); + } + } +} +inline float clip(float x, float lower, float upper) +{ + return std::max(lower, std::min(x, upper)); +} +bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) +{ + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + int a, b, c, d, index; + float Ca, Cb, Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, ii, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + float scale = std::max(tx, ty); + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) + { + for (j = 0; j < target_width; j++) + { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + index = (y * nx + x) * 3; + a = (y * nx + (x + 1)) * 3; + b = ((y + 1) * nx + x) * 3; + c = ((y + 1) * nx + (x + 1)) * 3; + + for (k = 0; k < 3; k++) + { + for (jj = 0; jj <= 3; jj++) + { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +// llava-1.6 type of resize_and_pad (black) +void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { + int target_width = target_resolution.first; + int target_height = target_resolution.second; + + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + // bilinear_resize(image, resized_image, new_width, new_height); + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + + image_output = std::move(padded_image); +} + + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + + +std::vector divide_to_patches_u8(const clip_image_u8& image, int patch_size) { + std::vector patches; + int width = image.nx; + int height = image.ny; + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + clip_image_u8 *patch = clip_image_u8_init(); + patch->nx = std::min(patch_size, width - j); + patch->ny = std::min(patch_size, height - i); + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = 0; y < patch->ny; ++y) { + for (int x = 0; x < patch->nx; ++x) { + for (int c = 0; c < 3; ++c) { + patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; + } + } + } + patches.push_back(patch); + } + } + return patches; +} + + +// debug function to convert f32 to u8 +void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} + +/** + * @brief Get the anyres image grid shape object + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return + */ +struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { + /** + Conversion from gguf flat array to vector: + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + */ + auto best_resolution = select_best_resolution(image_size, grid_pinpoints); + return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; +} + + // normalize: x = (x - mean) / std // TODO: implement bicubic interpolation instead of linear. -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); return false; } + auto & params = ctx->vision_model.hparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { + pad2square = false; + } else { + // pad2square = true; // todo: consider automatic decisions on that options for all models + } + // free the previous res_tensor + if (res_tensor.size() > 0) { + for (size_t i = 0; i < res_tensor.size(); i++) { + clip_image_f32_free(res_tensor[i]); + } + res_tensor.clear(); + } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1151,7 +1597,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli temp->nx = longer_side; temp->ny = longer_side; temp->buf.resize(3 * longer_side * longer_side); - const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA + const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) // fill with background color for (size_t i = 0; i < temp->buf.size(); i++) { @@ -1169,18 +1615,65 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } else { - temp->nx = img->nx; - temp->ny = img->ny; - temp->buf.resize(img->buf.size()); - memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); + if (params.image_grid_pinpoints[0] != 0) + { + // "spatial_unpad" with "anyres" processing for llava-1.6 + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); + // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second); + // clip_image_save_to_bmp(*img, "input.bmp"); + resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 + // clip_image_save_to_bmp(*temp, "resized.bmp"); + // visually verify normalized image: + // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + // { + // clip_image_u8 * temp2 = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*res, *temp2); + // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp"); + // clip_image_u8_free(temp2); + // } + + std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) + // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size); + + clip_image_u8 *image_original_resize = clip_image_u8_init(); + // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? + bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? + patches.insert(patches.begin(), image_original_resize); + + res_tensor.clear(); + for (auto& patch : patches) { + clip_image_f32 *temp_image_f32 = clip_image_f32_init(); + normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true); + res_tensor.push_back(temp_image_f32); + } + + for (size_t i = 0; i < patches.size(); i++) { + // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + clip_image_u8_free(patches[i]); + } + + clip_image_u8_free(temp); + + return true; + } else { + temp->nx = img->nx; + temp->ny = img->ny; + temp->buf.resize(img->buf.size()); + memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); + } } const int nx = temp->nx; const int ny = temp->ny; + // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); const int nx2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size; - + clip_image_f32 * res = clip_image_f32_init(); res->nx = nx2; res->ny = ny2; res->buf.resize(3 * nx2 * ny2); @@ -1234,6 +1727,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } clip_image_u8_free(temp); + // { + // clip_image_u8 * temp2 = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*res, *temp2); + // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); + // clip_image_u8_free(temp2); + // } + res_tensor.push_back(res); return true; } @@ -1302,6 +1802,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i type = static_cast(itype); auto * ctx_clip = clip_model_load(fname_inp, 2); + const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; @@ -1495,6 +1996,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } } +ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + int clip_n_patches(const struct clip_ctx * ctx) { auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); @@ -1506,4 +2011,4 @@ int clip_n_patches(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} +} \ No newline at end of file diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 458a256a107fe..09346b603b259 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -3,6 +3,8 @@ #include #include +#include +#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -32,10 +34,20 @@ struct clip_vision_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + float eps; + + char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; + }; +struct clip_ctx; +CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx); + CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -44,6 +56,24 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +// RGB uint8 image +CLIP_API struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... + CLIP_API struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + + struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -53,6 +83,10 @@ struct clip_image_f32_batch { struct clip_image_f32 * data; size_t size; }; +CLIP_API struct clip_image_grid_shape { + int first; + int second; +}; CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -61,11 +95,16 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); +CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename); +CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); +/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */ +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square); +CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size); +CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); -CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 115b6b35b4da0..ea331f2fe9875 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -240,7 +240,8 @@ def bytes_to_unicode(): # flatten it image_grid_pinpoints = [] for pinpoint in v_hparams["image_grid_pinpoints"]: - image_grid_pinpoints.extend(pinpoint) + for p in pinpoint: + image_grid_pinpoints.append(p) fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) if "image_crop_resolution" in v_hparams: fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index a5850b96e77d9..6b4fac80d29d7 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -13,11 +13,12 @@ def is_safetensor_file(file_path): # Unified loading function def load_model(file_path): if is_safetensor_file(file_path): - # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' tensors = {} with safe_open(file_path, framework="pt", device="cpu") as f: for key in f.keys(): tensors[key] = f.get_tensor(key).clone() + # output shape + print(f"{key} : {tensors[key].shape}") return tensors, 'safetensor' else: return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' @@ -156,4 +157,4 @@ def proj_criteria(checkpoint): print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") \ No newline at end of file diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index d42e7582e8c66..3a0c4a8a4a874 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -6,27 +6,261 @@ #include #include #include +#include #include "base64.hpp" +// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) +static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { + struct temp_model { + struct ggml_tensor *newline; + struct ggml_context * ctx; + } model; + + auto & vparams = clip_get_vision_hparams(ctx_clip); + auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) + int num_patches_width = grid_shape.first; // grid 1-4 + int num_patches_height = grid_shape.second; // grid 1-4 + + // TODO: size calculation is not calculated - it's only tens of MB + size_t ctx_size = 0; + { + ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features + ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // + } + + struct ggml_init_params params { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API + }; + + // Python reference for full unpad: + // base_image_feature = image_feature[0] + // image_feature = image_feature[1:] + // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + // image_feature = image_feature.flatten(1, 2).flatten(2, 3) + // image_feature = unpad_image(image_feature, image_sizes[image_idx]) + // image_feature = torch.cat(( + // image_feature, + // self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) + // ), dim=-1) + // image_feature = image_feature.flatten(1, 2).transpose(0, 1) + // image_feature = torch.cat((base_image_feature, image_feature), dim=0) + + // embeddings -> tokens -> 24 x 24 + /** + * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval + * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet + * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. + * Once all images are processed to prepended the base_image_features without any changes. + */ + /** + Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) + # image_feature = image_feature.view(2, 2, 24, 24, 4096) + # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + # image_feature = image_feature.view(2, 24, 2, 24, 4096) + # image_feature = image_feature.flatten(0, 3) + + # Reshape to 4D tensor by merging the last two dimensions + image_feature = image_feature.view(2, 2, 24, 24*4096) + image_feature = image_feature.permute(0, 2, 1, 3).contiguous() + image_feature = image_feature.view(-1, 4096) + * + */ + model.ctx = ggml_init(params); + + ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); + // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1)); + + ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); + model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); + if (newline_tmp->backend != GGML_BACKEND_CPU) { + if (newline_tmp->buffer == NULL) { + printf("newline_tmp tensor buffer is NULL\n"); + } + ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); + } else + { + model.newline->data = newline_tmp->data; + if (model.newline->data == NULL) { + printf("newline_tmp tensor data is NULL\n"); + } + } + + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip)); + // fill it with the image embeddings, ignoring the first + for (int i = 1; i < image_embd_v.size(); i++) + { + // printf("Copying image_embd_v[%d] to image_features tensor\n", i); + size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); + + // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc: + // float *floatPtr = static_cast(image_embd_v[i]); + // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++) + // { + // // floatPtr[j] = (j + 1) / 10000.0f; + // int feature = j % clip_n_mmproj_embd(ctx_clip) + 1; + // floatPtr[j] = i + feature / 10000.0f; + // } + memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); + } + // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1)); + + struct ggml_cgraph * gf = ggml_new_graph(model.ctx); + // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + size_t size_ele = ggml_type_size(GGML_TYPE_F32); + // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip)); + + struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, + num_patches_height, // nb0 : 4 byte für jedes + num_patches_width, + num_patches_per_side * num_patches_per_side, + clip_n_mmproj_embd(ctx_clip), + + size_ele * num_patches_height, + size_ele * num_patches_height * num_patches_width, + size_ele * num_patches_height * num_patches_width * num_patches_per_side, + 0); + + struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side, + num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + + size_ele * num_patches_height, + size_ele * num_patches_height * num_patches_width, + size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); + + struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug + + struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); + struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed + // struct ggml_tensor *prepared_cont = prepared; // the view only flattens + + ggml_build_forward_expand(gf, prepared_cont); + + ggml_graph_compute_with_ctx(model.ctx, gf, 1); + + struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + // ggml_tensor_printf(image_features,"image_features",__LINE__,false,true); + // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true); + // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true); + + memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context + // append without newline tokens: + // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches + // append with newline tokens: + for (size_t i = 0; i < image_embd_v.size() - 1; ++i) { + // we append with +1 offset (base image is prepended) + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i, + (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip), + clip_embd_nbytes(ctx_clip)); + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i , + (float*)model.newline->data, + ggml_nbytes(model.newline)); + } + + size_t newline_tokens = image_embd_v.size()-1; + *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens; + + // Debug: Test single segments + // Current findings: sending base image, sending a segment embedding all works similar to python + // However, permuted embeddings do not work yet (stride issue?) + // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context + // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context + // *n_img_pos_out=576; + + ggml_free(model.ctx); + + return true; +} + + static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { - clip_image_f32 * img_res = clip_image_f32_init(); - if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { + std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 + if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); - clip_image_f32_free(img_res); + for (auto img_res : img_res_v) { + clip_image_f32_free(img_res); + } return false; } - *n_img_pos = clip_n_patches(ctx_clip); - const int64_t t_img_enc_start_us = ggml_time_us(); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); - clip_image_f32_free(img_res); - if (!encoded) { - fprintf(stderr, "Unable to encode image\n"); + auto & vparams = clip_get_vision_hparams(ctx_clip); + // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format + // for (int i = 0; i < img_res_v.size(); i++) + // { + // printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny); + // for (int j = 0; j < 10; j++) + // { + // for (int k = 0; k < 10; k++) + // { + // printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]); + // } + // printf("\n"); + // } + // } + + if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) + { + // flat / default llava-1.5 type embedding + *n_img_pos = clip_n_patches(ctx_clip); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096 + clip_image_f32_free(img_res_v[0]); + if (!encoded) { + fprintf(stderr, "Unable to encode image\n"); + + return false; + } + } else + { + // spatial_unpad llava-1.6 type embedding + // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size()); + for (int i = 0; i < img_res_v.size(); i++) + { + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + clip_image_f32_free(img_res_v[i]); + if (!encoded) { + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size()); + return false; + } + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + + std::vector> grid_pinpoints; + for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) { + grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]}); + } + img_res_v.clear(); + struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size); + + int n_img_pos_out; + handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + *n_img_pos = n_img_pos_out; + + for (int i = 0; i < image_embd_v.size(); i++) + { + free(image_embd_v[i]); + } + image_embd_v.clear(); + + // debug image/segment/normalization content: + // clip_image_u8 * tmp = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*image_feature, *tmp); + // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); - return false; } + printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; @@ -36,6 +270,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return true; } + + bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { // make sure that the correct mmproj was used, i.e., compare apples to apples int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); @@ -48,7 +284,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); free(image_embd); @@ -151,7 +387,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct return NULL; } - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); + llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); free(image_bytes); return embed; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ea77125eac99d..353bd89760819 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -943,13 +943,14 @@ struct llama_server_context { continue; } - clip_image_f32 * img_res = clip_image_f32_init(); - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true)) + std::vector img_res_v; + if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true)) { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); return false; } + clip_image_f32 * img_res = img_res_v[0]; img.image_tokens = clip_n_patches(clp_ctx); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); if (!img.image_embedding) From 7dcadb4ec3fa233fc13b581acf533a19a3dc7480 Mon Sep 17 00:00:00 2001 From: John Date: Sun, 11 Feb 2024 03:30:36 +0100 Subject: [PATCH 09/24] whitespace corrections --- examples/llava/clip.cpp | 36 +++++++------- examples/llava/clip.h | 2 +- .../llava/convert-image-encoder-to-gguf.py | 2 +- examples/llava/llava-surgery-v2.py | 7 ++- examples/llava/llava.cpp | 49 +++++++++---------- 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8193945ee43d7..7a7374cd8d64a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1,7 +1,6 @@ // NOTE: This is modified from clip.cpp only for LLaVA, // so there might be still unnecessary artifacts hanging around // I'll gradually clean and extend it - #include "clip.h" #include "ggml.h" #include "ggml-alloc.h" @@ -965,7 +964,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.image_grid_pinpoints[i] = pinpoints[i]; } hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & e) { hparams.image_grid_pinpoints[0]=0; } try { @@ -979,7 +978,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch(const std::exception& e) { hparams.image_crop_resolution = hparams.image_size; - } + } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); @@ -1022,7 +1021,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { { fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); } - + // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); @@ -1270,12 +1269,12 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam inline float lerp(float s, float e, float t) { return s + (e - s) * t; } -// Bilinear resize function +// Bilinear resize function void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); - + float x_ratio = static_cast(src.nx - 1) / target_width; float y_ratio = static_cast(src.ny - 1) / target_height; @@ -1343,11 +1342,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co dst->nx = src->nx; dst->ny = src->ny; dst->buf.resize(src->buf.size()); - + for (size_t i = 0; i < src->buf.size(); ++i) { int c = i % 3; // rgb dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; - + if (replicate_float16) { dst->buf[i] = simulateFloat16Precision(dst->buf[i]); } @@ -1546,15 +1545,15 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) /** * @brief Get the anyres image grid shape object - * - * @param image_size - * @param grid_pinpoints - * @param image_patch_size - * @return + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return */ struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { /** - Conversion from gguf flat array to vector: + Conversion from gguf flat array to vector: std::vector> possible_resolutions; for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); @@ -1628,7 +1627,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 // clip_image_save_to_bmp(*temp, "resized.bmp"); // visually verify normalized image: - // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); // { // clip_image_u8 * temp2 = clip_image_u8_init(); // clip_image_convert_f32_to_u8(*res, *temp2); @@ -1638,7 +1637,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size); - + clip_image_u8 *image_original_resize = clip_image_u8_init(); // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? @@ -1655,9 +1654,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } - + clip_image_u8_free(temp); - + return true; } else { temp->nx = img->nx; @@ -1802,7 +1801,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i type = static_cast(itype); auto * ctx_clip = clip_model_load(fname_inp, 2); - const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 09346b603b259..c1981bb5d1574 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -38,7 +38,7 @@ struct clip_vision_hparams { float eps; char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) - int32_t image_grid_pinpoints[32]; + int32_t image_grid_pinpoints[32]; int32_t image_crop_resolution; }; diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index ea331f2fe9875..61a14703702ad 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -234,7 +234,7 @@ def bytes_to_unicode(): # 1008, 336, # 336, 1008 # ] - # * + # * # */ if "image_grid_pinpoints" in v_hparams: # flatten it diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 6b4fac80d29d7..e94d10a55ddbf 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -4,7 +4,6 @@ import torch from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file - # Function to determine if file is a SafeTensor file def is_safetensor_file(file_path): return file_path.endswith('.safetensors') @@ -40,12 +39,12 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): model_path = os.path.dirname(checkpoint_path) print(f"Searching for vision tower tensors in {checkpoint_path}") clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] - + if len(clip_tensors) > 0: print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") # Adapted for file type clip_path = os.path.join(model_path, "llava.clip") - + if os.path.exists(clip_path): existing_clip, _ = load_model(clip_path) else: @@ -142,7 +141,7 @@ def proj_criteria(checkpoint): projector[name] = last_checkpoint[name].float() for name in first_mm_tensors: projector[name] = first_checkpoint[name].float() - + save_model(projector, f"{args.model}/llava.projector", 'pytorch') for name in mm_tensors: diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 3a0c4a8a4a874..5ba9d072dfba9 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -14,21 +14,21 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { struct temp_model { struct ggml_tensor *newline; - struct ggml_context * ctx; + struct ggml_context * ctx; } model; auto & vparams = clip_get_vision_hparams(ctx_clip); auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 - + // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; { ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features - ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // + ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); } - + struct ggml_init_params params { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -47,7 +47,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb // ), dim=-1) // image_feature = image_feature.flatten(1, 2).transpose(0, 1) // image_feature = torch.cat((base_image_feature, image_feature), dim=0) - + // embeddings -> tokens -> 24 x 24 /** * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval @@ -66,13 +66,13 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb image_feature = image_feature.view(2, 2, 24, 24*4096) image_feature = image_feature.permute(0, 2, 1, 3).contiguous() image_feature = image_feature.view(-1, 4096) - * + * */ model.ctx = ggml_init(params); - + ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1)); - + ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); if (newline_tmp->backend != GGML_BACKEND_CPU) { @@ -112,28 +112,28 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb size_t size_ele = ggml_type_size(GGML_TYPE_F32); // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip)); - struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, - num_patches_height, // nb0 : 4 byte für jedes - num_patches_width, - num_patches_per_side * num_patches_per_side, - clip_n_mmproj_embd(ctx_clip), + struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side * num_patches_per_side, + clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height, size_ele * num_patches_height * num_patches_width, size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); - - struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, - num_patches_height, - num_patches_width, - num_patches_per_side, + + struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip), - + size_ele * num_patches_height, size_ele * num_patches_height * num_patches_width, size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); - struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); @@ -172,9 +172,8 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context // *n_img_pos_out=576; - - ggml_free(model.ctx); + ggml_free(model.ctx); return true; } @@ -205,7 +204,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // } // } - if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) + if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); @@ -233,7 +232,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); std::vector> grid_pinpoints; @@ -260,7 +259,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); - + const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; From 7107b9098e3f1375a6647360d9c8ba41125c4973 Mon Sep 17 00:00:00 2001 From: John Date: Sun, 11 Feb 2024 03:44:07 +0100 Subject: [PATCH 10/24] ws --- examples/llava/clip.cpp | 6 +++--- examples/llava/llava-surgery-v2.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7a7374cd8d64a..56d3fd0af9a27 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1277,7 +1277,7 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi float x_ratio = static_cast(src.nx - 1) / target_width; float y_ratio = static_cast(src.ny - 1) / target_height; - + for (int y = 0; y < target_height; y++) { for (int x = 0; x < target_width; x++) { float px = x_ratio * x; @@ -1654,7 +1654,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } - + clip_image_u8_free(temp); return true; @@ -2009,4 +2009,4 @@ int clip_n_patches(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} \ No newline at end of file +} diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index e94d10a55ddbf..f0ade4ceb357b 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -156,4 +156,4 @@ def proj_criteria(checkpoint): print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") \ No newline at end of file +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") From 51e60c996f5cdce71c21eaf53da0f6afee87acd1 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 12 Feb 2024 04:02:54 +0100 Subject: [PATCH 11/24] Tensors are now properly permuted. Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference. --- examples/llava/clip.cpp | 4 +- examples/llava/llava.cpp | 140 +++++++++++++-------------------------- 2 files changed, 48 insertions(+), 96 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 56d3fd0af9a27..60d8e8e802f05 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1,6 +1,7 @@ // NOTE: This is modified from clip.cpp only for LLaVA, // so there might be still unnecessary artifacts hanging around // I'll gradually clean and extend it +// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" #include "ggml.h" #include "ggml-alloc.h" @@ -1622,7 +1623,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); } std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); - // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second); // clip_image_save_to_bmp(*img, "input.bmp"); resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 // clip_image_save_to_bmp(*temp, "resized.bmp"); @@ -1646,7 +1646,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std res_tensor.clear(); for (auto& patch : patches) { clip_image_f32 *temp_image_f32 = clip_image_f32_init(); - normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true); + normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication res_tensor.push_back(temp_image_f32); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 5ba9d072dfba9..42d00082b8c6b 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -34,44 +34,40 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API }; - - // Python reference for full unpad: - // base_image_feature = image_feature[0] - // image_feature = image_feature[1:] - // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() - // image_feature = image_feature.flatten(1, 2).flatten(2, 3) - // image_feature = unpad_image(image_feature, image_sizes[image_idx]) - // image_feature = torch.cat(( - // image_feature, - // self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) - // ), dim=-1) - // image_feature = image_feature.flatten(1, 2).transpose(0, 1) - // image_feature = torch.cat((base_image_feature, image_feature), dim=0) - - // embeddings -> tokens -> 24 x 24 - /** - * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval - * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet - * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. - * Once all images are processed to prepended the base_image_features without any changes. - */ - /** - Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) - # image_feature = image_feature.view(2, 2, 24, 24, 4096) - # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() - # image_feature = image_feature.view(2, 24, 2, 24, 4096) - # image_feature = image_feature.flatten(0, 3) - - # Reshape to 4D tensor by merging the last two dimensions + // Python reference code for full unpad: + /* + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + image_feature = torch.cat(( + image_feature, + self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) + ), dim=-1) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + */ + // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval. + // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet. + // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. + // Once all images are processed to prepended the base_image_features without any changes. + + // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) + /* + image_feature = image_feature.view(2, 2, 24, 24, 4096) + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.view(2, 24, 2, 24, 4096) + image_feature = image_feature.flatten(0, 3) + + // Reshape to 4D tensor by merging the last two dimensions image_feature = image_feature.view(2, 2, 24, 24*4096) image_feature = image_feature.permute(0, 2, 1, 3).contiguous() image_feature = image_feature.view(-1, 4096) - * - */ - model.ctx = ggml_init(params); + */ + model.ctx = ggml_init(params); ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); - // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1)); ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); @@ -88,83 +84,39 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb } } - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip)); - // fill it with the image embeddings, ignoring the first + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4 + // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); + // fill it with the image embeddings, ignoring the base for (int i = 1; i < image_embd_v.size(); i++) { - // printf("Copying image_embd_v[%d] to image_features tensor\n", i); size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); - - // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc: - // float *floatPtr = static_cast(image_embd_v[i]); - // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++) - // { - // // floatPtr[j] = (j + 1) / 10000.0f; - // int feature = j % clip_n_mmproj_embd(ctx_clip) + 1; - // floatPtr[j] = i + feature / 10000.0f; - // } memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); } - // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1)); struct ggml_cgraph * gf = ggml_new_graph(model.ctx); - // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) size_t size_ele = ggml_type_size(GGML_TYPE_F32); - // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip)); - - struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, - num_patches_height, - num_patches_width, - num_patches_per_side * num_patches_per_side, - clip_n_mmproj_embd(ctx_clip), - - size_ele * num_patches_height, - size_ele * num_patches_height * num_patches_width, - size_ele * num_patches_height * num_patches_width * num_patches_per_side, - 0); struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, - num_patches_height, - num_patches_width, - num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip), - - size_ele * num_patches_height, - size_ele * num_patches_height * num_patches_width, - size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); - + num_patches_per_side, + num_patches_width, + num_patches_height, + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side, + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); + // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); - permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug - - struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); - struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed - // struct ggml_tensor *prepared_cont = prepared; // the view only flattens - - ggml_build_forward_expand(gf, prepared_cont); - + // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); + struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); + // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); + ggml_build_forward_expand(gf, flatten); ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; - // ggml_tensor_printf(image_features,"image_features",__LINE__,false,true); - // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true); - // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true); memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context - // append without newline tokens: - // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches - // append with newline tokens: - for (size_t i = 0; i < image_embd_v.size() - 1; ++i) { - // we append with +1 offset (base image is prepended) - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i, - (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip), - clip_embd_nbytes(ctx_clip)); - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i , - (float*)model.newline->data, - ggml_nbytes(model.newline)); - } - - size_t newline_tokens = image_embd_v.size()-1; - *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens; + // append without newline tokens (default behavior in llava_arch when not using unpad ): + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches + *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python From 60c5f46ba734391f62005d2b20ff8d791e1fcdae Mon Sep 17 00:00:00 2001 From: John Date: Mon, 12 Feb 2024 04:04:57 +0100 Subject: [PATCH 12/24] ws --- examples/llava/llava.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 42d00082b8c6b..4ba89eb97cf56 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -101,7 +101,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb num_patches_per_side, num_patches_width, num_patches_height, - size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side, size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); From 0dd6c9da2a81337ec112f4c96627c01742a08943 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 12 Feb 2024 04:34:51 +0100 Subject: [PATCH 13/24] added verbose_prompt support into cli added stopwords for llava-1.6 into cli --- examples/llava/llava-cli.cpp | 26 ++++++++++++++++++++++++-- examples/llava/llava.cpp | 6 +++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 6ac70ba69e281..04fe6bef05ea2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -167,11 +167,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } printf("system_prompt: %s\n", system_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } printf("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } } else { // llava-1.5 native mode system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } } eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); @@ -183,13 +201,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fprintf(stderr, "\n"); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); - + std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + fflush(stdout); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ba89eb97cf56..ff99a688e8605 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -116,7 +116,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches - *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip); + *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -179,12 +179,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside clip_image_f32_free(img_res_v[i]); if (!encoded) { - fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size()); + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size()); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); std::vector> grid_pinpoints; From 3a722678690952ed922a4dde8693a00882f1890a Mon Sep 17 00:00:00 2001 From: John Date: Tue, 13 Feb 2024 00:29:17 +0100 Subject: [PATCH 14/24] moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed --- examples/llava/clip.cpp | 141 ++++++++++++++----------------------- examples/llava/clip.h | 27 +------ examples/llava/llava.cpp | 136 ++++++++++++++++++++++++++--------- examples/server/server.cpp | 29 ++++++-- 4 files changed, 184 insertions(+), 149 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 60d8e8e802f05..ad12bd8c514fb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -31,6 +31,25 @@ #include #include +// #define CLIP_DEBUG_FUNCTIONS + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + static std::string format(const char * fmt, ...) { va_list ap; va_list ap2; @@ -961,10 +980,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int n = gguf_get_arr_n(ctx, idx); const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); - for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) { + for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { hparams.image_grid_pinpoints[i] = pinpoints[i]; } - hparams.image_grid_pinpoints[n] = 0; + if (n < 32) + hparams.image_grid_pinpoints[n] = 0; } catch (std::runtime_error & e) { hparams.image_grid_pinpoints[0]=0; } @@ -1170,7 +1190,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } - +#ifdef CLIP_DEBUG_FUNCTIONS void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { @@ -1265,6 +1285,7 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam file.close(); } +#endif // Linear interpolation between two points inline float lerp(float s, float e, float t) { @@ -1305,41 +1326,8 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi } } -// for replication purposes `.to(model.device, dtype=torch.float16)` -// converts a float to half precision and back to float -float simulateFloat16Precision(float value) { - // Convert float32 to float16 - uint32_t f32 = *reinterpret_cast(&value); - uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit) - uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32) - uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32) - - // Handle overflow/underflow - if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable - exponent = 0x1F; - mantissa = 0; - } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision - exponent = 0; - mantissa = 0; - } - - uint16_t f16 = sign | (exponent << 10) | mantissa; - - // Convert back to float32 - uint32_t sign32 = (f16 & 0x8000) << 16; - uint32_t exponent32 = ((f16 >> 10) & 0x1F); - uint32_t mantissa32 = (f16 & 0x3FF) << 13; - - // Adjust bias back - exponent32 = exponent32 == 0 ? 0 : exponent32 + 112; - - uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32; - float result = *reinterpret_cast(&f32Result); - - return result; -} -// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16) -void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) { +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { dst->nx = src->nx; dst->ny = src->ny; dst->buf.resize(src->buf.size()); @@ -1347,12 +1335,9 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co for (size_t i = 0; i < src->buf.size(); ++i) { int c = i % 3; // rgb dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; - - if (replicate_float16) { - dst->buf[i] = simulateFloat16Precision(dst->buf[i]); - } } } + inline float clip(float x, float lower, float upper) { return std::max(lower, std::min(x, upper)); @@ -1471,7 +1456,6 @@ void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_outpu } } } - image_output = std::move(padded_image); } @@ -1533,7 +1517,7 @@ std::vector divide_to_patches_u8(const clip_image_u8& image, int return patches; } - +#ifdef CLIP_DEBUG_FUNCTIONS // debug function to convert f32 to u8 void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { dst.nx = src.nx; @@ -1543,32 +1527,12 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); } } +#endif -/** - * @brief Get the anyres image grid shape object - * - * @param image_size - * @param grid_pinpoints - * @param image_patch_size - * @return - */ -struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { - /** - Conversion from gguf flat array to vector: - std::vector> possible_resolutions; - for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { - possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); - } - */ - auto best_resolution = select_best_resolution(image_size, grid_pinpoints); - return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; -} - - -// normalize: x = (x - mean) / std -// TODO: implement bicubic interpolation instead of linear. -// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square) { +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ) { + bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); return false; @@ -1576,23 +1540,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std auto & params = ctx->vision_model.hparams; // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { - pad2square = false; - } else { - // pad2square = true; // todo: consider automatic decisions on that options for all models + pad_to_square = false; } - // free the previous res_tensor - if (res_tensor.size() > 0) { - for (size_t i = 0; i < res_tensor.size(); i++) { - clip_image_f32_free(res_tensor[i]); + // free the previous res_imgs if any set + if (res_imgs.size > 0 && res_imgs.size < 100) { + for (size_t i = 0; i < res_imgs.size; i++) { + clip_image_f32_free(&(res_imgs.data[i])); } - res_tensor.clear(); + delete[] res_imgs.data; } + res_imgs.data = nullptr; + res_imgs.size = 0; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily - if (pad2square && img->nx != img->ny) { + if (pad_to_square && img->nx != img->ny) { int longer_side = std::max(img->nx, img->ny); temp->nx = longer_side; temp->ny = longer_side; @@ -1636,18 +1600,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std // } std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) - // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size); clip_image_u8 *image_original_resize = clip_image_u8_init(); - // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? - bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? + // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square + bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); - - res_tensor.clear(); + // clip_image_f32_batch_init(patches.size()); + res_imgs.size = patches.size(); + res_imgs.data = new clip_image_f32[res_imgs.size]; + int num=0; for (auto& patch : patches) { - clip_image_f32 *temp_image_f32 = clip_image_f32_init(); - normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication - res_tensor.push_back(temp_image_f32); + normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); + num++; } for (size_t i = 0; i < patches.size(); i++) { @@ -1732,7 +1696,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); // clip_image_u8_free(temp2); // } - res_tensor.push_back(res); + // res_imgs.push_back(res); + res_imgs.size = 1; + res_imgs.data = new clip_image_f32[res_imgs.size]; + res_imgs.data[0] = std::move(*res); return true; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index c1981bb5d1574..2d1858bbd4082 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -3,8 +3,6 @@ #include #include -#include -#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -56,24 +54,6 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); -// RGB uint8 image -CLIP_API struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... - CLIP_API struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - - struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -95,14 +75,11 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); -CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename); -CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); -/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square); -CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size); +/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ); CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index ff99a688e8605..699fd256a8ae2 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -10,8 +10,78 @@ #include "base64.hpp" +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} +/** + * @brief Get the anyres image grid shape object + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return + */ +struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { + /** + Conversion from gguf flat array to vector: + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + */ + auto best_resolution = select_best_resolution(image_size, grid_pinpoints); + return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; +} + + // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { +static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { struct temp_model { struct ggml_tensor *newline; struct ggml_context * ctx; @@ -21,11 +91,12 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 + const size_t num_images = num_patches_width + num_patches_height + 1; // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; { - ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features + ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); } @@ -84,10 +155,10 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb } } - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4 + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base - for (int i = 1; i < image_embd_v.size(); i++) + for (int i = 1; i < num_images; i++) { size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); @@ -106,6 +177,15 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + /** + At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings + image_feature = torch.cat(( + image_feature, + self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) + ), dim=-1) + * + */ + // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); @@ -115,7 +195,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); // Debug: Test single segments @@ -131,37 +211,25 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { - std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 - if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) { + // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 + clip_image_f32_batch img_res_v; + img_res_v.size = 0; + img_res_v.data = nullptr; + if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); - for (auto img_res : img_res_v) { - clip_image_f32_free(img_res); - } + delete[] img_res_v.data; return false; } const int64_t t_img_enc_start_us = ggml_time_us(); auto & vparams = clip_get_vision_hparams(ctx_clip); - // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format - // for (int i = 0; i < img_res_v.size(); i++) - // { - // printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny); - // for (int j = 0; j < 10; j++) - // { - // for (int k = 0; k < 10; k++) - // { - // printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]); - // } - // printf("\n"); - // } - // } if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096 - clip_image_f32_free(img_res_v[0]); + bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 + delete[] img_res_v.data; if (!encoded) { fprintf(stderr, "Unable to encode image\n"); @@ -172,30 +240,32 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; - image_embd_v.resize(img_res_v.size()); - for (int i = 0; i < img_res_v.size(); i++) + image_embd_v.resize(img_res_v.size); + for (int i = 0; i < img_res_v.size; i++) { image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside - clip_image_f32_free(img_res_v[i]); + bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size()); + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); std::vector> grid_pinpoints; for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) { grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]}); } - img_res_v.clear(); + // free all img_res_v - not needed anymore + delete[] img_res_v.data; + img_res_v.size = 0; + img_res_v.data = nullptr; struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size); int n_img_pos_out; - handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); *n_img_pos = n_img_pos_out; for (int i = 0; i < image_embd_v.size(); i++) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 353bd89760819..9148f6ca21331 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -31,6 +31,23 @@ using json = nlohmann::json; +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + struct server_params { std::string hostname = "127.0.0.1"; @@ -943,14 +960,17 @@ struct llama_server_context { continue; } - std::vector img_res_v; - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true)) + clip_image_f32_batch img_res_v; + img_res_v.size = 0; + img_res_v.data = nullptr; + if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v)) { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); + delete[] img_res_v.data; return false; } - clip_image_f32 * img_res = img_res_v[0]; + clip_image_f32 * img_res = &img_res_v.data[0]; img.image_tokens = clip_n_patches(clp_ctx); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); if (!img.image_embedding) @@ -965,7 +985,8 @@ struct llama_server_context LOG_TEE("Unable to encode image\n"); return false; } - clip_image_f32_free(img_res); + // clip_image_f32_free(img_res); + delete[] img_res_v.data; img.request_encode_image = false; } From 07f5cd7beccf93d4f720d2c037cbc5ca86385cd5 Mon Sep 17 00:00:00 2001 From: John Date: Tue, 13 Feb 2024 00:35:31 +0100 Subject: [PATCH 15/24] ws --- examples/llava/llava.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 699fd256a8ae2..9f955e2ae5a62 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -183,7 +183,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) ), dim=-1) - * + * */ // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); From 6b8d69b451feadb38972174980314d3da9e4f179 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 19:58:44 +0200 Subject: [PATCH 16/24] convert : skip unknown tensors (need for LLaVA) --- convert.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 75c10011846e4..237f8d782570e 100755 --- a/convert.py +++ b/convert.py @@ -1195,7 +1195,9 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: for name, lazy_tensor in model.items(): tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: - raise Exception(f"Unexpected tensor name: {name}") + #raise Exception(f"Unexpected tensor name: {name}") + print(f"Unexpected tensor name: {name} - skipping") + continue if tensor_type in should_skip: print(f"skipping tensor {name_new}") From a2848854a445c18b5339f2a928c59bb4cc8082d2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 19:59:00 +0200 Subject: [PATCH 17/24] llava : update readme --- examples/llava/README.md | 6 +++++- examples/llava/convert-image-encoder-to-gguf.py | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 323c5fdd02835..c1c030951f3dd 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -49,8 +49,12 @@ python ./convert.py ../llava-v1.5-7b Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. +## LLaVA 1.6 + +- Use `llava-surgery-v2.py` + ## TODO -- [ ] Support non-CPU backend for the image encoding part. +- [x] Support non-CPU backend for the image encoding part. - [ ] Support different sampling methods. - [ ] Support more model variants. diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 61a14703702ad..3988da70c9731 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -78,9 +78,9 @@ def bytes_to_unicode(): help="Save a text-only model. It can't be used to encode images") ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip_model_is_vision", action="store_true", required=False, +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") @@ -89,8 +89,8 @@ def bytes_to_unicode(): # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) # with proper args = ap.parse_args() From 65ec518d4120bc25425204d5834991ab9bca0639 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 20:22:28 +0200 Subject: [PATCH 18/24] llava : fix compile warnings --- examples/llava/clip.cpp | 109 ++++++++++++++++++++++++--------------- examples/llava/clip.h | 27 ++++------ examples/llava/llava.cpp | 72 +++++++++++++------------- 3 files changed, 112 insertions(+), 96 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ad12bd8c514fb..2baceda5da387 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -30,6 +30,7 @@ #include #include #include +#include // #define CLIP_DEBUG_FUNCTIONS @@ -242,7 +243,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { } } -static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") { +static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, @@ -263,6 +264,24 @@ static projector_type clip_projector_type_from_string(const std::string & name) // clip layers // +struct clip_hparams { + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + + float eps; + + char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) + + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; + +}; + struct clip_layer { // attention struct ggml_tensor * k_w; @@ -292,7 +311,7 @@ struct clip_layer { }; struct clip_vision_model { - struct clip_vision_hparams hparams; + struct clip_hparams hparams; // embeddings struct ggml_tensor * class_embedding; @@ -376,10 +395,6 @@ struct clip_ctx { ggml_allocr * compute_alloc = NULL; }; -const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams; -} - static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -392,7 +407,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_patches_per_side = image_size / patch_size; + const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_positions = num_patches + 1; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; @@ -1292,7 +1307,7 @@ inline float lerp(float s, float e, float t) { return s + (e - s) * t; } // Bilinear resize function -void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { +static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); @@ -1327,7 +1342,7 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi } // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not -void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { +static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { dst->nx = src->nx; dst->ny = src->ny; dst->buf.resize(src->buf.size()); @@ -1338,12 +1353,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co } } -inline float clip(float x, float lower, float upper) -{ +inline float clip(float x, float lower, float upper) { return std::max(lower, std::min(x, upper)); } -bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) -{ + +static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { const int nx = img.nx; const int ny = img.ny; @@ -1351,11 +1365,10 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); - int a, b, c, d, index; - float Ca, Cb, Cc; + float Cc; float C[5]; float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, ii, jj; + int i, j, k, jj; int x, y; float dx, dy; float tx, ty; @@ -1363,39 +1376,29 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid tx = (float)nx / (float)target_width; ty = (float)ny / (float)target_height; - float scale = std::max(tx, ty); - // Bicubic interpolation; adapted from ViT.cpp, inspired from : // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - for (i = 0; i < target_height; i++) - { - for (j = 0; j < target_width; j++) - { + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { x = (int)(tx * j); y = (int)(ty * i); dx = tx * j - x; dy = ty * i - y; - index = (y * nx + x) * 3; - a = (y * nx + (x + 1)) * 3; - b = ((y + 1) * nx + x) * 3; - c = ((y + 1) * nx + (x + 1)) * 3; - - for (k = 0; k < 3; k++) - { - for (jj = 0; jj <= 3; jj++) - { + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; d0 = C[0] - C[1]; @@ -1403,8 +1406,8 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid d3 = C[3] - C[1]; a0 = C[1]; a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); @@ -1418,7 +1421,7 @@ bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid } // llava-1.6 type of resize_and_pad (black) -void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { +static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { int target_width = target_resolution.first; int target_height = target_resolution.second; @@ -1467,7 +1470,7 @@ void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_outpu * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. * @return The best fit resolution in the format (width, height). */ -static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { +static std::pair select_best_resolution(const std::pair & original_size, const std::vector> & possible_resolutions) { int original_width = original_size.first; int original_height = original_size.second; std::pair best_fit; @@ -1494,7 +1497,7 @@ static std::pair select_best_resolution(const std::pair& ori } -std::vector divide_to_patches_u8(const clip_image_u8& image, int patch_size) { +static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { std::vector patches; int width = image.nx; int height = image.ny; @@ -1531,7 +1534,7 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1710,6 +1713,30 @@ void clip_free(clip_ctx * ctx) { delete ctx; } +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_image_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_size; +} + +int32_t clip_patch_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.patch_size; +} + +int32_t clip_hidden_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.hidden_size; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.mm_patch_merge_type; +} + +const int32_t * clip_image_grid(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_grid_pinpoints; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1973,7 +2000,3 @@ int clip_n_patches(const struct clip_ctx * ctx) { } return n_patches; } - -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 2d1858bbd4082..5e0b5c64b57c5 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -24,25 +24,7 @@ struct clip_ctx; extern "C" { #endif -struct clip_vision_hparams { - int32_t image_size; - int32_t patch_size; - int32_t hidden_size; - int32_t n_intermediate; - int32_t projection_dim; - int32_t n_head; - int32_t n_layer; - - float eps; - - char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) - int32_t image_grid_pinpoints[32]; - int32_t image_crop_resolution; - -}; - struct clip_ctx; -CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx); CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); @@ -51,6 +33,15 @@ CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); +CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); + +// TODO: should be enum, not string +CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); + +CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); + CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 9f955e2ae5a62..ea956ac005a97 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -2,14 +2,13 @@ #include "common.h" #include "llama.h" #include "llava.h" +#include "base64.hpp" #include #include #include #include -#include "base64.hpp" - // RGB uint8 image struct clip_image_u8 { int nx; @@ -35,8 +34,9 @@ struct clip_image_f32 { * @return The best fit resolution in the format (width, height). */ static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { - int original_width = original_size.first; + int original_width = original_size.first; int original_height = original_size.second; + std::pair best_fit; int max_effective_resolution = 0; int min_wasted_resolution = std::numeric_limits::max(); @@ -45,7 +45,7 @@ static std::pair select_best_resolution(const std::pair& ori int width = resolution.first; int height = resolution.second; float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); - int downscaled_width = static_cast(original_width * scale); + int downscaled_width = static_cast(original_width * scale); int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; @@ -59,6 +59,7 @@ static std::pair select_best_resolution(const std::pair& ori return best_fit; } + /** * @brief Get the anyres image grid shape object * @@ -67,7 +68,7 @@ static std::pair select_best_resolution(const std::pair& ori * @param image_patch_size * @return */ -struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { +static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair & image_size, const std::vector> & grid_pinpoints, int image_patch_size) { /** Conversion from gguf flat array to vector: std::vector> possible_resolutions; @@ -79,22 +80,26 @@ struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { - struct temp_model { - struct ggml_tensor *newline; + struct { + struct ggml_tensor * newline; struct ggml_context * ctx; } model; - auto & vparams = clip_get_vision_hparams(ctx_clip); - auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) - int num_patches_width = grid_shape.first; // grid 1-4 + const int32_t image_size = clip_image_size(ctx_clip); + const int32_t patch_size = clip_patch_size(ctx_clip); + + int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) + + int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 + const size_t num_images = num_patches_width + num_patches_height + 1; // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; + { ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); @@ -105,6 +110,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API }; + // Python reference code for full unpad: /* base_image_feature = image_feature[0] @@ -138,17 +144,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector */ model.ctx = ggml_init(params); - ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); - ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); + ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); if (newline_tmp->backend != GGML_BACKEND_CPU) { if (newline_tmp->buffer == NULL) { printf("newline_tmp tensor buffer is NULL\n"); } ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); - } else - { + } else { model.newline->data = newline_tmp->data; if (model.newline->data == NULL) { printf("newline_tmp tensor data is NULL\n"); @@ -158,8 +162,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base - for (int i = 1; i < num_images; i++) - { + for (size_t i = 1; i < num_images; i++) { size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); } @@ -222,10 +225,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } const int64_t t_img_enc_start_us = ggml_time_us(); - auto & vparams = clip_get_vision_hparams(ctx_clip); - if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) - { + const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); + + if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 @@ -235,41 +238,43 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } else - { + } else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; image_embd_v.resize(img_res_v.size); - for (int i = 0; i < img_res_v.size; i++) - { + for (size_t i = 0; i < img_res_v.size; i++) { image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size); + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + const int32_t * image_grid = clip_image_grid(ctx_clip); std::vector> grid_pinpoints; - for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) { - grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]}); + for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { + grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); } + // free all img_res_v - not needed anymore delete[] img_res_v.data; img_res_v.size = 0; img_res_v.data = nullptr; - struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size); + + const int32_t image_size = clip_image_size(ctx_clip); + + struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); int n_img_pos_out; clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); *n_img_pos = n_img_pos_out; - for (int i = 0; i < image_embd_v.size(); i++) - { + for (size_t i = 0; i < image_embd_v.size(); i++) { free(image_embd_v[i]); } image_embd_v.clear(); @@ -278,10 +283,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_u8 * tmp = clip_image_u8_init(); // clip_image_convert_f32_to_u8(*image_feature, *tmp); // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); - } - printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; @@ -291,8 +295,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return true; } - - bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { // make sure that the correct mmproj was used, i.e., compare apples to apples int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); From 997dd1fdf7c367dcfc92758d5b4f61de7546125f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 20:40:01 +0200 Subject: [PATCH 19/24] llava : style --- examples/llava/clip.cpp | 232 ++++++++++++++++++++------------------- examples/llava/clip.h | 34 +++--- examples/llava/llava.cpp | 11 +- examples/llava/llava.h | 2 - 4 files changed, 141 insertions(+), 138 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a7562eb47acfe..1cdb2be74a31d 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -70,29 +70,29 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" - -#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" -#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" + +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" @@ -100,26 +100,26 @@ static std::string format(const char * fmt, ...) { // tensor name constants // -#define TN_TOKEN_EMBD "%s.token_embd.weight" -#define TN_POS_EMBD "%s.position_embd.weight" -#define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" -#define TN_ATTN_K "%s.blk.%d.attn_k.%s" -#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" -#define TN_ATTN_V "%s.blk.%d.attn_v.%s" -#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" -#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" -#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" -#define TN_LN_1 "%s.blk.%d.ln1.%s" -#define TN_LN_2 "%s.blk.%d.ln2.%s" -#define TN_LN_PRE "%s.pre_ln.%s" -#define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" -#define TN_LLAVA_PROJ "mm.%d.%s" -#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_TOKEN_EMBD "%s.token_embd.weight" +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" +#define TN_LN_2 "%s.blk.%d.ln2.%s" +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_TEXT_PROJ "text_projection.weight" +#define TN_VIS_PROJ "visual_projection.weight" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" -#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_IMAGE_NEWLINE "model.image_newline" enum projector_type { @@ -130,8 +130,8 @@ enum projector_type { }; static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, }; @@ -191,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } } - static void replace_all(std::string & s, const std::string & search, const std::string & replace) { std::string result; for (size_t pos = 0; ; pos += search.length()) { @@ -279,7 +278,6 @@ struct clip_hparams { int32_t image_grid_pinpoints[32]; int32_t image_crop_resolution; - }; struct clip_layer { @@ -333,6 +331,7 @@ struct clip_vision_model { struct ggml_tensor * mm_0_b = NULL; struct ggml_tensor * mm_2_w = NULL; struct ggml_tensor * mm_2_b = NULL; + struct ggml_tensor * image_newline = NULL; // Yi type models with mlp+normalization projection @@ -389,9 +388,10 @@ struct clip_ctx { std::vector buf_compute_meta; // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; + ggml_backend_buffer_t params_buffer = NULL; ggml_backend_buffer_t compute_buffer = NULL; - ggml_backend_t backend = NULL; + + ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; }; @@ -404,19 +404,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); - const int num_positions = num_patches + 1; - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - //const int n_intermediate = hparams.n_intermediate; - //const int projection_dim = hparams.projection_dim; - const float eps = hparams.eps; - int batch_size = imgs->size; + const int num_positions = num_patches + 1; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + const int batch_size = imgs->size; + if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); } @@ -816,10 +816,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (idx != -1) { const std::string proj_type = gguf_get_val_str(ctx, idx); new_clip->proj_type = clip_projector_type_from_string(proj_type); - } - else { + } else { new_clip->proj_type = PROJECTOR_TYPE_MLP; } + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; @@ -944,6 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + try { int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int n = gguf_get_arr_n(ctx, idx); @@ -956,23 +957,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & e) { hparams.image_grid_pinpoints[0]=0; } + try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); } catch (std::runtime_error & e) { strcpy(hparams.mm_patch_merge_type, "flat"); } + try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } - catch(const std::exception& e) { + } catch(const std::exception& e) { hparams.image_crop_resolution = hparams.image_size; } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); + for (int i = 0; i < 3; ++i) { new_clip->image_mean[i] = mean_data[i]; new_clip->image_std[i] = std_data[i]; @@ -998,16 +1002,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } - try - { + + try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - } - catch(const std::exception& e) - { + } catch(const std::exception& e) { fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); } @@ -1039,40 +1041,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & e) { } - } - else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection - vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); - vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); - vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); - vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); - vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); - vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); - vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); - vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); - vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); - vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); - vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); - vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); - vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); - vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); - vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); - vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } - else { + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } vision_model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { auto & layer = vision_model.layers[il]; layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); @@ -1412,7 +1413,6 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag image_output = std::move(padded_image); } - /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -1446,7 +1446,6 @@ static std::pair select_best_resolution(const std::pair & or return best_fit; } - static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { std::vector patches; int width = image.nx; @@ -1472,7 +1471,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im #ifdef CLIP_DEBUG_FUNCTIONS // debug function to convert f32 to u8 -void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { dst.nx = src.nx; dst.ny = src.ny; dst.buf.resize(3 * src.nx * src.ny); @@ -1532,8 +1531,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } else { - if (params.image_grid_pinpoints[0] != 0) - { + if (params.image_grid_pinpoints[0] != 0) { // "spatial_unpad" with "anyres" processing for llava-1.6 std::vector> possible_resolutions; for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { @@ -1656,6 +1654,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli return true; } +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); @@ -1687,6 +1689,18 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_grid_pinpoints; } +int clip_n_patches(const struct clip_ctx * ctx) { + const auto & params = ctx->vision_model.hparams; + + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + n_patches /= 4; + } + + return n_patches; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1706,7 +1720,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } int batch_size = imgs->size; - if(ctx->has_llava_projector) { + if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -1717,9 +1731,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_positions = num_patches + 1; { @@ -1794,11 +1809,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + return true; } bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { - ggml_type type = GGML_TYPE_Q4_1; assert(itype < GGML_TYPE_COUNT); @@ -1987,26 +2002,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; } - else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; } - else { - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; } -} - -ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->vision_model.image_newline; -} -int clip_n_patches(const struct clip_ctx * ctx) { - auto & params = ctx->vision_model.hparams; - int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - n_patches /= 4; - } - return n_patches; + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 5e0b5c64b57c5..cd9a4022f5778 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -26,7 +26,17 @@ extern "C" { struct clip_ctx; -CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; +}; + +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -45,33 +55,21 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); -struct clip_image_u8_batch { - struct clip_image_u8 * data; - size_t size; -}; - -struct clip_image_f32_batch { - struct clip_image_f32 * data; - size_t size; -}; -CLIP_API struct clip_image_grid_shape { - int first; - int second; -}; - CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); -CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ); -CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); + +CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index ea956ac005a97..22953417f0975 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -26,6 +26,11 @@ struct clip_image_f32 { std::vector buf; }; +struct clip_image_grid_shape { + int first; + int second; +}; + /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -344,7 +349,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ return true; } -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { +struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); @@ -401,7 +406,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long return true; } -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { +struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { unsigned char* image_bytes; long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); @@ -416,7 +421,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct return embed; } -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) { +void llava_image_embed_free(struct llava_image_embed * embed) { free(embed->embed); free(embed); } diff --git a/examples/llava/llava.h b/examples/llava/llava.h index e08ce78839dcb..9e9466a5d1726 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -3,7 +3,6 @@ #include "ggml.h" - #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD @@ -42,7 +41,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); - #ifdef __cplusplus } #endif From 9d166b0850db18fac234d60af38213faf8dedaf8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 20:43:45 +0200 Subject: [PATCH 20/24] convert : add --skip-unknown CLI arg --- convert.py | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/convert.py b/convert.py index 1fc2d4719efd0..63a0a5d78075b 100755 --- a/convert.py +++ b/convert.py @@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM for (name, tensor) in model.items()} -def convert_model_names(model: LazyModel, params: Params) -> LazyModel: +def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) @@ -1199,9 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: for name, lazy_tensor in model.items(): tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: - #raise Exception(f"Unexpected tensor name: {name}") - print(f"Unexpected tensor name: {name} - skipping") - continue + if skip_unknown: + print(f"Unexpected tensor name: {name} - skipping") + continue + else: + raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: print(f"skipping tensor {name_new}") @@ -1379,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None: output_choices.append("q8_0") vocab_types = ["spm", "bpe", "hfft"] parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") - parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") - parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) - parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") - parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") + parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") args = parser.parse_args(args_in) if args.awq_path: @@ -1463,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None: print(f"Special vocab info: {special_vocab}") model = model_plus.model - model = convert_model_names(model, params) + model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype) From c92431a0a4643346e3c96a6b971dcadf0d5d4a99 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 20:51:20 +0200 Subject: [PATCH 21/24] server : remove clip structs --- examples/llava/clip.cpp | 4 +++- examples/server/server.cpp | 29 ++++++++--------------------- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 1cdb2be74a31d..73438e3f5e241 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1483,7 +1483,7 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1648,9 +1648,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // clip_image_u8_free(temp2); // } // res_imgs.push_back(res); + res_imgs.size = 1; res_imgs.data = new clip_image_f32[res_imgs.size]; res_imgs.data[0] = std::move(*res); + return true; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9f71725dc163..6e343403032fc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -31,23 +31,6 @@ using json = nlohmann::json; -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - struct server_params { std::string hostname = "127.0.0.1"; @@ -992,10 +975,13 @@ struct llama_server_context { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); - delete[] img_res_v.data; + clip_image_f32_free(img_res_v.data); return false; } - clip_image_f32 * img_res = &img_res_v.data[0]; + + // note: assumes only one image was returned by clip_image_preprocess + clip_image_f32 * img_res = img_res_v.data; + img.image_tokens = clip_n_patches(clp_ctx); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); if (!img.image_embedding) @@ -1010,8 +996,9 @@ struct llama_server_context LOG_TEE("Unable to encode image\n"); return false; } - // clip_image_f32_free(img_res); - delete[] img_res_v.data; + + clip_image_f32_free(img_res_v.data); + img.request_encode_image = false; } From c9874dd0d65c8e0d42588f287af02d8905999e21 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Wed, 14 Feb 2024 05:05:57 +0100 Subject: [PATCH 22/24] bugfix for non llava-1.6 It should now work with llava-1.5 as well --- examples/llava/llava-surgery-v2.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index f0ade4ceb357b..5bc5bc5137fe0 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -38,7 +38,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): # file_type = 'pytorch' model_path = os.path.dirname(checkpoint_path) print(f"Searching for vision tower tensors in {checkpoint_path}") - clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] + clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))] if len(clip_tensors) > 0: print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") @@ -46,8 +46,10 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): clip_path = os.path.join(model_path, "llava.clip") if os.path.exists(clip_path): + print(f"Loading existing llava.clip from {clip_path}") existing_clip, _ = load_model(clip_path) else: + print(f"Creating new llava.clip at {clip_path}") existing_clip = {} # Update existing_clip with new tensors, avoid duplicates for name in clip_tensors: @@ -116,19 +118,24 @@ def proj_criteria(checkpoint): newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) print(f"Taking projector from {projector_checkpoint_path}") -print(f"Taking newline from {newline_checkpoint_path}") +first_mm_tensors = [] +first_checkpoint = None +if newline_checkpoint_path is not None: + print(f"Taking newline from {newline_checkpoint_path}") + first_checkpoint, file_type = load_model(newline_checkpoint_path) + first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] # Load the checkpoint -first_checkpoint, file_type = load_model(newline_checkpoint_path) -last_checkpoint, file_type = load_model(projector_checkpoint_path) -mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] -first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] - - +mm_tensors = [] +last_checkpoint = None +if projector_checkpoint_path is not None: + last_checkpoint, file_type = load_model(projector_checkpoint_path) + mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] if len(mm_tensors) == 0: - for k, v in last_checkpoint.items(): - print(k) + if last_checkpoint is not None: + for k, v in last_checkpoint.items(): + print(k) print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") print("No tensors found. Is this a LLaVA model?") exit() @@ -142,7 +149,8 @@ def proj_criteria(checkpoint): for name in first_mm_tensors: projector[name] = first_checkpoint[name].float() -save_model(projector, f"{args.model}/llava.projector", 'pytorch') +if len(projector) > 0: + save_model(projector, f"{args.model}/llava.projector", 'pytorch') for name in mm_tensors: del last_checkpoint[name] From 7974ff7f027739b108927acc1eb540076fadfb6d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 14 Feb 2024 09:34:16 +0200 Subject: [PATCH 23/24] clip : minor code rearrange --- examples/llava/clip.cpp | 221 ++++++++++++++++++++-------------------- 1 file changed, 110 insertions(+), 111 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 73438e3f5e241..9c5091e613849 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -32,7 +32,7 @@ #include #include -// #define CLIP_DEBUG_FUNCTIONS +//#define CLIP_DEBUG_FUNCTIONS // RGB uint8 image struct clip_image_u8 { @@ -258,6 +258,114 @@ static projector_type clip_projector_type_from_string(const std::string & name) return PROJECTOR_TYPE_UNKNOWN; } +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + // // clip layers @@ -274,7 +382,7 @@ struct clip_hparams { float eps; - char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) + char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) int32_t image_grid_pinpoints[32]; int32_t image_crop_resolution; @@ -1156,103 +1264,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } -#ifdef CLIP_DEBUG_FUNCTIONS -void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - std::cerr << "Failed to open file for writing: " << filename << std::endl; - return; - } - - // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; - - // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { - // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); - } - - file.close(); -} -void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - std::cerr << "Failed to open file for writing: " << filename << std::endl; - return; - } - - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data - int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; - int paddingAmount = (4 - (widthInBytes % 4)) % 4; - int stride = widthInBytes + paddingAmount; - - // Bitmap file header - unsigned char fileHeader[14] = { - 'B','M', // Signature - 0,0,0,0, // Image file size in bytes - 0,0,0,0, // Reserved - 54,0,0,0 // Start of pixel array - }; - - // Total file size - fileSize = 54 + (stride * img.ny); - fileHeader[2] = (unsigned char)(fileSize); - fileHeader[3] = (unsigned char)(fileSize >> 8); - fileHeader[4] = (unsigned char)(fileSize >> 16); - fileHeader[5] = (unsigned char)(fileSize >> 24); - - // Bitmap information header (BITMAPINFOHEADER) - unsigned char infoHeader[40] = { - 40,0,0,0, // Size of this header (40 bytes) - 0,0,0,0, // Image width - 0,0,0,0, // Image height - 1,0, // Number of color planes - 24,0, // Bits per pixel - 0,0,0,0, // No compression - 0,0,0,0, // Image size (can be 0 for no compression) - 0,0,0,0, // X pixels per meter (not specified) - 0,0,0,0, // Y pixels per meter (not specified) - 0,0,0,0, // Total colors (color table not used) - 0,0,0,0 // Important colors (all are important) - }; - - // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); - - // Write file headers - file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); - file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); - - // Pixel data - std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { - // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; - unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] - }; - file.write(reinterpret_cast(pixel), 3); - } - // Write padding for the row - file.write(reinterpret_cast(padding.data()), paddingAmount); - } - - file.close(); -} -#endif - // Linear interpolation between two points inline float lerp(float s, float e, float t) { return s + (e - s) * t; @@ -1469,18 +1480,6 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } -#ifdef CLIP_DEBUG_FUNCTIONS -// debug function to convert f32 to u8 -static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); - } -} -#endif - // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { From 6727cfd21ac3d3d14a21de690326552333de9ab8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 14 Feb 2024 09:35:57 +0200 Subject: [PATCH 24/24] llava : update readme a bit --- examples/llava/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 4e789da3dd190..e2ef0eff1466c 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -19,9 +19,9 @@ After building, run: `./llava-cli` to see the usage. For example: **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. -## Model conversion +## LLaVA 1.5 -- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally: +- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: ```sh git clone https://huggingface.co/liuhaotian/llava-v1.5-7b @@ -59,6 +59,8 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director - Use `llava-surgery-v2.py` +- TODO: add detailed instructions + ## TODO - [x] Support non-CPU backend for the image encoding part.