diff --git a/common/arg.cpp b/common/arg.cpp index 80c318a0e50d0..1cfd0168d95ae 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -976,14 +976,13 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-gritlm", "llama-imatrix", "llama-infill", - "llama-llava-cli", + "llama-mtmd-cli", "llama-llava-clip-quantize-cli", "llama-lookahead", "llama-lookup", "llama-lookup-create", "llama-lookup-merge", "llama-lookup-stats", - "llama-minicpmv-cli", "llama-parallel", "llama-passkey", "llama-perplexity", diff --git a/examples/llava/MobileVLM-README.md b/docs/multimodal/MobileVLM.md similarity index 96% rename from examples/llava/MobileVLM-README.md rename to docs/multimodal/MobileVLM.md index 4f783f3ce05fb..20ac02f7a8dfc 100644 --- a/examples/llava/MobileVLM-README.md +++ b/docs/multimodal/MobileVLM.md @@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. ## Usage -Build with cmake or run `make llama-llava-cli` to build it. -After building, run: `./llama-llava-cli` to see the usage. For example: +Build the `llama-mtmd-cli` binary. + +After building, run: `./llama-mtmd-cli` to see the usage. For example: ```sh -./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ +./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ - --image path/to/an/image.jpg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" + --chat-template deepseek ``` ## Model conversion @@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path` ### case 1 **input** ```sh -/data/local/tmp/llama-llava-cli \ +/data/local/tmp/llama-mtmd-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms ### case 2 **input** ```sh -/data/local/tmp/llama-llava-cli \ +/data/local/tmp/llama-mtmd-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -123,10 +123,10 @@ llama_print_timings: total time = 34570.79 ms ## Some result on Android with `Snapdragon 778G` chip ### MobileVLM-1.7B case -#### llava-cli release-b2005 +#### mtmd-cli release-b2005 **input** ```sh -/data/local/tmp/llama-llava-cli \ +/data/local/tmp/llama-mtmd-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -147,7 +147,7 @@ llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 m llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second) llama_print_timings: total time = 28038.34 ms / 205 tokens ``` -#### llava-cli latest-version +#### mtmd-cli latest-version **input** Just the same as above. @@ -169,7 +169,7 @@ llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 m llama_print_timings: total time = 865441.76 ms / 204 tokens ``` ### MobileVLM_V2-1.7B case -#### llava-cli release-2005b +#### mtmd-cli release-2005b **input** Just the same as above. @@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 ### case 1 **input** ```sh -./llama-llava-cli \ +./llama-mtmd-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ --image /data/local/tmp/demo.jpeg \ @@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens ### case 2 **input** ```sh -./llama-llava-cli \ +./llama-mtmd-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" \ diff --git a/examples/llava/README-gemma3.md b/docs/multimodal/gemma3.md similarity index 82% rename from examples/llava/README-gemma3.md rename to docs/multimodal/gemma3.md index 3c25ee2583027..8fa077de71985 100644 --- a/examples/llava/README-gemma3.md +++ b/docs/multimodal/gemma3.md @@ -26,11 +26,12 @@ llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF ## How to get mmproj.gguf? +Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`: + ```bash cd gemma-3-4b-it -python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py . - -# output file is mmproj.gguf +python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj . +# output file: mmproj-model.gguf ``` ## How to run it? diff --git a/examples/llava/README-glmedge.md b/docs/multimodal/glmedge.md similarity index 80% rename from examples/llava/README-glmedge.md rename to docs/multimodal/glmedge.md index 603d01474513f..af6b696a8ad27 100644 --- a/examples/llava/README-glmedge.md +++ b/docs/multimodal/glmedge.md @@ -3,12 +3,12 @@ Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b). ## Usage -Build with cmake or run `make llama-llava-cli` to build it. +Build the `llama-mtmd-cli` binary. -After building, run: `./llama-llava-cli` to see the usage. For example: +After building, run: `./llama-mtmd-cli` to see the usage. For example: ```sh -./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <|user|>\n prompt <|assistant|>\n" +./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. diff --git a/examples/llava/README-granitevision.md b/docs/multimodal/granitevision.md similarity index 92% rename from examples/llava/README-granitevision.md rename to docs/multimodal/granitevision.md index f08a21cc175b4..3118fe0cdc113 100644 --- a/examples/llava/README-granitevision.md +++ b/docs/multimodal/granitevision.md @@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio ### 5. Running the Model in Llama cpp -Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. +Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. ```bash -$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \ +$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \ --mmproj $VISUAL_GGUF_PATH \ - --image ./media/llama0-banner.png \ -c 16384 \ - -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\\nWhat does the text in this image say?\n<|assistant|>\n" \ --temp 0 ``` - -Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"` diff --git a/docs/multimodal/llava.md b/docs/multimodal/llava.md new file mode 100644 index 0000000000000..c5bdc82158ede --- /dev/null +++ b/docs/multimodal/llava.md @@ -0,0 +1,143 @@ +# LLaVA + +Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants, +as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants. + +The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) +and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) +models are available. +For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf) + +After API is confirmed, more models will be supported / uploaded. + +## Usage +Build the `llama-mtmd-cli` binary. + +After building, run: `./llama-mtmd-cli` to see the usage. For example: + +```sh +./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \ + --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \ + --chat-template vicuna +``` + +**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. +**note**: For GPU offloading ensure to use the `-ngl` flag just like usual + +## LLaVA 1.5 + +1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: + +```sh +git clone https://huggingface.co/liuhaotian/llava-v1.5-7b + +git clone https://huggingface.co/openai/clip-vit-large-patch14-336 +``` + +2. Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: + +```sh +python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b +``` + +4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: + +```sh +python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b +``` + +5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: + +```sh +python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown +``` + +Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. + +## LLaVA 1.6 gguf conversion +1) First clone a LLaVA 1.6 model: +```console +git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b +``` + +2) Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +```console +python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ +``` +- you will find a llava.projector and a llava.clip file in your model directory + +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +```console +mkdir vit +cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin +cp ../llava-v1.6-vicuna-7b/llava.projector vit/ +curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json +``` + +5) Create the visual gguf model: +```console +python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +``` +- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP + +6) Then convert the model to gguf format: +```console +python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown +``` + +7) And finally we can run the llava cli using the 1.6 model version: +```console +./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf +``` + +**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) + +**note** llava-1.6 greatly benefits from batched prompt processing (defaults work) + +**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model. + +```python +import os +import transformers + +model_path = ... +llm_export_path = ... + +tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) +model = transformers.AutoModelForImageTextToText.from_pretrained(model_path) + +tokenizer.save_pretrained(llm_export_path) +model.language_model.save_pretrained(llm_export_path) +``` + +Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures. + +## Chat template + +For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template. + + +## How to know if you are running in llava-1.5 or llava-1.6 mode + +When running llava-cli you will see a visual information right before the prompt is being processed: + +**Llava-1.5:** +`encode_image_with_clip: image embedding created: 576 tokens` + +**Llava-1.6 (anything above 576):** +`encode_image_with_clip: image embedding created: 2880 tokens` + + +Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6 diff --git a/examples/llava/README-minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md similarity index 73% rename from examples/llava/README-minicpmo2.6.md rename to docs/multimodal/minicpmo2.6.md index 48c423238395b..de470d8a82cc6 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/docs/multimodal/minicpmo2.6.md @@ -40,9 +40,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf ``` diff --git a/examples/llava/README-minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md similarity index 72% rename from examples/llava/README-minicpmv2.5.md rename to docs/multimodal/minicpmv2.5.md index 6bfe7abd16487..7a6879d3959ca 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/docs/multimodal/minicpmv2.5.md @@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf ``` diff --git a/examples/llava/README-minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md similarity index 71% rename from examples/llava/README-minicpmv2.6.md rename to docs/multimodal/minicpmv2.6.md index 2df39cdbac78a..410a5dd1771e4 100644 --- a/examples/llava/README-minicpmv2.6.md +++ b/docs/multimodal/minicpmv2.6.md @@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf ``` diff --git a/examples/llava/README.md b/examples/llava/README.md index 0e3c32032055b..cadbc53fab0d7 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -1,158 +1,47 @@ -# LLaVA +# Multimodal Support in llama.cpp -Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants, -as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants. +This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported. -The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) -and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) -models are available. -For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf) +> [!IMPORTANT] +> +> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**. -After API is confirmed, more models will be supported / uploaded. +The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify: -## Usage -Build with cmake or run `make llama-llava-cli` to build it. +- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction. +- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure. +- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users. +- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs. +- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`. -After building, run: `./llama-llava-cli` to see the usage. For example: +## How it works and what is `mmproj`? -```sh -./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg -``` +Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model. -**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. -**note**: For GPU offloading ensure to use the `-ngl` flag just like usual +This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging. -## LLaVA 1.5 +Consequently, running a multimodal model typically requires two GGUF files: +1. The standard language model file. +2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection. -1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: +## What is `libmtmd`? -```sh -git clone https://huggingface.co/liuhaotian/llava-v1.5-7b +As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs. -git clone https://huggingface.co/openai/clip-vit-large-patch14-336 -``` +Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages: +- **Unified Interface:** Aims to consolidate interaction for various multimodal models. +- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library. +- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models. -2. Install the required Python packages: +## How to obtain `mmproj` -```sh -pip install -r examples/llava/requirements.txt -``` +Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them: -3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: - -```sh -python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b -``` - -4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: - -```sh -python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b -``` - -5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: - -```sh -python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown -``` - -Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. - -## LLaVA 1.6 gguf conversion -1) First clone a LLaVA 1.6 model: -```console -git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b -``` - -2) Install the required Python packages: - -```sh -pip install -r examples/llava/requirements.txt -``` - -3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: -```console -python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ -``` -- you will find a llava.projector and a llava.clip file in your model directory - -4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: -```console -mkdir vit -cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin -cp ../llava-v1.6-vicuna-7b/llava.projector vit/ -curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json -``` - -5) Create the visual gguf model: -```console -python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision -``` -- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP - -6) Then convert the model to gguf format: -```console -python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown -``` - -7) And finally we can run the llava cli using the 1.6 model version: -```console -./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 -``` - -**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) - -**note** llava-1.6 greatly benefits from batched prompt processing (defaults work) - -**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model. - -```python -import os -import transformers - -model_path = ... -llm_export_path = ... - -tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) -model = transformers.AutoModelForImageTextToText.from_pretrained(model_path) - -tokenizer.save_pretrained(llm_export_path) -model.language_model.save_pretrained(llm_export_path) -``` - -Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures. - -## llava-cli templating and llava-1.6 prompting - -llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."` -For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system: - -**For Mistral and using llava-cli binary:** -Add this: `-p "\nUSER:\nProvide a full description.\nASSISTANT:\n"` -The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role - -**For the 34B this should work:** -Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nProvide a full description.<|im_end|><|im_start|>assistant\n` - - -## How to know if you are running in llava-1.5 or llava-1.6 mode - -When running llava-cli you will see a visual information right before the prompt is being processed: - -**Llava-1.5:** -`encode_image_with_clip: image embedding created: 576 tokens` - -**Llava-1.6 (anything above 576):** -`encode_image_with_clip: image embedding created: 2880 tokens` - - -Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6 - - - - -## TODO - -- [x] Support non-CPU backend for the image encoding part. -- [ ] Support different sampling methods. -- [ ] Support more model variants. +- [LLaVA](../../docs/multimodal/llava.md) +- [MobileVLM](../../docs/multimodal/MobileVLM.md) +- [GLM-Edge](../../docs/multimodal/glmedge.md) +- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md) +- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md) +- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md) +- [IBM Granite Vision](../../docs/multimodal/granitevision.md) +- [Google Gemma 3](../../docs/multimodal/gemma3.md) diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh index 45ccf8d70d863..a24d6787d9a05 100755 --- a/examples/llava/android/adb_run.sh +++ b/examples/llava/android/adb_run.sh @@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant. # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" program_dir="build_64/bin" -binName="llama-llava-cli" +binName="llama-mtmd-cli" n_threads=4 diff --git a/examples/llava/gemma3_convert_encoder_to_gguf.py b/examples/llava/gemma3_convert_encoder_to_gguf.py deleted file mode 100644 index 241b526b9ede7..0000000000000 --- a/examples/llava/gemma3_convert_encoder_to_gguf.py +++ /dev/null @@ -1,307 +0,0 @@ -import gguf -import argparse -import logging -import sys -import torch -import json -import os -import numpy as np -from typing import cast, ContextManager, Any, Iterator -from pathlib import Path -from torch import Tensor - -logger = logging.getLogger("gemma3-mmproj") - - -# (copied from convert_hf_to_gguf.py) -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - } - - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_str_map: dict[str, torch.dtype] = { - "F64": torch.float64, - "F32": torch.float32, - "BF16": torch.bfloat16, - "F16": torch.float16, - # "U64": torch.uint64, - "I64": torch.int64, - # "U32": torch.uint32, - "I32": torch.int32, - # "U16": torch.uint16, - "I16": torch.int16, - "U8": torch.uint8, - "I8": torch.int8, - "BOOL": torch.bool, - "F8_E4M3": torch.float8_e4m3fn, - "F8_E5M2": torch.float8_e5m2, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - args=(self,), - func=(lambda s: s.numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") - - @classmethod - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: - dtype = cls._dtype_str_map[st_slice.get_dtype()] - shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) - return cast(torch.Tensor, lazy) - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - - if kwargs is None: - kwargs = {} - - if func is torch.Tensor.numpy: - return args[0].numpy() - - return cls._wrap_fn(func)(*args, **kwargs) - - -class Gemma3VisionTower: - hparams: dict - gguf_writer: gguf.GGUFWriter - fname_out: Path - ftype: gguf.LlamaFileType - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - part_names.sort() - return part_names - - def __init__(self, - dir_model: Path, - fname_out: Path, - ftype: gguf.LlamaFileType, - is_big_endian: bool,): - hparams = Gemma3VisionTower.load_hparams(dir_model) - self.hparams = hparams - self.fname_out = fname_out - self.ftype = ftype - endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess) - - text_config = hparams["text_config"] - vision_config = hparams["vision_config"] - - assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration" - assert text_config is not None - assert vision_config is not None - - self.gguf_writer.add_string ("clip.projector_type", "gemma3") - self.gguf_writer.add_bool ("clip.has_text_encoder", False) - self.gguf_writer.add_bool ("clip.has_vision_encoder", True) - self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy - self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"]) - self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"]) - self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"]) - self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"]) - self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"]) - self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"]) - self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"]) - self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6)) - # default values taken from HF tranformers code - self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5]) - self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5]) - self.gguf_writer.add_bool ("clip.use_gelu", True) - - # load tensors - for name, data_torch in self.get_tensors(dir_model): - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - self.add_tensor(name, data_torch) - - def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]: - part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors") - tensor_names_from_parts: set[str] = set() - for part_name in part_names: - logger.info(f"gguf: loading model part '{part_name}'") - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu")) - with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) - - for name in model_part.keys(): - data = model_part.get_slice(name) - data = LazyTorchTensor.from_safetensors_slice(data) - yield name, data - - def add_tensor(self, name: str, data_torch: Tensor): - is_1d = len(data_torch.shape) == 1 - is_embd = ".embeddings." in name - old_dtype = data_torch.dtype - can_quantize = not is_1d and not is_embd - data_qtype = gguf.GGMLQuantizationType.F32 - - # this is to support old checkpoint - # TODO: remove this when we have the final model - name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.") - name = name.replace("multimodal_projector.", "multi_modal_projector.") - - # filter only vision tensors - if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."): - return - # prefix - name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.") - name = name.replace("vision_tower.vision_model.", "v.") - # projector and input embd - name = name.replace(".embeddings.patch_embedding.", ".patch_embd.") - name = name.replace(".embeddings.position_embedding.", ".position_embd.") - name = name.replace( - "multi_modal_projector.mm_input_projection_weight", - "mm.input_projection.weight" - ) - name = name.replace( - "multi_modal_projector.mm_soft_emb_norm.weight", - "mm.soft_emb_norm.weight" - ) - name = name.replace("post_layernorm.", "post_ln.") - # each block - name = name.replace(".self_attn.k_proj.", ".attn_k.") - name = name.replace(".self_attn.v_proj.", ".attn_v.") - name = name.replace(".self_attn.q_proj.", ".attn_q.") - name = name.replace(".self_attn.out_proj.", ".attn_out.") - name = name.replace(".layer_norm1.", ".ln1.") - name = name.replace(".layer_norm2.", ".ln2.") - name = name.replace(".mlp.fc1.", ".ffn_down.") - name = name.replace(".mlp.fc2.", ".ffn_up.") - - if can_quantize: - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - else: - raise ValueError(f"Unsupported file type: {self.ftype}") - - # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector - # the other norm values are part of SigLIP model, and they are already correct - # ref code: Gemma3RMSNorm - if "soft_emb_norm.weight" in name: - logger.info(f"Correcting norm value for '{name}'") - data_torch = data_torch + 1 - - data = data_torch.numpy() - - try: - data = gguf.quants.quantize(data, data_qtype) - except Exception as e: - logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" - logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) - - def write(self): - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert Gemma 3 vision tower safetensors to GGUF format",) - parser.add_argument( - "--outfile", type=Path, default="mmproj.gguf", - help="path to write to", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", - help="output format", - ) - parser.add_argument( - "--bigendian", action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file", - nargs="?", - ) - parser.add_argument( - "--verbose", action="store_true", - help="increase output verbosity", - ) - - args = parser.parse_args() - if args.model is None: - parser.error("the following arguments are required: model") - return args - - -def main() -> None: - args = parse_args() - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - - dir_model = args.model - - if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') - sys.exit(1) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - } - - logger.info(f"Loading model: {dir_model.name}") - - with torch.inference_mode(): - gemma3_vision_tower = Gemma3VisionTower( - dir_model=dir_model, - fname_out=args.outfile, - ftype=ftype_map[args.outtype], - is_big_endian=args.bigendian, - ) - gemma3_vision_tower.write() - - -if __name__ == '__main__': - main() -