From e074d330317122eca0a9ab1459e56bf60b275830 Mon Sep 17 00:00:00 2001
From: himanshushukla12 <himanshushukla.shukla3@gmail.com>
Date: Tue, 29 Oct 2024 16:56:49 +0000
Subject: [PATCH 01/13] Added fix for issue 702 and added code for that as
 well, added instructions in LLM_finetuning_overview.md as well

---
 .../finetuning/LLM_finetuning_overview.md     |  29 +++++
 .../finetuning/code-merge-inference.py        | 117 ++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 recipes/quickstart/finetuning/code-merge-inference.py

diff --git a/recipes/quickstart/finetuning/LLM_finetuning_overview.md b/recipes/quickstart/finetuning/LLM_finetuning_overview.md
index ca79bcb96..8133a0faa 100644
--- a/recipes/quickstart/finetuning/LLM_finetuning_overview.md
+++ b/recipes/quickstart/finetuning/LLM_finetuning_overview.md
@@ -62,3 +62,32 @@ To boost the performance of fine-tuning with FSDP, we can make use a number of f
 - **Activation Checkpointing**  which is a technique to save memory by discarding the intermediate activation in forward pass instead of keeping it in the memory with the cost recomputing them in the backward pass. FSDP Activation checkpointing is shard aware meaning we need to apply it after wrapping the model with FSDP. In our script we are making use of that.
 
 - **auto_wrap_policy** Which is the way to specify how FSDP would partition the model, there is default support for transformer wrapping policy. This allows FSDP to form each FSDP unit ( partition of the  model ) based on the transformer class in the model. To identify this layer in the model, you need to look at the layer that wraps both the attention layer and  MLP. This helps FSDP have more fine-grained units for communication that help with optimizing the communication cost.
+
+### Inference
+
+After fine-tuning the model, you can use the `code-merge-inference.py` script to generate text from images. The script supports merging PEFT adapter weights from a specified path.
+
+#### Usage
+
+To run the inference script, use the following command:
+
+```bash
+python code-merge-inference.py \
+    --image_path "path/to/your/image.png" \
+    --prompt_text "Your prompt text here" \
+    --temperature 1 \
+    --top_p 0.5 \
+    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
+    --hf_token "your_hugging_face_token" \
+    --finetuning_path "path/to/your/finetuned/model"
+```
+
+#### Script Details
+
+The `code-merge-inference.py` script performs the following steps:
+
+1. **Load Model and Processor**: Loads the pre-trained model and processor, and optionally loads PEFT adapter weights if specified.
+2. **Process Image**: Opens and converts the input image.
+3. **Generate Text**: Generates text from the image using the model and processor.
+
+For more details, refer to the `code-merge-inference.py` script.
\ No newline at end of file
diff --git a/recipes/quickstart/finetuning/code-merge-inference.py b/recipes/quickstart/finetuning/code-merge-inference.py
new file mode 100644
index 000000000..bd7c11804
--- /dev/null
+++ b/recipes/quickstart/finetuning/code-merge-inference.py
@@ -0,0 +1,117 @@
+import os
+import sys
+import argparse
+from PIL import Image as PIL_Image
+import torch
+from transformers import MllamaForConditionalGeneration, MllamaProcessor
+from accelerate import Accelerator
+from peft import PeftModel  # Make sure to install the `peft` library
+
+accelerator = Accelerator()
+device = accelerator.device
+
+# Constants
+DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+
+def load_model_and_processor(model_name: str, hf_token: str, finetuning_path: str = None):
+    """
+    Load the model and processor, and optionally load adapter weights if specified.
+    """
+    # Load pre-trained model and processor
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_name, 
+        torch_dtype=torch.bfloat16, 
+        use_safetensors=True, 
+        device_map=device,
+        token=hf_token
+    )
+    processor = MllamaProcessor.from_pretrained(
+        model_name, 
+        token=hf_token, 
+        use_safetensors=True
+    )
+
+    # If a finetuning path is provided, load the adapter model
+    if finetuning_path and os.path.exists(finetuning_path):
+        adapter_weights_path = os.path.join(finetuning_path, "adapter_model.safetensors")
+        adapter_config_path = os.path.join(finetuning_path, "adapter_config.json")
+
+        if os.path.exists(adapter_weights_path) and os.path.exists(adapter_config_path):
+            print(f"Loading adapter from '{finetuning_path}'...")
+            # Load the model with adapters using `peft`
+            model = PeftModel.from_pretrained(
+                model,
+                finetuning_path,  # This should be the folder containing the adapter files
+                is_adapter=True,
+                torch_dtype=torch.bfloat16
+            )
+
+            print("Adapter merged successfully with the pre-trained model.")
+        else:
+            print(f"Adapter files not found in '{finetuning_path}'. Using pre-trained model only.")
+    else:
+        print(f"No fine-tuned weights or adapters found in '{finetuning_path}'. Using pre-trained model only.")
+
+    # Prepare the model and processor for accelerated training
+    model, processor = accelerator.prepare(model, processor)
+    
+    return model, processor
+
+
+def process_image(image_path: str) -> PIL_Image.Image:
+    """
+    Open and convert an image from the specified path.
+    """
+    if not os.path.exists(image_path):
+        print(f"The image file '{image_path}' does not exist.")
+        sys.exit(1)
+    with open(image_path, "rb") as f:
+        return PIL_Image.open(f).convert("RGB")
+
+
+def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
+    """
+    Generate text from an image using the model and processor.
+    """
+    conversation = [
+        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
+    ]
+    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    inputs = processor(image, prompt, return_tensors="pt").to(device)
+    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=2048)
+    return processor.decode(output[0])[len(prompt):]
+
+
+def main(image_path: str, prompt_text: str, temperature: float, top_p: float, model_name: str, hf_token: str, finetuning_path: str = None):
+    """
+    Call all the functions and optionally merge adapter weights from a specified path.
+    """
+    model, processor = load_model_and_processor(model_name, hf_token, finetuning_path)
+    image = process_image(image_path)
+    result = generate_text_from_image(model, processor, image, prompt_text, temperature, top_p)
+    print("Generated Text: " + result)
+
+
+if __name__ == "__main__":
+    # Example usage with argparse (optional)
+    parser = argparse.ArgumentParser(description="Generate text from an image using a fine-tuned model with adapters.")
+    parser.add_argument("--image_path", type=str, required=True, help="Path to the input image.")
+    parser.add_argument("--prompt_text", type=str, required=True, help="Prompt text for the image.")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature.")
+    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling.")
+    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Pre-trained model name.")
+    parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face API token.")
+    parser.add_argument("--finetuning_path", type=str, help="Path to the fine-tuning weights (adapters).")
+    
+    args = parser.parse_args()
+
+    main(
+        image_path=args.image_path,
+        prompt_text=args.prompt_text,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        model_name=args.model_name,
+        hf_token=args.hf_token,
+        finetuning_path=args.finetuning_path
+    )

From 5250a20684fc424134d2e796b8cfe32f9696a3b6 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Wed, 30 Oct 2024 18:55:57 +0000
Subject: [PATCH 02/13] Move details of loading lora checkpoints from
 finetuning/LLM_finetuning_overview.md to local_inference/README.md

---
 .../finetuning/LLM_finetuning_overview.md     | 31 +------------------
 .../inference/local_inference/README.md       | 29 +++++++++++++++++
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/recipes/quickstart/finetuning/LLM_finetuning_overview.md b/recipes/quickstart/finetuning/LLM_finetuning_overview.md
index 8133a0faa..34e79ff35 100644
--- a/recipes/quickstart/finetuning/LLM_finetuning_overview.md
+++ b/recipes/quickstart/finetuning/LLM_finetuning_overview.md
@@ -61,33 +61,4 @@ To boost the performance of fine-tuning with FSDP, we can make use a number of f
 
 - **Activation Checkpointing**  which is a technique to save memory by discarding the intermediate activation in forward pass instead of keeping it in the memory with the cost recomputing them in the backward pass. FSDP Activation checkpointing is shard aware meaning we need to apply it after wrapping the model with FSDP. In our script we are making use of that.
 
-- **auto_wrap_policy** Which is the way to specify how FSDP would partition the model, there is default support for transformer wrapping policy. This allows FSDP to form each FSDP unit ( partition of the  model ) based on the transformer class in the model. To identify this layer in the model, you need to look at the layer that wraps both the attention layer and  MLP. This helps FSDP have more fine-grained units for communication that help with optimizing the communication cost.
-
-### Inference
-
-After fine-tuning the model, you can use the `code-merge-inference.py` script to generate text from images. The script supports merging PEFT adapter weights from a specified path.
-
-#### Usage
-
-To run the inference script, use the following command:
-
-```bash
-python code-merge-inference.py \
-    --image_path "path/to/your/image.png" \
-    --prompt_text "Your prompt text here" \
-    --temperature 1 \
-    --top_p 0.5 \
-    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
-    --hf_token "your_hugging_face_token" \
-    --finetuning_path "path/to/your/finetuned/model"
-```
-
-#### Script Details
-
-The `code-merge-inference.py` script performs the following steps:
-
-1. **Load Model and Processor**: Loads the pre-trained model and processor, and optionally loads PEFT adapter weights if specified.
-2. **Process Image**: Opens and converts the input image.
-3. **Generate Text**: Generates text from the image using the model and processor.
-
-For more details, refer to the `code-merge-inference.py` script.
\ No newline at end of file
+- **auto_wrap_policy** Which is the way to specify how FSDP would partition the model, there is default support for transformer wrapping policy. This allows FSDP to form each FSDP unit ( partition of the  model ) based on the transformer class in the model. To identify this layer in the model, you need to look at the layer that wraps both the attention layer and  MLP. This helps FSDP have more fine-grained units for communication that help with optimizing the communication cost.
\ No newline at end of file
diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md
index 0bf2ad9d7..d86035015 100644
--- a/recipes/quickstart/inference/local_inference/README.md
+++ b/recipes/quickstart/inference/local_inference/README.md
@@ -114,3 +114,32 @@ python inference.py --model_name <training_config.output_dir> --prompt_file <tes
 ## Inference on large models like Meta Llama 405B
 The FP8 quantized variants of Meta Llama (i.e. meta-llama/Meta-Llama-3.1-405B-FP8 and meta-llama/Meta-Llama-3.1-405B-Instruct-FP8) can be executed on a single node with 8x80GB H100 using the scripts located in this folder.
 To run the unquantized Meta Llama 405B variants (i.e. meta-llama/Meta-Llama-3.1-405B and meta-llama/Meta-Llama-3.1-405B-Instruct) we need to use a multi-node setup for inference. The llama-recipes inference script currently does not allow multi-node inference. To run this model you can use vLLM with pipeline and tensor parallelism as showed in [this example](../../../3p_integrations/vllm/README.md).
+
+### Inference-with-lora-checkpoints
+
+After fine-tuning the model, you can use the `code-merge-inference.py` script to generate text from images. The script supports merging PEFT adapter weights from a specified path.
+
+#### Usage
+
+To run the inference script, use the following command:
+
+```bash
+python code-merge-inference.py \
+    --image_path "path/to/your/image.png" \
+    --prompt_text "Your prompt text here" \
+    --temperature 1 \
+    --top_p 0.5 \
+    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
+    --hf_token "your_hugging_face_token" \
+    --finetuning_path "path/to/your/finetuned/model"
+```
+
+#### Script Details
+
+The `code-merge-inference.py` script performs the following steps:
+
+1. **Load Model and Processor**: Loads the pre-trained model and processor, and optionally loads PEFT adapter weights if specified.
+2. **Process Image**: Opens and converts the input image.
+3. **Generate Text**: Generates text from the image using the model and processor.
+
+For more details, refer to the `code-merge-inference.py` script.
\ No newline at end of file

From 4377505e4f92ced4af4e829e5a0d38fc14340351 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Wed, 30 Oct 2024 18:59:47 +0000
Subject: [PATCH 03/13] Moved the file code-merge-inference.py from fine-tuning
 firectory to local_inference

---
 .../local_inference}/code-merge-inference.py                      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename recipes/quickstart/{finetuning => inference/local_inference}/code-merge-inference.py (100%)

diff --git a/recipes/quickstart/finetuning/code-merge-inference.py b/recipes/quickstart/inference/local_inference/code-merge-inference.py
similarity index 100%
rename from recipes/quickstart/finetuning/code-merge-inference.py
rename to recipes/quickstart/inference/local_inference/code-merge-inference.py

From 6b1c0d582ba511b1ec8c73beecc4eed8eaec2978 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 17:50:14 +0000
Subject: [PATCH 04/13] added working in single file for 1. terminal
 inferencing, 2. gradio inferencing, 3. checkpoint inferencing

---
 .../local_inference/multi_modal_infer.py      | 169 ++++++++++--------
 1 file changed, 96 insertions(+), 73 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index 27d45b5f1..5d2c69ddd 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -1,108 +1,131 @@
 import argparse
 import os
 import sys
-
 import torch
 from accelerate import Accelerator
 from PIL import Image as PIL_Image
 from transformers import MllamaForConditionalGeneration, MllamaProcessor
+from peft import PeftModel
+import gradio as gr
 
+# Initialize accelerator
 accelerator = Accelerator()
-
 device = accelerator.device
 
 # Constants
 DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+MAX_OUTPUT_TOKENS = 2048
+MAX_IMAGE_SIZE = (1120, 1120)
 
-
-def load_model_and_processor(model_name: str):
-    """
-    Load the model and processor based on the 11B or 90B model.
-    """
+def load_model_and_processor(model_name: str, hf_token: str = None, finetuning_path: str = None):
+    """Load model and processor with optional LoRA adapter"""
     model = MllamaForConditionalGeneration.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
         use_safetensors=True,
         device_map=device,
+        token=hf_token
     )
-    processor = MllamaProcessor.from_pretrained(model_name, use_safetensors=True)
-
+    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token, use_safetensors=True)
+
+    if finetuning_path and os.path.exists(finetuning_path):
+        print(f"Loading adapter from '{finetuning_path}'...")
+        model = PeftModel.from_pretrained(
+            model,
+            finetuning_path,
+            is_adapter=True,
+            torch_dtype=torch.bfloat16
+        )
+        print("Adapter merged successfully")
+    
     model, processor = accelerator.prepare(model, processor)
     return model, processor
 
-
 def process_image(image_path: str) -> PIL_Image.Image:
-    """
-    Open and convert an image from the specified path.
-    """
+    """Process and validate image input"""
     if not os.path.exists(image_path):
-        print(f"The image file '{image_path}' does not exist.")
+        print(f"Image file '{image_path}' does not exist.")
         sys.exit(1)
-    with open(image_path, "rb") as f:
-        return PIL_Image.open(f).convert("RGB")
+    return PIL_Image.open(image_path).convert("RGB")
 
-
-def generate_text_from_image(
-    model, processor, image, prompt_text: str, temperature: float, top_p: float
-):
-    """
-    Generate text from an image using the model and processor.
-    """
+def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
+    """Generate text from image using model"""
     conversation = [
-        {
-            "role": "user",
-            "content": [{"type": "image"}, {"type": "text", "text": prompt_text}],
-        }
+        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
-    prompt = processor.apply_chat_template(
-        conversation, add_generation_prompt=True, tokenize=False
-    )
+    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     inputs = processor(image, prompt, return_tensors="pt").to(device)
-    output = model.generate(
-        **inputs, temperature=temperature, top_p=top_p, max_new_tokens=512
+    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS)
+    return processor.decode(output[0])[len(prompt):]
+
+def gradio_interface(model, processor):
+    """Create Gradio UI"""
+    def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
+        if image is not None:
+            image = image.resize(MAX_IMAGE_SIZE)
+            result = generate_text_from_image(model, processor, image, user_prompt, temperature, top_p)
+            history.append((user_prompt, result))
+        return history
+
+    def clear_chat():
+        return []
+
+    with gr.Blocks() as demo:
+        gr.HTML("<h1 style='text-align: center'>Llama Vision Model Interface</h1>")
+        
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_input = gr.Image(label="Image", type="pil", image_mode="RGB", height=512, width=512)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1)
+                top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1)
+                max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50)
+
+            with gr.Column(scale=2):
+                chat_history = gr.Chatbot(label="Chat", height=512)
+                user_prompt = gr.Textbox(show_label=False, placeholder="Enter your prompt", lines=2)
+                
+                with gr.Row():
+                    generate_button = gr.Button("Generate")
+                    clear_button = gr.Button("Clear")
+
+                generate_button.click(
+                    fn=describe_image,
+                    inputs=[image_input, user_prompt, temperature, top_k, top_p, max_tokens, chat_history],
+                    outputs=[chat_history]
+                )
+                clear_button.click(fn=clear_chat, outputs=[chat_history])
+
+    return demo
+
+def main(args):
+    """Main execution flow"""
+    model, processor = load_model_and_processor(
+        args.model_name,
+        args.hf_token,
+        args.finetuning_path
     )
-    return processor.decode(output[0])[len(prompt) :]
-
-
-def main(
-    image_path: str, prompt_text: str, temperature: float, top_p: float, model_name: str
-):
-    """
-    Call all the functions.
-    """
-    model, processor = load_model_and_processor(model_name)
-    image = process_image(image_path)
-    result = generate_text_from_image(
-        model, processor, image, prompt_text, temperature, top_p
-    )
-    print("Generated Text: " + result)
 
+    if args.gradio_ui:
+        demo = gradio_interface(model, processor)
+        demo.launch()
+    else:
+        image = process_image(args.image_path)
+        result = generate_text_from_image(
+            model, processor, image, args.prompt_text, args.temperature, args.top_p
+        )
+        print("Generated Text:", result)
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate text from an image and prompt using the 3.2 MM Llama model."
-    )
-    parser.add_argument("--image_path", type=str, help="Path to the image file")
-    parser.add_argument(
-        "--prompt_text", type=str, help="Prompt text to describe the image"
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.7,
-        help="Temperature for generation (default: 0.7)",
-    )
-    parser.add_argument(
-        "--top_p", type=float, default=0.9, help="Top p for generation (default: 0.9)"
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Model name (default: '{DEFAULT_MODEL}')",
-    )
-
+    parser = argparse.ArgumentParser(description="Multi-modal inference with optional Gradio UI and LoRA support")
+    parser.add_argument("--image_path", type=str, help="Path to the input image")
+    parser.add_argument("--prompt_text", type=str, help="Prompt text for the image")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling")
+    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name")
+    parser.add_argument("--hf_token", type=str, help="Hugging Face API token")
+    parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights")
+    parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI")
+    
     args = parser.parse_args()
-    main(
-        args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name
-    )
+    main(args)
\ No newline at end of file

From 0e2703c5bd116b2f8daae4038806e9d286f0967f Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 17:59:42 +0000
Subject: [PATCH 05/13] Added complete inferencing functionality of 1. terminal
 inferencing, 2. gradio inferencing, 3. checkpoint inferencing in UI/CLI

---
 .../inference/local_inference/README.md       |  75 ++++-----
 .../local_inference/multi_modal_infer.py      | 131 ---------------
 .../multi_modal_infer_gradio_UI.py            | 157 ------------------
 3 files changed, 33 insertions(+), 330 deletions(-)
 delete mode 100644 recipes/quickstart/inference/local_inference/multi_modal_infer.py
 delete mode 100644 recipes/quickstart/inference/local_inference/multi_modal_infer_gradio_UI.py

diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md
index d86035015..5659077fc 100644
--- a/recipes/quickstart/inference/local_inference/README.md
+++ b/recipes/quickstart/inference/local_inference/README.md
@@ -3,26 +3,46 @@
 ## Hugging face setup
 **Important Note**: Before running the inference, you'll need your Hugging Face access token, which you can get at your Settings page [here](https://huggingface.co/settings/tokens). Then run `huggingface-cli login` and copy and paste your Hugging Face access token to complete the login to make sure the scripts can download Hugging Face models if needed.
 
-## Multimodal Inference
-For Multi-Modal inference we have added [multi_modal_infer.py](multi_modal_infer.py) which uses the transformers library.
+## Multimodal Inference and CLI inference with or without PEFT LoRA weights
 
-The way to run this would be:
-```
-python multi_modal_infer.py --image_path PATH_TO_IMAGE --prompt_text "Describe this image" --temperature 0.5 --top_p 0.8 --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct"
-```
----
-## Multi-modal Inferencing Using gradio UI for inferencing
-For multi-modal inferencing using gradio UI we have added [multi_modal_infer_gradio_UI.py](multi_modal_infer_gradio_UI.py) which used gradio and transformers library.
+### Model Overview
+- Base model: `meta-llama/Llama-3.2-11B-Vision-Instruct`
+- Uses PEFT library (v0.13.1) for efficient fine-tuning
+- Supports vision-language tasks with instruction capabilities
 
-### Steps to Run
+### Features in
+`multi_modal_infer.py`
 
-The way to run this would be:
-- Ensure having proper access to llama 3.2 vision models, then run the command given below
+All functionality has been consolidated into a single file with three main modes:
+### Steops to run are given below:
+1. **Basic Inference**
+```bash
+python multi_modal_infer.py \
+    --image_path "path/to/image.jpg" \
+    --prompt_text "Describe this image" \
+    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
+    --hf_token "your_token"
+```
 
+2. **Gradio UI Mode**
+```bash
+python multi_modal_infer.py \
+    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
+    --hf_token "your_token" \
+    --gradio_ui
 ```
-python multi_modal_infer_gradio_UI.py --hf_token <your hf_token here>
+
+3. **LoRA Fine-tuning Integration**
+```bash
+python multi_modal_infer.py \
+    --image_path "path/to/image.jpg" \
+    --prompt_text "Describe this image" \
+    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
+    --hf_token "your_token" \
+    --finetuning_path "path/to/lora/weights"
 ```
 
+
 ## Text-only Inference
 For local inference we have provided an [inference script](inference.py). Depending on the type of finetuning performed during training the [inference script](inference.py) takes different arguments.
 
@@ -114,32 +134,3 @@ python inference.py --model_name <training_config.output_dir> --prompt_file <tes
 ## Inference on large models like Meta Llama 405B
 The FP8 quantized variants of Meta Llama (i.e. meta-llama/Meta-Llama-3.1-405B-FP8 and meta-llama/Meta-Llama-3.1-405B-Instruct-FP8) can be executed on a single node with 8x80GB H100 using the scripts located in this folder.
 To run the unquantized Meta Llama 405B variants (i.e. meta-llama/Meta-Llama-3.1-405B and meta-llama/Meta-Llama-3.1-405B-Instruct) we need to use a multi-node setup for inference. The llama-recipes inference script currently does not allow multi-node inference. To run this model you can use vLLM with pipeline and tensor parallelism as showed in [this example](../../../3p_integrations/vllm/README.md).
-
-### Inference-with-lora-checkpoints
-
-After fine-tuning the model, you can use the `code-merge-inference.py` script to generate text from images. The script supports merging PEFT adapter weights from a specified path.
-
-#### Usage
-
-To run the inference script, use the following command:
-
-```bash
-python code-merge-inference.py \
-    --image_path "path/to/your/image.png" \
-    --prompt_text "Your prompt text here" \
-    --temperature 1 \
-    --top_p 0.5 \
-    --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
-    --hf_token "your_hugging_face_token" \
-    --finetuning_path "path/to/your/finetuned/model"
-```
-
-#### Script Details
-
-The `code-merge-inference.py` script performs the following steps:
-
-1. **Load Model and Processor**: Loads the pre-trained model and processor, and optionally loads PEFT adapter weights if specified.
-2. **Process Image**: Opens and converts the input image.
-3. **Generate Text**: Generates text from the image using the model and processor.
-
-For more details, refer to the `code-merge-inference.py` script.
\ No newline at end of file
diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
deleted file mode 100644
index 5d2c69ddd..000000000
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import argparse
-import os
-import sys
-import torch
-from accelerate import Accelerator
-from PIL import Image as PIL_Image
-from transformers import MllamaForConditionalGeneration, MllamaProcessor
-from peft import PeftModel
-import gradio as gr
-
-# Initialize accelerator
-accelerator = Accelerator()
-device = accelerator.device
-
-# Constants
-DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-MAX_OUTPUT_TOKENS = 2048
-MAX_IMAGE_SIZE = (1120, 1120)
-
-def load_model_and_processor(model_name: str, hf_token: str = None, finetuning_path: str = None):
-    """Load model and processor with optional LoRA adapter"""
-    model = MllamaForConditionalGeneration.from_pretrained(
-        model_name,
-        torch_dtype=torch.bfloat16,
-        use_safetensors=True,
-        device_map=device,
-        token=hf_token
-    )
-    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token, use_safetensors=True)
-
-    if finetuning_path and os.path.exists(finetuning_path):
-        print(f"Loading adapter from '{finetuning_path}'...")
-        model = PeftModel.from_pretrained(
-            model,
-            finetuning_path,
-            is_adapter=True,
-            torch_dtype=torch.bfloat16
-        )
-        print("Adapter merged successfully")
-    
-    model, processor = accelerator.prepare(model, processor)
-    return model, processor
-
-def process_image(image_path: str) -> PIL_Image.Image:
-    """Process and validate image input"""
-    if not os.path.exists(image_path):
-        print(f"Image file '{image_path}' does not exist.")
-        sys.exit(1)
-    return PIL_Image.open(image_path).convert("RGB")
-
-def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
-    """Generate text from image using model"""
-    conversation = [
-        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
-    ]
-    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-    inputs = processor(image, prompt, return_tensors="pt").to(device)
-    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS)
-    return processor.decode(output[0])[len(prompt):]
-
-def gradio_interface(model, processor):
-    """Create Gradio UI"""
-    def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
-        if image is not None:
-            image = image.resize(MAX_IMAGE_SIZE)
-            result = generate_text_from_image(model, processor, image, user_prompt, temperature, top_p)
-            history.append((user_prompt, result))
-        return history
-
-    def clear_chat():
-        return []
-
-    with gr.Blocks() as demo:
-        gr.HTML("<h1 style='text-align: center'>Llama Vision Model Interface</h1>")
-        
-        with gr.Row():
-            with gr.Column(scale=1):
-                image_input = gr.Image(label="Image", type="pil", image_mode="RGB", height=512, width=512)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1)
-                top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1)
-                top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1)
-                max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50)
-
-            with gr.Column(scale=2):
-                chat_history = gr.Chatbot(label="Chat", height=512)
-                user_prompt = gr.Textbox(show_label=False, placeholder="Enter your prompt", lines=2)
-                
-                with gr.Row():
-                    generate_button = gr.Button("Generate")
-                    clear_button = gr.Button("Clear")
-
-                generate_button.click(
-                    fn=describe_image,
-                    inputs=[image_input, user_prompt, temperature, top_k, top_p, max_tokens, chat_history],
-                    outputs=[chat_history]
-                )
-                clear_button.click(fn=clear_chat, outputs=[chat_history])
-
-    return demo
-
-def main(args):
-    """Main execution flow"""
-    model, processor = load_model_and_processor(
-        args.model_name,
-        args.hf_token,
-        args.finetuning_path
-    )
-
-    if args.gradio_ui:
-        demo = gradio_interface(model, processor)
-        demo.launch()
-    else:
-        image = process_image(args.image_path)
-        result = generate_text_from_image(
-            model, processor, image, args.prompt_text, args.temperature, args.top_p
-        )
-        print("Generated Text:", result)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Multi-modal inference with optional Gradio UI and LoRA support")
-    parser.add_argument("--image_path", type=str, help="Path to the input image")
-    parser.add_argument("--prompt_text", type=str, help="Prompt text for the image")
-    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
-    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling")
-    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name")
-    parser.add_argument("--hf_token", type=str, help="Hugging Face API token")
-    parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights")
-    parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI")
-    
-    args = parser.parse_args()
-    main(args)
\ No newline at end of file
diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer_gradio_UI.py b/recipes/quickstart/inference/local_inference/multi_modal_infer_gradio_UI.py
deleted file mode 100644
index 5119ac7c3..000000000
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer_gradio_UI.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import gradio as gr
-import torch
-import os
-from PIL import Image
-from accelerate import Accelerator
-from transformers import MllamaForConditionalGeneration, AutoProcessor
-import argparse  # Import argparse
-
-# Parse the command line arguments
-parser = argparse.ArgumentParser(description="Run Gradio app with Hugging Face model")
-parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face authentication token")
-args = parser.parse_args()
-
-# Hugging Face token
-hf_token = args.hf_token
-
-# Initialize Accelerator
-accelerate = Accelerator()
-device = accelerate.device
-
-# Set memory management for PyTorch
-os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'  # or adjust size as needed
-
-# Model ID
-model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-# Load model with the Hugging Face token
-model = MllamaForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    device_map=device,
-    use_auth_token=hf_token  # Pass the Hugging Face token here
-)
-
-# Load the processor
-processor = AutoProcessor.from_pretrained(model_id, use_auth_token=hf_token)
-
-# Visual theme
-visual_theme = gr.themes.Default()  # Default, Soft or Monochrome
-
-# Constants
-MAX_OUTPUT_TOKENS = 2048
-MAX_IMAGE_SIZE = (1120, 1120)
-
-# Function to process the image and generate a description
-def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
-    # Initialize cleaned_output variable
-    cleaned_output = ""
-
-    if image is not None:
-        # Resize image if necessary
-        image = image.resize(MAX_IMAGE_SIZE)
-        prompt = f"<|image|><|begin_of_text|>{user_prompt} Answer:"
-        # Preprocess the image and prompt
-        inputs = processor(image, prompt, return_tensors="pt").to(device)
-    else:
-        # Text-only input if no image is provided
-        prompt = f"<|begin_of_text|>{user_prompt} Answer:"
-        # Preprocess the prompt only (no image)
-        inputs = processor(prompt, return_tensors="pt").to(device)
-
-    # Generate output with model
-    output = model.generate(
-        **inputs,
-        max_new_tokens=min(max_tokens, MAX_OUTPUT_TOKENS),
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p
-    )
-
-    # Decode the raw output
-    raw_output = processor.decode(output[0])
-
-    # Clean up the output to remove system tokens
-    cleaned_output = raw_output.replace("<|image|><|begin_of_text|>", "").strip().replace(" Answer:", "")
-
-    # Ensure the prompt is not repeated in the output
-    if cleaned_output.startswith(user_prompt):
-        cleaned_output = cleaned_output[len(user_prompt):].strip()
-
-    # Append the new conversation to the history
-    history.append((user_prompt, cleaned_output))
-
-    return history
-
-
-# Function to clear the chat history
-def clear_chat():
-    return []
-
-# Gradio Interface
-def gradio_interface():
-    with gr.Blocks(visual_theme) as demo:
-        gr.HTML(
-        """
-    <h1 style='text-align: center'>
-    meta-llama/Llama-3.2-11B-Vision-Instruct
-    </h1>
-    """)
-        with gr.Row():
-            # Left column with image and parameter inputs
-            with gr.Column(scale=1):
-                image_input = gr.Image(
-                    label="Image", 
-                    type="pil", 
-                    image_mode="RGB", 
-                    height=512,  # Set the height
-                    width=512   # Set the width
-                )
-
-                # Parameter sliders
-                temperature = gr.Slider(
-                    label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1, interactive=True)
-                top_k = gr.Slider(
-                    label="Top-k", minimum=1, maximum=100, value=50, step=1, interactive=True)
-                top_p = gr.Slider(
-                    label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1, interactive=True)
-                max_tokens = gr.Slider(
-                    label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50, interactive=True)
-
-            # Right column with the chat interface
-            with gr.Column(scale=2):
-                chat_history = gr.Chatbot(label="Chat", height=512)
-
-                # User input box for prompt
-                user_prompt = gr.Textbox(
-                    show_label=False,
-                    container=False,
-                    placeholder="Enter your prompt", 
-                    lines=2
-                )
-
-                # Generate and Clear buttons
-                with gr.Row():
-                    generate_button = gr.Button("Generate")
-                    clear_button = gr.Button("Clear")
-
-                # Define the action for the generate button
-                generate_button.click(
-                    fn=describe_image, 
-                    inputs=[image_input, user_prompt, temperature, top_k, top_p, max_tokens, chat_history],
-                    outputs=[chat_history]
-                )
-
-                # Define the action for the clear button
-                clear_button.click(
-                    fn=clear_chat,
-                    inputs=[],
-                    outputs=[chat_history]
-                )
-
-    return demo
-
-# Launch the interface
-demo = gradio_interface()
-# demo.launch(server_name="0.0.0.0", server_port=12003)
-demo.launch()
\ No newline at end of file

From 95f42eeed3dc40a8df0e9e2c209cae407373bcde Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 18:04:47 +0000
Subject: [PATCH 06/13] Fixed spelling mistake

---
 recipes/quickstart/inference/local_inference/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md
index 5659077fc..3a1b0590f 100644
--- a/recipes/quickstart/inference/local_inference/README.md
+++ b/recipes/quickstart/inference/local_inference/README.md
@@ -14,7 +14,7 @@
 `multi_modal_infer.py`
 
 All functionality has been consolidated into a single file with three main modes:
-### Steops to run are given below:
+### Steps to run are given below:
 1. **Basic Inference**
 ```bash
 python multi_modal_infer.py \

From 243332539ede165c2c1d22310316dedd9108262d Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 21:11:10 +0000
Subject: [PATCH 07/13] Renamed file from code-merge-inference.py to 
 multi_modal_infer.py

---
 .../{code-merge-inference.py => multi_modal_infer.py}             | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename recipes/quickstart/inference/local_inference/{code-merge-inference.py => multi_modal_infer.py} (100%)

diff --git a/recipes/quickstart/inference/local_inference/code-merge-inference.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
similarity index 100%
rename from recipes/quickstart/inference/local_inference/code-merge-inference.py
rename to recipes/quickstart/inference/local_inference/multi_modal_infer.py

From 20dd4740569b68a69650807be2ec246ac187a309 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 21:51:12 +0000
Subject: [PATCH 08/13] added working code of CLI/gradio UI/ LoRA weights merge

---
 .../quickstart/inference/local_inference/multi_modal_infer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index bd7c11804..5459f2ced 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -16,7 +16,7 @@
 
 def load_model_and_processor(model_name: str, hf_token: str, finetuning_path: str = None):
     """
-    Load the model and processor, and optionally load adapter weights if specified.
+    Load the model and processor, and optionally load adapter weights if specified
     """
     # Load pre-trained model and processor
     model = MllamaForConditionalGeneration.from_pretrained(

From 6b6bb37ecd9f1ef9f877e10d3e1209cd22e2b465 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Sat, 2 Nov 2024 22:08:00 +0000
Subject: [PATCH 09/13] fixed gradio UI during performing the tests, it is
 working in this commit

---
 .../local_inference/multi_modal_infer.py      | 242 ++++++++++++------
 1 file changed, 158 insertions(+), 84 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index 5459f2ced..a92482c3c 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -1,117 +1,191 @@
+import argparse
 import os
 import sys
-import argparse
-from PIL import Image as PIL_Image
 import torch
-from transformers import MllamaForConditionalGeneration, MllamaProcessor
 from accelerate import Accelerator
-from peft import PeftModel  # Make sure to install the `peft` library
+from PIL import Image as PIL_Image
+from transformers import MllamaForConditionalGeneration, MllamaProcessor
+from peft import PeftModel
+import gradio as gr
 
+# Initialize accelerator
 accelerator = Accelerator()
 device = accelerator.device
 
 # Constants
 DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+MAX_OUTPUT_TOKENS = 2048
+MAX_IMAGE_SIZE = (1120, 1120)
 
-
-def load_model_and_processor(model_name: str, hf_token: str, finetuning_path: str = None):
-    """
-    Load the model and processor, and optionally load adapter weights if specified
-    """
-    # Load pre-trained model and processor
+def load_model_and_processor(model_name: str, hf_token: str = None, finetuning_path: str = None):
+    """Load model and processor with optional LoRA adapter"""
+    print(f"Loading model: {model_name}")
     model = MllamaForConditionalGeneration.from_pretrained(
-        model_name, 
-        torch_dtype=torch.bfloat16, 
-        use_safetensors=True, 
+        model_name,
+        torch_dtype=torch.bfloat16,
+        use_safetensors=True,
         device_map=device,
         token=hf_token
     )
-    processor = MllamaProcessor.from_pretrained(
-        model_name, 
-        token=hf_token, 
-        use_safetensors=True
-    )
+    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token, use_safetensors=True)
 
-    # If a finetuning path is provided, load the adapter model
     if finetuning_path and os.path.exists(finetuning_path):
-        adapter_weights_path = os.path.join(finetuning_path, "adapter_model.safetensors")
-        adapter_config_path = os.path.join(finetuning_path, "adapter_config.json")
-
-        if os.path.exists(adapter_weights_path) and os.path.exists(adapter_config_path):
-            print(f"Loading adapter from '{finetuning_path}'...")
-            # Load the model with adapters using `peft`
-            model = PeftModel.from_pretrained(
-                model,
-                finetuning_path,  # This should be the folder containing the adapter files
-                is_adapter=True,
-                torch_dtype=torch.bfloat16
-            )
-
-            print("Adapter merged successfully with the pre-trained model.")
-        else:
-            print(f"Adapter files not found in '{finetuning_path}'. Using pre-trained model only.")
-    else:
-        print(f"No fine-tuned weights or adapters found in '{finetuning_path}'. Using pre-trained model only.")
-
-    # Prepare the model and processor for accelerated training
-    model, processor = accelerator.prepare(model, processor)
+        print(f"Loading LoRA adapter from '{finetuning_path}'...")
+        model = PeftModel.from_pretrained(
+            model,
+            finetuning_path,
+            is_adapter=True,
+            torch_dtype=torch.bfloat16
+        )
+        print("LoRA adapter merged successfully")
     
+    model, processor = accelerator.prepare(model, processor)
     return model, processor
 
-
-def process_image(image_path: str) -> PIL_Image.Image:
-    """
-    Open and convert an image from the specified path.
-    """
-    if not os.path.exists(image_path):
-        print(f"The image file '{image_path}' does not exist.")
-        sys.exit(1)
-    with open(image_path, "rb") as f:
-        return PIL_Image.open(f).convert("RGB")
-
+def process_image(image_path: str = None, image = None) -> PIL_Image.Image:
+    """Process and validate image input"""
+    if image is not None:
+        return image.convert("RGB")
+    if image_path and os.path.exists(image_path):
+        return PIL_Image.open(image_path).convert("RGB")
+    raise ValueError("No valid image provided")
 
 def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
-    """
-    Generate text from an image using the model and processor.
-    """
+    """Generate text from image using model"""
     conversation = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     inputs = processor(image, prompt, return_tensors="pt").to(device)
-    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=2048)
+    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS)
     return processor.decode(output[0])[len(prompt):]
 
-
-def main(image_path: str, prompt_text: str, temperature: float, top_p: float, model_name: str, hf_token: str, finetuning_path: str = None):
-    """
-    Call all the functions and optionally merge adapter weights from a specified path.
-    """
-    model, processor = load_model_and_processor(model_name, hf_token, finetuning_path)
-    image = process_image(image_path)
-    result = generate_text_from_image(model, processor, image, prompt_text, temperature, top_p)
-    print("Generated Text: " + result)
-
+def gradio_interface(model_name: str, hf_token: str):
+    """Create Gradio UI with LoRA support"""
+    # Initialize model state
+    current_model = {"model": None, "processor": None}
+    
+    def load_or_reload_model(enable_lora: bool, lora_path: str = None):
+        current_model["model"], current_model["processor"] = load_model_and_processor(
+            model_name, 
+            hf_token,
+            lora_path if enable_lora else None
+        )
+        return "Model loaded successfully" + (" with LoRA" if enable_lora else "")
+
+    def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
+        if image is not None:
+            try:
+                processed_image = process_image(image=image)
+                result = generate_text_from_image(
+                    current_model["model"],
+                    current_model["processor"],
+                    processed_image,
+                    user_prompt,
+                    temperature,
+                    top_p
+                )
+                history.append((user_prompt, result))
+            except Exception as e:
+                history.append((user_prompt, f"Error: {str(e)}"))
+        return history
+
+    def clear_chat():
+        return []
+
+    with gr.Blocks() as demo:
+        gr.HTML("<h1 style='text-align: center'>Llama Vision Model Interface</h1>")
+        
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Model loading controls
+                with gr.Group():
+                    enable_lora = gr.Checkbox(label="Enable LoRA", value=False)
+                    lora_path = gr.Textbox(
+                        label="LoRA Weights Path",
+                        placeholder="Path to LoRA weights folder",
+                        visible=False
+                    )
+                    load_status = gr.Textbox(label="Load Status", interactive=False)
+                    load_button = gr.Button("Load/Reload Model")
+
+                # Image and parameter controls
+                image_input = gr.Image(label="Image", type="pil", image_mode="RGB", height=512, width=512)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1)
+                top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1)
+                max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50)
+
+            with gr.Column(scale=2):
+                chat_history = gr.Chatbot(label="Chat", height=512)
+                user_prompt = gr.Textbox(
+                    show_label=False,
+                    placeholder="Enter your prompt",
+                    lines=2
+                )
+                
+                with gr.Row():
+                    generate_button = gr.Button("Generate")
+                    clear_button = gr.Button("Clear")
+
+        # Event handlers
+        enable_lora.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=[enable_lora],
+            outputs=[lora_path]
+        )
+        
+        load_button.click(
+            fn=load_or_reload_model,
+            inputs=[enable_lora, lora_path],
+            outputs=[load_status]
+        )
+
+        generate_button.click(
+            fn=describe_image,
+            inputs=[
+                image_input, user_prompt, temperature,
+                top_k, top_p, max_tokens, chat_history
+            ],
+            outputs=[chat_history]
+        )
+        
+        clear_button.click(fn=clear_chat, outputs=[chat_history])
+
+    # Initial model load
+    load_or_reload_model(False)
+    return demo
+
+def main(args):
+    """Main execution flow"""
+    if args.gradio_ui:
+        demo = gradio_interface(args.model_name, args.hf_token)
+        demo.launch()
+    else:
+        model, processor = load_model_and_processor(
+            args.model_name,
+            args.hf_token,
+            args.finetuning_path
+        )
+        image = process_image(image_path=args.image_path)
+        result = generate_text_from_image(
+            model, processor, image,
+            args.prompt_text,
+            args.temperature,
+            args.top_p
+        )
+        print("Generated Text:", result)
 
 if __name__ == "__main__":
-    # Example usage with argparse (optional)
-    parser = argparse.ArgumentParser(description="Generate text from an image using a fine-tuned model with adapters.")
-    parser.add_argument("--image_path", type=str, required=True, help="Path to the input image.")
-    parser.add_argument("--prompt_text", type=str, required=True, help="Prompt text for the image.")
-    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature.")
-    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling.")
-    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Pre-trained model name.")
-    parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face API token.")
-    parser.add_argument("--finetuning_path", type=str, help="Path to the fine-tuning weights (adapters).")
+    parser = argparse.ArgumentParser(description="Multi-modal inference with optional Gradio UI and LoRA support")
+    parser.add_argument("--image_path", type=str, help="Path to the input image")
+    parser.add_argument("--prompt_text", type=str, help="Prompt text for the image")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling")
+    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name")
+    parser.add_argument("--hf_token", type=str, help="Hugging Face API token")
+    parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights")
+    parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI")
     
     args = parser.parse_args()
-
-    main(
-        image_path=args.image_path,
-        prompt_text=args.prompt_text,
-        temperature=args.temperature,
-        top_p=args.top_p,
-        model_name=args.model_name,
-        hf_token=args.hf_token,
-        finetuning_path=args.finetuning_path
-    )
+    main(args)
\ No newline at end of file

From db3216695a41641ee301e1a314b8c57479fcc7d7 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Tue, 19 Nov 2024 12:16:11 +0530
Subject: [PATCH 10/13] Added the huggingface cli feature

---
 .../local_inference/multi_modal_infer.py      | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index a92482c3c..1090610b6 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -7,6 +7,7 @@
 from transformers import MllamaForConditionalGeneration, MllamaProcessor
 from peft import PeftModel
 import gradio as gr
+from huggingface_hub import login
 
 # Initialize accelerator
 accelerator = Accelerator()
@@ -17,9 +18,24 @@
 MAX_OUTPUT_TOKENS = 2048
 MAX_IMAGE_SIZE = (1120, 1120)
 
-def load_model_and_processor(model_name: str, hf_token: str = None, finetuning_path: str = None):
+def get_hf_token():
+    """Retrieve Hugging Face token from environment or local auth."""
+    token = os.getenv("HUGGINGFACE_TOKEN")
+    if token:
+        return token
+
+    # Check if the user is logged in via huggingface-cli
+    try:
+        login()  # Will use local authentication cache if available
+    except Exception as e:
+        print("Unable to authenticate with Hugging Face. Ensure you are logged in via `huggingface-cli login`.")
+        sys.exit(1)
+    return None
+
+def load_model_and_processor(model_name: str, finetuning_path: str = None):
     """Load model and processor with optional LoRA adapter"""
     print(f"Loading model: {model_name}")
+    hf_token = get_hf_token()
     model = MllamaForConditionalGeneration.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
@@ -60,7 +76,7 @@ def generate_text_from_image(model, processor, image, prompt_text: str, temperat
     output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS)
     return processor.decode(output[0])[len(prompt):]
 
-def gradio_interface(model_name: str, hf_token: str):
+def gradio_interface(model_name: str):
     """Create Gradio UI with LoRA support"""
     # Initialize model state
     current_model = {"model": None, "processor": None}
@@ -68,7 +84,6 @@ def gradio_interface(model_name: str, hf_token: str):
     def load_or_reload_model(enable_lora: bool, lora_path: str = None):
         current_model["model"], current_model["processor"] = load_model_and_processor(
             model_name, 
-            hf_token,
             lora_path if enable_lora else None
         )
         return "Model loaded successfully" + (" with LoRA" if enable_lora else "")
@@ -159,12 +174,11 @@ def clear_chat():
 def main(args):
     """Main execution flow"""
     if args.gradio_ui:
-        demo = gradio_interface(args.model_name, args.hf_token)
+        demo = gradio_interface(args.model_name)
         demo.launch()
     else:
         model, processor = load_model_and_processor(
             args.model_name,
-            args.hf_token,
             args.finetuning_path
         )
         image = process_image(image_path=args.image_path)
@@ -183,9 +197,8 @@ def main(args):
     parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
     parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling")
     parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name")
-    parser.add_argument("--hf_token", type=str, help="Hugging Face API token")
     parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights")
     parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI")
     
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)

From 2df7d1b6a8c6660dcf0be1da35ac437c27c7506d Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Tue, 19 Nov 2024 12:17:01 +0530
Subject: [PATCH 11/13] fixed the issue of everytime login

---
 .../local_inference/multi_modal_infer.py      | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index 1090610b6..c785edb7d 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -18,19 +18,23 @@
 MAX_OUTPUT_TOKENS = 2048
 MAX_IMAGE_SIZE = (1120, 1120)
 
+from huggingface_hub import HfFolder
+
 def get_hf_token():
-    """Retrieve Hugging Face token from environment or local auth."""
+    """Retrieve Hugging Face token from the cache or environment."""
+    # Check if a token is explicitly set in the environment
     token = os.getenv("HUGGINGFACE_TOKEN")
     if token:
         return token
 
-    # Check if the user is logged in via huggingface-cli
-    try:
-        login()  # Will use local authentication cache if available
-    except Exception as e:
-        print("Unable to authenticate with Hugging Face. Ensure you are logged in via `huggingface-cli login`.")
-        sys.exit(1)
-    return None
+    # Automatically retrieve the token from the Hugging Face cache (set via huggingface-cli login)
+    token = HfFolder.get_token()
+    if token:
+        return token
+
+    print("Hugging Face token not found. Please login using `huggingface-cli login`.")
+    sys.exit(1)
+
 
 def load_model_and_processor(model_name: str, finetuning_path: str = None):
     """Load model and processor with optional LoRA adapter"""

From 3ba4b64e719e183b90942e8a65af49ce614b9903 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Tue, 19 Nov 2024 12:18:11 +0530
Subject: [PATCH 12/13] shifted the import statement at top

---
 .../quickstart/inference/local_inference/multi_modal_infer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index c785edb7d..071dc8683 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -7,8 +7,7 @@
 from transformers import MllamaForConditionalGeneration, MllamaProcessor
 from peft import PeftModel
 import gradio as gr
-from huggingface_hub import login
-
+from huggingface_hub import HfFolder
 # Initialize accelerator
 accelerator = Accelerator()
 device = accelerator.device
@@ -18,7 +17,6 @@
 MAX_OUTPUT_TOKENS = 2048
 MAX_IMAGE_SIZE = (1120, 1120)
 
-from huggingface_hub import HfFolder
 
 def get_hf_token():
     """Retrieve Hugging Face token from the cache or environment."""

From 54269639de7a35ac2f7edceb255af7bacd8bc523 Mon Sep 17 00:00:00 2001
From: Himanshu Shukla <himanshushukla.shukla3@gmail.com>
Date: Tue, 19 Nov 2024 12:21:00 +0530
Subject: [PATCH 13/13] added inference instructions in README for easy
 inferencing

---
 recipes/quickstart/inference/local_inference/README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md
index 3a1b0590f..8e27304a2 100644
--- a/recipes/quickstart/inference/local_inference/README.md
+++ b/recipes/quickstart/inference/local_inference/README.md
@@ -13,7 +13,7 @@
 ### Features in
 `multi_modal_infer.py`
 
-All functionality has been consolidated into a single file with three main modes:
+All functionality has been consolidated into a single file with three main modes, use `huggingface-cli login`:
 ### Steps to run are given below:
 1. **Basic Inference**
 ```bash
@@ -21,14 +21,12 @@ python multi_modal_infer.py \
     --image_path "path/to/image.jpg" \
     --prompt_text "Describe this image" \
     --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
-    --hf_token "your_token"
 ```
 
 2. **Gradio UI Mode**
 ```bash
 python multi_modal_infer.py \
     --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
-    --hf_token "your_token" \
     --gradio_ui
 ```
 
@@ -38,7 +36,6 @@ python multi_modal_infer.py \
     --image_path "path/to/image.jpg" \
     --prompt_text "Describe this image" \
     --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" \
-    --hf_token "your_token" \
     --finetuning_path "path/to/lora/weights"
 ```