From 5c3fe5b477170ebbc02e01b68412b1f0172b254f Mon Sep 17 00:00:00 2001
From: himanshushukla12 <himanshushukla.shukla3@gmail.com>
Date: Thu, 3 Oct 2024 17:47:01 +0000
Subject: [PATCH 1/2] [Fixed]RuntimeError: probability tensor contains either
 inf, nan or element < 0

---
 .../quickstart/inference/local_inference/multi_modal_infer.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index 935add51d..ee5161f85 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -15,6 +15,7 @@ def load_model_and_processor(model_name: str, hf_token: str):
     Load the model and processor based on the 11B or 90B model.
     """
     model = MllamaForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16, token=hf_token)
+    model = model.bfloat16().cuda()
     processor = MllamaProcessor.from_pretrained(model_name, token=hf_token)
     return model, processor
 

From 625860d3dbfc232baa86392b05fd8fea9e75ba5c Mon Sep 17 00:00:00 2001
From: himanshushukla12 <himanshushukla.shukla3@gmail.com>
Date: Thu, 3 Oct 2024 19:49:56 +0000
Subject: [PATCH 2/2] removed dependency of singly GPU using accelerate

---
 .../local_inference/multi_modal_infer.py         | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
index ee5161f85..8c11de8ee 100644
--- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py
+++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py
@@ -4,7 +4,11 @@
 from PIL import Image as PIL_Image
 import torch
 from transformers import MllamaForConditionalGeneration, MllamaProcessor
+from accelerate import  Accelerator
 
+accelerator = Accelerator()
+
+device = accelerator.device
 
 # Constants
 DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -14,9 +18,11 @@ def load_model_and_processor(model_name: str, hf_token: str):
     """
     Load the model and processor based on the 11B or 90B model.
     """
-    model = MllamaForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16, token=hf_token)
-    model = model.bfloat16().cuda()
-    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token)
+    model = MllamaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True, device_map=device,
+                                                            token=hf_token)
+    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token,use_safetensors=True)
+
+    model, processor=accelerator.prepare(model, processor)
     return model, processor
 
 
@@ -39,7 +45,7 @@ def generate_text_from_image(model, processor, image, prompt_text: str, temperat
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-    inputs = processor(image, prompt, return_tensors="pt").to(model.device)
+    inputs = processor(image, prompt, return_tensors="pt").to(device)
     output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=512)
     return processor.decode(output[0])[len(prompt):]
 
@@ -64,4 +70,4 @@ def main(image_path: str, prompt_text: str, temperature: float, top_p: float, mo
     parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face token for authentication")
 
     args = parser.parse_args()
-    main(args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name, args.hf_token)
+    main(args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name, args.hf_token)
\ No newline at end of file