From 5c3fe5b477170ebbc02e01b68412b1f0172b254f Mon Sep 17 00:00:00 2001 From: himanshushukla12 Date: Thu, 3 Oct 2024 17:47:01 +0000 Subject: [PATCH 1/2] [Fixed]RuntimeError: probability tensor contains either inf, nan or element < 0 --- .../quickstart/inference/local_inference/multi_modal_infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py index 935add51d..ee5161f85 100644 --- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py +++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py @@ -15,6 +15,7 @@ def load_model_and_processor(model_name: str, hf_token: str): Load the model and processor based on the 11B or 90B model. """ model = MllamaForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16, token=hf_token) + model = model.bfloat16().cuda() processor = MllamaProcessor.from_pretrained(model_name, token=hf_token) return model, processor From 625860d3dbfc232baa86392b05fd8fea9e75ba5c Mon Sep 17 00:00:00 2001 From: himanshushukla12 Date: Thu, 3 Oct 2024 19:49:56 +0000 Subject: [PATCH 2/2] removed dependency of singly GPU using accelerate --- .../local_inference/multi_modal_infer.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py index ee5161f85..8c11de8ee 100644 --- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py +++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py @@ -4,7 +4,11 @@ from PIL import Image as PIL_Image import torch from transformers import MllamaForConditionalGeneration, MllamaProcessor +from accelerate import Accelerator +accelerator = Accelerator() + +device = accelerator.device # Constants DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct" @@ -14,9 +18,11 @@ def load_model_and_processor(model_name: str, hf_token: str): """ Load the model and processor based on the 11B or 90B model. """ - model = MllamaForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16, token=hf_token) - model = model.bfloat16().cuda() - processor = MllamaProcessor.from_pretrained(model_name, token=hf_token) + model = MllamaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True, device_map=device, + token=hf_token) + processor = MllamaProcessor.from_pretrained(model_name, token=hf_token,use_safetensors=True) + + model, processor=accelerator.prepare(model, processor) return model, processor @@ -39,7 +45,7 @@ def generate_text_from_image(model, processor, image, prompt_text: str, temperat {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - inputs = processor(image, prompt, return_tensors="pt").to(model.device) + inputs = processor(image, prompt, return_tensors="pt").to(device) output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=512) return processor.decode(output[0])[len(prompt):] @@ -64,4 +70,4 @@ def main(image_path: str, prompt_text: str, temperature: float, top_p: float, mo parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face token for authentication") args = parser.parse_args() - main(args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name, args.hf_token) + main(args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name, args.hf_token) \ No newline at end of file