From 4337f2958a7f822b249cd02f9048f1190a6cfb76 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 24 Jan 2025 17:28:48 +0800
Subject: [PATCH 1/2] fix cuda UT torch_dtype

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 test_cuda/test_support_vlms.py | 11 ++++++-----
 test_cuda/test_vlms.py         |  7 ++++---
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py
index a2627df3..b84836a8 100644
--- a/test_cuda/test_support_vlms.py
+++ b/test_cuda/test_support_vlms.py
@@ -35,7 +35,7 @@ def test_qwen2(self):
         from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            torch_dtype="float16",
             device_map=f"cuda:{self.device}",
         )
         processor = AutoProcessor.from_pretrained(quantized_model_path)
@@ -92,7 +92,7 @@ def test_phi3(self):
             quantized_model_path, 
             device_map=f"cuda:{self.device}", 
             trust_remote_code=True, 
-            torch_dtype="auto"
+            torch_dtype="float16",
             )
         processor = AutoProcessor.from_pretrained(quantized_model_path, 
         trust_remote_code=True, 
@@ -244,7 +244,7 @@ def test_llama(self):
         quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round")
         model = MllamaForConditionalGeneration.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            torch_dtype="float16",
             device_map=f"cuda:{self.device}",
         )
         processor = AutoProcessor.from_pretrained(quantized_model_path)
@@ -288,7 +288,7 @@ def test_cogvlm(self):
         )
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            torch_dtype="float16",
             trust_remote_code=True,
             device_map=DEVICE,
         ).to(DEVICE).eval()
@@ -352,7 +352,7 @@ def test_deepseek_vl2(self):
             quantized_model_path,
             trust_remote_code=True,
             device_map=f"cuda:{self.device}",
-            torch_dtype="auto",
+            torch_dtype="float16",
         )
         vl_gpt = vl_gpt.eval()
 
@@ -399,3 +399,4 @@ def test_deepseek_vl2(self):
 
 if __name__ == "__main__":
     unittest.main()
+
diff --git a/test_cuda/test_vlms.py b/test_cuda/test_vlms.py
index b40322da..1044e363 100644
--- a/test_cuda/test_vlms.py
+++ b/test_cuda/test_vlms.py
@@ -1,4 +1,4 @@
-import copy
+iimport copy
 import shutil
 import sys
 import unittest
@@ -46,7 +46,7 @@ def qwen_inference(self, quantized_model_dir):
         processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             quantized_model_dir,
-            torch_dtype="auto",
+            torch_dtype="float16",
             device_map="auto",
             ##revision="df7f44c" ##AutoGPTQ format
         )
@@ -120,7 +120,7 @@ def phi3_infernece(self, quantized_model_dir):
             quantized_model_path, 
             device_map="auto", 
             trust_remote_code=True, 
-            torch_dtype="auto"
+            torch_dtype="float16",
             )
         processor = AutoProcessor.from_pretrained(quantized_model_path, 
         trust_remote_code=True, 
@@ -198,3 +198,4 @@ def test_quant_not_text_fp_layers(self):
 
 if __name__ == "__main__":
     unittest.main()
+

From ead838d3bb86c9b5cae7f20491cabe8c16155daf Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Fri, 24 Jan 2025 06:24:00 -0500
Subject: [PATCH 2/2] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test_cuda/test_vlms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_cuda/test_vlms.py b/test_cuda/test_vlms.py
index 1044e363..ab0438e0 100644
--- a/test_cuda/test_vlms.py
+++ b/test_cuda/test_vlms.py
@@ -1,4 +1,4 @@
-iimport copy
+import copy
 import shutil
 import sys
 import unittest
@@ -94,7 +94,7 @@ def test_vlm_tune(self):
         ## load the model
         model_name = "/models/Qwen2-VL-2B-Instruct"
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name, trust_remote_code=True)
+            model_name, trust_remote_code=True, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)