ModelCloud · Qubitium · Nov 29, 2024 · Nov 28, 2024
diff --git a/tests/models/test_hymba.py b/tests/models/test_hymba.py
@@ -1,33 +1,17 @@
+from gptqmodel import GPTQModel
 from model_test import ModelTest
 
 
 class TestHymba(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Hymba-1.5B-Instruct/"  # "baichuan-inc/Baichuan2-7B-Chat"
-    NATIVE_ARC_CHALLENGE_ACC = 0.4104
-    NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4317
+    NATIVE_ARC_CHALLENGE_ACC = 0.2073
+    NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2713
     MODEL_MAX_LEN = 8192
     TRUST_REMOTE_CODE = True
+    APPLY_CHAT_TEMPLATE = True
     # Hymba currently only supports a batch size of 1.
     # See https://huggingface.co/nvidia/Hymba-1.5B-Instruct
     BATCH_SIZE = 1
 
     def test_hymba(self):
-        model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
-                                           torch_dtype=self.TORCH_DTYPE)
-        model.cuda()
-
-        prompt = "5+5=?"
-
-        messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
-
-        # Apply chat template
-        tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,
-                                                       return_tensors="pt").to('cuda')
-
-        outputs = model.generate(input_ids=tokenized_chat, max_new_tokens=56)
-        input_length = tokenized_chat.shape[1]
-        response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-
-        print(f"Model response: {response}")
-
-        self.assertTrue("10" in response)
+        self.quant_lm_eval()