From 357aaf067c3a169986ba602531395baa6cc97b4f Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Wed, 4 Dec 2024 20:19:59 +0800
Subject: [PATCH 1/5] update deci delta

---
 tests/models/test_deci.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_deci.py b/tests/models/test_deci.py
index 7ac61262f..cfce2037b 100644
--- a/tests/models/test_deci.py
+++ b/tests/models/test_deci.py
@@ -5,7 +5,7 @@ class TestDeci(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/DeciLM-7B-instruct" # "Deci/DeciLM-7B-instruct"
     NATIVE_ARC_CHALLENGE_ACC = 0.5239
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5222
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.55
+    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.8
     TRUST_REMOTE_CODE = True
     USE_VLLM = False
     BATCH_SIZE = 6

From eda8706fcc01f94168a554599c2592cd93d40283 Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Wed, 4 Dec 2024 20:20:36 +0800
Subject: [PATCH 2/5] update cohere delta

---
 tests/models/test_cohere.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_cohere.py b/tests/models/test_cohere.py
index d11fc485d..88cb438d4 100644
--- a/tests/models/test_cohere.py
+++ b/tests/models/test_cohere.py
@@ -5,7 +5,7 @@ class TestCohere(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/aya-expanse-8b" # "CohereForAI/aya-expanse-8b"
     NATIVE_ARC_CHALLENGE_ACC = 0.5401
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5640
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.12
+    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.15
     BATCH_SIZE = 4
 
     def test_cohere(self):

From f5b26f8d16de6a7b8312567c1b725122a023afaa Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Wed, 4 Dec 2024 20:21:17 +0800
Subject: [PATCH 3/5] update longllama delta

---
 tests/models/test_longllama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_longllama.py b/tests/models/test_longllama.py
index 38128c2ce..c54ee051f 100644
--- a/tests/models/test_longllama.py
+++ b/tests/models/test_longllama.py
@@ -6,7 +6,7 @@ class TestLongLlama(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC = 0.3515
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3652
     TRUST_REMOTE_CODE = True
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.4
+    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.5
     USE_VLLM = False
 
     def test_longllama(self):

From 02a36dd6c7397df70a1d4fa17713484a159206b9 Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Wed, 4 Dec 2024 20:22:11 +0800
Subject: [PATCH 4/5] update hymba delta

---
 tests/models/test_hymba.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_hymba.py b/tests/models/test_hymba.py
index 5596452c0..271dd6760 100644
--- a/tests/models/test_hymba.py
+++ b/tests/models/test_hymba.py
@@ -5,6 +5,7 @@ class TestHymba(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Hymba-1.5B-Instruct/"  # "baichuan-inc/Baichuan2-7B-Chat"
     NATIVE_ARC_CHALLENGE_ACC = 0.2073
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2713
+    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.75
     MODEL_MAX_LEN = 8192
     TRUST_REMOTE_CODE = True
     APPLY_CHAT_TEMPLATE = True

From 87f921522622e228d1e736a98551d86d2620bfde Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Wed, 4 Dec 2024 20:50:26 +0800
Subject: [PATCH 5/5] rename

---
 tests/models/model_test.py     | 4 ++--
 tests/models/test_cohere.py    | 2 +-
 tests/models/test_deci.py      | 2 +-
 tests/models/test_falcon.py    | 3 ++-
 tests/models/test_hymba.py     | 2 +-
 tests/models/test_llama3_2.py  | 2 +-
 tests/models/test_longllama.py | 2 +-
 tests/models/test_qwen2_5.py   | 2 +-
 tests/test_asym_gptq_v1.py     | 2 +-
 9 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/models/model_test.py b/tests/models/model_test.py
index 9a0c74990..8c620a3c4 100644
--- a/tests/models/model_test.py
+++ b/tests/models/model_test.py
@@ -22,7 +22,7 @@
 class ModelTest(unittest.TestCase):
     TASK_NAME = "arc_challenge"
     # sub test can modify
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.15  # -15%
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.15  # -15%
     QUANT_ARC_MAX_POSITIVE_DELTA = 0.2  # 20%
     TRUST_REMOTE_CODE = False
     APPLY_CHAT_TEMPLATE = False
@@ -221,7 +221,7 @@ def quant_lm_eval(self):
     def check_results(self, task_results):
         for filter, value in task_results.items():
             diff_pct = self.calculatorPer(filter=filter, value=value)
-            negative_pct = 100 * (1 - self.QUANT_ARC_MAX_NEGATIVE_DELTA)
+            negative_pct = 100 * (1 - self.QUANT_ARC_MAX_DELTA_FLOOR_PERCENT)
             positive_pct = 100 * (1 + self.QUANT_ARC_MAX_POSITIVE_DELTA)
             self.assertTrue(negative_pct <= diff_pct <= positive_pct,
                             f"{filter}: {value} diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]")
diff --git a/tests/models/test_cohere.py b/tests/models/test_cohere.py
index 88cb438d4..8ca23dba9 100644
--- a/tests/models/test_cohere.py
+++ b/tests/models/test_cohere.py
@@ -5,7 +5,7 @@ class TestCohere(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/aya-expanse-8b" # "CohereForAI/aya-expanse-8b"
     NATIVE_ARC_CHALLENGE_ACC = 0.5401
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5640
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.15
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.15
     BATCH_SIZE = 4
 
     def test_cohere(self):
diff --git a/tests/models/test_deci.py b/tests/models/test_deci.py
index cfce2037b..ade5e169f 100644
--- a/tests/models/test_deci.py
+++ b/tests/models/test_deci.py
@@ -5,7 +5,7 @@ class TestDeci(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/DeciLM-7B-instruct" # "Deci/DeciLM-7B-instruct"
     NATIVE_ARC_CHALLENGE_ACC = 0.5239
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5222
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.8
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.8
     TRUST_REMOTE_CODE = True
     USE_VLLM = False
     BATCH_SIZE = 6
diff --git a/tests/models/test_falcon.py b/tests/models/test_falcon.py
index 091d2c42a..467be5977 100644
--- a/tests/models/test_falcon.py
+++ b/tests/models/test_falcon.py
@@ -1,4 +1,5 @@
 import torch  # noqa: E402from tests.model_test import ModelTest
+
 from model_test import ModelTest
 
 
@@ -9,7 +10,7 @@ class TestFalcon(ModelTest):
     APPLY_CHAT_TEMPLATE = True
     TRUST_REMOTE_CODE = True
     TORCH_DTYPE = torch.float16
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.52
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.52
     BATCH_SIZE = 6
     USE_VLLM = False
 
diff --git a/tests/models/test_hymba.py b/tests/models/test_hymba.py
index 271dd6760..33b466f4a 100644
--- a/tests/models/test_hymba.py
+++ b/tests/models/test_hymba.py
@@ -5,7 +5,7 @@ class TestHymba(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Hymba-1.5B-Instruct/"  # "baichuan-inc/Baichuan2-7B-Chat"
     NATIVE_ARC_CHALLENGE_ACC = 0.2073
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2713
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.75
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.75
     MODEL_MAX_LEN = 8192
     TRUST_REMOTE_CODE = True
     APPLY_CHAT_TEMPLATE = True
diff --git a/tests/models/test_llama3_2.py b/tests/models/test_llama3_2.py
index d94e5f70d..b301d9f18 100644
--- a/tests/models/test_llama3_2.py
+++ b/tests/models/test_llama3_2.py
@@ -5,7 +5,7 @@ class TestLlama3_2(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct"
     NATIVE_ARC_CHALLENGE_ACC = 0.3567
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.36
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36
     APPLY_CHAT_TEMPLATE = True
     TRUST_REMOTE_CODE = True
 
diff --git a/tests/models/test_longllama.py b/tests/models/test_longllama.py
index c54ee051f..7407753c1 100644
--- a/tests/models/test_longllama.py
+++ b/tests/models/test_longllama.py
@@ -6,7 +6,7 @@ class TestLongLlama(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC = 0.3515
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3652
     TRUST_REMOTE_CODE = True
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.5
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.5
     USE_VLLM = False
 
     def test_longllama(self):
diff --git a/tests/models/test_qwen2_5.py b/tests/models/test_qwen2_5.py
index 5f92ae5c2..a15fdc470 100644
--- a/tests/models/test_qwen2_5.py
+++ b/tests/models/test_qwen2_5.py
@@ -3,7 +3,7 @@
 
 class TestQwen2_5(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct"
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.2
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2
     NATIVE_ARC_CHALLENGE_ACC = 0.2739
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055
     TRUST_REMOTE_CODE = False
diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py
index 6eb023a8a..237792604 100644
--- a/tests/test_asym_gptq_v1.py
+++ b/tests/test_asym_gptq_v1.py
@@ -12,7 +12,7 @@ class Test(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct"  # "meta-llama/Llama-3.2-1B-Instruct"
     NATIVE_ARC_CHALLENGE_ACC = 0.3567
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805
-    QUANT_ARC_MAX_NEGATIVE_DELTA = 0.36
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36
     QUANT_FORMAT = FORMAT.GPTQ
     SYM = False