From 2dfbe4b7243f1ed0bc1ef3ed30859978da51b655 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 9 Aug 2024 14:01:58 +0200 Subject: [PATCH 1/3] TST Potentially Skip 8bit bnb regression test The 8bit bnb LoRA regression test results are dependent on the underlying compute capability. The logits are slightly different depending on the version (up to 0.5 abs diff). Therefore, we now check the compute capability for this test and skip it if it's too low. This check may require updating if the hardware of the CI worker is updated. Note that I have already invalidated the old regression artifacts and created a new one. --- tests/regression/test_regression.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index a876e63f31..f83fe370db 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -584,6 +584,9 @@ def load_base_model(self): ) return model + @pytest.mark.skipif( + torch.cuda.get_device_properties(0).major < 8, reason="8bit bnb results vary with compute capability" + ) def test_lora_8bit(self): base_model = self.load_base_model() config = LoraConfig( From e1d7c91d5bb305bfdd3f34b31686a65a782d5e57 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 9 Aug 2024 15:04:15 +0200 Subject: [PATCH 2/3] Fix pytest skip to work without cuda --- tests/regression/test_regression.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index f83fe370db..2603e7f880 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -584,10 +584,12 @@ def load_base_model(self): ) return model - @pytest.mark.skipif( - torch.cuda.get_device_properties(0).major < 8, reason="8bit bnb results vary with compute capability" - ) def test_lora_8bit(self): + # note: we cannot use pytest.mark.skipif, because for the condition to be evaluated, cuda must be available, and + # since this runs on CI without CUDA, that would fail + if torch.cuda.get_device_properties(0).major < 8: + pytest.skip("8bit bnb results vary with compute capability") + base_model = self.load_base_model() config = LoraConfig( r=8, From ff6cc6d12a37d7384440812a598d97267a9046fd Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 12 Aug 2024 17:35:32 +0200 Subject: [PATCH 3/3] Instead of skipping, add a comment to explain After internal discussion, we think this is the most practical solution for the time being. --- tests/regression/test_regression.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index 2603e7f880..f4a1910a50 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -585,11 +585,9 @@ def load_base_model(self): return model def test_lora_8bit(self): - # note: we cannot use pytest.mark.skipif, because for the condition to be evaluated, cuda must be available, and - # since this runs on CI without CUDA, that would fail - if torch.cuda.get_device_properties(0).major < 8: - pytest.skip("8bit bnb results vary with compute capability") - + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. base_model = self.load_base_model() config = LoraConfig( r=8, @@ -603,6 +601,9 @@ def test_adalora(self): self.skipTest( "Skipping AdaLora for now, getting TypeError: unsupported operand type(s) for +=: 'dict' and 'Tensor'" ) + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. base_model = self.load_base_model() config = AdaLoraConfig( init_r=6, @@ -646,6 +647,9 @@ def load_base_model(self): return model def test_lora_4bit(self): + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. base_model = self.load_base_model() config = LoraConfig( r=8, @@ -657,6 +661,9 @@ def test_lora_4bit(self): def test_adalora(self): # TODO self.skipTest("Skipping AdaLora for now because of a bug, see #1113") + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. base_model = self.load_base_model() config = AdaLoraConfig( init_r=6,