diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index 05089db7c8..d38705d5b9 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -12,16 +12,11 @@ cd torchao && pip install . --no-build-isolation && cd ..
 
 python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')"
 
-pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
-
-pytest -v -s torchao/test/quantization/
-
-pytest -v -s torchao/test/dtypes/
-
-pytest -v -s torchao/test/float8/
-
-pytest -v -s torchao/test/integration/test_integration.py
-
-pytest -v -s torchao/test/prototype/
-
-pytest -v -s torchao/test/test_ao_models.py
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' transformers tabulate fire
+
+pytest -v -s torchao/test/quantization/ \
+        torchao/test/dtypes/ \
+        torchao/test/float8/ \
+        torchao/test/integration/test_integration.py \
+        torchao/test/prototype/ \
+        torchao/test/test_ao_models.py
diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
index 2a711413f0..221eed8216 100644
--- a/test/dtypes/test_nf4.py
+++ b/test/dtypes/test_nf4.py
@@ -756,6 +756,7 @@ def world_size(self) -> int:
         return 2
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_comm(self):
         self.run_subtests(
             {"input_size": [512, 2048]},
diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index fa0edd694b..ab25a38bb3 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -296,6 +296,7 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
@@ -386,6 +387,7 @@ def _run_subtest(self, args):
             )
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_precompute_bitnet_scale(self):
         from torchao.prototype.quantized_training.bitnet import (
             get_bitnet_scale,