Fix the impl for to for int4 weight only use case (#522)

Summary: Note that we can do the following right now: * initialize and quantize the model with int4_weight_only quant in cpu * move the model to cuda we'll enable this in a separate PR Test Plan: CI Reviewers: Subscribers: Tasks: Tags:
pytorch · Jul 17, 2024 · d36de1b · d36de1b
1 parent 6dd82d8
commit d36de1b
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 2 deletions.
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -624,7 +624,7 @@ def test_quantized_tensor_subclass_save_load(self):
 
  @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
  @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- def test_quantized_model_to_device(self):
+ def test_int8wo_quantized_model_to_device(self):
  m = ToyLinearModel().eval().to(torch.bfloat16)
  m_copy = copy.deepcopy(m)
  example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu")
@@ -637,6 +637,23 @@ def test_quantized_model_to_device(self):
  cuda_res = m(*example_inputs_cuda)
  self.assertEqual(cuda_res.cpu(), ref)
 
+ @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ @unittest.skipIf(TORCH_VERSION_AFTER_2_5, "Test currently doesn't work for 2.5+")
+ def test_int4wo_quantized_model_to_device(self):
+ # TODO: change initial model to "cpu"
+ m = ToyLinearModel().eval().to(torch.bfloat16).to("cuda")
+ m_copy = copy.deepcopy(m)
+ example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda")
+
+ quantize_(m, int4_weight_only())
+ ref = m(*example_inputs)
+
+ example_inputs_cuda = (example_inputs[0].to("cuda"),)
+ m.to(device="cuda")
+ cuda_res = m(*example_inputs_cuda)
+ self.assertEqual(cuda_res.cpu(), ref)
+
  @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
  @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
  def test_quantized_tensor_subclass_save_load_map_location(self):

diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -544,7 +544,7 @@ def from_plain(
  def to(self, *args, **kwargs):
  kwargs = self._get_to_kwargs(*args, **kwargs)
  device = kwargs["device"]
- if device != "cuda" or (isinstance(device, torch.device) and device.type != "cuda"):
+ if device != "cuda" and (isinstance(device, torch.device) and device.type != "cuda"):
  raise ValueError(f"TensorCoreTiledAQTLayout is only available for cuda device, can't convert to {device}")
  return self.__class__(
  self.packed_weight.to(device),