Skip to content

Commit 460208e

Browse files
committed
Merge branch 'main' into reland_fp8_linear_2
2 parents 4e09ab8 + 9266734 commit 460208e

File tree

27 files changed

+845
-385
lines changed

27 files changed

+845
-385
lines changed

.github/workflows/1xL4_tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ jobs:
5151
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
5252
./test/float8/test_everything_single_gpu.sh
5353
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py
54+
python test/kernel/test_blockwise_triton.py --verbose -s

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ If you believe there's other CUDA kernels we should be taking a closer look at p
254254

255255
TorchAO is integrated into some of the leading open-source libraries including:
256256

257-
* Unsloth for QAT, blog post coming soon!
257+
* Unsloth now supports QAT: [Read blog](https://docs.unsloth.ai/new/quantization-aware-training-qat) and [guide](https://docs.unsloth.ai/new/quantization-aware-training-qat#qat--lora-finetuning).
258258
* HuggingFace transformers with a [builtin inference backend](https://huggingface.co/docs/transformers/main/quantization/torchao) and [low bit optimizers](https://github.com/huggingface/transformers/pull/31865)
259259
* HuggingFace diffusers best practices with `torch.compile` and TorchAO in a standalone repo [diffusers-torchao](https://github.com/huggingface/diffusers/blob/main/docs/source/en/quantization/torchao.md)
260260
* vLLM for LLM serving: [usage](https://docs.vllm.ai/en/latest/features/quantization/torchao.html), [detailed docs](https://docs.pytorch.org/ao/main/torchao_vllm_integration.html)

benchmarks/benchmark_blockwise_scaled_linear_triton.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from triton.testing import do_bench
1414

1515
from torchao.float8.float8_utils import compute_error
16-
from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
16+
from torchao.kernel.blockwise_quantization import (
1717
blockwise_fp8_gemm,
1818
fp8_blockwise_act_quant,
1919
fp8_blockwise_weight_quant,

test/dtypes/test_affine_quantized_float.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def test_invalid_granularity(self):
152152
def test_mismatched_granularity(self):
153153
with pytest.raises(
154154
ValueError,
155-
match="Different granularities for activation and weight are not supported",
155+
match="Unsupported granularity types",
156156
):
157157
Float8DynamicActivationFloat8WeightConfig(
158158
granularity=(PerTensor(), PerRow())
@@ -165,7 +165,7 @@ def test_unsupported_granularity(self):
165165
class UnsupportedGranularity:
166166
pass
167167

168-
with pytest.raises(ValueError, match="Invalid granularity types"):
168+
with pytest.raises(ValueError, match="Unsupported granularity types"):
169169
Float8DynamicActivationFloat8WeightConfig(
170170
granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
171171
)

test/prototype/test_blockwise_triton.py renamed to test/kernel/test_blockwise_triton.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
triton = pytest.importorskip("triton", reason="Triton required to run this test")
1313

14-
from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
14+
from torchao.kernel.blockwise_quantization import (
1515
blockwise_fp8_gemm,
1616
fp8_blockwise_act_quant,
1717
fp8_blockwise_weight_dequant,

test/prototype/moe_training/test_fsdp.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@
4646

4747
# this test requires torchtitan
4848
try:
49-
from torchtitan.distributed.expert_parallel import set_token_group_alignment_size_m
5049
from torchtitan.models.moe import MoE, MoEArgs
50+
from torchtitan.models.moe.utils import set_token_group_alignment_size_m
5151
except ImportError:
5252
pytest.skip(
5353
"torchtitan not installed, skipping MoE tests.", allow_module_level=True
@@ -62,9 +62,6 @@ def device_mesh_1d() -> DeviceMesh:
6262
"""
6363
rank = int(os.environ["RANK"])
6464
world_size = int(os.environ["WORLD_SIZE"])
65-
if not dist.is_initialized():
66-
dist.init_process_group("nccl", rank=rank, world_size=world_size)
67-
6865
device_mesh = init_device_mesh("cuda", (world_size,))
6966
torch.manual_seed(1)
7067
torch.cuda.set_device(rank)

test/prototype/moe_training/test_fsdp_tp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@
6565
ExpertTensorParallel,
6666
NoParallel,
6767
TensorParallel,
68-
set_token_group_alignment_size_m,
6968
)
7069
from torchtitan.models.moe import MoE, MoEArgs
70+
from torchtitan.models.moe.utils import set_token_group_alignment_size_m
7171
except ImportError:
7272
pytest.skip(
7373
"torchtitan not installed, skipping MoE tests.", allow_module_level=True

test/prototype/moe_training/test_tp.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,14 @@
5858

5959
# this test requires torchtitan
6060
try:
61+
from torchtitan.distributed import NoParallel
6162
from torchtitan.distributed.expert_parallel import (
6263
ExpertParallel,
6364
ExpertTensorParallel,
64-
NoParallel,
6565
TensorParallel,
66-
set_token_group_alignment_size_m,
6766
)
6867
from torchtitan.models.moe import MoE, MoEArgs
68+
from torchtitan.models.moe.utils import set_token_group_alignment_size_m
6969
except ImportError:
7070
pytest.skip(
7171
"torchtitan not installed, skipping MoE tests.", allow_module_level=True
@@ -80,9 +80,6 @@ def device_mesh_1d() -> DeviceMesh:
8080
"""
8181
rank = int(os.environ["RANK"])
8282
world_size = int(os.environ["WORLD_SIZE"])
83-
if not dist.is_initialized():
84-
dist.init_process_group("nccl", rank=rank, world_size=world_size)
85-
8683
device_mesh = init_device_mesh("cuda", (world_size,))
8784
torch.manual_seed(1)
8885
torch.cuda.set_device(rank)

test/prototype/moe_training/test_training.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222

2323
# this test requires torchtitan
2424
try:
25-
from torchtitan.distributed.expert_parallel import (
25+
from torchtitan.models.moe import MoE, MoEArgs
26+
from torchtitan.models.moe.utils import (
2627
set_token_group_alignment_size_m,
2728
)
28-
from torchtitan.models.moe import MoE, MoEArgs
2929
except ImportError:
3030
pytest.skip(
3131
"torchtitan not installed, skipping MoE tests.", allow_module_level=True

test/prototype/mx_formats/test_mx_tensor.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,6 @@ def test_some_zeros(elem_dtype):
116116
_test_mx(data, elem_dtype, block_size)
117117

118118

119-
# TODO(future PR): fix and reenable this test
120-
@pytest.mark.skip(reason="does not pass on B200 yet")
121119
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
122120
def test_to_mx_rceil():
123121
# nan
@@ -131,11 +129,7 @@ def test_to_mx_rceil():
131129
],
132130
dtype=torch.uint32,
133131
).view(torch.float32)
134-
# fmt: on
135-
ground_truth_scale = torch.tensor([255], dtype=torch.uint8).view(
136-
torch.float8_e8m0fnu
137-
)
138-
# fmt: off
132+
139133
ground_truth_fp8 = torch.tensor(
140134
[
141135
127, 0, 0, 0, 0, 0, 0, 0,
@@ -149,7 +143,7 @@ def test_to_mx_rceil():
149143
data_mx = MXTensor.to_mx(
150144
data_hp, torch.float8_e4m3fn, 32, ScaleCalculationMode.RCEIL
151145
)
152-
torch.testing.assert_close(data_mx.scale, ground_truth_scale)
146+
assert torch.isnan(data_mx.scale)
153147
assert torch.isnan(data_mx.qdata[0])
154148
assert torch.all(data_mx.qdata[1:] == 0)
155149
# fp32 denorm

0 commit comments

Comments
 (0)