Merge branch 'main' of https://github.com/tile-ai/tilelang into sm89fix

LeiWang1999 · LeiWang1999 · commit 2e5991017b91 · 2025-09-18T11:39:36.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,15 @@
 cmake_minimum_required(VERSION 3.18)
 project(TILE_LANG C CXX)
 
+option(TILE_LANG_STATIC_STDCPP "Statically link libstdc++ for TileLang libraries" ON)
+option(TILE_LANG_INSTALL_STATIC_LIB "Install the static library" ON)
+
+if(TILE_LANG_STATIC_STDCPP)
+  message(STATUS "Enabling static linking of C++ standard library")
+  # Note: We'll apply static linking flags selectively to avoid Python extension conflicts
+  # The flags will be applied per-target below rather than globally
+endif()
+
 # Set default build type to Release if not provided
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type")
@@ -218,6 +227,11 @@ add_library(tilelang_static STATIC $<TARGET_OBJECTS:tilelang_objs>)
 add_dependencies(tilelang_static tvm_runtime)
 set_target_properties(tilelang_static PROPERTIES OUTPUT_NAME tilelang)
 
+# Apply static linking flags only to static library to avoid Python extension conflicts
+if(TILE_LANG_STATIC_STDCPP AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  target_link_options(tilelang_static PRIVATE -static-libstdc++ -static-libgcc)
+endif()
+
 # Debug build type-specific definitions
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_compile_definitions(tilelang PRIVATE "TVM_LOG_DEBUG")
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -389,9 +389,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     """
     dtypeC = "bfloat16"
     B = torch_convert_bit_twiddling(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -414,9 +412,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     """
     dtypeC = "bfloat16"
     B = torch_convert_bit_twiddling(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -440,9 +436,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
     """
     dtypeC = "bfloat16"
     B = torch_convert(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -470,9 +464,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
     """
     dtypeC = "bfloat16"
     B = torch_convert(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -23,7 +23,7 @@ def matmul(
     threads,
     num_bits=4,
 ):
-    from bitblas.quantization import _tir_packed_to_unsigned_convert
+    from tilelang.quantize import _tir_packed_to_unsigned_convert
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "int8"
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -4,6 +4,7 @@
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper_tma
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
 import example_dequant_gemm_w4a8
 
 
@@ -31,6 +32,13 @@ def test_example_dequant_gemm_bf16_mxfp4_hopper_tma():
 
 
 @tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    example_dequant_groupedgemm_bf16_mxfp4_hopper.main()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_dequant_gemm_w4a8():
     example_dequant_gemm_w4a8.main()
 
diff --git a/examples/dequantize_gemm/utils.py b/examples/dequantize_gemm/utils.py
@@ -3,8 +3,6 @@
 
 def torch_convert_bit_twiddling(tensor):
     """
-    Convert a 2-D uint8 tensor into a bfloat16 tensor by decoding pairs of input bytes with a bit-twiddling scheme.
-
     This function expects `tensor` to be a 2-D torch.Tensor of dtype `torch.uint8`. Each output element is produced by combining two input bytes and extracting a bf16-like 16-bit pattern according to one of four positional bit layouts (pos 0..3). The result is scaled by 2**126 to adjust the exponent bias and returned as dtype `torch.bfloat16`.
 
     Parameters:
@@ -16,38 +14,46 @@ def torch_convert_bit_twiddling(tensor):
     Raises:
         AssertionError: If any byte inputs used for a conversion are not dtype `torch.uint8`.
     """
+    assert tensor.dim() == 2 and tensor.dtype == torch.uint8
+    N, K = tensor.shape
+    assert K % 2 == 0, "Number of columns must be even"
 
-    def _convert(val0, val1, pos) -> torch.bfloat16:
-        assert val0.dtype == torch.uint8
-        assert val1.dtype == torch.uint8
-        val0 = val0.view(torch.uint8)
-        val1 = val1.view(torch.uint8)
-        val_concat = (val0.item() << 8) | val1.item()
-        mask = 0b1000000111000000
-        if pos == 0:
-            bf16 = val_concat & mask
-        elif pos == 1:
-            bf16 = (val_concat << 3) & mask
-        elif pos == 2:
-            bf16 = (val_concat << 6) & mask
-        elif pos == 3:
-            mask1 = 0b1000000000000000
-            mask2 = 0b0000000110000000
-            mask3 = 0b0000000001000000
-            bf16 = ((val_concat << 1) & mask1) | ((val_concat >> 3) & mask2) | (
-                (val_concat >> 7) & mask3)
-        bf16_new = torch.tensor([bf16], dtype=torch.uint16, device=val0.device).view(torch.bfloat16)
-        # Add bias for change from fp4 to bf16
-        bf16_new = bf16_new.item() * (2**126)
-        return bf16_new
+    # Combine pairs of uint8 values into uint32 for safe bitwise ops on CUDA
+    val0 = tensor[:, 0::2].to(torch.int32)
+    val1 = tensor[:, 1::2].to(torch.int32)
+    val_concat = (val0 << 8) | val1  # (N, K//2), uint32
 
-    N = tensor.shape[0]
-    K = tensor.shape[1]
-    new_tensor = torch.empty(N, K * 2, dtype=torch.bfloat16, device=tensor.device)
-    for i in range(new_tensor.shape[0]):
-        for j in range(new_tensor.shape[1]):
-            new_tensor[i][j] = _convert(tensor[i][j // 4 * 2], tensor[i][j // 4 * 2 + 1], j % 4)
-    return new_tensor
+    # Expand to match output shape where each pair generates 4 values
+    val_concat_expanded = val_concat.repeat_interleave(4, dim=1)  # (N, K//2*4)
+
+    # Positional encoding for bit-twiddling logic
+    pos = torch.arange(K * 2, device=tensor.device) % 4  # (K*2,)
+
+    # Bit masks for decoding (as uint32 for CUDA compatibility)
+    mask = 0b1000000111000000
+    mask1 = 0b1000000000000000
+    mask2 = 0b0000000110000000
+    mask3 = 0b0000000001000000
+
+    # Calculate results for all 4 positions in parallel
+    res0 = val_concat_expanded & mask
+    res1 = (val_concat_expanded << 3) & mask
+    res2 = (val_concat_expanded << 6) & mask
+    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | (
+        (val_concat_expanded >> 7) & mask3)
+
+    # Select the correct result based on position
+    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1,
+                                                   torch.where(pos == 2, res2, res3)))
+
+    # Convert to uint16 for .view(torch.bfloat16)
+    bf16_uint16 = (bf16 & 0xFFFF).to(torch.uint16)
+    bf16_bf16 = bf16_uint16.view(torch.bfloat16)
+
+    # Avoid integer overflow by using a float32 multiplier for the exponent scaling
+    bf16_new = bf16_bf16 * (2.0**126)
+
+    return bf16_new
 
 
 def torch_convert(tensor, scale_size=None, Scale=None):
@@ -106,3 +112,41 @@ def print_bit(name, val):
     val_cpu = val.cpu().item()
     binary_repr = f'{val_cpu:032b}'
     print(name, binary_repr)
+
+
+def print_red_warning(message):
+    print(f"\033[31mWARNING: {message}\033[0m")
+
+
+def calc_sim(x, y, name="tensor"):
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print_red_warning(f'{name} all zero')
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
+    x_mask = torch.isfinite(x)
+    y_mask = torch.isfinite(y)
+    if not torch.all(x_mask == y_mask):
+        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        if raise_assert:
+            raise AssertionError
+    if not torch.isclose(
+            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
+            equal_nan=True).all():
+        print_red_warning(f'{name} Error: nonfinite value mismatch')
+        if raise_assert:
+            raise AssertionError
+    x = x.masked_fill(~x_mask, 0)
+    y = y.masked_fill(~y_mask, 0)
+    sim = calc_sim(x, y, name)
+    diff = (1. - sim).item()
+    print(f'{diff=}')
+    if not (0 <= diff <= eps):
+        print_red_warning(f'{name} Error: {diff=}')
+        if raise_assert:
+            raise AssertionError
diff --git a/maint/scripts/pypi.Dockerfile b/maint/scripts/pypi.Dockerfile
@@ -2,24 +2,40 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu18.04
 
 RUN set -eux; \
     apt-get update; \
-    apt-get install -y wget curl libtinfo-dev zlib1g-dev libssl-dev build-essential libedit-dev libxml2-dev git; \
+    # Install gcc-9 and g++-9
+    apt-get install -y software-properties-common; \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y; \
+    apt-get update; \
+    apt-get install -y wget curl libtinfo-dev zlib1g-dev libssl-dev build-essential \
+                       libedit-dev libxml2-dev git gcc-9 g++-9; \
+    # Switch default gcc/g++ to new version
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100; \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 100; \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100; \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100; \
+    gcc --version; g++ --version; \
     curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh; \
     bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda3; \
-    rm Miniconda3-latest-Linux-x86_64.sh
+    rm Miniconda3-latest-Linux-x86_64.sh;
+
+RUN apt-get update && apt-get install -y ninja-build
 
 ENV PATH=/miniconda3/bin/:$PATH
 
+# ✅ Accept Anaconda Terms of Service for both required channels
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
+    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
+
+# Create environments
 RUN set -eux; \
-    conda create -n py38 python=3.8 -y; \
     conda create -n py39 python=3.9 -y; \
     conda create -n py310 python=3.10 -y; \
     conda create -n py311 python=3.11 -y; \
     conda create -n py312 python=3.12 -y; \
-    ln -s /miniconda3/envs/py38/bin/python3.8 /usr/bin/python3.8; \
     ln -s /miniconda3/envs/py39/bin/python3.9 /usr/bin/python3.9; \
     ln -s /miniconda3/envs/py310/bin/python3.10 /usr/bin/python3.10; \
     ln -s /miniconda3/envs/py311/bin/python3.11 /usr/bin/python3.11; \
     ln -s /miniconda3/envs/py312/bin/python3.12 /usr/bin/python3.12; \
     conda install -y cmake patchelf
 
-WORKDIR /tilelang
+WORKDIR /tilelang
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,13 +4,9 @@ requires = [
     "cmake>=3.26",
     "packaging",
     "setuptools>=61",
-    "torch",
     "wheel",
-    "tox",
-    "auditwheel",
     "patchelf",
-    "ninja",
-    "Cython",
+    "Cython>=3.0.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-Cython
+Cython>=3.0.0
 build
 cmake>=3.26
 packaging
@@ -9,3 +9,4 @@ wheel
 tox
 auditwheel
 patchelf
+ninja
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
@@ -331,13 +331,13 @@ def shfl_up(value: Union[int, PrimExpr, tir.Call], offset: Union[int, PrimExpr,
 
 
 def sync_threads():
-    """Synchronize all threads in a warp.
+    """Synchronize all threads in a block.
     """
     return tir.op.tvm_storage_sync("shared")
 
 
 def sync_global():
-    """Synchronize all threads in a block.
+    """Synchronize all threads in the entire grid.
     """
     tx, ty, tz = get_thread_bindings()
     ex, ey, ez = get_block_extents()
diff --git a/tilelang/quantize/__init__.py b/tilelang/quantize/__init__.py
@@ -5,6 +5,7 @@
     _tir_packed_to_fp4_to_f16,  # noqa: F401
     _tir_u8_to_f8_e4m3_to_f16,  # noqa: F401
     _tir_packed_to_unsigned_convert_with_zeros,  # noqa: F401
+    _tir_u8_to_f4_to_bf16,  # noqa: F401
 )
 
 from .utils import (