diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58a61a68a..7137a43e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,13 @@ cmake_minimum_required(VERSION 3.18)
 project(TILE_LANG C CXX)
 
 option(TILE_LANG_STATIC_STDCPP "Statically link libstdc++ for TileLang libraries" ON)
+option(TILE_LANG_INSTALL_STATIC_LIB "Install the static library" ON)
+
+if(TILE_LANG_STATIC_STDCPP)
+  message(STATUS "Enabling static linking of C++ standard library")
+  # Note: We'll apply static linking flags selectively to avoid Python extension conflicts
+  # The flags will be applied per-target below rather than globally
+endif()
 
 # Set default build type to Release if not provided
 if(NOT CMAKE_BUILD_TYPE)
@@ -63,18 +70,6 @@ if(TILE_LANG_INSTALL_STATIC_LIB)
   set(BUILD_STATIC_RUNTIME ON)
 endif()
 
-if(TILE_LANG_STATIC_STDCPP)
-  message(STATUS "Enabling static linking of C++ standard library")
-  # Set compile flags for static linking of the C++ standard library
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
-  # For some compilers, additional flags may be required
-  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++")
-    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -static-libstdc++")
-  endif()
-endif()
-
 # Enforce CUDA standard
 if(USE_CUDA)
   set(CMAKE_CUDA_STANDARD 17)
@@ -232,6 +227,11 @@ add_library(tilelang_static STATIC $<TARGET_OBJECTS:tilelang_objs>)
 add_dependencies(tilelang_static tvm_runtime)
 set_target_properties(tilelang_static PROPERTIES OUTPUT_NAME tilelang)
 
+# Apply static linking flags only to static library to avoid Python extension conflicts
+if(TILE_LANG_STATIC_STDCPP AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  target_link_options(tilelang_static PRIVATE -static-libstdc++ -static-libgcc)
+endif()
+
 # Debug build type-specific definitions
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_compile_definitions(tilelang PRIVATE "TVM_LOG_DEBUG")
diff --git a/maint/scripts/pypi.Dockerfile b/maint/scripts/pypi.Dockerfile
index 6ddf708b0..1ad5f1bc4 100644
--- a/maint/scripts/pypi.Dockerfile
+++ b/maint/scripts/pypi.Dockerfile
@@ -2,24 +2,40 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu18.04
 
 RUN set -eux; \
     apt-get update; \
-    apt-get install -y wget curl libtinfo-dev zlib1g-dev libssl-dev build-essential libedit-dev libxml2-dev git; \
+    # Install gcc-9 and g++-9
+    apt-get install -y software-properties-common; \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y; \
+    apt-get update; \
+    apt-get install -y wget curl libtinfo-dev zlib1g-dev libssl-dev build-essential \
+                       libedit-dev libxml2-dev git gcc-9 g++-9; \
+    # Switch default gcc/g++ to new version
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100; \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 100; \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100; \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100; \
+    gcc --version; g++ --version; \
     curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh; \
     bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda3; \
-    rm Miniconda3-latest-Linux-x86_64.sh
+    rm Miniconda3-latest-Linux-x86_64.sh;
+
+RUN apt-get update && apt-get install -y ninja-build
 
 ENV PATH=/miniconda3/bin/:$PATH
 
+# ✅ Accept Anaconda Terms of Service for both required channels
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
+    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
+
+# Create environments
 RUN set -eux; \
-    conda create -n py38 python=3.8 -y; \
     conda create -n py39 python=3.9 -y; \
     conda create -n py310 python=3.10 -y; \
     conda create -n py311 python=3.11 -y; \
     conda create -n py312 python=3.12 -y; \
-    ln -s /miniconda3/envs/py38/bin/python3.8 /usr/bin/python3.8; \
     ln -s /miniconda3/envs/py39/bin/python3.9 /usr/bin/python3.9; \
     ln -s /miniconda3/envs/py310/bin/python3.10 /usr/bin/python3.10; \
     ln -s /miniconda3/envs/py311/bin/python3.11 /usr/bin/python3.11; \
     ln -s /miniconda3/envs/py312/bin/python3.12 /usr/bin/python3.12; \
     conda install -y cmake patchelf
 
-WORKDIR /tilelang
\ No newline at end of file
+WORKDIR /tilelang
diff --git a/pyproject.toml b/pyproject.toml
index 95a894ced..43eecf879 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,13 +4,9 @@ requires = [
     "cmake>=3.26",
     "packaging",
     "setuptools>=61",
-    "torch",
     "wheel",
-    "tox",
-    "auditwheel",
     "patchelf",
-    "ninja",
-    "Cython",
+    "Cython>=3.0.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements-build.txt b/requirements-build.txt
index 0c18991fd..4280a7173 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-Cython
+Cython>=3.0.0
 build
 cmake>=3.26
 packaging
@@ -9,3 +9,4 @@ wheel
 tox
 auditwheel
 patchelf
+ninja
diff --git a/requirements.txt b/requirements.txt
index f69a5259a..1a44b9a71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # runtime requirements
-Cython
+Cython>=3.0.0
 numpy>=1.23.5
 tqdm>=4.62.3
 typing_extensions>=4.10.0
diff --git a/src/tl_templates/cuda/gemm_sm120.h b/src/tl_templates/cuda/gemm_sm120.h
index 1e7be8fc1..122f56642 100644
--- a/src/tl_templates/cuda/gemm_sm120.h
+++ b/src/tl_templates/cuda/gemm_sm120.h
@@ -1,3 +1,9 @@
 #pragma once
 
 #include "gemm_mma.h"
+
+namespace tl {
+using tl_mma::gemm_rs;
+using tl_mma::gemm_sr;
+using tl_mma::gemm_ss;
+} // namespace tl
diff --git a/src/tl_templates/cuda/gemm_sm80.h b/src/tl_templates/cuda/gemm_sm80.h
index 1e7be8fc1..122f56642 100644
--- a/src/tl_templates/cuda/gemm_sm80.h
+++ b/src/tl_templates/cuda/gemm_sm80.h
@@ -1,3 +1,9 @@
 #pragma once
 
 #include "gemm_mma.h"
+
+namespace tl {
+using tl_mma::gemm_rs;
+using tl_mma::gemm_sr;
+using tl_mma::gemm_ss;
+} // namespace tl
diff --git a/src/tl_templates/cuda/gemm_sm89.h b/src/tl_templates/cuda/gemm_sm89.h
index f02ef3e60..d64ae9e2e 100644
--- a/src/tl_templates/cuda/gemm_sm89.h
+++ b/src/tl_templates/cuda/gemm_sm89.h
@@ -5,3 +5,9 @@
 #include "cuda_fp8.h"
 
 #include "gemm_mma.h"
+
+namespace tl {
+using tl_mma::gemm_rs;
+using tl_mma::gemm_sr;
+using tl_mma::gemm_ss;
+} // namespace tl