[Examples] Add the support of rocm arch detecting (#661)

zhangnju · zhangnju · web-flow · commit 8361eb5c659f · 2025-07-24T00:19:04.000+08:00
Co-authored-by: zhangnju &lt;ningzhan@SMC-SC-DI08-33.dh144.dcgpu&gt;
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
@@ -49,8 +49,14 @@ def get_configs(args, kwargs):
     if with_roller:
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
         from tilelang.carver.roller.rasterization import NoRasterization
-        arch = CUDA("cuda")
+        import torch
+
+        if torch.version.hip is not None:
+            arch=CDNA("hip")
+        else:
+            arch = CUDA("cuda")
         topk = 10
 
         carve_template = MatmulTemplate(
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -183,8 +183,14 @@ def get_configs(args, kwargs):
     if with_roller:
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
         from tilelang.carver.roller.rasterization import NoRasterization
-        arch = CUDA("cuda")
+        import torch
+
+        if torch.version.hip is not None:
+            arch=CDNA("hip")
+        else:
+            arch = CUDA("cuda")
         topk = 10
 
         carve_template = MatmulTemplate(
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -50,8 +50,14 @@ def get_configs(args, kwargs):
     if with_roller:
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
         from tilelang.carver.roller.rasterization import NoRasterization
-        arch = CUDA("cuda")
+        import torch
+
+        if torch.version.hip is not None:
+            arch=CDNA("hip")
+        else:
+            arch = CUDA("cuda")
         topk = 10
 
         carve_template = MatmulTemplate(
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
@@ -1,8 +1,9 @@
 import tilelang.language as T
 from tilelang.tools import Analyzer
 from tilelang.carver.arch import CUDA
+from tilelang.carver.arch import CDNA
 from tilelang.layout import make_swizzled_layout
-
+import torch
 N = 64
 C = 256
 H = 512
@@ -94,7 +95,10 @@ def conv(
 
 def main():
     my_func = kernel(N, C, H, W, F, K, S, D, P, 64, 128, 32, 3, 256)
-    cuda_device = CUDA("cuda")
+    if torch.version.hip is not None:
+        cuda_device=CDNA("hip")
+    else:
+        cuda_device = CUDA("cuda")
     result = Analyzer.analysis(my_func, cuda_device)
     print(result)
     print(f"Analyzed FLOPs: {result.total_flops}")
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
@@ -1,6 +1,8 @@
 import tilelang.language as T
 from tilelang.tools import Analyzer
 from tilelang.carver.arch import CUDA
+from tilelang.carver.arch import CDNA
+import torch
 
 M = N = K = 1024
 
@@ -47,7 +49,10 @@ def matmul(
 def main():
     my_func = kernel(128, 128, 32, 3, 128, True)
 
-    cuda_device = CUDA("cuda")
+    if torch.version.hip is not None:
+        cuda_device=CDNA("hip")
+    else:
+        cuda_device = CUDA("cuda")
     result = Analyzer.analysis(my_func, cuda_device)
 
     print(f"Analyzed FLOPs: {result.total_flops}")
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
@@ -6,6 +6,7 @@
 from tilelang.autotuner import AutoTuner
 from tilelang.carver.template import ConvTemplate
 from tilelang.carver.arch import CUDA
+from tilelang.carver.arch import CDNA
 from tilelang.carver.roller.rasterization import NoRasterization
 
 
@@ -31,7 +32,10 @@ def main(A, B):
 
 def get_configs(N, C, H, W, F, K, S, D, P, with_roller=False, topk=15):
     if with_roller:
-        arch = CUDA("cuda")
+        if torch.version.hip is not None:
+            arch=CDNA("hip")
+        else:
+            arch = CUDA("cuda")
         carve_template = ConvTemplate(
             N=N,
             C=C,
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
@@ -6,6 +6,7 @@
 from tilelang.autotuner import AutoTuner
 from tilelang.carver.template import MatmulTemplate
 from tilelang.carver.arch import CUDA
+from tilelang.carver.arch import CDNA
 from tilelang.carver.roller.rasterization import NoRasterization
 
 
@@ -15,7 +16,10 @@ def ref_program(A, B):
 
 def get_configs(M, N, K, with_roller=False, topk=20):
     if with_roller:
-        arch = CUDA("cuda")
+        if torch.version.hip is not None:
+            arch=CDNA("hip")
+        else:
+            arch = CUDA("cuda")
         carve_template = MatmulTemplate(
             M=M,
             N=N,
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
@@ -4,7 +4,7 @@
 from .cdna import CDNA
 from typing import Union
 from tvm.target import Target
-
+import torch
 
 def get_arch(target: Union[str, Target] = "cuda") -> TileDevice:
     if isinstance(target, str):
@@ -23,7 +23,12 @@ def get_arch(target: Union[str, Target] = "cuda") -> TileDevice:
 def auto_infer_current_arch() -> TileDevice:
     # TODO(lei): This is a temporary solution to infer the current architecture
     # Can be replaced by a more sophisticated method in the future
-    return get_arch("cuda")
+    if torch.version.hip is not None:
+        return get_arch("hip")
+    if torch.cuda.is_available():
+        return get_arch("cuda")
+    else:
+        return get_arch("llvm")
 
 
 from .cpu import is_cpu_arch  # noqa: F401