From 324066ad57fec8f1e3b807df8ec2633e041c62a4 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Fri, 7 Aug 2020 12:29:35 -0700
Subject: [PATCH] [TESTS] Refactor tests to run on either the GPU or CPU.

Much of the time spent in testing is duplicated work between CPU and GPU
test nodes. The main reason is that there is no way to control which
TVM devices are enabled at runtime, so tests that use LLVM will run on
both GPU and CPU nodes.

This patch adds an environment variable, TVM_TEST_DEVICES, which
controls which TVM devices should be used by tests. Devices not in
TVM_TEST_DEVICES can still be used, so tests must be careful to check
that the desired device is enabled with `tvm.testing.device_enabled` or
by enumerating all devices with `tvm.testing.enabled_devices`. All
tests have been retrofitted with these checks.

This patch also provides the decorator `@tvm.testing.gpu` to mark a test
as possibly using the gpu. Tests that require the gpu can use
`@tvm.testing.requires_gpu`. Tests without these flags will not be run
on GPU nodes.
---
 Jenkinsfile                                   |   8 +-
 python/tvm/_ffi/runtime_ctypes.py             |   6 +
 python/tvm/relay/testing/__init__.py          |   4 +-
 python/tvm/testing.py                         | 104 +++++++++
 tests/lint/check_file_type.py                 |   1 +
 .../config.py => tests/python/conftest.py     |  17 +-
 tests/python/contrib/test_cblas.py            |   7 +-
 tests/python/contrib/test_cublas.py           |  13 +-
 tests/python/contrib/test_cudnn.py            |  13 +-
 tests/python/contrib/test_random.py           |  29 +--
 tests/python/frontend/caffe2/test_forward.py  |  10 +-
 .../frontend/coreml/model_zoo/__init__.py     |  43 ----
 tests/python/frontend/coreml/test_forward.py  |  63 ++++--
 tests/python/frontend/keras/test_forward.py   |   5 +-
 tests/python/frontend/mxnet/test_forward.py   | 208 +++++++++++++-----
 tests/python/frontend/onnx/test_forward.py    | 195 ++++++++++------
 tests/python/frontend/pytorch/test_forward.py | 130 ++++++++++-
 .../frontend/tensorflow/test_forward.py       |  18 +-
 tests/python/integration/test_ewise.py        |  13 +-
 tests/python/integration/test_gemm.py         |   4 +-
 tests/python/integration/test_reduce.py       |  15 +-
 tests/python/integration/test_scan.py         |   4 +-
 tests/python/integration/test_tuning.py       |   5 +-
 .../integration/test_winograd_nnpack.py       |   5 +-
 .../test_quantization_accuracy.py             |   2 +
 tests/python/pytest.ini                       |  22 ++
 .../relay/dyn/test_dynamic_op_level10.py      |   9 +-
 .../relay/dyn/test_dynamic_op_level2.py       |   4 +-
 .../relay/dyn/test_dynamic_op_level3.py       |  11 +-
 .../relay/dyn/test_dynamic_op_level5.py       |   6 +-
 .../relay/dyn/test_dynamic_op_level6.py       |   5 +-
 .../relay/test_backend_compile_engine.py      |   4 +-
 .../relay/test_backend_graph_runtime.py       |   5 +-
 tests/python/relay/test_cpp_build_module.py   |   9 +-
 tests/python/relay/test_op_grad_level1.py     |   9 +-
 tests/python/relay/test_op_grad_level2.py     |  15 +-
 tests/python/relay/test_op_grad_level3.py     |   6 +-
 tests/python/relay/test_op_level1.py          |  27 ++-
 tests/python/relay/test_op_level10.py         |  42 ++--
 tests/python/relay/test_op_level2.py          |  91 +++++---
 tests/python/relay/test_op_level3.py          |  60 +++--
 tests/python/relay/test_op_level4.py          |  30 ++-
 tests/python/relay/test_op_level5.py          |  54 +++--
 tests/python/relay/test_op_level6.py          |   8 +-
 .../python/relay/test_pass_alter_op_layout.py |   6 +-
 tests/python/relay/test_pass_annotation.py    |  37 ++--
 .../relay/test_pass_dynamic_to_static.py      |  21 +-
 tests/python/relay/test_pass_fuse_ops.py      |   4 +-
 tests/python/relay/test_pass_manager.py       |  14 +-
 tests/python/relay/test_vm.py                 |  20 +-
 tests/python/topi/python/common.py            |  15 +-
 tests/python/topi/python/test_fifo_buffer.py  |   6 +-
 .../topi/python/test_topi_batch_matmul.py     |   8 +-
 .../python/topi/python/test_topi_broadcast.py |  12 +-
 tests/python/topi/python/test_topi_clip.py    |   4 +-
 tests/python/topi/python/test_topi_conv1d.py  |   6 +-
 .../python/test_topi_conv1d_transpose_ncw.py  |   8 +-
 .../topi/python/test_topi_conv2d_NCHWc.py     |   2 +-
 .../topi/python/test_topi_conv2d_hwcn.py      |   4 +-
 .../topi/python/test_topi_conv2d_int8.py      |  13 +-
 .../topi/python/test_topi_conv2d_nchw.py      |   7 +-
 .../topi/python/test_topi_conv2d_nhwc.py      |   5 +-
 .../test_topi_conv2d_nhwc_tensorcore.py       |   5 +-
 .../python/test_topi_conv2d_nhwc_winograd.py  |  15 +-
 .../python/test_topi_conv2d_transpose_nchw.py |   8 +-
 .../topi/python/test_topi_conv2d_winograd.py  |   4 +-
 .../topi/python/test_topi_conv3d_ncdhw.py     |   9 +-
 .../topi/python/test_topi_conv3d_ndhwc.py     |   9 +-
 .../test_topi_conv3d_ndhwc_tensorcore.py      |   9 +-
 .../test_topi_conv3d_transpose_ncdhw.py       |   9 +-
 .../topi/python/test_topi_conv3d_winograd.py  |   6 +-
 .../topi/python/test_topi_correlation.py      |  10 +-
 .../python/test_topi_deformable_conv2d.py     |   5 +-
 tests/python/topi/python/test_topi_dense.py   |  14 +-
 .../topi/python/test_topi_dense_tensorcore.py |  12 +-
 .../topi/python/test_topi_depth_to_space.py   |   4 +-
 .../topi/python/test_topi_depthwise_conv2d.py |  14 +-
 .../test_topi_depthwise_conv2d_back_input.py  |   4 +-
 .../test_topi_depthwise_conv2d_back_weight.py |   4 +-
 .../topi/python/test_topi_group_conv2d.py     |  10 +-
 .../test_topi_group_conv2d_NCHWc_int8.py      |   2 +-
 tests/python/topi/python/test_topi_image.py   |  12 +-
 tests/python/topi/python/test_topi_lrn.py     |   4 +-
 tests/python/topi/python/test_topi_math.py    |  11 +-
 tests/python/topi/python/test_topi_pooling.py |  39 ++--
 tests/python/topi/python/test_topi_reduce.py  |   4 +-
 tests/python/topi/python/test_topi_relu.py    |  10 +-
 tests/python/topi/python/test_topi_reorg.py   |   4 +-
 tests/python/topi/python/test_topi_softmax.py |  14 +-
 tests/python/topi/python/test_topi_sort.py    |   9 +-
 .../topi/python/test_topi_space_to_depth.py   |   4 +-
 tests/python/topi/python/test_topi_sparse.py  |  24 +-
 tests/python/topi/python/test_topi_tensor.py  |   9 +-
 .../python/topi/python/test_topi_transform.py | 153 +++++--------
 .../topi/python/test_topi_upsampling.py       |   6 +-
 tests/python/topi/python/test_topi_util.py    |   2 +-
 tests/python/topi/python/test_topi_vision.py  |  32 +--
 .../unittest/test_auto_scheduler_measure.py   |  13 +-
 .../unittest/test_autotvm_index_tuner.py      |   2 +-
 .../unittest/test_hybrid_error_report.py      |   2 +-
 .../test_runtime_module_based_interface.py    |  32 +--
 .../unittest/test_runtime_module_export.py    |  11 +-
 .../unittest/test_runtime_module_load.py      |  12 +-
 tests/python/unittest/test_runtime_ndarray.py |  20 +-
 .../unittest/test_target_codegen_blob.py      |   9 +-
 .../unittest/test_target_codegen_bool.py      |   8 +-
 .../unittest/test_target_codegen_cuda.py      | 150 ++++++-------
 .../unittest/test_target_codegen_device.py    |  13 +-
 .../unittest/test_target_codegen_extern.py    |   8 +-
 .../unittest/test_target_codegen_opencl.py    |  19 +-
 .../python/unittest/test_te_hybrid_script.py  |  49 ++---
 ...hedule_postproc_rewrite_for_tensor_core.py |  18 +-
 .../unittest/test_te_schedule_tensor_core.py  |  18 +-
 .../unittest/test_te_tensor_overload.py       |  13 +-
 .../test_tir_analysis_verify_gpu_code.py      |  22 +-
 .../test_tir_analysis_verify_memory.py        |  43 ++--
 tests/python/unittest/test_tir_ir_builder.py  |   6 +-
 .../test_tir_transform_lower_warp_memory.py   |  21 +-
 .../test_tir_transform_thread_sync.py         |   2 +
 tests/scripts/setup-pytest-env.sh             |  15 ++
 tests/scripts/task_python_integration.sh      |   2 +-
 tests/scripts/task_python_topi.sh             |   2 +-
 tutorials/frontend/deploy_ssd_gluoncv.py      |   4 +-
 123 files changed, 1527 insertions(+), 1008 deletions(-)
 rename python/tvm/relay/testing/config.py => tests/python/conftest.py (62%)
 delete mode 100644 tests/python/frontend/coreml/model_zoo/__init__.py
 create mode 100644 tests/python/pytest.ini

diff --git a/Jenkinsfile b/Jenkinsfile
index 7df0d3fc31af8..02f2e93a1843d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -202,8 +202,8 @@ stage('Unit Test') {
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest.sh"
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest.sh gpu"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration.sh gpu"
         }
       }
     }
@@ -214,8 +214,8 @@ stage('Unit Test') {
         init_git()
         unpack_lib('i386', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
-          sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
+          sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh cpu"
+          sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh cpu"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
         }
       }
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index dcc9528b1ad7e..115b81a6586e0 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -18,6 +18,7 @@
 # pylint: disable=invalid-name
 import ctypes
 import json
+import os
 import numpy as np
 from .base import _LIB, check_call
 
@@ -182,6 +183,7 @@ class TVMContext(ctypes.Structure):
         'hexagon': 14,
         'webgpu': 15,
     }
+
     def __init__(self, device_type, device_id):
         super(TVMContext, self).__init__()
         self.device_type = device_type
@@ -197,6 +199,10 @@ def _GetDeviceAttr(self, device_type, device_id, attr_id):
     @property
     def exist(self):
         """Whether this device exist."""
+        allowed_ctxs = os.environ.get("TVM_TEST_CTXS")
+        if allowed_ctxs is not None:
+            if self.device_type not in set(allowed_ctxs.split(",")):
+                return False
         return self._GetDeviceAttr(
             self.device_type, self.device_id, 0) != 0
 
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 904e4d7baf28e..4a9c1cc5e3f46 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -25,6 +25,7 @@
 import tvm.relay as relay
 import tvm.relay.op as op
 from tvm.relay import Prelude
+from tvm.testing import enabled_devices
 
 from . import mlp
 from . import resnet
@@ -41,7 +42,6 @@
 from . import temp_op_attr
 from . import synthetic
 
-from .config import ctx_list
 from .init import create_workload
 from .nat import add_nat_definitions, count, make_nat_value, make_nat_expr
 from .py_converter import to_python, run_as_python
@@ -125,7 +125,7 @@ def check_grad(func,
     if test_inputs is None:
         test_inputs = inputs
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp = relay.create_executor(ctx=ctx, target=target)
 
         # Get analytic gradients.
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 7483a9fb4cf84..a5a6a45d73b88 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -18,12 +18,15 @@
 # pylint: disable=invalid-name,unnecessary-comprehension
 """ TVM testing utilities """
 import logging
+import os
+import pytest
 import numpy as np
 import tvm
 import tvm.arith
 import tvm.tir
 import tvm.te
 import tvm._ffi
+from tvm.contrib import nvcc
 
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
@@ -285,4 +288,105 @@ def _check_forward(constraints1, constraints2, varmap, backvarmap):
                    constraints_trans.dst_to_src, constraints_trans.src_to_dst)
 
 
+def gpu(f):
+    """Mark to differentiate tests that use the GPU is some capacity. These
+    tests will be run on CPU-only nodes and on nodes with GPUS.
+
+    To mark a test that must have a GPU present to run, use `@requires_gpu`.
+    """
+    return pytest.mark.gpu(f)
+
+
+def requires_gpu(f):
+    """Mark a test as requiring a GPU to run. Tests with this mark will not be
+    run unless a gpu is present.
+    """
+    return pytest.mark.skipif(not tvm.gpu().exist, reason="No GPU present")(gpu(f))
+
+
+def requires_cuda(f):
+    """Mark a test as requiring the CUDA runtime. This does not mean the tests
+    also requires a gpu. For that, use `@requires_gpu` and `@requires_cuda`
+    """
+    return pytest.mark.cuda(
+        pytest.mark.skipif(
+            not tvm.runtime.enabled("cuda"), reason="CUDA support not enabled"
+        )(requires_gpu(f))
+    )
+
+
+def requires_opencl(f):
+    """Mark a test as requiring the OpenCL runtime. This does not mean the tests
+    also requires a gpu. For that, use `@requires_gpu` and `@requires_cuda`.
+    """
+    return pytest.mark.opencl(
+        pytest.mark.skipif(
+            not tvm.runtime.enabled("opencl"), reason="OpenCL support not enabled"
+        )(f)
+    )
+
+
+def requires_tpu(f):
+    """Mark a test as requiring a TPU to run. Tests with this mark will not be
+    run unless a tpu is present.
+    """
+    return pytest.mark.tpu(
+        pytest.mark.skipif(
+            not tvm.gpu().exist or not nvcc.have_tensorcore(tvm.gpu(0).compute_version),
+            reason="No TPU present",
+        )(f)
+    )
+
+
+def _get_backends():
+    backend_str = os.environ.get("TVM_TEST_DEVICES", "")
+    if len(backend_str) == 0:
+        backend_str = DEFAULT_TEST_DEVICES
+    backends = {
+        dev
+        for dev in backend_str.split(";")
+        if len(dev) > 0 and tvm.context(dev, 0).exist and tvm.runtime.enabled(dev)
+    }
+    if len(backends) == 0:
+        logging.warning(
+            "None of the following backends are supported by this build of TVM: %s."
+            "Try setting TVM_TEST_DEVICES to a supported backend. Defaulting to llvm.",
+            backend_str
+        )
+        return {"llvm"}
+    return backends
+
+
+DEFAULT_TEST_DEVICES = (
+    "llvm;cuda;opencl;metal;rocm;vulkan;nvptx;"
+    "llvm -device=arm_cpu;opencl -device=mali,aocl_sw_emu"
+)
+TEST_DEVICES = _get_backends()
+
+
+def device_enabled(device):
+    """Check if a device should be used when testing. This allows the user to
+    control which devices they are testing against. In tests, this should be
+    used to check if a device should be used when said device is an optional
+    part of the test.
+    """
+    return device in TEST_DEVICES
+
+
+def enabled_devices():
+    """Get all enabled devices with associated contexts. In this context,
+    enabled means that TVM was built with support for this device and the
+    device name appears in the TVM_TEST_DEVICES environment variable.
+
+    If TVM_TEST_DEVICES is not set, it defaults to variable
+    DEFAULT_TEST_DEVICES in this module.
+
+    Returns
+    -------
+    targets: list
+        A list of pairs of all enabled devices and the associated context
+    """
+    return [(dev, tvm.context(dev, 0)) for dev in TEST_DEVICES]
+
+
 tvm._ffi._init_api("testing", __name__)
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index f803647d91a19..36b92aa48b776 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -122,6 +122,7 @@
     "docs/_static/css/tvm_theme.css",
     "docs/_static/img/tvm-logo-small.png",
     "docs/_static/img/tvm-logo-square.png",
+    "tests/python/pytest.ini",
 }
 
 
diff --git a/python/tvm/relay/testing/config.py b/tests/python/conftest.py
similarity index 62%
rename from python/tvm/relay/testing/config.py
rename to tests/python/conftest.py
index 93a08db32d2ce..cce722fbefa4a 100644
--- a/python/tvm/relay/testing/config.py
+++ b/tests/python/conftest.py
@@ -14,18 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Configuration about tests"""
-from __future__ import absolute_import as _abs
+import tvm.testing
 
-import os
-import tvm
-
-
-def ctx_list():
-    """Get context list for testcases"""
-    device_list = os.environ.get("RELAY_TEST_TARGETS", "")
-    device_list = (device_list.split(",") if device_list
-                   else ["llvm", "cuda"])
-    device_list = set(device_list)
-    res = [(device, tvm.context(device, 0)) for device in device_list]
-    return [x for x in res if x[1].exist]
+def pytest_configure(config):
+    print("Enabled backends:", "; ".join(map(lambda x: x[0], tvm.testing.enabled_devices())))
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index e1c1c71255365..445643e100822 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -22,6 +22,7 @@
 from tvm.contrib import cblas
 from tvm.contrib import mkl
 from tvm.contrib import mkldnn
+from tvm.testing import device_enabled
 
 def verify_matmul_add(m, l, n, lib, transa=False, transb=False, dtype="float32"):
     bias = te.var('bias', dtype=dtype)
@@ -41,7 +42,7 @@ def get_numpy(a, b, bb, transa, transb):
         return np.dot(a, b) + bb
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func(lib.__name__ + ".matmul", True):
@@ -107,7 +108,7 @@ def get_numpy(a, b, bb, transa, transb):
         return np.dot(a, b) + bb
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.mkl.matmul_u8s8s32", True):
@@ -153,7 +154,7 @@ def get_numpy(a, b, transa, transb):
         return tvm.topi.testing.batch_matmul(a, b)
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func(lib.__name__ + ".matmul", True):
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 517e6e1240303..44fa48ea66e01 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -19,6 +19,7 @@
 import numpy as np
 from tvm.contrib import cublas
 from tvm.contrib import cublaslt
+from tvm.testing import requires_cuda
 
 def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
@@ -30,9 +31,6 @@ def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
-        if not tvm.runtime.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
@@ -64,9 +62,6 @@ def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
     s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
-        if not tvm.runtime.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
         if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
             print("skip because extern function is not available")
             return
@@ -115,9 +110,6 @@ def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
     s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
-        if not tvm.runtime.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
@@ -132,15 +124,18 @@ def verify(target="cuda"):
                                    b.asnumpy().astype(C.dtype)).astype(C.dtype), rtol=rtol)
     verify()
 
+@requires_cuda
 def test_matmul_add():
     verify_matmul_add('float', 'float', rtol=1e-3)
     verify_matmul_add('float16', 'float')
     verify_matmul_add('float16', 'float16', rtol=1e-2)
     verify_matmul_add('int8', 'int32')
 
+@requires_cuda
 def test_matmul_add_igemm():
     verify_matmul_add_igemm('int8', 'int32')
 
+@requires_cuda
 def test_batch_matmul():
     verify_batch_matmul('float', 'float')
     verify_batch_matmul('float16', 'float')
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 61822c849a7eb..e0dd0efdf187f 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -20,6 +20,7 @@
 from tvm.contrib.nvcc import have_fp16
 import numpy as np
 import tvm.topi.testing
+from tvm.testing import requires_gpu
 
 def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     in_channel = 4
@@ -36,9 +37,6 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     height = 32
     width = 32
 
-    if not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled...")
-        return
     if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
         print("skip because cudnn is not enabled...")
         return
@@ -87,6 +85,7 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     f(x, w, y)
     tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-2, rtol=1e-2)
 
+@requires_gpu
 def test_conv2d():
     verify_conv2d("float32", "float32", tensor_format=0)
     verify_conv2d("float16", "float32", tensor_format=1)
@@ -118,9 +117,6 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     height = 32
     width = 32
 
-    if not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled...")
-        return
     if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
         print("skip because cudnn is not enabled...")
         return
@@ -161,6 +157,7 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     f(x, w, y)
     tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-4)
 
+@requires_gpu
 def test_conv3d():
     verify_conv3d("float32", "float32", tensor_format=0)
     verify_conv3d("float32", "float32", tensor_format=0, groups=2)
@@ -195,10 +192,8 @@ def verify_softmax_4d(shape, dtype="float32"):
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
 
+@requires_gpu
 def test_softmax():
-    if not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled...")
-        return
     if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
         print("skip because cudnn is not enabled...")
         return
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index e61030bd98355..ea59b06f68b52 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -19,21 +19,7 @@
 import numpy as np
 from tvm.contrib import random
 from tvm import rpc
-
-def enabled_ctx_list():
-    ctx_list = [('cpu', tvm.cpu(0)),
-                ('gpu', tvm.gpu(0)),
-                ('cl', tvm.opencl(0)),
-                ('metal', tvm.metal(0)),
-                ('rocm', tvm.rocm(0)),
-                ('vulkan', tvm.vulkan(0)),
-                ('vpi', tvm.vpi(0))]
-    for k, v  in ctx_list:
-        assert tvm.context(k, 0) == v
-    ctx_list = [x[1] for x in ctx_list if x[1].exist]
-    return ctx_list
-
-ENABLED_CTX_LIST = enabled_ctx_list()
+from tvm.testing import device_enabled, enabled_devices, gpu
 
 def test_randint():
     m = 10240
@@ -42,7 +28,7 @@ def test_randint():
     s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.randint", True):
@@ -66,7 +52,7 @@ def test_uniform():
     s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.uniform", True):
@@ -90,7 +76,7 @@ def test_normal():
     s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.normal", True):
@@ -105,6 +91,7 @@ def verify(target="llvm"):
         assert abs(np.std(na) - 4) < 1e-2
     verify()
 
+@gpu
 def test_random_fill():
     def test_local(ctx, dtype):
         if not tvm.get_global_func("tvm.contrib.random.random_fill", True):
@@ -125,7 +112,7 @@ def test_rpc(dtype):
         if not tvm.get_global_func("tvm.contrib.random.random_fill", True):
             print("skip because extern function is not available")
             return
-        if not tvm.runtime.enabled("rpc") or not tvm.runtime.enabled("llvm"):
+        if not device_enabled("rpc") or not tvm.runtime.enabled("llvm"):
             return
         np_ones = np.ones((512, 512), dtype=dtype)
         server = rpc.Server("localhost")
@@ -142,7 +129,7 @@ def test_rpc(dtype):
 
     for dtype in ["bool", "int8", "uint8", "int16", "uint16", "int32", "int32",
                   "int64", "uint64", "float16", "float32", "float64"]:
-        for ctx in ENABLED_CTX_LIST:
+        for _, ctx in enabled_devices():
             test_local(ctx, dtype)
         test_rpc(dtype)
 
@@ -151,3 +138,5 @@ def test_rpc(dtype):
     test_uniform()
     test_normal()
     test_random_fill()
+
+
diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py
index 50a878180ac9c..7e13604cb801c 100644
--- a/tests/python/frontend/caffe2/test_forward.py
+++ b/tests/python/frontend/caffe2/test_forward.py
@@ -18,12 +18,12 @@
 import tvm
 from tvm import te
 from tvm.contrib import graph_runtime
-from tvm.relay.testing.config import ctx_list
 from tvm import relay
 from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19
 from caffe2.python import workspace, core
 from caffe2.proto import caffe2_pb2
 from collections import namedtuple
+from tvm.testing import enabled_devices, gpu
 
 
 def get_tvm_output(model,
@@ -84,19 +84,22 @@ def verify_caffe2_forward_impl(model, data_shape, out_shape):
     dtype = 'float32'
     data = np.random.uniform(size=data_shape).astype(dtype)
     c2_out = get_caffe2_output(model, data, dtype)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype)
         tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_squeezenet1_1():
     verify_caffe2_forward_impl(c2_squeezenet, (1, 3, 224, 224), (1, 1000, 1, 1))
 
 
+@gpu
 def test_forward_resnet50():
     verify_caffe2_forward_impl(c2_resnet50, (1, 3, 224, 224), (1, 1000))
 
 
+@gpu
 def test_forward_vgg19():
     verify_caffe2_forward_impl(c2_vgg19, (1, 3, 224, 224), (1, 1000))
 
@@ -104,6 +107,7 @@ def test_forward_vgg19():
 Model = namedtuple('Model', ['init_net', 'predict_net'])
 
 
+@gpu
 def test_elementwise_add():
     data_shape = (1, 16, 9, 9)
     init_net = caffe2_pb2.NetDef()
@@ -142,6 +146,7 @@ def test_elementwise_add():
     verify_caffe2_forward_impl(model, data_shape, data_shape)
 
 
+@gpu
 def test_elementwise_add_with_broadcast():
     data_shape = (1, 16, 9, 9)
     init_net = caffe2_pb2.NetDef()
@@ -181,6 +186,7 @@ def test_elementwise_add_with_broadcast():
     verify_caffe2_forward_impl(model, data_shape, data_shape)
 
 
+@gpu
 def test_normalize_yuv():
     data_shape = (1, 3, 96, 96)
     init_net = caffe2_pb2.NetDef()
diff --git a/tests/python/frontend/coreml/model_zoo/__init__.py b/tests/python/frontend/coreml/model_zoo/__init__.py
deleted file mode 100644
index b8cee305b15ac..0000000000000
--- a/tests/python/frontend/coreml/model_zoo/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from PIL import Image
-import numpy as np
-from tvm.contrib.download import download_testdata
-
-def get_mobilenet():
-    url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
-    dst = 'mobilenet.mlmodel'
-    real_dst = download_testdata(url, dst, module='coreml')
-    return os.path.abspath(real_dst)
-
-def get_resnet50():
-    url = 'https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel'
-    dst = 'resnet50.mlmodel'
-    real_dst = download_testdata(url, dst, module='coreml')
-    return os.path.abspath(real_dst)
-
-def get_cat_image():
-    url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
-    dst = 'cat.png'
-    real_dst = download_testdata(url, dst, module='data')
-    img = Image.open(real_dst).resize((224, 224))
-    # CoreML's standard model image format is BGR
-    img_bgr = np.array(img)[:, :, ::-1]
-    img = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]
-    return np.asarray(img)
\ No newline at end of file
diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py
index 5ae7a6cc875f7..8d41ed0efa465 100644
--- a/tests/python/frontend/coreml/test_forward.py
+++ b/tests/python/frontend/coreml/test_forward.py
@@ -25,11 +25,11 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm import relay
-from tvm.relay.testing.config import ctx_list
 from tvm.topi.testing import conv2d_nchw_python
 
 import coremltools as cm
 import model_zoo
+from tvm.testing import gpu, enabled_devices
 
 def get_tvm_output(func, x, params, target, ctx,
                    out_shape=(1, 1000), input_name='image', dtype='float32'):
@@ -50,15 +50,17 @@ def run_model_checkonly(model_file, model_name='', input_name='image'):
     shape_dict = {input_name : x.shape}
     # Some Relay passes change operators on the fly. Ensuring that we generate
     # new graph for each target.
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         mod, params = relay.frontend.from_coreml(model, shape_dict)
         tvm_output = get_tvm_output(mod["main"], x, params, target, ctx)
         print(target, ctx, model_name, 'prediction id: ', np.argmax(tvm_output.flat))
 
+@gpu
 def test_mobilenet_checkonly():
     model_file = model_zoo.get_mobilenet()
     run_model_checkonly(model_file, 'mobilenet')
 
+@gpu
 def test_resnet50_checkonly():
     model_file = model_zoo.get_resnet50()
     run_model_checkonly(model_file, 'resnet50')
@@ -122,10 +124,11 @@ def verify_AddLayerParams(input_dim, alpha=2):
                             output_name='output',
                             mode='ADD')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2], ['input1', 'input2'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_AddLayerParams():
     verify_AddLayerParams((1, 2, 2), 0)
     verify_AddLayerParams((1, 2, 2), 1)
@@ -148,10 +151,11 @@ def verify_MultiplyLayerParams(input_dim, alpha):
                             output_name='output',
                             mode='MULTIPLY')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2], ['input1', 'input2'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_MultiplyLayerParams():
     verify_MultiplyLayerParams((1, 2, 2), 0)
     verify_MultiplyLayerParams((1, 2, 2), 1)
@@ -173,10 +177,11 @@ def verify_ConcatLayerParams(input1_dim, input2_dim):
                             output_name='output',
                             mode='CONCAT')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2], ['input1', 'input2'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_ConcatLayerParams():
     verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
     verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4))
@@ -203,10 +208,11 @@ def verify_UpsampleLayerParams(input_dim, scale, mode):
                          output_name='output')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, a_np, 'input', b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_UpsampleLayerParams():
     verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
     verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR')
@@ -223,10 +229,11 @@ def verify_l2_normalize(input_dim, eps):
     builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, a_np, 'input', b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001)
 
@@ -248,10 +255,11 @@ def verify_lrn(input_dim, size, bias, alpha, beta):
                     local_size=size)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, a_np, 'input', b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_lrn():
     verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
 
@@ -272,10 +280,11 @@ def verify_average(input_dim1, input_dim2, axis=0):
                             output_name='output',
                             mode='AVE')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2], ['input1', 'input2'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_average():
     verify_average((1, 3, 20, 20), (1, 3, 20, 20))
     verify_average((3, 20, 20), (1, 3, 20, 20))
@@ -300,11 +309,12 @@ def verify_max(input_dim):
                             output_name='output',
                             mode='MAX')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2, a_np3],
                             ['input1', 'input2', 'input3'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_max():
     verify_max((1, 3, 20, 20))
     verify_max((20, 20))
@@ -328,11 +338,12 @@ def verify_min(input_dim):
                             output_name='output',
                             mode='MIN')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np1, a_np2, a_np3],
                             ['input1', 'input2', 'input3'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_min():
     verify_min((1, 3, 20, 20))
     verify_min((20, 20))
@@ -353,7 +364,7 @@ def verify_unary_sqrt(input_dim):
                       mode='sqrt')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -375,7 +386,7 @@ def verify_unary_rsqrt(input_dim, epsilon=0):
                       epsilon=epsilon)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -397,7 +408,7 @@ def verify_unary_inverse(input_dim, epsilon=0):
                       epsilon=epsilon)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -419,7 +430,7 @@ def verify_unary_power(input_dim, alpha):
                       alpha=alpha)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -440,7 +451,7 @@ def verify_unary_exp(input_dim):
                       mode='exp')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -461,7 +472,7 @@ def verify_unary_log(input_dim):
                       mode='log')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -482,7 +493,7 @@ def verify_unary_abs(input_dim):
                       mode='abs')
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -504,12 +515,13 @@ def verify_unary_threshold(input_dim, alpha):
                       alpha=alpha)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
+@gpu
 def test_forward_unary():
     verify_unary_sqrt((1, 3, 20, 20))
     verify_unary_rsqrt((1, 3, 20, 20))
@@ -525,6 +537,7 @@ def test_forward_unary():
     verify_unary_threshold((1, 3, 20, 20), alpha=5.0)
 
 
+@gpu
 def test_forward_reduce():
     from enum import Enum
     class ReduceAxis(Enum):
@@ -565,7 +578,7 @@ def _verify_reduce(input_dim, mode, axis, ref_func, dtype='float32'):
                           mode=mode)
 
         model = cm.models.MLModel(builder.spec)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             out = run_tvm_graph(model, target, ctx, [a_np],
                                 ['input'], ref_val.shape, dtype)
             tvm.testing.assert_allclose(out, ref_val, rtol=1e-5, atol=1e-5)
@@ -602,7 +615,7 @@ def verify_reshape(input_dim, target_shape, mode):
                        mode=mode)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -637,7 +650,7 @@ def verify_split(input_dim, nOutputs):
                       output_names=output_names)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input'], output_shapes, [dtype] * len(output_shapes))
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
@@ -673,11 +686,12 @@ def verify_image_scaler(input_dim, blue_bias=0.0, green_bias=0.0, red_bias=0.0,
     builder.add_elementwise(name='add', input_names=['input1', 'input2'],
                             output_name='output', alpha=0, mode='ADD')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np, a_np],
                             ['input1', 'input2'], b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_image_scaler():
     verify_image_scaler((3, 224, 224), image_scale=0.17)
     verify_image_scaler((3, 224, 224),
@@ -705,11 +719,12 @@ def verify_convolution(input_dim, filter, padding):
                             input_name='input1',
                             output_name='output')
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         out = run_tvm_graph(model, target, ctx, [a_np],
                             ['input1'], output_shape=None)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
+@gpu
 def test_forward_convolution():
     verify_convolution((1, 3, 224, 224), filter=(32, 3, 3, 3), padding='VALID')
     verify_convolution((1, 3, 224, 224), filter=(32, 3, 3, 3), padding='SAME')
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 8ddae9655d478..2726a63d2b201 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -19,8 +19,8 @@
 from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
-from tvm.relay.testing.config import ctx_list
 import keras
+from tvm.testing import enabled_devices, gpu
 
 try:
     import tensorflow.compat.v1 as tf
@@ -104,7 +104,7 @@ def to_channels_last(arr):
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
     keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         inputs = [to_channels_first(x) for x in xs] if need_transpose else xs
         tvm_out = get_tvm_output(inputs, target, ctx)
         for kout, tout in zip(keras_out, tvm_out):
@@ -113,6 +113,7 @@ def to_channels_last(arr):
             tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 class TestKeras:
     scenarios = [using_classic_keras, using_tensorflow_keras]
 
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 594ffe72faf0b..7de901fc75a81 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -20,15 +20,16 @@
 import tvm
 from tvm import te
 from tvm.contrib import graph_runtime
-from tvm.relay.testing.config import ctx_list
 from tvm import relay
 import mxnet as mx
 
 from mxnet import gluon
 from mxnet.gluon.model_zoo import vision
-import model_zoo
 import random
 import pytest
+import model_zoo
+
+from tvm.testing import gpu, enabled_devices
 
 def verify_mxnet_frontend_impl(mx_symbol,
                                data_shape=(1, 3, 224, 224),
@@ -82,32 +83,36 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
     x = np.random.uniform(size=data_shape)
     if gluon_impl:
         gluon_out, gluon_sym = get_gluon_output(name, x)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
             tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
     else:
         mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
         assert "data" not in args
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
             tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_forward_mlp():
     mlp = model_zoo.mx_mlp()
     verify_mxnet_frontend_impl(mlp,
                                data_shape=(1, 1, 28, 28),
                                out_shape=(1, 10))
 
+@gpu
 def test_forward_vgg():
     for n in [11]:
         mx_sym = model_zoo.mx_vgg(n)
         verify_mxnet_frontend_impl(mx_sym)
 
+@gpu
 def test_forward_resnet():
     for n in [18]:
         mx_sym = model_zoo.mx_resnet(18)
         verify_mxnet_frontend_impl(mx_sym)
 
+@gpu
 def test_forward_leaky_relu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
@@ -116,36 +121,42 @@ def test_forward_leaky_relu():
     mx_sym = mx.sym.LeakyReLU(data, act_type='leaky')
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_elu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_rrelu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
     verify_mxnet_frontend_impl(mx_sym[0], (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_prelu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_gelu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.LeakyReLU(data, act_type='gelu')
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_softrelu():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.Activation(data, act_type='softrelu')
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_fc_flatten():
     # test flatten=True option in mxnet 0.11.1
     data = mx.sym.var('data')
@@ -157,27 +168,32 @@ def test_forward_fc_flatten():
     except:
         pass
 
+@gpu
 def test_forward_clip():
     data = mx.sym.var('data')
     data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
+@gpu
 def test_forward_split():
     data = mx.sym.var('data')
     mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
     verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
 
+@gpu
 def test_forward_split_squeeze():
     data = mx.sym.var('data')
     mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
     verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
 
+@gpu
 def test_forward_expand_dims():
     data = mx.sym.var('data')
     mx_sym = mx.sym.expand_dims(data, axis=1)
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
 
+@gpu
 def test_forward_pooling():
     data = mx.sym.var('data')
     mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
@@ -186,6 +202,7 @@ def test_forward_pooling():
     mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
     verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
 
+@gpu
 def test_forward_pooling3d():
     data = mx.sym.var('data')
     mx_sym = mx.sym.Pooling(data, kernel=(3, 3, 3), pad=(1, 1, 1), pool_type='avg')
@@ -194,6 +211,7 @@ def test_forward_pooling3d():
     mx_sym = mx.sym.Pooling(data, kernel=(3, 3, 3), pad=(1, 1, 1), pool_type='max')
     verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8, 8), (1, 20, 8, 8, 8))
 
+@gpu
 def test_forward_adaptive_pooling():
     data = mx.sym.var('data')
     mx_sym = mx.sym.contrib.AdaptiveAvgPooling2D(data, output_size=(1,))
@@ -202,49 +220,58 @@ def test_forward_adaptive_pooling():
     mx_sym = mx.sym.contrib.AdaptiveAvgPooling2D(data, output_size=(3, 3))
     verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 3, 3))
 
+@gpu
 def test_forward_lrn():
     data = mx.sym.var('data')
     mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
     verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
 
+@gpu
 def test_forward_ones():
     data = mx.sym.var('data')
     ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
     mx_sym = mx.sym.elemwise_add(data, ones)
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+@gpu
 def test_forward_zeros():
     data = mx.sym.var('data')
     zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
     mx_sym = mx.sym.elemwise_add(data, zeros)
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+@gpu
 def test_forward_ones_like():
     data = mx.sym.var('data')
     mx_sym = mx.sym.ones_like(data, dtype='float32')
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+@gpu
 def test_forward_make_loss():
     data = mx.sym.var('data')
     ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
     mx_sym = mx.sym.make_loss((data-ones)**2/2, dtype='float32')
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+@gpu
 def test_forward_zeros_like():
     data = mx.sym.var('data')
     mx_sym = mx.sym.zeros_like(data, dtype='float32')
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+@gpu
 def test_forward_argmax():
     data = mx.sym.var('data')
     mx_sym = mx.sym.argmax(data, axis=1)
     verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
 
+@gpu
 def test_forward_argmin():
     data = mx.sym.var('data')
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
 
+@gpu
 def test_forward_slice():
     data = mx.sym.var('data')
     mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
@@ -252,6 +279,7 @@ def test_forward_slice():
     mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
     verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
 
+@gpu
 def test_forward_where():
     cond = mx.sym.var('cond')
     x = mx.sym.var('x')
@@ -273,13 +301,14 @@ def test_forward_where():
     mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
 
     mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
             op_res = intrp.evaluate()(np_cond, np_x, np_y)
             tvm.testing.assert_allclose(op_res.asnumpy(), mx_out)
 
 
+@gpu
 def test_forward_arange():
     def _mx_symbol(F, start, stop, step):
         if start is None and step is None:
@@ -296,7 +325,7 @@ def verify(start, stop, step):
         ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy()
         mx_sym = _mx_symbol(mx.sym, start, stop, step)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()()
@@ -315,6 +344,7 @@ def _mx_symbol(F, op_name, inputs):
     op = getattr(F, op_name)
     return op(*inputs)
 
+@gpu
 def test_forward_broadcast_ops():
     for op in ["broadcast_add",
                "broadcast_plus",
@@ -349,12 +379,13 @@ def test_forward_broadcast_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
         shapes = {'a': a_shape, 'b': b_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
+@gpu
 def test_forward_elemwise_ops():
     for op in ["elemwise_add", "elemwise_sub", "elemwise_mul",
                "elemwise_div", "maximum", "minimum",
@@ -372,13 +403,14 @@ def test_forward_elemwise_ops():
             ref_res = op(mx.nd.array(a_np), mx.nd.array(b_np))
         shapes = {'a': shape, 'b': shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 
+@gpu
 def test_forward_softmin():
     data = mx.sym.var('data')
     mx_sym = mx.sym.softmin(data)
@@ -388,6 +420,7 @@ def test_forward_softmin():
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 3, 100, 100))
 
 
+@gpu
 def test_forward_unary_ops():
     for op in ["abs", "sqrt", "ceil", "floor", "round", "reciprocal", "trunc",
                "softsign", "hard_sigmoid",
@@ -402,13 +435,14 @@ def test_forward_unary_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np)])
         shapes = {'a': shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_scalar_ops():
     for op in [operator.add, operator.sub, operator.mul, operator.truediv,
                operator.pow, operator.lt, operator.le, operator.eq,
@@ -421,7 +455,7 @@ def test_forward_scalar_ops():
         ref_res = op(mx.nd.array(a_np), b_scalar)
         shapes = {'a': a_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np)
@@ -435,19 +469,20 @@ def test_forward_scalar_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar])
         shapes = {'a': a_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
+@gpu
 def test_forward_slice_axis():
     def verify(shape, axis, begin, end):
         data_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end)
         mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(data_np)
@@ -458,6 +493,7 @@ def verify(shape, axis, begin, end):
     verify((3, 4), 1, -3, -1)
     verify((3, 4), -1, -3, -1)
 
+@gpu
 def test_forward_slice_like():
     def verify(x_shape, y_shape, axes):
         x_np = np.random.uniform(size=x_shape).astype("float32")
@@ -469,7 +505,7 @@ def verify(x_shape, y_shape, axes):
             ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes)
             mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np, y_np)
@@ -479,6 +515,7 @@ def verify(x_shape, y_shape, axes):
     verify((3, 4), (2, 3), (0))
     verify((3, 4), (2, 3), (-1))
 
+@gpu
 def test_forward_sequence_reverse():
     def verify(shape, seq_lengths, use_seq_lengths, seq_axis):
         data_np = np.random.uniform(size=shape).astype("float32")
@@ -500,7 +537,7 @@ def verify(shape, seq_lengths, use_seq_lengths, seq_axis):
         mx_sym = mx.sym.SequenceReverse(*mx_sym_args)
         mod, _ = relay.frontend.from_mxnet(mx_sym, *from_mxnet_args)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(*in_data)
@@ -512,18 +549,20 @@ def verify(shape, seq_lengths, use_seq_lengths, seq_axis):
     # MXNet accepts axis value as 0 only
     # verify((3, 4, 5, 6), None, False, 2)
 
+@gpu
 def test_forward_l2_normalize():
     data = mx.sym.var('data')
     mx_sym = mx.sym.L2Normalization(data, mode="channel")
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
 
+@gpu
 def test_forward_shape_array():
     def verify(shape):
         x_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.shape_array(mx.nd.array(x_np))
         mx_sym = mx.sym.shape_array(mx.sym.var("x"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -532,6 +571,7 @@ def verify(shape):
     verify((3, 4, 5))
     verify((3, 4, 5, 6))
 
+@gpu
 def test_forward_squeeze():
     def verify(shape, axis):
         x_np = np.random.uniform(size=shape).astype("float32")
@@ -542,7 +582,7 @@ def verify(shape, axis):
             ref_res = mx.nd.squeeze(mx.nd.array(x_np), axis=axis)
             mx_sym = mx.sym.squeeze(mx.sym.var("x"), axis=axis)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -552,6 +592,7 @@ def verify(shape, axis):
     verify((1, 3, 1), 2)
     verify((1, 3, 1), (0, 2))
 
+@gpu
 def test_forward_broadcast_axis():
     def verify(shape, axis, size):
         x_np = np.random.uniform(size=shape).astype("float32")
@@ -560,7 +601,7 @@ def verify(shape, axis, size):
             mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('x'),axis,size])
             ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(x_np),axis,size])
             mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 for kind in ["graph", "debug"]:
                     intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                     op_res = intrp.evaluate()(x_np)
@@ -570,13 +611,14 @@ def verify(shape, axis, size):
     verify((1, 2, 1), (0, 2), (2, 3))
 
 
+@gpu
 def test_forward_broadcast_to():
     def verify(input_shape, shape):
         x_np = np.random.uniform(size=input_shape).astype("float32")
         ref_res = mx.nd.broadcast_to(mx.nd.array(x_np), shape=shape)
         mx_sym = mx.sym.broadcast_to(mx.sym.var("x"), shape=shape)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": input_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -586,6 +628,7 @@ def verify(input_shape, shape):
     verify((4, 1, 32, 32), (4, 8, 32, 32))
 
 
+@gpu
 def test_forward_logical_not():
     a_shape = (3, 4, 5)
     dtype = 'float32'
@@ -594,20 +637,21 @@ def test_forward_logical_not():
     ref_res = mx.nd.logical_not(mx.nd.array(a_np))
     shapes = {'a': a_shape}
     mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
             op_res = intrp.evaluate()(a_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 
+@gpu
 def test_forward_full():
     def verify(val, shape, dtype):
         ctx = mx.cpu()
         ref_res = mx.nd.full(shape, val, dtype=dtype)
         mx_sym = mx.sym.full(shape, val, dtype=dtype)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             # Skip testing graph runtime because this op will be optimized out
             # by constant folding.
             for kind in ["debug"]:
@@ -618,6 +662,7 @@ def verify(val, shape, dtype):
     verify(2, (3, 4), "int32")
     verify(3.5, (1, 3, 4), "float32")
 
+@gpu
 def test_forward_embedding():
     def verify(data_shape, weight_shape):
         in_dim, out_dim = weight_shape
@@ -629,7 +674,7 @@ def verify(data_shape, weight_shape):
                                   input_dim=in_dim, output_dim=out_dim)
         mod, _ = relay.frontend.from_mxnet(
             mx_sym, {"x": data_shape, "w": weight_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x=x_np, w=w_np)
@@ -637,6 +682,7 @@ def verify(data_shape, weight_shape):
     verify((2, 2), (4, 5))
     verify((2, 3, 4), (4, 5))
 
+@gpu
 def test_forward_smooth_l1():
     data = mx.sym.var('data')
     mx_sym = mx.sym.smooth_l1(data)
@@ -644,6 +690,7 @@ def test_forward_smooth_l1():
     mx_sym = mx.sym.smooth_l1(data, scalar=1.0)
     verify_mxnet_frontend_impl(mx_sym, (3, 4), (3, 4))
 
+@gpu
 def test_forward_take():
     def verify(shape, indices_src, axis, mode="clip"):
         x_np = np.random.uniform(size=shape).astype("float32")
@@ -651,7 +698,7 @@ def verify(shape, indices_src, axis, mode="clip"):
         ref_res = mx.nd.take(mx.nd.array(x_np), mx.nd.array(indices_np), axis, mode)
         mx_sym = mx.sym.take(mx.sym.var("x"), mx.sym.var("y"), axis, mode)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape, "y": indices_np.shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np, indices_np)
@@ -664,13 +711,14 @@ def verify(shape, indices_src, axis, mode="clip"):
     verify((3,4), [-1, 5], 1)
     verify((3,4), [-1, 5], 1, mode="wrap")
 
+@gpu
 def test_forward_gather_nd():
     def verify(xshape, yshape, y_data, error=False):
         x_data = np.random.uniform(size=xshape).astype("float32")
         ref_res = mx.nd.gather_nd(mx.nd.array(x_data), mx.nd.array(y_data))
         mx_sym = mx.sym.gather_nd(mx.sym.var("x_data"), mx.sym.var("y_data"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x_data": xshape, "y_data": yshape}, {"x_data": "float32", "y_data": "int32"})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_data, y_data)
@@ -682,12 +730,14 @@ def verify(xshape, yshape, y_data, error=False):
     verify((3, 2), (2, 2, 3), [[[0, 1, 2], [2, 0, 1]], [[0, 0, 0], [1, 1, 1]]])
     verify((1, 4), (1, 1), [[0]])
 
+@gpu
 def test_forward_bilinear_resize():
     # add tests including scale_height and scale_width when mxnet is updated to version 1.5
     data = mx.sym.var('data')
     mx_sym = mx.sym.contrib.BilinearResize2D(data, height=5, width=10)
     verify_mxnet_frontend_impl(mx_sym, (1, 2, 3, 4), (1, 2, 5, 10))
 
+@gpu
 def test_forward_grid_generator():
     def verify(shape, transform_type, target_shape):
         x = np.random.uniform(size=shape).astype("float32")
@@ -695,7 +745,7 @@ def verify(shape, transform_type, target_shape):
         mx_sym = mx.sym.GridGenerator(mx.sym.var("x"), transform_type, target_shape)
         shape_dict = {"x": x.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(
                     kind, mod=mod, ctx=ctx, target=target)
@@ -706,6 +756,7 @@ def verify(shape, transform_type, target_shape):
     verify((4, 2, 16, 16), 'warp', None)
     verify((1, 2, 16, 16), 'warp', None)
 
+@gpu
 def test_forward_bilinear_sampler():
     def verify(data_shape, grid_shape):
         data = np.random.uniform(size=data_shape).astype("float32")
@@ -714,7 +765,7 @@ def verify(data_shape, grid_shape):
         mx_sym = mx.sym.BilinearSampler(mx.sym.var("data"), mx.sym.var("grid"))
         shape_dict = {"data": data.shape, "grid": grid.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(
                     kind, mod=mod, ctx=ctx, target=target)
@@ -724,6 +775,7 @@ def verify(data_shape, grid_shape):
     verify((4, 4, 16, 32), (4, 2, 8, 8))
     verify((4, 4, 16, 32), (4, 2, 32, 32))
 
+@gpu
 def test_forward_rnn_layer():
     def verify(mode, seq_len, input_size, hidden_size, num_layers,
                batch=1, init_states=True, bidirectional=False):
@@ -768,7 +820,7 @@ def verify(mode, seq_len, input_size, hidden_size, num_layers,
 
         mod, params = relay.frontend.from_mxnet(
             mx_sym, shape=shape_dict, arg_params=mx_params)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             # only test graph runtime because debug runtime is too slow
             for kind in ["graph"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
@@ -792,6 +844,7 @@ def verify(mode, seq_len, input_size, hidden_size, num_layers,
         # verify(mode, 10, 64, 64, 3, init_states=False)
         # verify(mode, 10, 64, 64, 3, batch=2, bidirectional=True, init_states=False)
 
+@gpu
 def test_forward_Crop():
     def verify(xshape, yshape, offset=None):
         x_data = np.random.uniform(size=xshape).astype("float32")
@@ -803,7 +856,7 @@ def verify(xshape, yshape, offset=None):
             mx_sym = mx.sym.Crop(mx.sym.var("x"), mx.sym.var("y"), offset=offset)
             ref_res = mx.nd.Crop(mx.nd.array(x_data), mx.nd.array(y_data), offset=offset)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": xshape, "y": yshape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 if offset is None or offset == (0, 0):
@@ -817,13 +870,14 @@ def verify(xshape, yshape, offset=None):
     verify((5, 32, 40, 40), (5, 32, 25, 25))
     verify((5, 32, 40, 40), (5, 32, 25, 25), (5, 5))
 
+@gpu
 def test_forward_argsort():
     def verify(shape, axis, is_ascend, dtype="float32"):
         x_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.argsort(mx.nd.array(x_np), axis=axis, is_ascend=is_ascend, dtype=dtype)
         mx_sym = mx.sym.argsort(mx.sym.var("x"), axis=axis, is_ascend=is_ascend, dtype=dtype)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -832,6 +886,7 @@ def verify(shape, axis, is_ascend, dtype="float32"):
     verify((1, 4, 6), axis=1, is_ascend=True)
     verify((3, 5, 6), axis=-3, is_ascend=False, dtype="int32")
 
+@gpu
 def test_forward_topk():
     def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
         x_np = np.random.uniform(size=shape).astype("float32")
@@ -840,7 +895,7 @@ def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
         mx_sym = mx.sym.topk(mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type,
                              is_ascend=is_ascend, dtype=dtype)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -856,6 +911,7 @@ def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
     verify((3, 5, 6), k=2, axis=1, ret_type="value", is_ascend=True)
     verify((3, 5, 6), k=0, axis=2, ret_type="both", dtype="int32")
 
+@gpu
 def test_forward_sequence_mask():
     def verify(shape, use_sequence_length, value, axis, dtype, itype):
         data_np = np.random.uniform(size=shape).astype(dtype)
@@ -885,7 +941,7 @@ def verify(shape, use_sequence_length, value, axis, dtype, itype):
                                          value=value,
                                          axis=axis)
             mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape}, dtype={"data": dtype})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ['graph', 'debug']:
                 if use_sequence_length is False and kind == 'graph':
                     # Disable the test for 'graph' when it's identity.
@@ -901,13 +957,14 @@ def verify(shape, use_sequence_length, value, axis, dtype, itype):
     verify((5, 4, 3), False, 1.0, 1, 'float64', 'float64')
     verify((5, 4, 3, 2), True, 1.0, 0, 'float32', 'float32')
 
+@gpu
 def test_forward_contrib_div_sqrt_dim():
     def verify(shape):
         x_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.contrib.div_sqrt_dim(mx.nd.array(x_np))
         mx_sym = mx.sym.contrib.div_sqrt_dim(mx.sym.var("x"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x_np)
@@ -915,6 +972,7 @@ def verify(shape):
     verify((3, 4))
     verify((3, 4, 5))
 
+@gpu
 def test_forward_batch_norm():
     def verify(shape, axis=1, fix_gamma=False):
         x = np.random.uniform(size=shape).astype("float32")
@@ -934,7 +992,7 @@ def verify(shape, axis=1, fix_gamma=False):
                       "mean": moving_mean.shape, "var": moving_var.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
         #print(mod)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta, moving_mean, moving_var)
@@ -945,6 +1003,7 @@ def verify(shape, axis=1, fix_gamma=False):
     verify((2, 3, 4, 5), fix_gamma=True)
 
 
+@gpu
 def test_forward_instance_norm():
     def verify(shape, axis=1, epsilon=1e-5):
         x = np.random.uniform(size=shape).astype("float32")
@@ -954,7 +1013,7 @@ def verify(shape, axis=1, epsilon=1e-5):
         mx_sym = mx.sym.InstanceNorm(mx.sym.var("x"), mx.sym.var("gamma"), mx.sym.var("beta"), epsilon)
         shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta)
@@ -965,6 +1024,7 @@ def verify(shape, axis=1, epsilon=1e-5):
     verify((8, 7, 6, 5, 4))
 
 
+@gpu
 def test_forward_layer_norm():
     def verify(shape, axis=-1):
         x = np.random.uniform(size=shape).astype("float32")
@@ -976,7 +1036,7 @@ def verify(shape, axis=-1):
                                   mx.sym.var("beta"), axis=axis)
         shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta)
@@ -985,6 +1045,7 @@ def verify(shape, axis=-1):
     verify((2, 5), axis=0)
     verify((2, 5, 6))
 
+@gpu
 def test_forward_one_hot():
     def verify(indices_shape, depth, on_value, off_value, dtype):
         x = np.random.randint(0, 5, size=indices_shape)
@@ -992,7 +1053,7 @@ def verify(indices_shape, depth, on_value, off_value, dtype):
         mx_sym = mx.sym.one_hot(mx.sym.var("x"), depth, on_value, off_value, dtype)
         shape_dict = {"x": x.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x.astype("float32"))
@@ -1004,6 +1065,7 @@ def verify(indices_shape, depth, on_value, off_value, dtype):
     verify((3, 2, 4, 5), 6, 1, 0, "int32")
     verify((3, 2, 4, 5), 6, 1.0, 0.0, "float32")
 
+@gpu
 def test_forward_pad():
     def verify(data_shape, out_shape, mode, pad_width, constant_value=0.0):
         data = mx.sym.var('data')
@@ -1028,6 +1090,7 @@ def verify(data_shape, out_shape, mode, pad_width, constant_value=0.0):
            pad_width=(0,0,0,0,1,2,3,4,5,6))
 
 
+@gpu
 def test_forward_slice():
     def verify(data_shape, out_shape, begin, end):
         data = mx.sym.var('data')
@@ -1038,6 +1101,7 @@ def verify(data_shape, out_shape, begin, end):
     verify(data_shape=(1,1,10), out_shape=(1,1,8), begin=(None, None, 2), end=(None, None, None))
 
 
+@gpu
 def test_forward_convolution():
     def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False):
         if is_depthwise:
@@ -1057,7 +1121,7 @@ def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False)
                                     pad=pad, num_filter=num_filter, num_group=groups)
         shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x, weight, bias)
@@ -1078,6 +1142,7 @@ def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False)
     verify(data_shape=(1, 8, 16, 16, 16), kernel_size=(3, 3, 3), stride=(2, 2, 2), pad=(1, 1, 1), num_filter=2)
     verify(data_shape=(20, 8, 16, 16, 16), kernel_size=(3, 3, 3), stride=(1, 1, 1), pad=(1, 1, 1), num_filter=2)
 
+@gpu
 def test_forward_deconvolution():
     def verify(data_shape, kernel_size, stride, pad, num_filter):
         weight_shape=(data_shape[1], num_filter) + kernel_size
@@ -1092,7 +1157,7 @@ def verify(data_shape, kernel_size, stride, pad, num_filter):
                                       pad=pad, num_filter=num_filter, no_bias=False)
         shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x, weight, bias)
@@ -1107,6 +1172,7 @@ def verify(data_shape, kernel_size, stride, pad, num_filter):
     verify(data_shape=(1, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2)
     verify(data_shape=(20, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2)
 
+@gpu
 def test_forward_cond():
     def verify(a_np, b_np):
         a_nd, b_nd = mx.nd.array(a_np), mx.nd.array(b_np)
@@ -1123,7 +1189,7 @@ def verify(a_np, b_np):
 
         shape_dict = {"a": a_np.shape, "b": b_np.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["debug", "vm"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
@@ -1132,6 +1198,7 @@ def verify(a_np, b_np):
     verify(np.asarray([1.0], 'float32'), np.asarray([2.0],'float32'))
     verify(np.asarray([4.0], 'float32'), np.asarray([3.0],'float32'))
 
+@gpu
 def test_forward_amp_cast():
     def verify(from_dtype, to_dtype):
         from_np = np.random.uniform(size=(1,3,18)).astype(from_dtype)
@@ -1140,7 +1207,7 @@ def verify(from_dtype, to_dtype):
         shape_dict = {'x': (1,3,18)}
         dtype_dict = {'x': from_dtype}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "vm", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(from_np)
@@ -1150,6 +1217,7 @@ def verify(from_dtype, to_dtype):
     verify('float32', 'float16')
     verify('float16', 'float32')
 
+@gpu
 def test_forward_amp_multicast():
     def verify(dtypes, cast_narrow, expected_dtype):
         x_nps = [np.random.uniform(size=(1,3,18)).astype(dtype) for dtype in dtypes]
@@ -1162,7 +1230,7 @@ def verify(dtypes, cast_narrow, expected_dtype):
             shape_dict[str(i)] = (1,3,18)
             dtype_dict[str(i)] = dtype
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "vm", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(*x_nps)
@@ -1178,6 +1246,7 @@ def verify(dtypes, cast_narrow, expected_dtype):
     verify(['float16', 'float16'], True, 'float16')
 
 
+@gpu
 def test_forward_unravel_index():
     def verify(x, shape, dtype):
         a_np = np.array(x).astype(dtype)
@@ -1186,7 +1255,7 @@ def verify(x, shape, dtype):
         shapes = {'a': a_np.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "vm", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(a_np)
@@ -1204,6 +1273,7 @@ def verify(x, shape, dtype):
     # verify([0, 1, 2, 5], [2, 2], dtype)
 
 
+@gpu
 def test_forward_swap_axis():
     def _verify_swap_axis(in_shape, out_shape, dim1, dim2):
         data = mx.sym.var('data')
@@ -1216,6 +1286,7 @@ def _verify_swap_axis(in_shape, out_shape, dim1, dim2):
     # _verify_swap_axis((4, 5), (5, 4), 0, 0)
 
 
+@gpu
 def test_forward_depth_to_space():
     def verify(shape, blocksize=2):
         x = np.random.uniform(size=shape).astype("float32")
@@ -1223,7 +1294,7 @@ def verify(shape, blocksize=2):
         mx_sym = mx.sym.depth_to_space(mx.sym.var("x"), blocksize)
         shape_dict = {"x": x.shape, }
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x)
@@ -1232,6 +1303,7 @@ def verify(shape, blocksize=2):
     verify((1, 18, 3, 3), 3)
 
 
+@gpu
 def test_forward_space_to_depth():
     def verify(shape, blocksize=2):
         x = np.random.uniform(size=shape).astype("float32")
@@ -1239,7 +1311,7 @@ def verify(shape, blocksize=2):
         mx_sym = mx.sym.space_to_depth(mx.sym.var("x"), blocksize)
         shape_dict = {"x": x.shape, }
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(x)
@@ -1248,6 +1320,7 @@ def verify(shape, blocksize=2):
     verify((1, 1, 9, 9), 3)
 
 
+@gpu
 def test_forward_correlation():
     def verify(data_shape, kernel_size, max_displacement, stride1, stride2, pad_size,
                is_multiply):
@@ -1263,7 +1336,7 @@ def verify(data_shape, kernel_size, max_displacement, stride1, stride2, pad_size
                                     is_multiply=is_multiply)
         shape_dict = {"data1": data1.shape, "data2": data2.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(data1, data2)
@@ -1280,6 +1353,7 @@ def verify(data_shape, kernel_size, max_displacement, stride1, stride2, pad_size
     verify((5, 1, 11, 11), kernel_size = 5, max_displacement = 1, stride1 = 1, stride2 = 1, pad_size = 2, is_multiply = False)
 
 
+@gpu
 def test_forward_arange_like():
     def verify(data_shape, start=None, step=None, axis=None):
         attrs = {}
@@ -1295,7 +1369,7 @@ def verify(data_shape, start=None, step=None, axis=None):
         
         mx_sym = mx.sym.contrib.arange_like(data, **attrs)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()()
@@ -1307,6 +1381,7 @@ def verify(data_shape, start=None, step=None, axis=None):
     verify(data_shape=(3, 4, 5), start=2., step=3., axis=1)
 
 
+@gpu
 def test_forward_interleaved_matmul_selfatt_qk():
     def verify(batch, seq_length, num_heads, head_dim):
         data_shape = (seq_length, batch, num_heads * head_dim * 3)
@@ -1317,7 +1392,7 @@ def verify(batch, seq_length, num_heads, head_dim):
 
         mx_sym = mx.sym.contrib.interleaved_matmul_selfatt_qk(data, heads=num_heads)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(data_np)
@@ -1327,6 +1402,7 @@ def verify(batch, seq_length, num_heads, head_dim):
     verify(3, 10, 6, 8)
 
 
+@gpu
 def test_forward_interleaved_matmul_selfatt_valatt():
     def verify(batch, seq_length, num_heads, head_dim):
         data_shape = (seq_length, batch, num_heads * head_dim * 3)
@@ -1342,7 +1418,7 @@ def verify(batch, seq_length, num_heads, head_dim):
             data, weight, heads=num_heads)
         mod, _ = relay.frontend.from_mxnet(
             mx_sym, {"data": data_shape, "weight": weight_shape})
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(data=data_np, weight=weight_np)
@@ -1352,6 +1428,7 @@ def verify(batch, seq_length, num_heads, head_dim):
     verify(3, 10, 6, 8)
 
 
+@gpu
 def test_forward_box_decode():
     def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corner"):
         dtype = "float32"
@@ -1361,7 +1438,7 @@ def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corn
         mx_sym = mx.sym.contrib.box_decode(mx.sym.var("data"), mx.sym.var("anchors"), stds[0], stds[1], stds[2], stds[3], clip, in_format)
         shape_dict = {"data": data_shape, "anchors": anchor_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(data, anchors)
@@ -1374,6 +1451,7 @@ def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corn
     verify((1, 10, 4), (1, 10, 4), in_format="center")
 
 
+@gpu
 def test_forward_softmax():
     def verify(data_shape, axis, use_length, length):
         dtype = "float32"
@@ -1394,7 +1472,7 @@ def verify(data_shape, axis, use_length, length):
             shape_dict = {"data": data_shape}
             mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 if use_length:
@@ -1411,6 +1489,7 @@ def verify(data_shape, axis, use_length, length):
     verify((2, 3, 4), 2, True, np.array([[3, 4, 2], [1, 2, 1]]).astype('int32'))
 
 
+@gpu
 @pytest.mark.skipif(not hasattr(mx.sym.np, 'pad'), reason="mx.sym.np.pad hasn't been publish yet")
 @pytest.mark.parametrize(
     "data_shape, pad_width",
@@ -1419,7 +1498,7 @@ def verify(data_shape, axis, use_length, length):
 @pytest.mark.parametrize("mode", ["constant", "edge", "reflect"])
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32'])
 @pytest.mark.parametrize("constant_value", [0.0, 3.0])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value,target, ctx, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
@@ -1435,12 +1514,13 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value,targ
     op_res = intrp.evaluate()(data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
-    
+
+@gpu
 @pytest.mark.skipif(not hasattr(mx.sym.np, 'pad'), reason="test'll abort with Mxnet 1.x, skip for now")
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2)])
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
 @pytest.mark.parametrize("axes", [(1,0,2),None])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_transpose(data_shape, axes, dtype,target, ctx, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
@@ -1453,12 +1533,13 @@ def test_forward_npi_transpose(data_shape, axes, dtype,target, ctx, kind):
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize(
     "data_shape1, data_shape2, axis",
     [((2,2),(2,2),1),((2,4),(2,3),1),((1,3,2),(1,3,5),2),((1,3,3),(1,3,3),1),((1,3),(1,3),0)]
 )
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype,target, ctx, kind):
     data_np1 = np.random.uniform(size=data_shape1).astype(dtype)
@@ -1473,9 +1554,10 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype,target, c
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2),(2,2,2,1,2,3,1),(1,8)])
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_np_copy(data_shape,dtype,target, ctx, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
@@ -1488,8 +1570,9 @@ def test_forward_np_copy(data_shape,dtype,target, ctx, kind):
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 @pytest.mark.parametrize("data_shape,out_shape,reverse",
                          [((2, 3, 8),(-2, -2, 2, -1),False),
@@ -1508,9 +1591,10 @@ def test_forward_npx_reshape(data_shape,out_shape,dtype,target,reverse, ctx, kin
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2),(2,2,2,1,2,3,1),(1,8),(2,2),(1,3)])
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_binary(data_shape,dtype,target, ctx, kind):
     ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.less]
@@ -1533,9 +1617,10 @@ def test_forward_npi_binary(data_shape,dtype,target, ctx, kind):
         tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2),(2,2,2,1,2,3,1),(1,8),(2,2),(1,3)])
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("scalar", [1.0,2.0,3.0,4.0])
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_binary_scalar(data_shape,dtype,scalar,target, ctx, kind):
@@ -1557,9 +1642,10 @@ def test_forward_npi_binary_scalar(data_shape,dtype,scalar,target, ctx, kind):
         tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2),(2,2,2,1,2,3,1),(1,8),(2,2),(1,3)])
 @pytest.mark.parametrize("dtype", ['float64', 'float32'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_tanh(data_shape,dtype,target, ctx, kind):
     data_np1 = np.random.uniform(size=data_shape).astype(dtype)
@@ -1572,12 +1658,13 @@ def test_forward_npi_tanh(data_shape,dtype,target, ctx, kind):
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.skipif(not hasattr(mx.np, 'where'), reason="mx.np.where hasn't been publish yet")
 @pytest.mark.parametrize("data_shape", [(2,2,2),(2,7,2),(1,8),(2,2),(1,3)])
 @pytest.mark.parametrize("data_dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
 @pytest.mark.parametrize("cond_dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
 @pytest.mark.parametrize("scalar", [1.0,2.0])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_where_rscalar(data_shape,cond_dtype,data_dtype,scalar,target, ctx, kind):
     if data_dtype == 'bool':
@@ -1599,8 +1686,9 @@ def test_forward_npi_where_rscalar(data_shape,cond_dtype,data_dtype,scalar,targe
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@gpu
 @pytest.mark.parametrize("dtype", ['float64', 'float32', 'int64', 'int32', 'bool'])
-@pytest.mark.parametrize("target, ctx", ctx_list())
+@pytest.mark.parametrize("target, ctx", enabled_devices())
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 @pytest.mark.parametrize("data_shape, axis, indices_or_sections, squeeze_axis", 
                          [((3,2,1),1,2,False),((3,2,1),0,3,False),((3,2,1),0,3,True),((3,2,1),0,(1,2),False)])
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c09580e573015..d5e0fd7e520ae 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -26,8 +26,8 @@
 from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
-from tvm.relay.testing.config import ctx_list
 import scipy
+from tvm.testing import gpu, enabled_devices
 
 
 def get_input_data_shape_dict(graph_def, input_data):
@@ -117,11 +117,12 @@ def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
     x = np.random.uniform(size=data_shape)
     model = onnx.load_model(graph_file)
     c2_out = get_onnxruntime_output(model, x, dtype)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
         tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_reshape():
     in_shape = (4, 3, 3, 4)
     ref_shape = (6, 2, 4, 3)
@@ -145,13 +146,14 @@ def test_reshape():
 
     model = helper.make_model(graph, producer_name='reshape_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 
+@gpu
 def test_expand():
 
     def _test_expand(name, data, shape, ref_data):
@@ -174,7 +176,7 @@ def _test_expand(name, data, shape, ref_data):
 
         model = helper.make_model(graph, producer_name=name)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             tvm_out = get_tvm_output(model, data, target, ctx, ref_data.shape, 'float32')
 
         tvm.testing.assert_allclose(ref_data, tvm_out)
@@ -205,13 +207,14 @@ def verify_depth_to_space(inshape, outshape, mode, blockSize):
 
     model = helper.make_model(graph, producer_name='depth_to_space_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=inshape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, outshape, 'float32')
         onnx_out = get_onnxruntime_output(model, x, 'float32')
         tvm.testing.assert_allclose(onnx_out, tvm_out)
 
 
+@gpu
 def test_depth_to_space():
     # current onnx.checker use OpSet-1 version of DepthToSpace, which doesn't have a mode argument.
     # TO-DO, we can add mode arguement to test CRD mode and DCR mode
@@ -232,17 +235,19 @@ def verify_space_to_depth(inshape, outshape, blockSize):
 
     model = helper.make_model(graph, producer_name='space_to_depth_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=inshape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, outshape, 'float32')
         onnx_out = get_onnxruntime_output(model, x, 'float32')
         tvm.testing.assert_allclose(onnx_out, tvm_out)
 
 
+@gpu
 def test_space_to_depth():
     verify_space_to_depth((1, 1, 4, 6), (1, 4, 2, 3), 2)
 
 
+@gpu
 def test_shape():
     in_shape = (4, 3, 3, 4)
     ref_shape = (6, 2, 4, 3)
@@ -268,7 +273,7 @@ def test_shape():
 
     model = helper.make_model(graph, producer_name='shape_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'int32')
 
@@ -297,17 +302,19 @@ def _test_power_iteration(x_shape, y_shape):
 
     model = helper.make_model(graph, producer_name='power_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
         tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_power():
     _test_power_iteration((1, 3), (1))
     _test_power_iteration((2, 3), (2, 3))
     _test_power_iteration((2, 3), (1, 3))
 
 
+@gpu
 def test_squeeze():
     in_shape = (1, 3, 1, 3, 1, 1)
     out_shape = (3, 3)
@@ -322,13 +329,14 @@ def test_squeeze():
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
     tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
 
+@gpu
 def test_flatten():
 
     in_shape = (1, 3, 4, 4)
@@ -346,13 +354,14 @@ def test_flatten():
 
     model = helper.make_model(graph, producer_name='flatten_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 
+@gpu
 def test_unsqueeze():
     in_shape = (3, 3)
     axis = (0, 3, 4)
@@ -368,7 +377,7 @@ def test_unsqueeze():
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
@@ -392,12 +401,13 @@ def verify_gather(in_shape, indices, axis, dtype):
                                                                      TensorProto.FLOAT, list(out_np.shape))])
     model = helper.make_model(graph, producer_name='gather_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x, indices], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out)
 
 
+@gpu
 def test_gather():
     verify_gather((4,), [1], 0, 'int32')
     verify_gather((1, 4), [0], 0, 'int32')
@@ -427,12 +437,13 @@ def verify_scatter(in_shape, indices, axis):
     model = helper.make_model(graph, producer_name='scatter_test')
     onnx_out = get_onnxruntime_output(model, [x, indices, updates])
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x, indices, updates], target, ctx, onnx_out[0].shape)
         tvm.testing.assert_allclose(onnx_out[0], tvm_out)
 
 
+@gpu
 def test_scatter():
     verify_scatter((4,), [1], 0)
     verify_scatter((1, 4), [[0]], 0)
@@ -459,7 +470,7 @@ def _test_slice_iteration_v1(indata, outdata, starts, ends, axes=None):
 
     model = helper.make_model(graph, producer_name='slice_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, indata, target, ctx, outdata.shape, 'float32', opset=1)
 
@@ -547,7 +558,7 @@ def add_noop_to_input_attr(attr_name, attr):
                               initializer=initializer)
     model = helper.make_model(graph, producer_name='slice_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model,
                                  indata,
                                  target,
@@ -559,6 +570,7 @@ def add_noop_to_input_attr(attr_name, attr):
     tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_slice():
     x = np.random.randn(20, 10, 5).astype(np.float32)
     _test_slice_iteration_v1(x, x[0:3, 0:10], starts=(0, 0), ends=(3, 10), axes=(0, 1))
@@ -595,22 +607,25 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, indata, target, ctx, outdata.shape, dtype)
 
     tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_floor():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.floor,
                               {}, 'float32', 'Floor', {})
 
 
+@gpu
 def test_ceil():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.ceil, {}, 'float32', 'Ceil', {})
 
 
+@gpu
 def test_clip():
     _test_onnx_op_elementwise((2, 4, 5, 6),
                               np.clip,
@@ -620,7 +635,7 @@ def test_clip():
                               {'min': -1.0, 'max': 1.0})
 
 
-
+@gpu
 def test_round():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.round, {}, 'float32', 'Round', {})
 
@@ -640,17 +655,19 @@ def _test_finite_ops(inshape, outfunc, npargs, dtype, opname, kwargs):
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, indata, target, ctx, outdata.shape, dtype)
 
     tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_isinf():
     _test_finite_ops((2, 4, 5, 6), np.isinf, {}, 'float32', 'IsInf', {})
 
 
+@gpu
 def test_isnan():
     _test_finite_ops((2, 4, 5, 6), np.isnan, {}, 'float32', 'IsNaN', {})
 
@@ -672,18 +689,20 @@ def verify_gather_nd(in_shape, indices, dtype):
                                                                      TensorProto.FLOAT, list(out_np.shape))])
     model = helper.make_model(graph, producer_name='gather_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x, indices], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out)
 
 
+@gpu
 def test_gather_nd():
     verify_gather_nd((2, 2), [[0,0],[1,1]], 'int32')
     verify_gather_nd((3, 3, 3), [[0,1],[1,0]] , 'float32')
     verify_gather_nd((4, 3, 5, 6), [[2, 1, 0, 0]], 'float32')
 
 
+@gpu
 def test_onehot():
     indices_shape = [10]
     indices_array = np.random.randint(
@@ -709,12 +728,13 @@ def test_onehot():
 
     model = helper.make_model(graph, producer_name="onehot_test")
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [indices_array], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_matmul():
     a_shape = (4, 3)
     b_shape = (3, 4)
@@ -736,7 +756,7 @@ def test_matmul():
 
     model = helper.make_model(graph, producer_name='matmul_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_array, b_array], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
@@ -759,11 +779,12 @@ def verify_batch_matmul(a_shape, b_shape):
 
     model = helper.make_model(graph, producer_name='matmul_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_array, b_array], target, ctx, out_np.shape)
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_batch_matmul():
     verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4))
     verify_batch_matmul((2, 4, 3), (3, 4))
@@ -800,7 +821,7 @@ def _get_python_lrn():
         py_out = in_array / ((bias + (alpha / nsize) * square_sum) ** beta)
         return py_out
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         input_name = model.graph.input[0].name
         py_out = _get_python_lrn()
         tvm_out = get_tvm_output(
@@ -808,6 +829,7 @@ def _get_python_lrn():
         tvm.testing.assert_allclose(py_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_lrn():
     verify_lrn((5, 5, 5, 5), 3, 'float32')
     verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0)
@@ -845,12 +867,13 @@ def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5):
                                       helper.make_tensor_value_info("beta", TensorProto.FLOAT, (shape[1],))],
                               outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(shape))])
     model = helper.make_model(graph, producer_name='instance_norm_test')
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x, gamma, beta], target, ctx, shape, 'float32')
         tvm.testing.assert_allclose(y, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_instance_norm():
     verify_instance_norm((2, 3, 4, 5))
     verify_instance_norm((32, 64, 80, 64))
@@ -877,7 +900,7 @@ def _test_upsample_nearest():
 
     model = helper.make_model(graph, producer_name='upsample_nearest_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out)
@@ -902,7 +925,7 @@ def _test_upsample3d_nearest():
 
     model = helper.make_model(graph, producer_name='upsample_nearest_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out)
@@ -926,7 +949,7 @@ def _test_upsample_bilinear():
 
     model = helper.make_model(graph, producer_name='upsample_bilinear_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
@@ -961,7 +984,7 @@ def _test_upsample_bilinear_opset9():
     model = helper.make_model(
         graph, producer_name='upsample_bilinear_opset9_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
@@ -995,11 +1018,12 @@ def _test_upsample3d_trilinear():
     model = helper.make_model(
         graph, producer_name='upsample_trilinear_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
@@ -1026,12 +1050,13 @@ def _test_softmax(inshape, axis):
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, indata, target, ctx, outshape, 'float32')
         tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_softmax():
     _test_softmax((1, 10), None)
     _test_softmax((1, 10), 1)
@@ -1061,12 +1086,13 @@ def verify_min(input_dim):
 
     model = helper.make_model(graph, producer_name='Min_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_min():
     verify_min((1, 3, 20, 20))
     verify_min((20, 20))
@@ -1096,12 +1122,13 @@ def verify_max(input_dim):
 
     model = helper.make_model(graph, producer_name='Max_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_max():
     verify_max((1, 3, 20, 20))
     verify_max((20, 20))
@@ -1131,12 +1158,13 @@ def verify_mean(input_dim):
 
     model = helper.make_model(graph, producer_name='Mean_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_mean():
     verify_mean((1, 3, 20, 20))
     verify_mean((20, 20))
@@ -1161,11 +1189,12 @@ def verify_hardsigmoid(input_dim, alpha, beta):
 
     model = helper.make_model(graph, producer_name='HardSigmoid_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_hardsigmoid():
     verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
     verify_hardsigmoid((20, 20), 0.3, 0.4)
@@ -1212,7 +1241,7 @@ def _argmin_numpy(data, axis=0, keepdims=True):
 
     model = helper.make_model(graph, producer_name='argmin_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
@@ -1260,12 +1289,13 @@ def _argmax_numpy(data, axis=0, keepdims=True):
 
     model = helper.make_model(graph, producer_name='argmax_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
         tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_forward_arg_min_max():
     '''Verify argmin and argmax'''
     verify_argmin([3, 4, 4])
@@ -1309,12 +1339,13 @@ def verify_constantofshape(input_dim, value, dtype):
 
     model = helper.make_model(graph, producer_name='fill_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
 
         tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_constantofshape():
     verify_constantofshape((2, 3, 4, 5), 10, 'float32')
     verify_constantofshape((3, 3), 0, 'int32')
@@ -1355,7 +1386,7 @@ def verify_pad(indata, pads, mode='constant', value=0.0):
                                                                      TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='pad_test')
     #  tvm result
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, indata, target, ctx, outdata.shape, 'float32', opset=2)
     tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
@@ -1411,12 +1442,13 @@ def verify_pad_v11(indata, pads, mode='constant', value=0.0):
                                                                          TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='pad_test')
     #  tvm result
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, inputs, target, ctx, outdata.shape, 'float32', opset=11)
     tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_pad():
     verify_pad(np.random.randn(2, 2).astype(
         np.float32), [0, 1, 0, 0], 'constant', 0.0)
@@ -1465,10 +1497,11 @@ def verify_reduce_func(func, data, axis, keepdims):
     model = helper.make_model(graph, producer_name='reduce_test')
 
     onnx_out = get_onnxruntime_output(model, data, 'float32')
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, data, target, ctx, outshape, 'float32')
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_all_reduce_funcs():
     funcs = ["ReduceMax",
              "ReduceMean",
@@ -1532,7 +1565,7 @@ def verify_split(indata, outdatas, split, axis=0):
                                        ])
     model = helper.make_model(graph, producer_name='split_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         output_shape = [o.shape for o in outdatas]
         output_type = ['float32', 'float32', 'float32']
         tvm_out = get_tvm_output(
@@ -1541,6 +1574,7 @@ def verify_split(indata, outdatas, split, axis=0):
         tvm.testing.assert_allclose(o, t)
 
 
+@gpu
 def test_split():
     # 1D
     verify_split([1., 2., 3., 4., 5., 6.], [
@@ -1554,6 +1588,7 @@ def test_split():
     verify_split([1, 2, 3], [[1], [2], [3]], False)
 
 
+@gpu
 def test_binary_ops():
     in_shape = (1, 2, 3, 3)
     dtype = "float32"
@@ -1573,7 +1608,7 @@ def verify_binary_ops(op, x, y, out_np, x_name='in1', y_name='in2', broadcast=No
                                   outputs=[helper.make_tensor_value_info("out",
                                                                          TensorProto.FLOAT, list(out_shape))])
         model = helper.make_model(graph, producer_name='_test')
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             tvm_out = get_tvm_output(model, [x, y], target, ctx)
             tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
@@ -1595,6 +1630,7 @@ def verify_binary_ops(op, x, y, out_np, x_name='in1', y_name='in2', broadcast=No
     verify_binary_ops("Equal", x, y, x == y, broadcast=True)
 
 
+@gpu
 def test_single_ops():
     in_shape = (1, 2, 3, 3)
     dtype = "float32"
@@ -1609,7 +1645,7 @@ def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
                                   outputs=[helper.make_tensor_value_info("out",
                                                                          TensorProto.FLOAT, list(out_shape))])
         model = helper.make_model(graph, producer_name='_test')
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             tvm_out = get_tvm_output(model, [x], target, ctx)
             tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol)
 
@@ -1639,6 +1675,7 @@ def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
     verify_single_ops("SoftPlus", x, np.log(1 + np.exp(x)))
 
 
+@gpu
 def test_leaky_relu():
     def leaky_relu_x(x, alpha):
         return np.where(x >= 0, x, x * alpha)
@@ -1650,6 +1687,7 @@ def leaky_relu_x(x, alpha):
                               {'alpha': 0.25})
 
 
+@gpu
 def test_elu():
     def elu_x(x, alpha):
         return np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
@@ -1661,6 +1699,7 @@ def elu_x(x, alpha):
                               {'alpha': 0.25})
 
 
+@gpu
 def test_selu():
     def selu_x(x, alpha, gamma):
         return gamma * np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
@@ -1672,6 +1711,7 @@ def selu_x(x, alpha, gamma):
                               {'alpha': 0.25, 'gamma': 0.3})
 
 
+@gpu
 def test_prelu():
     def verify_prelu(x_shape, a_shape):
         node = helper.make_node('PRelu',
@@ -1700,6 +1740,7 @@ def verify_prelu(x_shape, a_shape):
     verify_prelu([2,12,16,16], [1, 12, 1, 1])
 
 
+@gpu
 def test_ThresholdedRelu():
     def ThresholdedRelu_x(x, alpha):
         out_np = np.clip(x, alpha, np.inf)
@@ -1713,6 +1754,7 @@ def ThresholdedRelu_x(x, alpha):
                               {'alpha': 0.25})
 
 
+@gpu
 def test_ScaledTanh():
     def ScaledTanh_x(x, alpha, beta):
         return alpha * np.tanh(beta * x)
@@ -1724,6 +1766,7 @@ def ScaledTanh_x(x, alpha, beta):
                               {'alpha': 0.25, 'beta': 0.3})
 
 
+@gpu
 def test_ParametricSoftplus():
     def ParametricSoftplus_x(x, alpha, beta):
         return alpha * np.log(np.exp(beta * x) + 1)
@@ -1735,6 +1778,7 @@ def ParametricSoftplus_x(x, alpha, beta):
                               {'alpha': 0.25, 'beta': 0.3})
 
 
+@gpu
 def test_Scale():
     def Scale_x(x, scale):
         return scale * x
@@ -1746,6 +1790,7 @@ def Scale_x(x, scale):
                               {'scale': 0.25})
 
 
+@gpu
 def test_LogSoftmax():
     _test_onnx_op_elementwise((1, 4),
                               tvm.topi.testing.log_softmax_python,
@@ -1762,13 +1807,14 @@ def check_torch_conversion(model, input_size):
     torch.onnx.export(model(), dummy_input, file_name,
                       export_params=True, verbose=False)
     onnx_model = onnx.load(file_name)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         input_data = np.random.uniform(size=input_size).astype('int32')
         c2_out = get_onnxruntime_output(onnx_model, input_data)
         tvm_out = get_tvm_output(onnx_model, input_data, target, ctx)
         tvm.testing.assert_allclose(c2_out, tvm_out)
 
 
+@gpu
 def test_resnet():
     check_torch_conversion(torchvision.models.resnet18, (1, 3, 224, 224))
     # check_torch_conversion(torchvision.models.resnet101, (1,3,224,224))
@@ -1787,10 +1833,12 @@ def test_resnet():
 #     check_torch_conversion(torchvision.models.squeezenet1_0, (1,3,224,224))
 
 
+@gpu
 def test_densenet():
     check_torch_conversion(torchvision.models.densenet161, (1, 3, 224, 224))
 
 
+@gpu
 def test_inception():
     check_torch_conversion(torchvision.models.inception_v3, (1, 3, 224, 224))
 
@@ -1803,6 +1851,7 @@ def test_inception():
 #     check_torch_conversion(torchvision.models.shufflenetv2, (1,3,224,224))
 
 
+@gpu
 def test_sign():
     def Sign_x(x):
         return np.sign(x)
@@ -1828,11 +1877,12 @@ def verify_not(indata, dtype):
 
     model = helper.make_model(graph, producer_name='not_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [x], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_not():
     # 2d
     verify_not(indata=(np.random.randn(3, 4) > 0), dtype=bool)
@@ -1857,11 +1907,12 @@ def verify_and(indata, dtype):
 
     model = helper.make_model(graph, producer_name='and_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_and():
     # 2d
     x = (np.random.randn(3, 4) > 0)
@@ -1899,7 +1950,7 @@ def verify_tile_v1(indata, outdata, **kwargs):
 
     model = helper.make_model(graph, producer_name='tile_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [indata], target, ctx, outdata.shape, opset=1)
         tvm.testing.assert_allclose(outdata, tvm_out)
@@ -1929,7 +1980,7 @@ def verify_tile_v6(indata, repeats, outdata):
 
     model = helper.make_model(graph, producer_name='tile_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [indata],
                                  target,
                                  ctx,
@@ -1938,6 +1989,7 @@ def verify_tile_v6(indata, repeats, outdata):
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_tile():
     x = np.random.rand(2, 3, 4, 5).astype(np.float32)
     repeats = np.random.randint(
@@ -1956,11 +2008,12 @@ def verify_erf(indata, outdata):
                               outputs=[helper.make_tensor_value_info('out', TensorProto.FLOAT, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='erf_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_erf():
     x = np.random.rand(2, 3, 4, 6).astype(np.float32)
     z = scipy.special.erf(x)
@@ -1977,11 +2030,12 @@ def verify_where(condition, x, y, dtype, outdata):
                               outputs=[helper.make_tensor_value_info('out', dtype, list(outdata.shape))])
     model = helper.make_model(graph, producer_name='where_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [condition, x, y], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_where():
     condition = np.array([[1, 0], [1, 1]], dtype=np.bool)
     x = np.array([[1, 2], [3, 4]], dtype=np.int64)
@@ -2031,11 +2085,12 @@ def verify_or(indata, dtype):
 
     model = helper.make_model(graph, producer_name='or_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
+@gpu
 def test_or():
     # 2d
     x = (np.random.randn(3, 4) > 0)
@@ -2063,6 +2118,7 @@ def test_or():
     verify_or(indata=[x, y], dtype=bool)
 
 
+@gpu
 def test_batch_norm():
     def verify_batch_norm(in_shape):
         batchnorm = onnx.helper.make_node('BatchNormalization',
@@ -2087,7 +2143,7 @@ def verify_batch_norm(in_shape):
 
         model = helper.make_model(graph, producer_name='batchnorm_test')
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             x = np.random.uniform(size=in_shape).astype('float32')
             scale = np.random.uniform(size=in_shape[1]).astype('float32')
             b = np.random.uniform(size=in_shape[1]).astype('float32')
@@ -2104,6 +2160,7 @@ def verify_batch_norm(in_shape):
     verify_batch_norm([16, 16, 10, 10])
 
 
+@gpu
 def test_batch_norm_dynamic_subgraph():
     def verify_batch_norm_dynamic_subgraph(in_shape, o_shape):
         batchnorm = onnx.helper.make_node('BatchNormalization',
@@ -2132,7 +2189,7 @@ def verify_batch_norm_dynamic_subgraph(in_shape, o_shape):
 
         model = helper.make_model(graph, producer_name='batchnorm_test')
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             x = np.random.uniform(size=in_shape).astype('float32')
             inp = np.random.uniform(size=o_shape).astype('float32')
             scale = np.random.uniform(size=in_shape[1]).astype('float32')
@@ -2186,7 +2243,7 @@ def verify_conv(x_shape, w_shape, y_shape, padding, kernel_shape, strides, dilat
 
     model = helper.make_model(graph, producer_name='conv_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=x_shape).astype('float32')
         W = np.random.uniform(size=w_shape).astype('float32')
         tvm_out = get_tvm_output(model, [x, W], target, ctx, y_shape)
@@ -2194,6 +2251,7 @@ def verify_conv(x_shape, w_shape, y_shape, padding, kernel_shape, strides, dilat
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_conv():
     def repeat(N, D):
         return tuple([N for _ in range(D)])
@@ -2276,7 +2334,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p):
 
     model = helper.make_model(graph, producer_name='convtranspose_trest')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         x = np.random.uniform(size=x_shape).astype('float32')
         W = np.random.uniform(size=w_shape).astype('float32')
         tvm_out = get_tvm_output(model, [x, W], target, ctx, y_shape)
@@ -2284,6 +2342,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p):
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_convtranspose():
     # Convolution Transpose with padding
     # (1, 1, 3, 3) input tensor
@@ -2293,6 +2352,7 @@ def test_convtranspose():
     verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2])
 
 
+@gpu
 def test_unsqueeze_constant():
     from torch.nn import Linear, Sequential, Module
     class Flatten(Module):
@@ -2343,13 +2403,14 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p
 
     model = helper.make_model(graph, producer_name='pooling_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         onnx_out = get_onnxruntime_output(model, x_np, 'float32')
         tvm_out = get_tvm_output(
             model, [x_np], target, ctx, out_shape)
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_pooling():
     for mode in ['max', 'average']:
         # Pool1D
@@ -2440,12 +2501,13 @@ def verify_mod(x_shape, y_shape, fmod, out_shape, dtype='float32'):
 
     onnx_out = get_onnxruntime_output(model, [x_np, y_np], dtype)[0]
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x_np, y_np], target, ctx, out_shape)
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_mod():
     # Mod
     verify_mod(x_shape=[1, 32, 32], y_shape=[1, 1, 32], fmod=0, out_shape=(1, 32, 32), dtype="int32")
@@ -2481,12 +2543,13 @@ def verify_xor(x_shape, y_shape):
                                                                     onnx_dtype, list(out_shape))])
     model = helper.make_model(graph, producer_name='xor_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x_np, y_np], target, ctx, out_shape)
         tvm.testing.assert_allclose(np_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_xor():
     # XOR
     verify_xor(x_shape=[1, 32, 32], y_shape=[1, 32, 32])
@@ -2523,12 +2586,13 @@ def verify_max_roi_pool(x_shape, rois_shape, pooled_shape, spatial_scale, out_sh
     model = helper.make_model(graph, producer_name='pool_test')
 
     onnx_out = get_onnxruntime_output(model, [x_np, rois_np], 'float32')[0]
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         tvm_out = get_tvm_output(
             model, [x_np, rois_np], target, ctx, out_shape)
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_max_roi_pool():
     verify_max_roi_pool(x_shape=[1, 3, 6, 6],
                         rois_shape=[3, 5],
@@ -2572,13 +2636,14 @@ def verify_lppool(x_shape, kernel_shape, p, strides, pads, out_shape, auto_pad="
 
     model = helper.make_model(graph, producer_name='lppool_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         onnx_out = get_onnxruntime_output(model, x_np, 'float32')
         tvm_out = get_tvm_output(
             model, [x_np], target, ctx, out_shape)
         tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_lppool():
     # Pool1D
     verify_lppool(x_shape=[1, 1, 32], kernel_shape=[3], p=2, strides=[1], pads=[1, 1],
@@ -2728,7 +2793,7 @@ def verify_rnn(seq_length,
 
     model = helper.make_model(graph, producer_name='rnn_test')
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         onnx_out = get_onnxruntime_output(model, input_values, 'float32')
         tvm_out = get_tvm_output(
             model,
@@ -2741,6 +2806,7 @@ def verify_rnn(seq_length,
             tvm.testing.assert_allclose(o_out, t_out, rtol=5e-3, atol=5e-3)
 
 
+@gpu
 def test_lstm():
     # No bias.
     verify_rnn(
@@ -2845,6 +2911,7 @@ def test_lstm():
         rnn_type='LSTM')
 
 
+@gpu
 def test_gru():
     # No bias.
     verify_rnn(
@@ -2940,6 +3007,7 @@ def test_gru():
         rnn_type='GRU')
 
 
+@gpu
 def test_resize():
     def make_constant_node(name, data_type, dims, vals):
         return helper.make_node('Constant',
@@ -2977,7 +3045,7 @@ def verify(ishape, oshape, scales, mode, coord_trans):
 
         model = helper.make_model(graph, producer_name='resize_test')
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             x = np.random.uniform(size=ishape).astype('float32')
             onnx_out = get_onnxruntime_output(model, x, 'float32')
             tvm_out = get_tvm_output(model, x, target, ctx, oshape, 'float32', opset=11)
@@ -2997,6 +3065,7 @@ def verify(ishape, oshape, scales, mode, coord_trans):
     verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "linear", "half_pixel")
 
 
+@gpu
 def test_nonzero():
 
     def verify_nonzero(indata, outdata, dtype):
@@ -3025,6 +3094,7 @@ def verify_nonzero(indata, outdata, dtype):
     result = np.array((np.nonzero(input_data)))  # expected output [[0, 1, 2, 2], [0, 1, 0, 1]]
     verify_nonzero(input_data, result, dtype=np.int64)
 
+@gpu
 def test_topk():
     def verify_topk(input_dims, K, axis=-1):
         output_dims = list(input_dims)
@@ -3063,6 +3133,7 @@ def verify_topk(input_dims, K, axis=-1):
         verify_topk([n, n, n], 5, 2)
 
 
+@gpu
 def test_roi_align():
     def verify_roi_align(input_dims, num_roi, output_height, output_width, sampling_ratio=0, spatial_scale=1.0):
         output_dims = [num_roi, input_dims[1], output_height, output_width]
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 946712df50861..c83f34b94c706 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -28,7 +28,7 @@
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.contrib.nvcc import have_fp16
-from tvm.relay.testing.config import ctx_list
+from tvm.testing import gpu
 
 
 sys.setrecursionlimit(10000)
@@ -152,7 +152,6 @@ def measure_latency(model, input_shapes, output_shapes, thresh, dryruns=40):
 
 def verify_model(model_name, input_data=[],
                  custom_convert_map={},
-                 ctx_list=ctx_list(),
                  rtol=1e-5, atol=1e-5):
     """Assert that the output of a compiled model matches with that of its
     baseline."""
@@ -198,7 +197,7 @@ def verify_model(model_name, input_data=[],
                               [inp.cpu().numpy() for inp in baseline_input]))
 
     with tvm.transform.PassContext(opt_level=3):
-        for target, ctx in ctx_list:
+        for target, ctx in enabled_devices():
             relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
             relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
             relay_model.set_input(**relay_params)
@@ -218,6 +217,7 @@ def verify_model(model_name, input_data=[],
     torch.cuda.empty_cache()
 
 # Single operator tests
+@gpu
 def test_forward_add():
     torch.set_grad_enabled(False)
     input_shape = [10]
@@ -250,6 +250,7 @@ def forward(self, *args):
     verify_model(Add3().float().eval(), input_data=input_data)
     verify_model(Add4().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_subtract():
     torch.set_grad_enabled(False)
     input_shape = [10]
@@ -282,6 +283,7 @@ def forward(self, *args):
     verify_model(Subtract3().float().eval(), input_data=input_data)
     verify_model(Subtract4().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_multiply():
     torch.set_grad_enabled(False)
     input_shape = [10]
@@ -314,6 +316,7 @@ def forward(self, *args):
     verify_model(Multiply3().float().eval(), input_data=input_data)
     verify_model(Multiply4().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_reciprocal():
     torch.set_grad_enabled(False)
     input_shape = [2, 1, 10, 1, 10]
@@ -324,6 +327,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Reciprocal1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_repeat():
     torch.set_grad_enabled(False)
     input_shape = [1, 3]
@@ -344,6 +348,7 @@ def forward(self, *args):
     verify_model(Repeat2().float().eval(), input_data=input_data)
     verify_model(Repeat3().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_repeat_interleave():
     torch.set_grad_enabled(False)
     input_shape = [2, 2, 3]
@@ -369,6 +374,7 @@ def forward(self, *args):
     verify_model(RepeatInterleave3().float().eval(), input_data=input_data)
     verify_model(RepeatInterleave4().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_unsqueeze():
     torch.set_grad_enabled(False)
     input_shape = [10, 10]
@@ -380,6 +386,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Unsqueeze1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_squeeze():
     torch.set_grad_enabled(False)
     input_shape = [2, 1, 10, 1, 10]
@@ -396,6 +403,7 @@ def forward(self, *args):
     verify_model(Squeeze1().float().eval(), input_data=input_data)
     verify_model(Squeeze2().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_arange():
     torch.set_grad_enabled(False)
 
@@ -470,6 +478,7 @@ def forward(self, *args):
     verify_model(Arange11().float().eval())
     verify_model(Arange12().float().eval())
 
+@gpu
 def test_forward_mesh_grid():
     torch.set_grad_enabled(False)
 
@@ -490,6 +499,7 @@ def forward(self, *args):
     verify_model(MeshGrid1().float().eval())
     verify_model(MeshGrid2().float().eval())
 
+@gpu
 def test_forward_abs():
     torch.set_grad_enabled(False)
     input_shape = [2, 1, 10, 1, 10]
@@ -501,6 +511,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Abs1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_concatenate():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -520,18 +531,21 @@ def forward(self, *args):
     verify_model(Concatenate1().float().eval(), input_data=input_data)
     verify_model(Concatenate2().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_relu():
     torch.set_grad_enabled(False)
     input_shape = [10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.ReLU().eval(), input_data=input_data)
 
+@gpu
 def test_forward_prelu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.PReLU(num_parameters=3).eval(), input_data=input_data)
 
+@gpu
 def test_forward_leakyrelu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -541,6 +555,7 @@ def test_forward_leakyrelu():
     verify_model(torch.nn.LeakyReLU(negative_slope=1.0).eval(), input_data=input_data)
     verify_model(torch.nn.LeakyReLU(negative_slope=1.25).eval(), input_data=input_data)
 
+@gpu
 def test_forward_elu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -550,6 +565,7 @@ def test_forward_elu():
     verify_model(torch.nn.ELU(alpha=1.0).eval(), input_data=input_data)
     verify_model(torch.nn.ELU(alpha=1.3).eval(), input_data=input_data)
 
+@gpu
 def test_forward_celu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -559,18 +575,21 @@ def test_forward_celu():
     verify_model(torch.nn.CELU(alpha=1.0).eval(), input_data=input_data)
     verify_model(torch.nn.CELU(alpha=1.3).eval(), input_data=input_data)
 
+@gpu
 def test_forward_gelu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.GELU().eval(), input_data=input_data)
 
+@gpu
 def test_forward_selu():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.SELU().eval(), input_data=input_data)
 
+@gpu
 def test_forward_softplus():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -579,18 +598,21 @@ def test_forward_softplus():
     verify_model(torch.nn.Softplus(beta=1.5, threshold=20).eval(), input_data=input_data)
     verify_model(torch.nn.Softplus(beta=5, threshold=10).eval(), input_data=input_data)
 
+@gpu
 def test_forward_softsign():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.Softsign().eval(), input_data=input_data)
 
+@gpu
 def test_forward_log_sigmoid():
     torch.set_grad_enabled(False)
     input_shape = [10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.LogSigmoid().eval(), input_data=input_data)
 
+@gpu
 def test_forward_adaptiveavgpool():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -598,6 +620,7 @@ def test_forward_adaptiveavgpool():
     verify_model(torch.nn.AdaptiveAvgPool2d([1, 1]).eval(), input_data=input_data)
     verify_model(torch.nn.AdaptiveAvgPool2d([10, 10]).eval(), input_data=input_data)
 
+@gpu
 def test_forward_maxpool2d():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -623,6 +646,7 @@ def forward(self, *args):
 
     verify_model(MaxPool2DWithIndices().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_maxpool1d():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10]
@@ -637,6 +661,7 @@ def test_forward_maxpool1d():
                                     stride=2).eval(),
                  input_data)
 
+@gpu
 def test_forward_maxpool3d():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10, 10]
@@ -651,6 +676,7 @@ def test_forward_maxpool3d():
                                     stride=2).eval(),
                  input_data)
 
+@gpu
 def test_forward_split():
     torch.set_grad_enabled(False)
     input_shape = [4, 10]
@@ -674,6 +700,7 @@ def forward(self, *args):
     verify_model(Split([2, 3, 5], 1).float().eval(),
                  input_data=input_data)
 
+@gpu
 def test_forward_avgpool():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -686,6 +713,7 @@ def forward(self, *args):
     verify_model(torch.nn.AvgPool2d(kernel_size=[10, 10]).eval(), input_data=input_data)
     verify_model(AvgPool2D2().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_avgpool3d():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10, 10]
@@ -698,12 +726,14 @@ def forward(self, *args):
     verify_model(torch.nn.AvgPool3d(kernel_size=[10, 10, 10]).eval(), input_data=input_data)
     verify_model(AvgPool3D1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_hardtanh():
     torch.set_grad_enabled(False)
     input_shape = [10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.Hardtanh().eval(), input_data=input_data)
 
+@gpu
 def test_forward_conv():
     torch.set_grad_enabled(False)
     conv1d_input_shape = [1, 3, 10]
@@ -778,6 +808,7 @@ def forward(self, *args):
     verify_model(Conv1D2().float().eval(), input_data=conv1d_input_data)
     verify_model(Conv1D3().float().eval(), input_data=conv1d_input_data)
 
+@gpu
 def test_forward_conv_transpose():
     torch.set_grad_enabled(False)
     conv2d_input_shape = [1, 3, 10, 10]
@@ -791,12 +822,14 @@ def test_forward_conv_transpose():
     verify_model(torch.nn.ConvTranspose1d(3, 12, 3, bias=False), input_data=conv1d_input_data)
 
 
+@gpu
 def test_forward_threshold():
     torch.set_grad_enabled(False)
     input_shape = [1, 3]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.Threshold(0, 0).float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_contiguous():
     torch.set_grad_enabled(False)
     input_shape = [10]
@@ -809,6 +842,7 @@ def forward(self, *args):
     verify_model(Contiguous1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_batchnorm():
     def init_weight(m):
         torch.nn.init.normal_(m.weight, 0, 0.01)
@@ -823,6 +857,7 @@ def init_weight(m):
         verify_model(bn.eval(), input_data=inp)
 
 
+@gpu
 def test_forward_instancenorm():
     inp_2d = torch.rand((1, 16, 10, 10))
     inp_3d = torch.rand((1, 16, 10, 10, 10))
@@ -831,6 +866,7 @@ def test_forward_instancenorm():
                           (torch.nn.InstanceNorm3d(16), inp_3d)]:
         verify_model(ins_norm.eval(), input_data=inp)
 
+@gpu
 def test_forward_layernorm():
     def init_weight(m):
         torch.nn.init.normal_(m.weight, 0, 0.01)
@@ -844,6 +880,7 @@ def init_weight(m):
         verify_model(ln.eval(), input_data=inp)
 
 
+@gpu
 def test_forward_groupnorm():
     input_shape = [10, 6, 5, 5]
     input_data = torch.rand(input_shape).float()
@@ -865,6 +902,7 @@ def test_forward_groupnorm():
     verify_model(torch.nn.GroupNorm(10, 10).eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_reshape():
     torch.set_grad_enabled(False)
     input_shape = [2, 1, 10, 1, 10]
@@ -881,6 +919,7 @@ def forward(self, *args):
     verify_model(Reshape1().float().eval(), input_data=input_data)
     verify_model(Reshape2().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_transpose():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -902,6 +941,7 @@ def forward(self, *args):
     verify_model(Transpose2().float().eval(), input_data=input_data)
     verify_model(Transpose3().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_size():
     torch.set_grad_enabled(False)
     input_shape = [1, 3]
@@ -914,6 +954,7 @@ def forward(self, *args):
     verify_model(Size1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_type_as():
     torch.set_grad_enabled(False)
     input_shape = [1, 3]
@@ -951,6 +992,7 @@ def forward(self, *args):
             verify_model(_create_module(torch.float16), input_data=input_data)
 
 
+@gpu
 def test_forward_view():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -973,7 +1015,7 @@ def forward(self, *args):
     verify_model(View2().float().eval(), input_data=input_data)
     verify_model(View3().float().eval(), input_data=input_data)
 
-
+@gpu
 def test_forward_select():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1002,6 +1044,7 @@ def forward(self, index):
     verify_model(IndexedSelect(x, 1).eval(), input_data=indices)
 
 
+@gpu
 def test_forward_clone():
     torch.set_grad_enabled(False)
     input_shape = [10]
@@ -1014,6 +1057,7 @@ def forward(self, *args):
     verify_model(Clone1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_gather():
     torch.set_grad_enabled(False)
 
@@ -1052,6 +1096,7 @@ def forward(self, *args):
     verify_model(Gather3().float().eval(), input_data=[input_data, index])
 
 
+@gpu
 def test_forward_logsoftmax():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1064,6 +1109,7 @@ def forward(self, *args):
     verify_model(LogSoftmax1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_norm():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1121,6 +1167,7 @@ def forward(self, *args):
     verify_model(Norm10().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_frobenius_norm():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1148,12 +1195,14 @@ def forward(self, *args):
     verify_model(FroNorm4().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_sigmoid():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.Sigmoid().eval(), input_data=input_data)
 
+@gpu
 def test_forward_dense():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1183,6 +1232,7 @@ def forward(self, *args):
     )
     assert not any([op.name == "multiply" for op in list_ops(mod['main'])])
 
+@gpu
 def test_forward_dropout():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1192,6 +1242,7 @@ def test_forward_dropout():
     verify_model(torch.nn.Dropout3d(p=0.5).eval(), input_data=input_data)
     verify_model(torch.nn.AlphaDropout(p=0.5).eval(), input_data=input_data[0, 0])
 
+@gpu
 def test_forward_slice():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1216,6 +1267,7 @@ def forward(self, *args):
     verify_model(Slice3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_mean():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1227,6 +1279,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Mean1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_expand():
     torch.set_grad_enabled(False)
 
@@ -1247,6 +1300,7 @@ def forward(self, *args):
     verify_model(Expand2().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_pow():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1258,6 +1312,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Pow1().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_chunk():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 14, 14]
@@ -1270,6 +1325,7 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(Chunk1().float().eval(), input_data=input_data)
 
+@gpu
 def test_upsample():
     class Upsample(Module):
         def __init__(self, size=None, scale=None,
@@ -1293,6 +1349,7 @@ def forward(self, x):
     verify_model(Upsample(scale=2, mode="bilinear", align_corners=True), inp)
     verify_model(Upsample(size=(50, 50), mode="bilinear", align_corners=True), inp)
 
+@gpu
 def test_to():
     """ test for aten::to(...) """
     class ToCPU(Module):
@@ -1319,6 +1376,7 @@ def forward(self, x):
     verify_model(ToLong().eval(), torch.tensor(0.8))
 
 
+@gpu
 def test_adaptive_pool3d():
     for ishape in [(1, 32, 16, 16, 16),
                    (1, 32, 9, 15, 15),
@@ -1332,6 +1390,7 @@ def test_adaptive_pool3d():
         verify_model(torch.nn.AdaptiveMaxPool3d((7, 8, 9)).eval(), inp)
 
 
+@gpu
 def test_forward_functional_pad():
     torch.set_grad_enabled(False)
     pad = (0, 0)
@@ -1350,12 +1409,14 @@ def forward(self, *args):
     verify_model(Pad1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_zero_pad2d():
     inp = torch.rand((1, 1, 3, 3))
     verify_model(torch.nn.ZeroPad2d(2).eval(), inp)
     verify_model(torch.nn.ZeroPad2d((1, 1, 2, 0)).eval(), inp)
 
 
+@gpu
 def test_forward_constant_pad1d():
     inp = torch.rand((1, 2, 4))
     verify_model(torch.nn.ConstantPad2d(2, 3.5).eval(), inp)
@@ -1364,18 +1425,21 @@ def test_forward_constant_pad1d():
     verify_model(torch.nn.ConstantPad2d((3, 1), 3.5).eval(), inp)
 
 
+@gpu
 def test_forward_constant_pad2d():
     inp = torch.rand((1, 2, 2, 2))
     verify_model(torch.nn.ConstantPad2d(2, 3.5).eval(), inp)
     verify_model(torch.nn.ConstantPad2d((3, 0, 2, 1), 3.5).eval(), inp)
 
 
+@gpu
 def test_forward_constant_pad3d():
     inp = torch.rand((1, 3, 2, 2, 2))
     verify_model(torch.nn.ConstantPad3d(3, 3.5).eval(), inp)
     verify_model(torch.nn.ConstantPad3d((3, 4, 5, 6, 0, 1), 3.5).eval(), inp)
 
 
+@gpu
 def test_forward_reflection_pad1d():
     inp = torch.rand((1, 2, 4))
     verify_model(torch.nn.ReflectionPad1d(2).eval(), inp)
@@ -1385,6 +1449,7 @@ def test_forward_reflection_pad1d():
     verify_model(torch.nn.ReflectionPad1d((2, 3)).eval(), inp)
 
 
+@gpu
 def test_forward_reflection_pad2d():
     inp = torch.rand((1, 1, 3, 3))
     verify_model(torch.nn.ReflectionPad2d(2).eval(), inp)
@@ -1394,6 +1459,7 @@ def test_forward_reflection_pad2d():
     verify_model(torch.nn.ReflectionPad2d((1, 3, 2, 4)).eval(), inp)
 
 
+@gpu
 def test_forward_replication_pad1d():
     inp = torch.rand((1, 2, 4))
     verify_model(torch.nn.ReplicationPad1d(2).eval(), inp)
@@ -1403,6 +1469,7 @@ def test_forward_replication_pad1d():
     verify_model(torch.nn.ReplicationPad1d((2, 3)).eval(), inp)
 
 
+@gpu
 def test_forward_replication_pad2d():
     inp = torch.rand((1, 1, 3, 3))
     verify_model(torch.nn.ReplicationPad2d(2).eval(), inp)
@@ -1412,6 +1479,7 @@ def test_forward_replication_pad2d():
     verify_model(torch.nn.ReplicationPad2d((1, 3, 2, 4)).eval(), inp)
 
 
+@gpu
 def test_forward_replication_pad3d():
     inp = torch.rand((1, 1, 3, 3, 3))
     verify_model(torch.nn.ReplicationPad3d(3).eval(), inp)
@@ -1421,6 +1489,7 @@ def test_forward_replication_pad3d():
     verify_model(torch.nn.ReplicationPad3d((2, 3, 2, 5, 1, 4)).eval(), inp)
 
 
+@gpu
 def test_forward_upsample3d():
     inp = torch.arange(1, 9, dtype=torch.float32).view(1, 1, 2, 2, 2)
     verify_model(torch.nn.Upsample(scale_factor=2, mode='nearest').eval(), inp)
@@ -1453,6 +1522,7 @@ def _gen_rand_inputs(num_boxes):
         verify_trace_model(NonMaxSupression(iou_thres), [in_boxes, in_scores])
 
 
+@gpu
 def test_conv3d():
     for ishape in [(1, 32, 16, 16, 16),
                    (1, 32, 9, 15, 15),
@@ -1471,6 +1541,7 @@ def test_conv3d():
                      inp)
 
 
+@gpu
 def test_conv3d_transpose():
     for ishape in [(1, 8, 10, 5, 10),
                    (1, 8, 5, 8, 8),
@@ -1499,53 +1570,65 @@ def test_conv3d_transpose():
 
 
 # Model tests
+@gpu
 def test_resnet18():
     torch.set_grad_enabled(False)
     verify_model("resnet18", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_squeezenet1_0():
     torch.set_grad_enabled(False)
     verify_model("squeezenet1_0", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_squeezenet1_1():
     torch.set_grad_enabled(False)
     verify_model("squeezenet1_1", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_densenet121():
     torch.set_grad_enabled(False)
     verify_model("densenet121", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_inception_v3():
     torch.set_grad_enabled(False)
     verify_model("inception_v3", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_googlenet():
     torch.set_grad_enabled(False)
     verify_model("googlenet", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_mnasnet0_5():
     torch.set_grad_enabled(False)
     verify_model("mnasnet0_5", atol=1e-4, rtol=1e-4)
 
+@gpu
 def test_mobilenet_v2():
     torch.set_grad_enabled(False)
     verify_model("mobilenet_v2", atol=1e-4, rtol=1e-4)
 
 """
 #TODO: Fix VGG and AlexNet issues (probably due to pooling)
+@gpu
 def test_alexnet():
     torch.set_grad_enabled(False)
     verify_model("alexnet")
 
+@gpu
 def test_vgg11():
     torch.set_grad_enabled(False)
     verify_model("vgg11")
 
+@gpu
 def test_vgg11_bn():
     torch.set_grad_enabled(False)
     verify_model("vgg11_bn")
 """
 
+@gpu
 def test_custom_conversion_map():
     def get_roi_align():
         pool_size = 5
@@ -1575,6 +1658,7 @@ def _impl(inputs, input_types):
     verify_model(model, inputs, custom_map)
 
 
+@gpu
 def test_segmentaton_models():
     class SegmentationModelWrapper(Module):
         def __init__(self, model):
@@ -1594,6 +1678,7 @@ def forward(self, inp):
     verify_model(SegmentationModelWrapper(deeplab.eval()), inp, atol=1e-4, rtol=1e-4)
 
 
+@gpu
 def test_3d_models():
     input_shape = (1, 3, 4, 56, 56)
     resnet3d = torchvision.models.video.r3d_18(pretrained=True).eval()
@@ -1642,6 +1727,7 @@ def verify_model_vm(imodel, ishapes, idtype=torch.float, idata=None):
                                     rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_control_flow():
     class SimpleIf(torch.nn.Module):
         def __init__(self, N, M):
@@ -1755,6 +1841,7 @@ def forward(self, inp):
         verify_script_model(pt_model.eval(), [(10, 20)])
 
 
+@gpu
 def test_simple_rnn():
     # The mixed tracing and scripting example from
     # https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#mixing-scripting-and-tracing
@@ -1792,6 +1879,7 @@ def forward(self, xs):
     verify_script_model(RNNLoop().eval(), [(10, 10, 4)])
 
 
+@gpu
 def test_forward_reduce_sum():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1824,6 +1912,7 @@ def forward(self, *args):
     verify_model(ReduceSum5().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_reduce_prod():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1846,6 +1935,7 @@ def forward(self, *args):
     verify_model(ReduceProd3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_argmin():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1868,6 +1958,7 @@ def forward(self, *args):
     verify_model(ArgMin3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_argmax():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1890,6 +1981,7 @@ def forward(self, *args):
     verify_model(ArgMax3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_std():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1942,6 +2034,7 @@ def forward(self, *args):
     verify_model(Std9().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_variance():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -1994,6 +2087,7 @@ def forward(self, *args):
     verify_model(Variance9().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_rsub():
     torch.set_grad_enabled(False)
 
@@ -2014,6 +2108,7 @@ def forward(self, *args):
     verify_model(Rsub2().float().eval(), input_data=[d1, d3])
 
 
+@gpu
 def test_forward_embedding():
     torch.set_grad_enabled(False)
 
@@ -2027,6 +2122,7 @@ def test_forward_embedding():
     verify_model(torch.nn.Embedding(4, 5, sparse=True).float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_onehot():
     torch.set_grad_enabled(False)
 
@@ -2045,6 +2141,7 @@ def forward(self, *args):
     verify_model(OneHot2().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_isfinite():
     torch.set_grad_enabled(False)
 
@@ -2056,6 +2153,7 @@ def forward(self, *args):
     verify_model(IsFinite1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_isnan():
     torch.set_grad_enabled(False)
 
@@ -2067,6 +2165,7 @@ def forward(self, *args):
     verify_model(IsNan1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_isinf():
     torch.set_grad_enabled(False)
 
@@ -2078,6 +2177,7 @@ def forward(self, *args):
     verify_model(IsInf1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_clamp():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -2100,6 +2200,7 @@ def forward(self, *args):
     verify_model(Clamp3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_ones():
     torch.set_grad_enabled(False)
 
@@ -2110,6 +2211,7 @@ def forward(self, *args):
     verify_model(Ones1().float().eval(), input_data=[])
 
 
+@gpu
 def test_forward_ones_like():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -2132,6 +2234,7 @@ def forward(self, *args):
     verify_model(OnesLike3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_zeros():
     torch.set_grad_enabled(False)
 
@@ -2142,6 +2245,7 @@ def forward(self, *args):
     verify_model(Zeros1().float().eval(), input_data=[])
 
 
+@gpu
 def test_forward_zeros_like():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -2164,6 +2268,7 @@ def forward(self, *args):
     verify_model(ZerosLike3().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_full():
     torch.set_grad_enabled(False)
 
@@ -2179,6 +2284,7 @@ def forward(self, *args):
     verify_model(Full2().float().eval(), input_data=[])
 
 
+@gpu
 def test_forward_full_like():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
@@ -2200,6 +2306,7 @@ def forward(self, *args):
     verify_model(FullLike2().float().eval(), input_data=input_data)
     verify_model(FullLike3().float().eval(), input_data=input_data)
 
+@gpu
 def test_forward_linspace():
     torch.set_grad_enabled(False)
 
@@ -2238,6 +2345,7 @@ def forward(self, *args):
     verify_model(Linspace8().float().eval())
 
 
+@gpu
 def test_forward_take():
     torch.set_grad_enabled(False)
     class Take1(Module):
@@ -2257,6 +2365,7 @@ def forward(self, *args):
     verify_model(Take2().float().eval(), input_data=[input_data, indices])
 
 
+@gpu
 def test_forward_topk():
     torch.set_grad_enabled(False)
     class Topk1(Module):
@@ -2293,6 +2402,7 @@ def forward(self, *args):
     verify_model(Topk6().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_logical_not():
     torch.set_grad_enabled(False)
 
@@ -2313,6 +2423,7 @@ def forward(self, *args):
     verify_model(LogicalNot1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_bitwise_not():
     torch.set_grad_enabled(False)
 
@@ -2330,6 +2441,7 @@ def forward(self, *args):
     verify_model(BitwiseNot1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_bitwise_xor():
     torch.set_grad_enabled(False)
 
@@ -2356,6 +2468,7 @@ def forward(self, *args):
     verify_model(BitwiseXor2().float().eval(), input_data=[lhs])
 
 
+@gpu
 def test_forward_logical_xor():
     torch.set_grad_enabled(False)
 
@@ -2382,6 +2495,7 @@ def forward(self, *args):
     verify_model(LogicalXor2().float().eval(), input_data=[lhs])
 
 
+@gpu
 def test_forward_unary():
     torch.set_grad_enabled(False)
 
@@ -2504,6 +2618,7 @@ def forward(self, *args):
     verify_model(Neg1().float().eval(), input_data=input_data)
 
 
+@gpu
 def test_forward_where():
     torch.set_grad_enabled(False)
 
@@ -2524,6 +2639,7 @@ def forward(self, *args):
     verify_model(Where2().float().eval(), input_data=[x, y])
 
 
+@gpu
 def test_forward_addcdiv():
     torch.set_grad_enabled(False)
 
@@ -2547,6 +2663,7 @@ def forward(self, *args):
     verify_model(Addcdiv2().float().eval(), input_data=[input_data, t1, t2])
 
 
+@gpu
 def test_forward_addcmul():
     torch.set_grad_enabled(False)
 
@@ -2569,6 +2686,7 @@ def forward(self, *args):
     t2 = torch.rand([1, 3]).float()
     verify_model(Addcmul2().float().eval(), input_data=[input_data, t1, t2])
 
+@gpu
 def test_forward_traced_function():
     def fn(t1, t2):
         return t1 + t2
@@ -2577,6 +2695,7 @@ def fn(t1, t2):
     tensor2 = torch.randn(3, 4)
     verify_model(fn, input_data=[tensor1, tensor2])
 
+@gpu
 def test_forward_dtypes():
     def fn(t1, t2):
         return 2.5 * t1 + t2
@@ -2600,12 +2719,14 @@ def forward(self, x):
     verify_model(ModuleWithIntParameters(param), input_data=inp)
 
 
+@gpu
 def test_weight_names():
     tm = torch.jit.trace(torch.nn.Linear(3, 4), [torch.randn(2, 3)])
     mod, params = relay.frontend.from_pytorch(tm, [('input', (2, 3))])
     assert set(params.keys()) == set(n for n, p in tm.named_parameters())
 
 
+@gpu
 def test_duplicate_weight_use():
     # The test cases doesn't make any sense as a neural network,
     # the issue popped up in shared input/output embeddings of bert,
@@ -2623,6 +2744,7 @@ def forward(self, x):
     verify_model(Test(), input_data=[torch.randn(5, 5)])
 
 
+@gpu
 def test_forward_matmul():
     torch.set_grad_enabled(False)
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 799d9c20058aa..ef5a4d0f02279 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -49,6 +49,8 @@
 from tvm.runtime.vm import VirtualMachine
 from packaging import version as package_version
 
+from tvm.testing import device_enabled, requires_gpu, gpu
+
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
@@ -198,7 +200,7 @@ def name_without_num(name):
 
         for device in ["llvm", "cuda"]:
             ctx = tvm.context(device, 0)
-            if not ctx.exist:
+            if not device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 continue
             if no_gpu and device == 'cuda':
@@ -262,6 +264,7 @@ def _test_pooling(input_shape, **kwargs):
             _test_pooling_iteration(input_shape, **kwargs)
 
 
+@gpu
 def test_forward_pooling():
     """ Pooling """
     # TensorFlow only supports NDHWC for max_pool3d on CPU
@@ -408,6 +411,7 @@ def _test_convolution(opname, tensor_in_sizes, filter_in_sizes,
                                 'Placeholder:0', 'DepthwiseConv2dNative:0')
 
 
+@gpu
 def test_forward_convolution():
     if is_gpu_available():
         _test_convolution('conv', [4, 176, 8, 8], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NCHW')
@@ -526,6 +530,7 @@ def _test_convolution3d(opname, tensor_in_sizes, filter_in_sizes,
             compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'),
                                 'Placeholder:0', 'Conv3D:0', cuda_layout="NCDHW")
 
+@gpu
 def test_forward_convolution3d():
     if is_gpu_available():
         _test_convolution3d('conv', [4, 176, 8, 8, 8], [1, 1, 1, 176, 32], [1, 1, 1], [1, 1, 1], 'SAME', 'NCDHW')
@@ -569,6 +574,7 @@ def _test_convolution3d_transpose(data_shape, filter_shape, strides,
         compare_tf_with_tvm(data_array, 'Placeholder:0', 'conv3d_transpose:0', cuda_layout="NDHWC")
 
 
+@gpu
 def test_forward_convolution3d_transpose():
     if is_gpu_available():
         _test_convolution3d_transpose(data_shape=[1, 10, 8, 8, 8],
@@ -655,6 +661,7 @@ def _test_biasadd(tensor_in_sizes, data_format):
                             'Placeholder:0', 'BiasAdd:0')
 
 
+@gpu
 def test_forward_biasadd():
     if is_gpu_available():
         _test_biasadd([4, 176, 8, 8], 'NCHW')
@@ -1272,7 +1279,7 @@ def test_read_variable_op():
 
         for device in ["llvm", "cuda"]:
             ctx = tvm.context(device, 0)
-            if not ctx.exist:
+            if not device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 continue
 
@@ -2382,6 +2389,7 @@ def test_forward_mobilenet():
 # --------
 
 
+@requires_gpu
 def test_forward_resnetv2():
     '''test resnet model'''
     if is_gpu_available():
@@ -2399,7 +2407,7 @@ def test_forward_resnetv2():
                     sess, data, 'input_tensor:0', out_node + ':0')
                 for device in ["llvm", "cuda"]:
                     ctx = tvm.context(device, 0)
-                    if not ctx.exist:
+                    if not device_enabled(device):
                         print("Skip because %s is not enabled" % device)
                         continue
                     tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', len(tf_output),
@@ -2431,7 +2439,7 @@ def _test_ssd_impl():
             # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready.
             for device in ["llvm"]:
                 ctx = tvm.context(device, 0)
-                if not ctx.exist:
+                if not device_enabled(device):
                     print("Skip because %s is not enabled" % device)
                     continue
                 tvm_output = run_tvm_graph(graph_def, data, in_node, len(out_node),
@@ -3754,7 +3762,7 @@ def test_forward_dynamic_input_shape():
             # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready.
             for device in ["llvm"]:
                 ctx = tvm.context(device, 0)
-                if not ctx.exist:
+                if not device_enabled(device):
                     print("Skip because %s is not enabled" % device)
                     continue
                 tvm_output = run_tvm_graph(graph_def, np_data, ["data"], 1,
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index dfa247e5a09ad..41e5e02cbbda5 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -19,7 +19,9 @@
 from tvm.contrib import nvcc
 import numpy as np
 import time
+from tvm.testing import requires_gpu, device_enabled, gpu
 
+@requires_gpu
 def test_exp():
     # graph
     n = tvm.runtime.convert(1024)
@@ -34,7 +36,7 @@ def test_exp():
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.runtime.enabled(host):
+        if not device_enabled(host):
             return
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -55,6 +57,7 @@ def check_device(device, host="stackvm"):
     check_device("cuda", "llvm")
     check_device("vulkan")
 
+@requires_gpu
 def test_fmod():
     # graph
     def run(dtype):
@@ -102,6 +105,7 @@ def check_device(device):
 
     run("float32")
 
+@requires_gpu
 def test_multiple_cache_write():
     # graph
     n = tvm.runtime.convert(1024)
@@ -123,7 +127,7 @@ def test_multiple_cache_write():
     s[C].bind(tx, te.thread_axis("threadIdx.x"))
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.runtime.enabled(host):
+        if not device_enabled(host):
             return
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -155,7 +159,7 @@ def test_log_pow_llvm():
     # create iter var and assign them tags.
     bx, tx = s[B].split(B.op.axis[0], factor=32)
     # one line to build the function.
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     flog = tvm.build(s, [A, B],
@@ -173,6 +177,7 @@ def test_log_pow_llvm():
         b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
 
 
+@gpu
 def test_popcount():
     def run(dtype):
         # graph
@@ -212,6 +217,7 @@ def check_device(device):
     run('uint64')
 
 
+@requires_gpu
 def test_add():
     def run(dtype):
         # graph
@@ -264,6 +270,7 @@ def check_device(device):
     run("uint64")
 
 
+@requires_gpu
 def try_warp_memory():
     """skip this in default test because it require higher arch"""
     m = 128
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 12026da61394e..d805594533b8f 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -18,8 +18,10 @@
 from tvm import te
 import numpy as np
 import time
+from tvm.testing import requires_gpu, device_enabled
 
 
+@requires_gpu
 def test_gemm():
     # graph
     nn = 1024
@@ -82,7 +84,7 @@ def test_gemm():
     # one line to build the function.
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
 
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index c5d9d0875c3e5..8f7db09948c6c 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -17,8 +17,10 @@
 import tvm
 from tvm import te
 import numpy as np
+from tvm.testing import requires_gpu, device_enabled
 
 
+@requires_gpu
 def test_reduce_prims():
     def test_prim(reducer, np_reducer):
         # graph
@@ -40,7 +42,7 @@ def test_prim(reducer, np_reducer):
         # one line to build the function.
         def check_device(device, host="llvm"):
             ctx = tvm.context(device, 0)
-            if not tvm.runtime.enabled(host):
+            if not device_enabled(host):
                 return
             if not ctx.exist:
                 print("skip because %s is not enabled.." % device)
@@ -83,7 +85,7 @@ def test_rfactor():
     s[BF].parallel(BF.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         ctx = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, B])
@@ -113,7 +115,7 @@ def test_rfactor_factor_axis():
     s[BF].parallel(BF.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         ctx = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, B])
@@ -132,6 +134,7 @@ def check_target(target="llvm"):
     check_target()
 
 
+@requires_gpu
 def test_rfactor_threads():
     nn = 1027
     mm = 10
@@ -182,6 +185,7 @@ def check_target(device, host="stackvm"):
     check_target("opencl")
     check_target("rocm")
 
+@requires_gpu
 def test_rfactor_elemwise_threads():
     n = 1025
     m = 10
@@ -255,7 +259,7 @@ def fidentity(t0, t1):
 
     def check_target():
         device = 'cpu'
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
         ctx = tvm.context(device, 0)
@@ -280,6 +284,7 @@ def check_target():
     check_target()
 
 
+@requires_gpu
 def test_rfactor_argmax():
     def fcombine(x, y):
         lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
@@ -341,6 +346,7 @@ def check_target(device):
     check_target("vulkan")
     check_target("rocm")
 
+@requires_gpu
 def test_warp_reduction1():
     nthx = 32
     nthy = 4
@@ -387,6 +393,7 @@ def check_target(device, m, n):
     # This is a bug in normal reduction.
     # check_target("cuda", m=10, n=37)
 
+@requires_gpu
 def test_warp_reduction2():
     def fcombine(x, y):
         return x[0] + y[0], x[1] * y[1]
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 99553c3579d50..b876144312b3b 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -17,7 +17,9 @@
 import tvm
 from tvm import te
 import numpy as np
+from tvm.testing import device_enabled, requires_gpu
 
+@requires_gpu
 def test_scan():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -47,7 +49,7 @@ def test_scan():
     # one line to build the function.
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
         fscan = tvm.build(s, [X, res],
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 95b94f693b39a..3364fb9bcb359 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -26,6 +26,8 @@
 from tvm import autotvm
 from tvm.autotvm.tuner import RandomTuner
 
+from tvm.testing import requires_gpu, device_enabled
+
 @autotvm.template("testing/conv2d_no_batching")
 def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
     """An example template for testing"""
@@ -120,10 +122,11 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
                                target=target, target_host=target_host)
     return task, target
 
+@requires_gpu
 def test_tuning():
     def check(target, target_host):
         ctx = tvm.context(target, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             logging.info("Skip test because %s is not available" % target)
             return
 
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index 994a047df7424..c636305c15d80 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -25,6 +25,7 @@
 import tvm.topi.testing
 from tvm.topi.util import get_const_tuple
 from pytest import skip
+from tvm.testing import device_enabled
 
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
@@ -60,8 +61,8 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            skip("s is not enabled" % device)
+        if not device_enabled(device):
+            print("Skipping %s becuase it is not enabled" % device)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py
index d4b55f14100b0..ff1a1f8f0e46e 100644
--- a/tests/python/nightly/quantization/test_quantization_accuracy.py
+++ b/tests/python/nightly/quantization/test_quantization_accuracy.py
@@ -23,6 +23,7 @@
 from mxnet import gluon
 import logging
 import os
+from tvm.testing import requires_gpu
 
 logging.basicConfig(level=logging.INFO)
 
@@ -112,6 +113,7 @@ def eval_acc(model, dataset, batch_fn, target=tvm.target.cuda(), ctx=tvm.gpu(),
     logging.info('[final] validation: acc-top1=%f acc-top5=%f', top1, top5)
     return top1
 
+@requires_gpu
 def test_quantize_acc(cfg, rec_val):
     qconfig = qtz.qconfig(skip_conv_layers=[0],
                           nbit_input=cfg.nbit_input,
diff --git a/tests/python/pytest.ini b/tests/python/pytest.ini
new file mode 100644
index 0000000000000..b620ac4c4cbd0
--- /dev/null
+++ b/tests/python/pytest.ini
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+[pytest]
+markers =
+    gpu: mark a test as requiring a gpu
+    tpu: mark a test as requiring a tpu
+    cuda: mark a test as requiring cuda
+    opencl: mark a test as requiring opencl
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index 95a030f5b8aeb..742d8c0e5dd6c 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -22,11 +22,13 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import tvm.topi.testing
 import random
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_dyn_broadcast_to():
     dtype = 'uint8'
     rank = 3
@@ -44,7 +46,7 @@ def test_dyn_broadcast_to():
     x = np.random.uniform(size=x_shape).astype(dtype)
     dyn_shape = (1, ) * rank
     ref_res = np.broadcast_to(x, dyn_shape)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         if (target != 'cuda'):  #skip cuda because we don't have dynamic support for GPU
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
@@ -53,6 +55,7 @@ def test_dyn_broadcast_to():
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_dyn_one_hot():
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
@@ -77,7 +80,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         func = relay.Function([indices, depth_var], out)
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if (target != 'cuda'):  #skip cuda because we don't have dynamic support for GPU
                 for kind in ["vm", "debug"]:
                     mod = tvm.ir.IRModule.from_expr(func)
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index e1a0d284d9bfc..c73bfbe59847b 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -21,7 +21,7 @@
 import tvm
 from tvm import relay
 from tvm import te
-from tvm.relay.testing import ctx_list
+from tvm.relay.testing import enabled_devices
 import random
 from test_dynamic_op_level3 import verify_func
 import tvm.topi.testing
@@ -51,7 +51,7 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
         zz = run_infer_type(z)
         func = relay.Function([x, scale_h_var, scale_w_var], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
              if "llvm" not in target: continue
              for kind in ["vm", "debug"]:
                  mod = tvm.ir.IRModule.from_expr(func)
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 91e9cc77fe99f..a8a87517493f1 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -22,11 +22,12 @@
 from tvm import te
 from tvm import relay
 from tvm.relay import create_executor, transform
-from tvm.relay.testing import ctx_list, check_grad, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type
+from tvm.testing import enabled_devices, gpu
 
 def verify_func(func, data, ref_res):
     assert isinstance(data, list)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         #TODO(mbrookhart): enable Cuda tests onces the VM supports dynamic shapes
         if "llvm" not in target: continue
         for kind in ["vm", "debug"]:
@@ -36,6 +37,8 @@ def verify_func(func, data, ref_res):
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
             relay.backend.compile_engine.get().clear()
 
+@gpu
+@gpu
 def test_dyn_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -60,6 +63,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((2, 3, 4, 5), (-3, -3), (6, 20))
     verify_reshape((2, 3, 4), (0, -3), (2, 12))
 
+@gpu
 def test_dyn_shape_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -76,6 +80,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((2, 3, 4), (8, 3), (8, 3))
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
+@gpu
 def test_dyn_tile():
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -92,6 +97,7 @@ def verify_tile(dshape, reps):
     verify_tile((2, 3), (3, 2, 1))
 
 
+@gpu
 def test_dyn_zeros_ones():
     def verify_zeros_ones(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
@@ -107,6 +113,7 @@ def verify_zeros_ones(shape, dtype):
     verify_zeros_ones((1, 3), 'int64')
     verify_zeros_ones((8, 9, 1, 2), 'float32')
 
+@gpu
 def test_dyn_full():
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index 8dcfd1fd5778a..1d3c974de21d3 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -22,8 +22,9 @@
 from tvm import te
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import tvm.topi.testing
+from tvm.testing import enabled_devices, gpu
 
 
 def test_resize_infer_type():
@@ -35,6 +36,7 @@ def test_resize_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
 
 
+@gpu
 def test_resize():
     def verify_resize(dshape, scale, method, layout):
         if layout == "NHWC":
@@ -57,7 +59,7 @@ def verify_resize(dshape, scale, method, layout):
         zz = run_infer_type(z)
         func = relay.Function([x, size_var], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if "llvm" not in target: continue
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index ddfab552ed83d..9019dad5563fe 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -21,8 +21,9 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import ctx_list
+from tvm.testing import enabled_devices, gpu
 
+@gpu
 def test_dynamic_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
@@ -51,7 +52,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 np_values[i, :] = np_data[i, np_indices[i, :]]
         np_indices = np_indices.astype(dtype)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if "llvm" not in target: continue
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index 6bc170d2b5af1..a16e4668b0602 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -23,6 +23,7 @@
 from tvm import topi
 from tvm.relay.testing import run_infer_type
 from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.testing import gpu, device_enabled
 
 
 @autotvm.register_topi_compute("test/conv2d_1")
@@ -146,6 +147,7 @@ def _select_impl(dshape, wshape, use_autotvm=False):
                 impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
                 assert impl.name == "conv2d_1"
 
+@gpu
 def test_compile_engine():
     engine = relay.backend.compile_engine.get()
     def get_func(shape):
@@ -161,7 +163,7 @@ def get_func(shape):
     z3 = engine.lower(get_func(()), "llvm")
     assert z1.same_as(z2)
     assert not z3.same_as(z1)
-    if tvm.context("cuda").exist:
+    if device_enabled("cuda"):
         z4 = engine.lower(get_func(()), "cuda")
         assert not z3.same_as(z4)
 
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index f0785bcf1c097..4e7a2bdfc5f25 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -20,7 +20,7 @@
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.op import add
-from tvm.relay.testing.config import ctx_list
+from tvm.testing import enabled_devices, gpu
 
 # @tq, @jr should we put this in testing ns?
 def check_rts(expr, args, expected_result, mod=None):
@@ -141,6 +141,7 @@ def test_plan_memory():
     assert len(device_types) == 1
 
 
+@gpu
 def test_gru_like():
     def unit(rnn_dim):
         X = relay.var("X", shape=(1, rnn_dim))
@@ -165,7 +166,7 @@ def unit_numpy(X, W):
     out_shape = (1, rnn_dim)
     z = unit(rnn_dim)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         with tvm.transform.PassContext(opt_level=2):
             graph, lib, params = relay.build(tvm.IRModule.from_expr(z), target)
             m = graph_runtime.create(graph, lib, ctx)
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index fa56eb0eef295..c6ca57fff3235 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -20,6 +20,7 @@
 from tvm import te
 from tvm import relay
 from tvm.contrib.nvcc import have_fp16
+from tvm.testing import gpu, requires_cuda
 
 
 def test_basic_build():
@@ -64,13 +65,10 @@ def test_basic_build():
                                atol=1e-5, rtol=1e-5)
 
 
+@requires_cuda
 def test_fp16_build():
     dtype = "float16"
 
-    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
-        print("skip because cuda is not enabled.")
-        return
-
     ctx = tvm.gpu(0)
     if dtype == "float16" and not have_fp16(ctx.compute_version):
         print("skip because gpu does not support fp16")
@@ -100,12 +98,13 @@ def test_fp16_build():
                                atol=1e-5, rtol=1e-5)
 
 
+@gpu
 def test_fp16_conversion():
     def check_conversion(tgt, ctx):
         if not tvm.runtime.enabled(tgt):
             print("skip because {} is not enabled.".format(tgt))
             return
-        elif tgt == "cuda" and ctx.exist and not have_fp16(ctx.compute_version):
+        elif tgt == "cuda" and device_enabled(tgt) and not have_fp16(ctx.compute_version):
             print("skip because gpu does not support fp16")
             return
 
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index 437901ee95fcc..467d4b1784f0a 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -20,8 +20,9 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, ctx_list, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
+from tvm.testing import gpu, enabled_devices
 
 
 def sigmoid(x):
@@ -35,6 +36,7 @@ def relu(x):
     return x_copy
 
 
+@gpu
 def test_unary_op():
     def check_single_op(opfunc, ref, dtype):
         shape = (10, 4)
@@ -49,7 +51,7 @@ def check_single_op(opfunc, ref, dtype):
             fwd_func = run_infer_type(fwd_func)
             bwd_func = run_infer_type(gradient(fwd_func))
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 intrp = relay.create_executor(ctx=ctx, target=target)
                 op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
                 np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
@@ -79,6 +81,7 @@ def check_single_op(opfunc, ref, dtype):
             check_single_op(opfunc, ref, dtype)
 
 
+@gpu
 def test_binary_op():
     def inst(vars, sh):
         return [vars.get(s, s) for s in sh]
@@ -97,7 +100,7 @@ def check_binary_op(opfunc, ref, dtype):
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor(ctx=ctx, target=target)
             op_res, (op_grad0, op_grad1) = intrp.evaluate(bwd_func)(x_data, y_data)
             np.testing.assert_allclose(op_grad0.asnumpy(), ref_grad0, rtol=0.01)
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 50e358564fbb8..0479608f56b75 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -21,8 +21,9 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, ctx_list, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
+from tvm.testing import enabled_devices, gpu
 
 
 def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
@@ -43,12 +44,13 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
         padding=[ph, pw, ph, pw],
         pool_type='max', ceil_mode=ceil_mode)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp = relay.create_executor(ctx=ctx, target=target)
         op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
 
+@gpu
 def test_max_pool2d_grad():
     verify_max_pool2d_grad((1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False)
     verify_max_pool2d_grad((1, 4, 16, 16), pool_size=(1, 1), strides=(1, 1), padding=(1, 1), ceil_mode=False)
@@ -72,11 +74,12 @@ def verify_avg_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode, coun
         padding=[ph, pw, ph, pw],
         pool_type='avg', ceil_mode=ceil_mode)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp = relay.create_executor(ctx=ctx, target=target)
         op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
+@gpu
 def test_avg_pool2d_grad():
     verify_avg_pool2d_grad((1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0),
                            ceil_mode=False, count_include_pad=True)
@@ -100,11 +103,12 @@ def verify_global_avg_pool2d_grad(x_shape):
         strides=(1, 1), padding=[0, 0, 0, 0], pool_type='avg',
         ceil_mode=False)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp = relay.create_executor(ctx=ctx, target=target)
         op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
+@gpu
 def test_global_avg_pool2d_grad():
     verify_global_avg_pool2d_grad((1, 4, 16, 16))
     verify_global_avg_pool2d_grad((1, 8, 8, 24))
@@ -139,7 +143,7 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
                            .detach().numpy()
 
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         data = tvm.nd.array(data_pt.detach().numpy(), ctx)
         weight = tvm.nd.array(weight_pt.detach().numpy(), ctx)
         intrp = relay.create_executor(ctx=ctx, target=target)
@@ -148,6 +152,7 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
         np.testing.assert_allclose(grad_weight.asnumpy(), grad_weight_pt, rtol=1e-4, atol=1e-4)
 
 
+@gpu
 def test_conv2d_grad():
     verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1])
     verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1])
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 8ca1eaea5ed2b..3e69252743fe3 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -20,10 +20,12 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, ctx_list, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_clip():
     for dtype in ('float32', 'float64'):
         ref = (lambda x: np.where(x > 10.0, np.zeros_like(x),
@@ -37,7 +39,7 @@ def test_clip():
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor(ctx=ctx, target=target)
             op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 4616a14dcdde9..40d8388d5832a 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -21,9 +21,10 @@
 import scipy
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import tvm.topi.testing
 from tvm.contrib.nvcc import have_fp16
+from tvm.testing import enabled_devices, gpu
 
 
 def sigmoid(x):
@@ -39,6 +40,7 @@ def rsqrt(x):
     one = np.ones_like(x)
     return one / np.sqrt(x)
 
+@gpu
 def test_unary_op():
     def check_single_op(opfunc, ref, dtype):
         shape = (10, 4)
@@ -56,7 +58,7 @@ def check_single_op(opfunc, ref, dtype):
             data = np.random.rand(*shape).astype(dtype)
             ref_res = ref(data)
             func = relay.Function([x], y)
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 # use graph by execuor default for testing, as we need
                 # create function explicitly to avoid constant-folding.
                 if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
@@ -82,6 +84,7 @@ def check_single_op(opfunc, ref, dtype):
             check_single_op(opfunc, ref, dtype)
 
 
+@gpu
 def test_binary_op():
     def inst(vars, sh):
         return [vars.get(s, s) for s in sh]
@@ -112,7 +115,7 @@ def check_binary_op(opfunc, ref, dtype):
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 # use graph by execuor default for testing, as we need
                 # create function explicitly to avoid constant-folding.
                 if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
@@ -131,12 +134,13 @@ def check_binary_op(opfunc, ref, dtype):
             check_binary_op(opfunc, ref, dtype)
 
 
+@gpu
 def test_expand_dims():
     # based on topi test
     def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
         x = relay.Var("x", relay.TensorType(dshape, dtype))
         func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                 continue
             data = np.random.uniform(size=dshape).astype(dtype)
@@ -149,6 +153,7 @@ def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
         verify_expand_dims((3, 10), dtype, (1, 3, 10), -3, 1)
 
 
+@gpu
 def test_bias_add():
     for dtype in ['float16', 'float32']:
         xshape=(10, 2, 3, 4)
@@ -165,7 +170,7 @@ def test_bias_add():
         x_data = np.random.uniform(size=xshape).astype(dtype)
         y_data = np.random.uniform(size=bshape).astype(dtype)
         ref_res = x_data + y_data.reshape((2, 1, 1))
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                 continue
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
@@ -183,6 +188,7 @@ def test_expand_dims_infer_type():
         assert yy.checked_type == relay.TensorType((n, t, 1, 100), dtype)
 
 
+@gpu
 def test_softmax():
     for dtype in ['float16', 'float32']:
         # Softmax accuracy for float16 is poor
@@ -197,12 +203,13 @@ def test_softmax():
         func = relay.Function([x], y)
         x_data = np.random.uniform(size=shape).astype(dtype)
         ref_res = tvm.topi.testing.softmax_python(x_data)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_log_softmax():
     for dtype in ['float16', 'float32']:
         # Softmax accuracy for float16 is poor
@@ -217,12 +224,13 @@ def test_log_softmax():
         func = relay.Function([x], y)
         x_data = np.random.uniform(size=shape).astype(dtype)
         ref_res = tvm.topi.testing.log_softmax_python(x_data)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_concatenate():
     for dtype in ['float16', 'float32']:
         n, t, d = te.size_var("n"), te.size_var("t"), 100
@@ -266,7 +274,7 @@ def test_concatenate():
         t_data = np.random.uniform(size=()).astype(dtype)
         ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                 continue
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
@@ -345,6 +353,7 @@ def test_dense_type_check():
     y = relay.nn.dense(x, w)
     yy = run_infer_type(y)
 
+@gpu
 def test_dense():
     for dtype in ['float16', 'float32']:
         # Dense accuracy for float16 is poor
@@ -383,7 +392,7 @@ def test_dense():
         w_data = np.random.rand(2, 5).astype(dtype)
         ref_res = np.dot(x_data, w_data.T)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x_data, w_data)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index c0a990ba9d2e8..498a764f60299 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -22,11 +22,13 @@
 import tvm.topi.testing
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 from tvm import topi
 import tvm.topi.testing
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_checkpoint():
     dtype = "float32"
     xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
@@ -38,7 +40,7 @@ def test_checkpoint():
     assert f.checked_type == f_checkpoint.checked_type
 
     inputs = [np.random.uniform() for _ in range(len(xs))]
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             f_res = intrp.evaluate(f)(*inputs)
@@ -148,6 +150,7 @@ def test_checkpoint_alpha_equal_tuple():
 
     tvm.ir.assert_structural_equal(df, df_parsed)
 
+@gpu
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
     shape_like = (4, 5, 6)
@@ -162,13 +165,14 @@ def test_collapse_sum_like():
     x = np.random.uniform(size=shape).astype(dtype)
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.sum(x, 0)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x, y)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_collapse_sum_to():
     shape = (3, 4, 5, 6)
     shape_to = (4, 5, 6)
@@ -181,13 +185,14 @@ def test_collapse_sum_to():
     func = relay.Function([x], z)
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.sum(x, 0)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_broadcast_to():
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
@@ -200,12 +205,13 @@ def test_broadcast_to():
     func = relay.Function([x], z)
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_broadcast_to_like():
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
@@ -222,7 +228,7 @@ def test_broadcast_to_like():
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x, y)
@@ -266,12 +272,13 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
     y_data = np.random.uniform(size=slice_like).astype(dtype)
     ref_res = np_slice_like(x_data, y_data, axes)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data, y_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_slice_like():
     d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
     verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
@@ -286,6 +293,7 @@ def test_slice_like():
                       axes=(2, 3),
                       output=(1, 3, 112, 112))
 
+@gpu
 def test_reverse_reshape():
     def verify_reverse_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -297,7 +305,7 @@ def verify_reverse_reshape(shape, newshape, oshape):
         func = relay.Function([x], z)
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -320,12 +328,13 @@ def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     y_np = np.random.uniform(size=y_shape).astype(dtype)
     z_np = tvm.topi.testing.batch_matmul(x_np, y_np)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             z = intrp.evaluate(func)(x_np, y_np)
             tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
 
+@gpu
 def test_batch_matmul():
     b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k")
     x = relay.var("x", relay.TensorType((b, m, k), "float32"))
@@ -339,13 +348,14 @@ def test_batch_matmul():
     verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
     verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
 
+@gpu
 def test_shape_of():
     shape = (10, 5, 12)
     x = relay.var("x", shape=shape)
     func = relay.Function([x], relay.op.shape_of(x))
     func = run_infer_type(func)
     x_data = np.random.rand(*shape).astype('float32')
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         # Because using graph executor, this op will be optimized after
         # constant folding pass, here we only test with interpreter
         for kind in ["debug"]:
@@ -354,6 +364,7 @@ def test_shape_of():
             tvm.testing.assert_allclose(op_res.asnumpy(),
                                         np.array(shape).astype('int32'))
 
+@gpu
 def test_ndarray_size():
     def verify_ndarray_size(shape):
         x = relay.var("x", shape=shape)
@@ -362,7 +373,7 @@ def verify_ndarray_size(shape):
 
         x_data = np.random.uniform(size=shape).astype("float32")
         ref_res = np.size(x_data)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -380,7 +391,7 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc):
     np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
     np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         relay_out = intrp1.evaluate(func)(np_data)
         tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
@@ -396,6 +407,7 @@ def verify_adaptive_pool3d(dshape, out_size, pool_type, layout="NCHW", dtype="fl
     verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc)
 
 
+@gpu
 def test_adaptive_pool():
     verify_adaptive_pool2d((1, 9, 224, 224), (1, 1), "max")
     verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg")
@@ -409,6 +421,7 @@ def test_adaptive_pool():
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (2, 4, 4), "max", layout="NDHWC")
 
 
+@gpu
 def test_sequence_mask():
     def _verify(data_shape, mask_value, axis, dtype, itype):
         max_length = data_shape[axis]
@@ -423,7 +436,7 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
         valid_length_np = np.random.randint(0, max_length, size=nbatch).astype(itype)
         gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 out_relay = intrp.evaluate(func)(data_np, valid_length_np)
@@ -432,6 +445,7 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
     _verify((2, 3, 5, 3), 0.0, 0, 'float32', 'int64')
     _verify((5, 8, 3), 0.1, 1, 'float64', 'float32')
 
+@gpu
 def test_one_hot():
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
@@ -458,7 +472,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 out_relay = intrp.evaluate(func)(indices_np)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 6258d8c9aaf81..7c8f6c49d1981 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -22,12 +22,14 @@
 from tvm import autotvm
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 from tvm.contrib import util
 import tvm.topi.testing
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_conv1d_infer_type():
     # symbolic in batch dimension
     n, c, w = te.var("n"), 10, 224
@@ -78,6 +80,7 @@ def test_conv1d_infer_type():
         (n, w, 16), "int32")
 
 
+@gpu
 def test_conv1d_run():
     def run_test_conv1d(dtype, out_dtype, scale, dshape, kshape,
                         padding=(1, 1),
@@ -100,9 +103,10 @@ def run_test_conv1d(dtype, out_dtype, scale, dshape, kshape,
         ref_res = tvm.topi.testing.conv1d_ncw_python(
             data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, dilation)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if target in except_targets:
                 continue
+            ctx = tvm.context(target, 0)
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -122,6 +126,7 @@ def run_test_conv1d(dtype, out_dtype, scale, dshape, kshape,
                     padding=(1, 1), channels=10, kernel_size=3, dilation=3)
 
 
+@gpu
 def test_conv2d_infer_type():
     # symbolic in batch dimension
     n, c, h, w = te.size_var("n"), 10, 224, 224
@@ -189,6 +194,7 @@ def test_conv2d_infer_type():
         (n, h, w, 16), "int32")
 
 
+@gpu
 def test_conv2d_run():
     def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
                         padding=(1, 1),
@@ -219,9 +225,10 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if target in except_targets:
                 continue
+            ctx = tvm.context(target, 0)
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4, atol=1e-4)
@@ -314,6 +321,7 @@ def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape,
     run_test_conv2d("float32", "float32", 1, dshape, kshape,
                     padding=(1, 1), channels=10, kernel_size=(3 ,3), dilation=(3, 3))
 
+@gpu
 def test_conv2d_winograd():
     class WinogradFallback(autotvm.FallbackContext):
         def _query_inside(self, target, workload):
@@ -357,9 +365,10 @@ def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape,
             groups=groups)
 
         with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 if target != 'cuda':
                     continue
+                ctx = tvm.context(target, 0)
                 params = {'w': tvm.nd.array(kernel)}
                 graph, lib, params = relay.build_module.build(mod, target=target, params=params)
                 module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
@@ -385,6 +394,7 @@ def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape,
                          padding=(2, 2), channels=192, kernel_size=(7, 7))
 
 
+@gpu
 def test_conv3d_infer_type():
     # symbolic in batch dimension
     n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
@@ -435,6 +445,7 @@ def test_conv3d_infer_type():
         (n, d, h, w, 16), "int32")
 
 
+@gpu
 def test_conv3d_run():
     def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
                         padding=(1, 1, 1),
@@ -465,9 +476,10 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if target in except_targets:
                 continue
+            ctx = tvm.context(target, 0)
 
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
@@ -479,6 +491,7 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
     run_test_conv3d("float32", "float32", 1, dshape, kshape,
             padding=(1, 1, 1), channels=10, kernel_size=(3, 3 ,3))
 
+@gpu
 def test_conv3d_ndhwc_run():
     def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
                         padding=(1, 1, 1),
@@ -509,9 +522,10 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if target in except_targets:
                 continue
+            ctx = tvm.context(target, 0)
 
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
@@ -523,6 +537,7 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
     run_test_conv3d("float32", "float32", 1, dshape, kshape,
             padding=(1, 1, 1), channels=10, kernel_size=(3, 3 ,3), except_targets=["cuda"])
 
+@gpu
 def test_conv3d_winograd():
     class WinogradFallback(autotvm.FallbackContext):
         def _query_inside(self, target, workload):
@@ -579,9 +594,10 @@ def run_test_conv3d_cuda(dtype, out_dtype, scale, dshape, kshape,
             groups=groups)
 
         with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 if target != 'cuda':
                     continue
+                ctx = tvm.context(target, 0)
                 params = {'w': tvm.nd.array(kernel)}
                 graph, lib, params = relay.build_module.build(mod, target=target, params=params)
                 module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
@@ -612,6 +628,7 @@ def run_test_conv3d_cuda(dtype, out_dtype, scale, dshape, kshape,
                          padding=(0, 2, 2), channels=120, kernel_size=(1, 5, 5))
 
 
+@gpu
 def test_conv3d_transpose_infer_type():
     # symbolic in batch dimension
     n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
@@ -649,6 +666,7 @@ def test_conv3d_transpose_infer_type():
         (n, 12, 226, 226, 226), "int32")
 
 
+@gpu
 def test_conv3d_transpose_ncdhw_run():
     dshape = (1, 3, 24, 24, 24)
     kshape = (3, 4, 2, 2, 2)
@@ -665,12 +683,13 @@ def test_conv3d_transpose_ncdhw_run():
     kernel = np.random.uniform(size=kshape).astype(dtype)
     ref_res = tvm.topi.testing.conv3d_transpose_ncdhw_python(data, kernel, 1, 1, 0)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
     n, c, h, w = te.size_var("n"), 10, 10, 12
@@ -700,6 +719,7 @@ def test_conv2d_transpose_infer_type():
         (n, 15, 15, 11), "float32")
 
 
+@gpu
 def test_conv2d_transpose_nchw_run():
     dshape = (1, 3, 18, 18)
     kshape = (3, 10, 3, 3)
@@ -716,12 +736,13 @@ def test_conv2d_transpose_nchw_run():
     ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(
         data, kernel, 2, 1, (1, 1))
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_conv2d_transpose_nhwc_run():
     dshape_nhwc = (1, 18, 18, 3)
     kshape_hwoi = (3, 3, 10, 3)
@@ -743,12 +764,13 @@ def test_conv2d_transpose_nhwc_run():
     ref_res = tvm.topi.testing.conv2d_transpose_nhwc_python(data, kernel, 'HWOI',
                                                         2, 1, output_padding=(1, 1))
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_conv1d_transpose_ncw_run():
     dshape = (1, 3, 18)
     kshape = (3, 10, 3)
@@ -765,12 +787,13 @@ def test_conv1d_transpose_ncw_run():
     ref_res = tvm.topi.testing.conv1d_transpose_ncw_python(
         data, kernel, 2, 1, output_padding=(1,))
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_upsampling_infer_type():
     n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     scale = tvm.tir.const(2.0, "float64")
@@ -787,6 +810,7 @@ def test_upsampling_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
+@gpu
 def test_upsampling3d_infer_type():
     n, c, d, h, w = te.size_var("n"), te.size_var("c"),\
                     te.size_var("d"), te.size_var("h"), te.size_var("w")
@@ -820,7 +844,7 @@ def _test_pool2d(opfunc, reffunc, pool_size=(2, 2), strides=(2, 2), padding=(0,
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5))
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -840,7 +864,7 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
     func = relay.Function([x], y)
     data = np.random.randint(low=-128, high=128, size=dshape)
     ref_res = reffunc(data.reshape(1,3,14,2,14,2), axis=(3,5)).astype(dtype)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -865,12 +889,13 @@ def _test_global_pool2d(opfunc, reffunc):
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     ref_res = reffunc(data, axis=(2,3), keepdims=True)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_pool2d():
     _test_pool2d(relay.nn.max_pool2d, np.max)
     _test_pool2d(relay.nn.max_pool2d, np.max, pool_size=2, strides=2, padding=0)
@@ -882,6 +907,7 @@ def test_pool2d():
     _test_global_pool2d(relay.nn.global_avg_pool2d, np.mean)
 
 
+@gpu
 def test_pool1d():
 
     def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
@@ -901,7 +927,7 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
         data = np.random.uniform(size=dshape).astype(dtype)
         ref_res = tvm.topi.testing.pool1d_ncw_python(data, (2,), (2,),
                                                  (0, 0), (1, 3, 16), pool_type, False)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -912,6 +938,7 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
     _test_pool1d(relay.nn.avg_pool1d, pool_size=2, strides=2, padding=0)
 
 
+@gpu
 def test_pool3d():
 
     def _test_pool3d(opfunc,
@@ -939,7 +966,7 @@ def _test_pool3d(opfunc,
         data = np.random.uniform(size=dshape).astype(dtype)
         ref_res = tvm.topi.testing.pool3d_ncdhw_python(data, pool_size, strides,
                                                    padding, out_shape, pool_type, False)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -956,6 +983,7 @@ def _test_pool3d(opfunc,
     _test_pool3d(relay.nn.avg_pool3d, pool_size=2, padding=0, strides=2)
 
 
+@gpu
 def test_avg_pool2d_no_count_pad():
     kh, kw = (4, 4)
     sh, sw = (2, 2)
@@ -985,11 +1013,12 @@ def test_avg_pool2d_no_count_pad():
     ref_res = np.maximum(b_np, 0.0)
     data = a_np
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_flatten_infer_type():
     d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4")
     x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
@@ -1018,7 +1047,7 @@ def test_flatten_infer_type():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = x_data.flatten().reshape(o_shape)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -1026,6 +1055,7 @@ def test_flatten_infer_type():
         op_res2 = intrp2.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_pad_infer_type():
     # entirely concrete case
     n, c, h, w = 1, 2, 3, 4
@@ -1042,6 +1072,7 @@ def test_pad_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
+@gpu
 def test_pad_run():
     def _test_run(dtype):
         dshape = (4, 10, 7, 7)
@@ -1050,7 +1081,7 @@ def _test_run(dtype):
         func = relay.Function([x], y)
         data = np.random.uniform(size=dshape).astype(dtype)
         ref_res = np.pad(data, ((1, 1), (2, 2), (3, 3), (4, 4)), 'constant')
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -1058,6 +1089,7 @@ def _test_run(dtype):
     _test_run('float32')
     _test_run('int32')
 
+@gpu
 def test_lrn():
     n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c , h, w))
@@ -1081,7 +1113,7 @@ def test_lrn():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -1089,6 +1121,7 @@ def test_lrn():
         op_res2 = intrp2.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_l2_normalize():
     n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c , h, w))
@@ -1109,7 +1142,7 @@ def test_l2_normalize():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -1126,6 +1159,7 @@ def batch_flatten(data):
     return np.reshape(data, (shape[0], target_dim))
 
 
+@gpu
 def test_batch_flatten():
     t1 = relay.TensorType((5, 10, 5))
     x = relay.Var("x", t1)
@@ -1133,7 +1167,7 @@ def test_batch_flatten():
 
     data = np.random.rand(5, 10, 5).astype(t1.dtype)
     ref_res = batch_flatten(data)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp = relay.create_executor("graph", ctx=ctx, target=target)
         op_res = intrp.evaluate(func)(data)
         np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
@@ -1166,12 +1200,13 @@ def get_shape():
     else:
         ref = tvm.topi.testing.bilinear_resize_python(data, (int(round(h*scale_h)),
                                                   int(round(w*scale_w))), layout)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         executor = relay.create_executor("graph", ctx=ctx, target=target)
         out = executor.evaluate(func)(data)
         tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
 
 
+@gpu
 def test_upsampling():
     _test_upsampling("NCHW", "nearest_neighbor")
     _test_upsampling("NCHW", "bilinear", True)
@@ -1212,17 +1247,19 @@ def get_shape():
         ref = tvm.topi.testing.trilinear_resize3d_python(data, (int(round(d*scale_d)),\
                                                      int(round(h*scale_h)),\
                                                      int(round(w*scale_w))), layout)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         executor = relay.create_executor("graph", ctx=ctx, target=target)
         out = executor.evaluate(func)(data)
         tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
 
+@gpu
 def test_upsampling3d():
     _test_upsampling3d("NCDHW", "nearest_neighbor")
     _test_upsampling3d("NCDHW", "trilinear", "align_corners")
     _test_upsampling3d("NDHWC", "nearest_neighbor")
     _test_upsampling3d("NDHWC", "trilinear", "align_corners")
 
+@gpu
 def test_conv2d_int8_intrinsics():
     def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
         input_dtype, weight_dtype, output_dtype = dtypes
@@ -1347,6 +1384,7 @@ def _has_fast_int8_instructions(asm, target):
     assert "vpmulld" in asm and "vpadd" in asm
 
 
+@gpu
 def test_depthwise_conv2d_int8():
     input_dtype = 'uint8'
     weight_dtype = 'int8'
@@ -1376,6 +1414,7 @@ def test_depthwise_conv2d_int8():
                 graph, lib, params = relay.build(func, target, params=parameters)
 
 
+@gpu
 def test_bitserial_conv2d_infer_type():
     # Basic shape test with ambiguous batch.
     n, c, h, w = te.size_var("n"), 32, 224, 224
@@ -1388,6 +1427,7 @@ def test_bitserial_conv2d_infer_type():
         (n, 32, 222, 222), "int16")
 
 
+@gpu
 def test_bitpack_infer_type():
     # Test axis packing shape inference.
     o, i, h, w = 32, 32, 128, 128
@@ -1400,6 +1440,7 @@ def test_bitpack_infer_type():
 # TODO(@jwfromm): Need to add bitserial_conv2d & bitpack run test cases
 
 
+@gpu
 def test_correlation():
     def _test_correlation(data_shape, kernel_size, max_displacement, stride1, stride2, padding, is_multiply, dtype='float32'):
         data1 = relay.var("data1", relay.ty.TensorType(data_shape, dtype))
@@ -1422,7 +1463,7 @@ def _test_correlation(data_shape, kernel_size, max_displacement, stride1, stride
         data2_np = np.random.uniform(size=data_shape).astype(dtype)
         ref_res = tvm.topi.testing.correlation_nchw_python(data1_np, data2_np, kernel_size, max_displacement, stride1, stride2, padding, is_multiply)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data1_np, data2_np)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 745130da02968..05dd67678ac7b 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -23,7 +23,8 @@
 from tvm import relay
 from tvm.error import TVMError
 from tvm.relay import create_executor, transform
-from tvm.relay.testing import ctx_list, check_grad, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type
+from tvm.testing import enabled_devices, gpu
 
 
 def test_zeros_ones():
@@ -199,6 +200,7 @@ def test_transpose_infer_type():
         (100, t, n), "float32")
 
 
+@gpu
 def test_transpose():
     def verify_transpose(dshape, axes):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -208,7 +210,7 @@ def verify_transpose(dshape, axes):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.transpose(x_data, axes=axes)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -250,6 +252,7 @@ def test_reshape_infer_type():
     assert yy.checked_type == relay.TensorType(
         (n, t, 2000), "float32")
 
+@gpu
 def test_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -262,7 +265,7 @@ def verify_reshape(shape, newshape, oshape):
         check_grad(func)
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -307,6 +310,7 @@ def test_reshape_like_infer_type():
     assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
 
 
+@gpu
 def test_reshape_like():
     def verify_reshape_like(shape, oshape):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
@@ -321,7 +325,7 @@ def verify_reshape_like(shape, oshape):
 
         func = relay.Function([x, y], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
@@ -347,6 +351,7 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
     verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
     verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
 
+@gpu
 def test_take():
     def verify_take(src_shape, indices_src, axis=None, mode="clip"):
         src_dtype = "float32"
@@ -361,7 +366,7 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
         np_mode = "raise" if mode == "fast" else mode
         ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, indices_src)
@@ -448,13 +453,14 @@ def test_full_infer_type():
     assert yy.checked_type == relay.TensorType((1, 2), "int8")
 
 
+@gpu
 def test_full():
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
         z = relay.full(x, src_shape, dtype)
         func = relay.Function([x], z)
         ref_res = np.full(src_shape, fill_value)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(np.array(fill_value, dtype))
@@ -481,6 +487,7 @@ def test_full_like_infer_type():
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 
+@gpu
 def test_full_like():
     def verify_full_like(base, fill_value, dtype):
         x_data = np.random.uniform(low=-1, high=1, size=base).astype(dtype)
@@ -491,7 +498,7 @@ def verify_full_like(base, fill_value, dtype):
         func = relay.Function([x, y], z)
         ref_res = np.full_like(x_data, fill_value)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, np.array(fill_value, dtype))
@@ -500,6 +507,7 @@ def verify_full_like(base, fill_value, dtype):
     verify_full_like((1, 1), 44.0, "float32")
 
 
+@gpu
 def test_infer_type_leaky_relu():
     n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
@@ -519,7 +527,7 @@ def test_infer_type_leaky_relu():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -555,7 +563,7 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
     else:
         ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data>=0) * x_data
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data, a_data)
@@ -564,6 +572,7 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
+@gpu
 def test_infer_type_prelu():
     n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))
@@ -576,6 +585,7 @@ def test_infer_type_prelu():
     verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
 
 
+@gpu
 def test_arange():
     def verify_arange(start, stop, step):
         dtype = "float32"
@@ -596,7 +606,7 @@ def verify_arange(start, stop, step):
             ref_res = np.arange(start, stop, step).astype(dtype)
 
         func = relay.Function([], x)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)()
@@ -613,6 +623,7 @@ def verify_arange(start, stop, step):
     # arange doesnt' support floating point right now, see type relation
     # verify_arange(20, 1, -1.5)
 
+@gpu
 def test_meshgrid():
     def verify_meshgrid(lengths, indexing="ij"):
         input_vars = []
@@ -632,7 +643,7 @@ def verify_meshgrid(lengths, indexing="ij"):
         # Get ref
         ref_res = np.meshgrid(*input_data, indexing=indexing)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(*input_data)
@@ -646,6 +657,7 @@ def verify_meshgrid(lengths, indexing="ij"):
     # Length 0 signifies scalar.
     verify_meshgrid([3, 5, 0])
 
+@gpu
 def test_tile():
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -655,7 +667,7 @@ def verify_tile(dshape, reps):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.tile(x_data, reps=reps)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -664,13 +676,14 @@ def verify_tile(dshape, reps):
     verify_tile((2, 3, 4), (1, 2))
     verify_tile((2, 3), (3, 2, 1))
 
+@gpu
 def test_repeat():
     def verify_repeat(dshape, repeats, axis):
         x = relay.Var("x", relay.TensorType(dshape, "float32"))
         func = relay.Function([x], relay.repeat(x, repeats, axis))
         data = np.random.uniform(size=dshape).astype("float32")
         ref_res = np.repeat(data, repeats, axis)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data)
@@ -679,6 +692,7 @@ def verify_repeat(dshape, repeats, axis):
     verify_repeat((3, 10), 2, -1)
     verify_repeat((3, 2, 4), 3, 1)
 
+@gpu
 def test_stack():
     def verify_stack(dshapes, axis):
         y = []
@@ -691,7 +705,7 @@ def verify_stack(dshapes, axis):
         x_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
         ref_res = np.stack(x_data, axis=axis)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(*x_data)
@@ -702,6 +716,7 @@ def verify_stack(dshapes, axis):
     verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
 
 
+@gpu
 def test_reverse():
     def verify_reverse(dshape, axis):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -711,7 +726,7 @@ def verify_reverse(dshape, axis):
         func = relay.Function([x], z)
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.flip(x_data, axis)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -721,6 +736,7 @@ def verify_reverse(dshape, axis):
     verify_reverse((2, 3, 4), -1)
 
 
+@gpu
 def test_reverse_sequence():
     def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
         seq_lengths_data = np.array(seq_lengths).astype("int32")
@@ -730,7 +746,7 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
         assert zz.checked_type == x.type_annotation
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -881,6 +897,7 @@ def verify_scatter_add(dshape, ishape, axis=0):
     verify_scatter_add((16, 16, 4, 5), (16, 16, 4, 5), 3)
 
 
+@gpu
 def test_gather():
     def verify_gather(data, axis, indices, ref_res):
         data = np.asarray(data, dtype='float32')
@@ -893,7 +910,7 @@ def verify_gather(data, axis, indices, ref_res):
 
         func = relay.Function([d, i], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data, indices)
@@ -933,6 +950,7 @@ def verify_gather(data, axis, indices, ref_res):
                     [-0.5700, 0.1558, -0.5700, 0.1558]]])
 
 
+@gpu
 def test_gather_nd():
     def verify_gather_nd(xshape, yshape, y_data):
         x = relay.var("x", relay.TensorType(xshape, "float32"))
@@ -943,7 +961,7 @@ def verify_gather_nd(xshape, yshape, y_data):
         x_data = np.random.uniform(size=xshape).astype("float32")
         ref_res = x_data[tuple(y_data)]
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
@@ -981,6 +999,7 @@ def test_isinf():
     _verify_infiniteness_ops(relay.isinf, np.isinf)
 
     
+@gpu
 def test_unravel_index():
     def verify_unravel_index(indices, shape, dtype):
         x_data = np.array(indices).astype(dtype)
@@ -999,7 +1018,7 @@ def verify_unravel_index(indices, shape, dtype):
 
         func = relay.Function([x, y], z)
         ref_res = np.unravel_index(x_data, y_data)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
@@ -1017,6 +1036,7 @@ def verify_unravel_index(indices, shape, dtype):
         # output which is inline with Tensorflow
         # verify_unravel_index([0, 1, 2, 5], [2, 2], dtype)
 
+@gpu
 def test_sparse_to_dense():
     def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
         sparse_indices_data = np.array(sparse_indices)
@@ -1037,7 +1057,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
         assert zz.checked_type == relay.ty.TensorType(output_shape, str(sparse_values_data.dtype))
 
         func = relay.Function(args, d)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 if default_value is None:
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 8e01fa2a89cd6..beb2bcb676c76 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -19,10 +19,12 @@
 import numpy as np
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import tvm.topi.testing
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_binary_op():
     def check_binary_op(opfunc, ref):
         n = te.size_var("n")
@@ -47,7 +49,7 @@ def check_binary_op(opfunc, ref):
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
@@ -56,6 +58,7 @@ def check_binary_op(opfunc, ref):
         check_binary_op(opfunc, ref)
 
 
+@gpu
 def test_cmp_type():
     for op, ref in ((relay.greater, np.greater),
                     (relay.greater_equal, np.greater_equal),
@@ -82,12 +85,13 @@ def test_cmp_type():
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
 
+@gpu
 def test_binary_int_broadcast_1():
     for op, ref in [(relay.right_shift, np.right_shift),
                     (relay.left_shift, np.left_shift)]:
@@ -107,11 +111,12 @@ def test_binary_int_broadcast_1():
             func = relay.Function([x, y], z)
             ref_res = ref(x_data, y_data)
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
+@gpu
 def test_binary_int_broadcast_2():
     for op, ref in [(relay.maximum, np.maximum),
                     (relay.minimum, np.minimum),
@@ -132,11 +137,12 @@ def test_binary_int_broadcast_2():
             func = relay.Function([x, y], z)
             ref_res = ref(x_data, y_data)
 
-            for target, ctx in ctx_list():
+            for target, ctx in enabled_devices():
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
+@gpu
 def test_where():
     shape = (3, 4)
     dtype = "float32"
@@ -152,7 +158,7 @@ def test_where():
     x = np.random.uniform(size=shape).astype(dtype)
     y = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.where(condition, x, y)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "debug"]:
             intrp = relay.create_executor(kind, ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(condition, x, y)
@@ -195,7 +201,7 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32")
             return
         ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -203,6 +209,7 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32")
         op_res2 = intrp2.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_reduce_functions():
     def _with_keepdims(func):
         def _wrapper(data, axis=None, keepdims=False):
@@ -282,7 +289,7 @@ def verify_mean_var_std(funcs, shape, axis, keepdims):
     ref_mean = np.mean(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
     ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
 
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
@@ -292,6 +299,7 @@ def verify_mean_var_std(funcs, shape, axis, keepdims):
         tvm.testing.assert_allclose(op_res2[0].asnumpy(), ref_mean, rtol=1e-5)
         tvm.testing.assert_allclose(op_res2[1].asnumpy(), ref_res, rtol=1e-5)
 
+@gpu
 def test_mean_var_std():
     for func in [[relay.mean_variance, np.var],
                  [relay.mean_std, np.std]]:
@@ -307,6 +315,7 @@ def test_mean_var_std():
         verify_mean_var_std(func, (128, 24, 128), (0, 2), True)
 
 
+@gpu
 def test_strided_slice():
     def verify(dshape, begin, end, strides, output, slice_mode="end",
                attr_const=True, test_ref=True, dtype="int32"):
@@ -349,7 +358,7 @@ def verify(dshape, begin, end, strides, output, slice_mode="end",
 
         if not test_ref:
             return
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
@@ -371,6 +380,7 @@ def verify(dshape, begin, end, strides, output, slice_mode="end",
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1],
            (2, 2, 3), slice_mode="size", test_ref=True)
 
+@gpu
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -394,7 +404,7 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
         v_data = np.random.uniform(size=vshape).astype("float32")
         ref_res = tvm.topi.testing.strided_set_python(
             x_data, v_data, begin, end, strides)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data, v_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 254bab5e1692e..789e365703ca9 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -22,8 +22,9 @@
 from tvm import te
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import tvm.topi.testing
+from tvm.testing import enabled_devices, device_enabled, gpu
 
 
 def test_resize_infer_type():
@@ -40,6 +41,7 @@ def test_resize_infer_type():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
+@gpu
 def test_resize():
     def verify_resize(dshape, scale, method, layout, coord_trans):
         if layout == "NHWC":
@@ -61,7 +63,7 @@ def verify_resize(dshape, scale, method, layout, coord_trans):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -87,6 +89,7 @@ def test_resize3d_infer_type():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((n, c, 10, 10, 20), "int8")
 
+@gpu
 def test_resize3d():
     def verify_resize(dshape, scale, method, layout):
         if layout == "NDHWC":
@@ -106,7 +109,7 @@ def verify_resize(dshape, scale, method, layout):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -115,6 +118,7 @@ def verify_resize(dshape, scale, method, layout):
         for layout in ["NDHWC", "NCDHW"]:
             verify_resize((1, 4, 4, 4, 4), 2, method, layout)
 
+@gpu
 def test_crop_and_resize():
     def verify_crop_and_resize(img_shape, boxes, box_indices, crop_size,
                                layout, method, extrapolation_value=0.0):
@@ -138,7 +142,7 @@ def verify_crop_and_resize(img_shape, boxes, box_indices, crop_size,
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([img, bx, bx_idx], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(image_data, boxes, box_indices)
@@ -157,6 +161,7 @@ def verify_crop_and_resize(img_shape, boxes, box_indices, crop_size,
         verify_crop_and_resize((5, 3, 255, 255), boxes_nchw, indices_nchw,
                                size_nchw, 'NCHW', method, 0.1)
 
+@gpu
 def test_multibox_prior():
     def get_ref_result(dshape, sizes=(1.0,),
                        ratios=(1.0,), steps=(-1.0, -1.0),
@@ -213,7 +218,7 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
         data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         func = relay.Function([x], z)
         func = run_infer_type(func)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
@@ -242,6 +247,7 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
+@gpu
 def test_get_valid_counts():
     def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         dtype = "float32"
@@ -271,7 +277,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         assert "score_threshold" in z.astext()
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
@@ -287,6 +293,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
     verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
 
 
+@gpu
 def test_non_max_suppression():
     def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
                    ref_indices_res, iou_threshold=0.5, force_suppress=False,
@@ -319,7 +326,7 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
         func = run_infer_type(func)
         func_indices = relay.Function([x0, x1, x2, x3], z_indices)
         func_indices = run_infer_type(func_indices)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
@@ -366,6 +373,7 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
                np_indices_result, top_k=2)
 
 
+@gpu
 def test_multibox_transform_loc():
     def test_default_value():
         num_anchors = 3
@@ -408,7 +416,7 @@ def test_default_value():
         nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = run_infer_type(func)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds,
                                             np_anchors)
@@ -450,6 +458,7 @@ def test_threshold():
     test_threshold()
 
 
+@gpu
 def test_roi_align():
     def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
@@ -471,7 +480,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
         ref_res = tvm.topi.testing.roi_align_nchw_python(np_data, np_rois, pooled_size=pooled_size,
                                                      spatial_scale=spatial_scale,
                                                      sample_ratio=sample_ratio)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4)
@@ -483,6 +492,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
     verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2)
 
 
+@gpu
 def test_roi_pool():
     def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
@@ -502,7 +512,7 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         np_rois[:, 0] = np.random.randint(low = 0, high = batch, size = num_roi).astype('float32')
         ref_res = tvm.topi.testing.roi_pool_nchw_python(np_data, np_rois, pooled_size=pooled_size,
                                                      spatial_scale=spatial_scale)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4)
@@ -514,6 +524,7 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
     verify_roi_pool((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5)
 
 
+@gpu
 def test_proposal():
     def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
         cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
@@ -526,7 +537,7 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
         func = relay.Function([cls_prob, bbox_pred, im_info], z)
         func = run_infer_type(func)
         for target in ['llvm', 'cuda']:
-            if not tvm.runtime.enabled(target):
+            if not device_enabled(target):
                 print("Skip test because %s is not enabled." % target)
                 continue
             ctx = tvm.context(target, 0)
@@ -592,6 +603,7 @@ def verify_yolo_reorg(shape, stride, out_shape):
     verify_yolo_reorg((n, c, 20, 20), 10, (n, c*10*10, 2, 2))
     verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, idxd(h, 2), idxd(w, 2)))
 
+@gpu
 def test_yolo_reorg():
     def verify_yolo_reorg(shape, stride):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
@@ -605,7 +617,7 @@ def verify_yolo_reorg(shape, stride):
 
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -615,6 +627,7 @@ def verify_yolo_reorg(shape, stride):
     verify_yolo_reorg((1, 4, 6, 6), 2)
 
 
+@gpu
 def test_deformable_conv2d():
     def test_infer_type(batch, in_channel, size, out_channel, deformable_groups, groups):
         data_shape = (batch, in_channel, size, size)
@@ -665,7 +678,7 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
         kernel = np.random.uniform(size=kernel_shape).astype(dtype)
         ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res1 = intrp1.evaluate(func)(data, offset, kernel)
@@ -674,6 +687,7 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
     test_run(2, 4, 16, 4, 4, 1)
 
 
+@gpu
 def test_depth_to_space():
     def verify_depth_to_space(dshape, block_size, layout, mode):
         if layout == "NHWC":
@@ -696,7 +710,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -706,6 +720,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
             verify_depth_to_space((1, 4, 4, 4), 2, layout, mode)
 
 
+@gpu
 def test_space_to_depth():
     def verify_space_to_depth(dshape, block_size, layout):
         if layout == "NHWC":
@@ -728,7 +743,7 @@ def verify_space_to_depth(dshape, block_size, layout):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -753,6 +768,7 @@ def test_dilation2d_infer_type():
         (n, 10, 217, 217), "float32")
 
 
+@gpu
 def test_dilation2d_run():
     def run_test_dilation2d(indata, kernel, out,
                             dtype='float32',
@@ -777,7 +793,7 @@ def run_test_dilation2d(indata, kernel, out,
                                    **attrs)
         func = relay.Function([x, w], y)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if target in except_targets:
                 continue
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
@@ -844,6 +860,7 @@ def _convert_data(indata, kernel, out, layout=None):
                         data_layout='NHWC', kernel_layout='HWI')
 
 
+@gpu
 def test_affine_grid():
     def verify_affine_grid(num_batch, target_shape):
         dtype = 'float32'
@@ -857,7 +874,7 @@ def verify_affine_grid(num_batch, target_shape):
         data_np = np.random.uniform(size=data_shape).astype(dtype)
         ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res1 = intrp1.evaluate(func)(data_np)
@@ -867,6 +884,7 @@ def verify_affine_grid(num_batch, target_shape):
     verify_affine_grid(4, (16, 32))
 
 
+@gpu
 def test_grid_sample():
     def verify_grid_sample(data_shape, grid_shape):
         dtype = 'float32'
@@ -883,7 +901,7 @@ def verify_grid_sample(data_shape, grid_shape):
         grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
         ref_res = tvm.topi.testing.grid_sample_nchw_python(data_np, grid_np, method='bilinear')
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res1 = intrp1.evaluate(func)(data_np, grid_np)
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 287e80a0fab76..760ca1c516b1e 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -20,8 +20,9 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import ctx_list
+from tvm.testing import gpu, enabled_devices
 
+@gpu
 def test_argsort():
     def verify_argsort(shape, axis, is_ascend, dtype):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -33,7 +34,7 @@ def verify_argsort(shape, axis, is_ascend, dtype):
         else:
             ref_res = np.argsort(-x_data, axis=axis)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
@@ -44,6 +45,7 @@ def verify_argsort(shape, axis, is_ascend, dtype):
         verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype)
 
 
+@gpu
 def test_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
@@ -70,7 +72,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 np_values[i, :] = np_data[i, np_indices[i, :]]
         np_indices = np_indices.astype(dtype)
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(np_data)
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 85dd2edb9185b..5e42a3ed58b10 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -21,8 +21,9 @@
 from tvm import relay
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
 import numpy as np
+from tvm.testing import gpu, enabled_devices
 
 def run_opt_pass(expr, passes):
     passes = passes if isinstance(passes, list) else [passes]
@@ -615,6 +616,7 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+@gpu
 def test_alter_layout_strided_slice():
     """Test rewriting strided_slice during alter_iop_layout"""
     def before():
@@ -661,7 +663,7 @@ def expected():
     mod_before['main'] = a
     mod_new['main'] = b
     with relay.build_config(opt_level=3):
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             for kind in ["graph", "debug", "vm"]:
                 ex_before = relay.create_executor(kind, mod=mod_before, ctx=ctx, target=target)
                 ex_new = relay.create_executor(kind, mod=mod_new, ctx=ctx, target=target)
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index e95708007b955..80f186961b844 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -23,7 +23,7 @@
 from tvm.contrib import graph_runtime
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay import transform
-
+from tvm.testing import requires_cuda, requires_opencl
 
 def run_opt_pass(expr, passes):
     passes = passes if isinstance(passes, list) else [passes]
@@ -624,22 +624,33 @@ def expected():
         tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
 
 
-def test_check_run():
-    for dev, tgt in [("opencl", "opencl"), ("cuda", "cuda"),
-                 ("opencl", str(tvm.target.intel_graphics()))]:
-        if not tvm.runtime.enabled(dev):
-            print("Skip test because %s is not enabled." % dev)
-            continue
-        run_fusible_network(dev, tgt)
-        run_unpropagatable_graph(dev, tgt)
+@requires_opencl
+def test_check_run_opencl():
+    dev = "opencl"
+    tgt = "opencl"
+    run_fusible_network(dev, tgt)
+    run_unpropagatable_graph(dev, tgt)
 
 
-def test_tuple_get_item():
+@requires_opencl
+def test_check_run_opencl_intel():
+    dev = "opencl"
+    tgt = str(tvm.target.intel_graphics())
+    run_fusible_network(dev, tgt)
+    run_unpropagatable_graph(dev, tgt)
+
+
+@requires_cuda
+def test_check_run_cuda():
     dev = "cuda"
-    if not tvm.runtime.enabled(dev):
-        print("Skip test because %s is not enabled." % dev)
-        return
+    tgt = "cuda"
+    run_fusible_network(dev, tgt)
+    run_unpropagatable_graph(dev, tgt)
 
+
+@requires_cuda
+def test_tuple_get_item():
+    dev = "cuda"
     cpu_ctx = tvm.cpu(0)
     gpu_ctx = tvm.context(dev)
 
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index 6b422ca0d5947..42cd86126d028 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -20,8 +20,9 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing import run_infer_type, create_workload, ctx_list
+from tvm.relay.testing import run_infer_type, create_workload
 import tvm.topi.testing
+from tvm.testing import enabled_devices, gpu
 
 def run_opt_pass(expr, opt_pass):
     assert isinstance(opt_pass, tvm.transform.Pass)
@@ -34,7 +35,7 @@ def run_opt_pass(expr, opt_pass):
 
 def verify_func(func, data, ref_res, rtol=1e-5, atol=1e-7):
     assert isinstance(data, list)
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         for kind in ["graph", "vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
             intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
@@ -42,6 +43,7 @@ def verify_func(func, data, ref_res, rtol=1e-5, atol=1e-7):
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
 
 
+@gpu
 def test_dynamic_to_static_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -66,6 +68,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
+@gpu
 def test_dynamic_to_static_double_reshape():
     def verify_reshape(shape, newshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -90,6 +93,7 @@ def verify_reshape(shape, newshape):
     verify_reshape((4, 7), (2, 7, 2))
 
 
+@gpu
 def test_dynamic_to_static_quad_reshape():
     def verify_reshape(shape, newshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -116,6 +120,7 @@ def verify_reshape(shape, newshape):
     verify_reshape((4, 7), (2, 7, 2))
 
 
+@gpu
 def test_dynamic_to_static_tile():
     def verify_tile(shape, reps, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -139,6 +144,7 @@ def verify_tile(shape, reps, oshape):
     verify_tile((4, 7), (4, 2), (16, 14))
 
 
+@gpu
 def test_dynamic_to_static_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
@@ -173,7 +179,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         assert isinstance(zz, relay.Call)
         assert zz.op == relay.op.get("topk")
 
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             if "llvm" not in target: continue
             for kind in ["graph", "vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func2)
@@ -195,6 +201,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 verify_topk(k, axis, ret_type, False, "float32")
 
 
+@gpu
 def test_dynamic_to_static_broadcast_to():
     def verify_broadcast_to(shape, broadcast_shape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -219,6 +226,7 @@ def verify_broadcast_to(shape, broadcast_shape):
     verify_broadcast_to((3, 1), (3, 3))
 
 
+@gpu
 def test_dynamic_to_static_zeros_ones():
     def verify_ones_zeros(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
@@ -241,6 +249,7 @@ def verify_ones_zeros(shape, dtype):
     verify_ones_zeros((9, 8, 3, 4), 'float32')
 
 
+@gpu
 def test_dynamic_to_static_resize():
     def verify_resize(shape, scale, method, layout):
         if layout == "NHWC":
@@ -275,6 +284,7 @@ def verify_resize(shape, scale, method, layout):
             verify_resize((1, 4, 4, 4), 2, method, layout)
 
 
+@gpu
 def test_dynamic_to_static_one_hot():
     def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
@@ -302,6 +312,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
     _verify((3, 2, 4, 5), 6, 1, 0, 1, "int32")
     _verify((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
 
+@gpu
 def test_dynamic_to_static_full():
     def verify_full(fill_value, fill_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
@@ -310,7 +321,7 @@ def verify_full(fill_value, fill_shape, dtype):
 
         func = run_infer_type(relay.Function([x, y], z))
         func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
-        
+
         zz = func2.body
         assert isinstance(zz, relay.Call)
         assert zz.op == relay.op.get("full")
@@ -318,7 +329,7 @@ def verify_full(fill_value, fill_shape, dtype):
         ref_res = np.full(fill_shape, fill_value).astype(dtype)
         y_data = np.random.uniform(low=-1, high=1, size=fill_shape).astype('int64')
         verify_func(func2, [fill_value, y_data], ref_res)
-    
+
     verify_full(4, (1, 2, 3, 4), 'int32')
     verify_full(4.0, (1, 2, 8, 10), 'float32')
 
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 1727429e74de8..b4d3007cf3ea6 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -19,6 +19,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
+from tvm.testing import enabled_devices, gpu
 
 
 def test_fuse_simple():
@@ -694,6 +695,7 @@ def expected():
     assert tvm.ir.structural_equal(m["main"], after)
 
 
+@gpu
 def test_fuse_bcast_reduce_scalar():
     """Test fusion case with broadcast and reduction involving scalar"""
 
@@ -716,7 +718,7 @@ def expected():
 
     orig = before()
     m = fuse2(tvm.IRModule.from_expr(orig))
-    for tgt, _ in tvm.relay.testing.config.ctx_list():
+    for tgt, ctx in enabled_devices():
         relay.build(m, tgt)
     after = run_opt_pass(expected(), transform.InferType())
     assert tvm.ir.structural_equal(m["main"], after)
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
index 25299caae30be..854daa4487a4b 100644
--- a/tests/python/relay/test_pass_manager.py
+++ b/tests/python/relay/test_pass_manager.py
@@ -25,7 +25,8 @@
 from tvm.relay import Function, Call
 from tvm.relay import analysis
 from tvm.relay import transform as _transform
-from tvm.relay.testing import ctx_list, run_infer_type
+from tvm.relay.testing import run_infer_type
+from tvm.testing import enabled_devices, gpu
 
 
 def get_var_func():
@@ -114,6 +115,7 @@ def check_func(func, ref_func):
     assert tvm.ir.structural_equal(func, ref_func)
 
 
+@gpu
 def test_module_pass():
     shape = (5, 10)
     dtype = 'float32'
@@ -178,7 +180,7 @@ def test_pass_run():
         x_nd = get_rand(shape, dtype)
         y_nd = get_rand(shape, dtype)
         ref_res = x_nd.asnumpy() + y_nd.asnumpy()
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             exe1 = relay.create_executor("graph", ctx=ctx, target=target)
             exe2 = relay.create_executor("debug", ctx=ctx, target=target)
             res1 = exe1.evaluate(new_add)(x_nd, y_nd)
@@ -214,6 +216,7 @@ def transform_function(self, func, mod, ctx):
     assert tvm.ir.structural_equal(mod["main"], mod2["main"])
 
 
+@gpu
 def test_function_pass():
     shape = (10, )
     dtype = 'float32'
@@ -271,7 +274,7 @@ def test_pass_run():
         # Execute the add function.
         x_nd = get_rand(shape, dtype)
         ref_res = np.log(x_nd.asnumpy() * 2)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             exe1 = relay.create_executor("graph", ctx=ctx, target=target)
             exe2 = relay.create_executor("debug", ctx=ctx, target=target)
             res1 = exe1.evaluate(new_log)(x_nd)
@@ -314,6 +317,7 @@ def test_pass_info():
     assert info.name == "xyz"
 
 
+@gpu
 def test_sequential_pass():
     shape = (10, )
     dtype = 'float32'
@@ -433,7 +437,7 @@ def test_multiple_passes():
         x_nd = get_rand(shape, dtype)
         y_nd = get_rand(shape, dtype)
         ref_res = np.subtract(x_nd.asnumpy() * 2, y_nd.asnumpy() * 2)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             exe1 = relay.create_executor("graph", ctx=ctx, target=target)
             exe2 = relay.create_executor("debug", ctx=ctx, target=target)
             res1 = exe1.evaluate(new_sub)(x_nd, y_nd)
@@ -444,7 +448,7 @@ def test_multiple_passes():
         # Execute the updated abs function.
         x_nd = get_rand((5, 10), dtype)
         ref_res = np.abs(x_nd.asnumpy() * 2)
-        for target, ctx in ctx_list():
+        for target, ctx in enabled_devices():
             exe1 = relay.create_executor("graph", ctx=ctx, target=target)
             exe2 = relay.create_executor("debug", ctx=ctx, target=target)
             res1 = exe1.evaluate(new_abs)(x_nd)
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index e96d36258c678..44281424c087f 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -21,10 +21,10 @@
 from tvm import runtime
 from tvm import relay
 from tvm.relay.scope_builder import ScopeBuilder
-from tvm.relay.testing.config import ctx_list
 from tvm.relay.prelude import Prelude
 from tvm.relay.loops import while_loop
 from tvm.relay import testing
+from tvm.testing import enabled_devices, gpu
 
 def check_result(args, expected_result, mod=None):
     """
@@ -41,7 +41,7 @@ def check_result(args, expected_result, mod=None):
     """
     # TODO(@zhiics, @icemelon9): Disable the gpu test for now until the heterogeneous support
     #   is ready
-    for target, ctx in ctx_list():
+    for target, ctx in enabled_devices():
         if "cuda" in target:
             continue
         vm = relay.create_executor('vm', ctx=ctx, target=target, mod=mod)
@@ -91,6 +91,7 @@ def test_split_no_fuse():
     res = veval(f, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0])
 
+@gpu
 def test_id():
     x = relay.var('x', shape=(10, 10), dtype='float64')
     f = relay.Function([x], x)
@@ -99,6 +100,7 @@ def test_id():
     mod["main"] = f
     check_result([x_data], x_data, mod=mod)
 
+@gpu
 def test_op():
     x = relay.var('x', shape=(10, 10))
     f = relay.Function([x], x + x)
@@ -111,6 +113,7 @@ def any(x):
     x = relay.op.nn.batch_flatten(x)
     return relay.op.min(x, axis=[0, 1])
 
+@gpu
 def test_cond():
     x = relay.var('x', shape=(10, 10))
     y = relay.var('y', shape=(10, 10))
@@ -127,6 +130,7 @@ def test_cond():
     # diff
     check_result([x_data, y_data], False, mod=mod)
 
+@gpu
 def test_simple_if():
     x = relay.var('x', shape=(10, 10))
     y = relay.var('y', shape=(10, 10))
@@ -162,6 +166,7 @@ def test_multiple_ifs():
     res = vmobj_to_list(vm.evaluate()(False))
     assert(res == [1, 0])
 
+@gpu
 def test_simple_call():
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar('sum_up')
@@ -175,6 +180,7 @@ def test_simple_call():
     mod["main"] = relay.Function([iarg], sum_up(iarg))
     check_result([i_data], i_data, mod=mod)
 
+@gpu
 def test_count_loop():
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar('sum_up')
@@ -195,6 +201,7 @@ def test_count_loop():
     tvm.testing.assert_allclose(result.asnumpy(), i_data)
     check_result([i_data], i_data, mod=mod)
 
+@gpu
 def test_sum_loop():
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar('sum_up')
@@ -217,6 +224,7 @@ def test_sum_loop():
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
     check_result([i_data, accum_data], sum(range(1, loop_bound + 1)), mod=mod)
 
+@gpu
 def test_tuple_fst():
     ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
     tup = relay.var('tup', type_annotation=ttype)
@@ -227,6 +235,7 @@ def test_tuple_fst():
     mod["main"] = f
     check_result([(i_data, j_data)], i_data, mod=mod)
 
+@gpu
 def test_tuple_second():
     ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
     tup = relay.var('tup', type_annotation=ttype)
@@ -259,6 +268,7 @@ def test_list_constructor():
     obj = vmobj_to_list(result)
     tvm.testing.assert_allclose(obj, np.array([3,2,1]))
 
+@gpu
 def test_let_tensor():
     sb = relay.ScopeBuilder()
     shape = (1,)
@@ -277,6 +287,7 @@ def test_let_tensor():
     mod["main"] = f
     check_result([x_data], x_data + 42.0, mod=mod)
 
+@gpu
 def test_let_scalar():
     sb = relay.ScopeBuilder()
 
@@ -545,6 +556,7 @@ def test_closure():
     res = veval(main)
     tvm.testing.assert_allclose(res.asnumpy(), 3.0)
 
+@gpu
 def test_add_op_scalar():
     """
     test_add_op_scalar:
@@ -561,6 +573,7 @@ def test_add_op_scalar():
     mod["main"] = func
     check_result([x_data, y_data], x_data + y_data, mod=mod)
 
+@gpu
 def test_add_op_tensor():
     """
     test_add_op_tensor:
@@ -577,6 +590,7 @@ def test_add_op_tensor():
     mod["main"] = func
     check_result([x_data, y_data], x_data + y_data, mod=mod)
 
+@gpu
 def test_add_op_broadcast():
     """
     test_add_op_broadcast:
@@ -598,6 +612,7 @@ def test_vm_optimize():
     comp = relay.vm.VMCompiler()
     opt_mod, _ = comp.optimize(mod, "llvm", params)
 
+@gpu
 def test_loop_free_var():
     x = relay.var('x', shape=(), dtype='int32')
     i = relay.var('i', shape=(), dtype='int32')
@@ -624,6 +639,7 @@ def body_with_free_var(i, acc):
         mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
         check_result(args, expected, mod=mod)
 
+@gpu
 def test_vm_reshape_tensor():
     x_np = np.random.uniform(size=(8, 16)).astype("float32")
     x = relay.var("x", shape=(8, 16), dtype="float32")
diff --git a/tests/python/topi/python/common.py b/tests/python/topi/python/common.py
index 735072c1ca4d8..e9aa3fb0da66a 100644
--- a/tests/python/topi/python/common.py
+++ b/tests/python/topi/python/common.py
@@ -16,22 +16,9 @@
 # under the License.
 """Common utility for topi test"""
 
-import tvm
-from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
-from tvm import topi
-
-def get_all_backend():
-    """return all supported target
-
-    Returns
-    -------
-    targets: list
-        A list of all supported targets
-    """
-    return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
-            'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
+from tvm.testing import device_enabled, enabled_devices
 
 class Int8Fallback(autotvm.FallbackContext):
     def _query_inside(self, target, workload):
diff --git a/tests/python/topi/python/test_fifo_buffer.py b/tests/python/topi/python/test_fifo_buffer.py
index 9af30f9dc779e..38d121ceaa30a 100644
--- a/tests/python/topi/python/test_fifo_buffer.py
+++ b/tests/python/topi/python/test_fifo_buffer.py
@@ -23,7 +23,7 @@
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from common import enabled_devices
 
 def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype='float32'):
     buffer = te.placeholder(buffer_shape, name='buffer', dtype=dtype)
@@ -64,7 +64,7 @@ def check_device(device):
         f(data_tvm, buffer_tvm, out_tvm)
         tvm.testing.assert_allclose(out_tvm.asnumpy(), out_np)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_conv1d_integration():
@@ -184,7 +184,7 @@ def check_device(device):
             tvm.testing.assert_allclose(output_window_tvm.asnumpy(),
                                         output_window_ref_tvm.asnumpy())
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def test_fifo_buffer():
diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py
index c8cddb661dca4..d4d5621884d8f 100644
--- a/tests/python/topi/python/test_topi_batch_matmul.py
+++ b/tests/python/topi/python/test_topi_batch_matmul.py
@@ -23,7 +23,7 @@
 from tvm.topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from tvm.testing import enabled_devices, gpu
 
 _batch_matmul_implement = {
     "generic": (topi.nn.batch_matmul, topi.generic.schedule_batch_matmul),
@@ -48,9 +48,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
@@ -63,9 +60,10 @@ def check_device(device):
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
+@gpu
 def test_batch_matmul():
     verify_batch_matmul(1, 16, 16, 32)
     verify_batch_matmul(5, 16, 16, 32)
diff --git a/tests/python/topi/python/test_topi_broadcast.py b/tests/python/topi/python/test_topi_broadcast.py
index 4ac985e057b9c..e1255309884ec 100644
--- a/tests/python/topi/python/test_topi_broadcast.py
+++ b/tests/python/topi/python/test_topi_broadcast.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from common import get_all_backend
+from common import enabled_devices
 
 
 def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
@@ -44,7 +44,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for target in get_all_backend():
+    for target, ctx in enabled_devices():
         check_device(target)
     check_device("sdaccel")
 
@@ -94,7 +94,7 @@ def check_device(device):
         foo(lhs_nd, rhs_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
-    for target in get_all_backend():
+    for target, ctx in enabled_devices():
         check_device(target)
     check_device("sdaccel")
 
@@ -256,7 +256,7 @@ def check_device(device):
             foo(data_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     test_apply(topi.logical_not, "logical_not", np.logical_not, np.array([True, False, 0, 1]))
@@ -297,7 +297,7 @@ def check_device(device):
             foo(data_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, ())
@@ -339,7 +339,7 @@ def check_device(device):
             foo(lhs_nd, rhs_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     test_apply(topi.logical_and, "logical_and", np.logical_and, True, False)
diff --git a/tests/python/topi/python/test_topi_clip.py b/tests/python/topi/python/test_topi_clip.py
index b3d95dd2e07a8..a7202fcae72c1 100644
--- a/tests/python/topi/python/test_topi_clip.py
+++ b/tests/python/topi/python/test_topi_clip.py
@@ -23,7 +23,7 @@
 from tvm.topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from common import enabled_devices
 
 def verify_clip(N, a_min, a_max, dtype):
     A = te.placeholder((N, N), dtype=dtype, name='A')
@@ -53,7 +53,7 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def test_clip():
diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py
index 49f2cd1125a3b..ad563481568f9 100644
--- a/tests/python/topi/python/test_topi_conv1d.py
+++ b/tests/python/topi/python/test_topi_conv1d.py
@@ -23,7 +23,9 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
-from common import get_all_backend
+from common import enabled_devices
+import pytest
+pytestmark = pytest.mark.gpu
 
 
 _conv1d_ncw_implement = {
@@ -95,7 +97,7 @@ def check_device(device):
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 7efa96d807b63..702bdffcc2f60 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -23,7 +23,7 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
-from common import get_all_backend
+from tvm.testing import enabled_devices, gpu
 
 _conv1d_transpose_ncw_implement = {
     "generic": (topi.nn.conv1d_transpose_ncw, topi.generic.schedule_conv1d_transpose_ncw),
@@ -51,9 +51,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_transpose_ncw_implement)
             B = fcompute(A, W, stride, padding, A.dtype, output_padding)
@@ -72,10 +69,11 @@ def check_device(device):
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_conv1d_transpose_ncw():
     verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 1, 0, (0,))
     verify_conv1d_transpose_ncw(1, 3, 224, 32, 7, 1, 2, (0,))
diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
index 95d5633bc1f8a..85bc50bb95757 100644
--- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py
+++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
@@ -26,7 +26,7 @@
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
 
 def _transform_data(data, bn):
     # NCHW -> NCHW[x]c
diff --git a/tests/python/topi/python/test_topi_conv2d_hwcn.py b/tests/python/topi/python/test_topi_conv2d_hwcn.py
index 20b1b4dfa8e5d..030809e4ca078 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwcn.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwcn.py
@@ -23,6 +23,7 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
+from tvm.testing import gpu, device_enabled, requires_gpu
 
 
 _conv2d_hwcn_implement = {
@@ -58,7 +59,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -94,6 +95,7 @@ def check_device(device):
         check_device(device)
 
 
+@requires_gpu
 def test_conv2d_hwcn():
     verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "SAME")
     verify_conv2d_hwcn(1, 256, 32, 256, 3, 1, "SAME")
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 615dc515b1f42..4ab5138cf783b 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -28,7 +28,8 @@
 from tvm.topi.util import get_const_tuple
 from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
 
-from common import get_all_backend, Int8Fallback
+from common import enabled_devices, Int8Fallback
+from tvm.testing import device_enabled, requires_gpu, requires_cuda
 
 def compile_conv2d_NHWC_gemm_int8_arm(batch, in_channel, in_size, num_filter, kernel, stride, padding,
                                  dilation=1, add_bias=False, add_relu=False):
@@ -45,7 +46,7 @@ def compile_conv2d_NHWC_gemm_int8_arm(batch, in_channel, in_size, num_filter, ke
     device = "llvm --device arm_cpu --mtriple aarch64-linux-gnu"
 
     ctx = tvm.context(device, 0)
-    if not ctx.exist:
+    if not device_enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     print("Compiling on arm AArch64 target: %s" % device)
@@ -128,7 +129,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -223,7 +224,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
@@ -293,7 +294,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
@@ -327,6 +328,8 @@ def check_device(device):
         check_device(device)
 
 
+@requires_gpu
+@requires_cuda
 def test_conv2d_nchw():
     with Int8Fallback():
         # ResNet18 workloads where channels in / out are multiple of oc_block_factor
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index dcdf0a776099e..3caaa0931e33d 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -26,7 +26,7 @@
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from tvm.testing import enabled_devices, gpu, device_enabled
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,\
         use_cudnn=False):
@@ -63,7 +63,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -97,7 +97,7 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
@@ -105,6 +105,7 @@ def check_device(device):
         check_device("cuda -model=unknown -libs=cudnn")
 
 
+@gpu
 def test_conv2d_nchw():
     # ResNet18 workloads
     verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 7750f235c6c51..a99abd360ff43 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -23,7 +23,7 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
-
+from tvm.testing import gpu, device_enabled
 
 
 _conv2d_nhwc_implement = {
@@ -56,7 +56,7 @@ def get_ref_data():
     a_np, w_np, b_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -76,6 +76,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_conv2d_nhwc():
     verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME")
     verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "SAME")
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
index 8375df34323cf..8812338b1ac19 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
@@ -26,6 +26,7 @@
 from tvm.contrib import nvcc
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
+from tvm.testing import device_enabled, requires_cuda, requires_gpu
 
 
 _conv2d_nhwc_tensorcore_implement = {
@@ -70,7 +71,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         if not nvcc.have_tensorcore(ctx.compute_version):
@@ -105,6 +106,8 @@ def check_device(device):
     check_device(devices)
 
 
+@requires_cuda
+@requires_gpu
 def test_conv2d_nhwc_tensorcore():
     """Test the conv2d with tensorcore for nhwc layout"""
     verify_conv2d_nhwc(16, 16, 14, 16, 3, 1, 1)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
index 00b40bfbe826e..c73f4a53e9c74 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
@@ -24,9 +24,9 @@
 import tvm.topi.testing
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib import nvcc
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
+from tvm.testing import requires_gpu, requires_cuda, requires_tpu
 
 
 _conv2d_nhwc_winograd_tensorcore = {
@@ -78,9 +78,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if bgemm == "direct":
@@ -114,6 +111,8 @@ def check_device(device):
     check_device(devices)
 
 
+@requires_cuda
+@requires_gpu
 def test_conv2d_nhwc_winograd_direct():
     """Test the conv2d with winograd for nhwc layout"""
     # resnet 18 workloads
@@ -135,13 +134,11 @@ def test_conv2d_nhwc_winograd_direct():
     verify_conv2d_nhwc(2,  48, 56,  48, 3, 1, "SAME", add_relu=True, add_bias=True)
     verify_conv2d_nhwc(1, 48, 35,  48, 5, 1, "VALID")
 
+
+@requires_cuda
+@requires_tpu
 def test_conv2d_nhwc_winograd_tensorcore():
     """Test the conv2d with winograd for nhwc layout"""
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-        return
     verify_conv2d_nhwc(8,  64, 56,  64, 3, 1, 1, bgemm="tensorcore")
     verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore")
     verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore")
diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
index 6c43b2d980cf1..284c24a35747c 100644
--- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
@@ -23,7 +23,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from tvm.testing import enabled_devices, gpu
 
 
 _conv2d_transpose_nchw_implement = {
@@ -59,9 +59,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_transpose_nchw_implement)
@@ -83,10 +80,11 @@ def check_device(device):
         func2(a, w, c)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_conv2d_transpose_nchw():
     verify_conv2d_transpose_nchw(1, 3, (224, 224),  1, (1, 1), (1, 1), (0, 0, 0, 0), (0, 0))
     verify_conv2d_transpose_nchw(1, 3, (224, 224),  32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0))
diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py
index 800aaea5363a9..c6ca4bae7d4c3 100644
--- a/tests/python/topi/python/test_topi_conv2d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_winograd.py
@@ -26,6 +26,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
+from tvm.testing import device_enabled, gpu
 
 
 _conv2d_nchw_winograd_implement = {
@@ -70,7 +71,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -102,6 +103,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_conv2d_nchw():
     # inception v3 workloads
     verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda'])
diff --git a/tests/python/topi/python/test_topi_conv3d_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
index ad2b93ce00ce2..e1bd30b8ae1b1 100644
--- a/tests/python/topi/python/test_topi_conv3d_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
@@ -26,7 +26,8 @@
 from tvm.topi.nn.util import get_pad_tuple3d
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 _conv3d_ncdhw_implement = {
     "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw),
@@ -68,9 +69,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ncdhw_implement)
         with tvm.target.create(device):
@@ -94,10 +92,11 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
+@gpu
 def test_conv3d_ncdhw():
     #3DCNN  workloads
     verify_conv3d_ncdhw(1, 32, 32, 5, 1, 1, 0)
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc.py b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
index b80f96bfb26d2..ec0ce1ba7c360 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
@@ -24,7 +24,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 _conv3d_ndhwc_implement = {
     "generic": (topi.nn.conv3d_ndhwc, topi.generic.schedule_conv3d_ndhwc),
@@ -60,9 +61,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ndhwc_implement)
         with tvm.target.create(device):
@@ -76,10 +74,11 @@ def check_device(device):
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_conv3d_ndhwc():
     verify_conv3d_ndhwc(1, 16, 32, 16, 3, 1, "SAME")
     verify_conv3d_ndhwc(4, 32, 16, 32, 5, 2, "SAME")
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
index 2adc34864c131..43aaee72fed6d 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
@@ -26,6 +26,7 @@
 from tvm.contrib import nvcc
 from tvm.topi.nn.util import get_pad_tuple3d
 from tvm.topi.util import get_const_tuple
+from tvm.testing import requires_tpu, requires_cuda
 
 
 _conv3d_ndhwc_tensorcore_implement = {
@@ -71,12 +72,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        if not nvcc.have_tensorcore(ctx.compute_version):
-            print("skip because gpu does not support Tensor Cores")
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ndhwc_tensorcore_implement)
@@ -106,6 +101,8 @@ def check_device(device):
     check_device(devices)
 
 
+@requires_tpu
+@requires_cuda
 def test_conv3d_ndhwc_tensorcore():
     """Test the conv3d with tensorcore for ndhwc layout"""
     verify_conv3d_ndhwc(16, 16, 14, 16, 3, 1, 1)
diff --git a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
index 8e9812043ce93..debd46b425344 100644
--- a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
@@ -23,7 +23,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 
 _conv3d_transpose_ncdhw_implement = {
@@ -57,9 +58,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_transpose_ncdhw_implement)
@@ -81,10 +79,11 @@ def check_device(device):
         func2(a, w, c)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, atol=1e-4, rtol=1e-4)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, atol=1e-4, rtol=1e-4)
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_conv3d_transpose_ncdhw():
     verify_conv3d_transpose_ncdhw(1, 3, (24, 24, 24), 1,  (1, 1, 1), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0))
     verify_conv3d_transpose_ncdhw(1, 3, (24, 24, 24), 2, (3, 3, 3), (1, 1, 1), (0, 0, 0, 0, 0, 0), (0, 0, 0))
diff --git a/tests/python/topi/python/test_topi_conv3d_winograd.py b/tests/python/topi/python/test_topi_conv3d_winograd.py
index 6e261305b9a4b..b5828c8cb979d 100644
--- a/tests/python/topi/python/test_topi_conv3d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv3d_winograd.py
@@ -26,7 +26,8 @@
 from tvm.topi.nn.util import get_pad_tuple3d
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import requires_gpu, device_enabled
 
 _conv3d_ncdhw_implement = {
     "gpu": (topi.cuda.conv3d_ncdhw_winograd, topi.cuda.schedule_conv3d_ncdhw_winograd),
@@ -78,7 +79,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -117,6 +118,7 @@ def check_device(device):
             check_device(device)
 
 
+@requires_gpu
 def test_conv3d_ncdhw():
     # Try without depth transformation
     #3DCNN  workloads
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index f5eb51c8a6aff..7408944d297e8 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -24,7 +24,10 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import device_enabled
+import pytest
+pytestmark = pytest.mark.gpu
 
 
 _correlation_implement = {
@@ -54,9 +57,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         fcompute, fschedule = tvm.topi.testing.dispatch(
             device, _correlation_implement)
@@ -72,7 +72,7 @@ def check_device(device):
             func(a, b, c)
             tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py
index a2a01fc7ea1fb..4fb4a763d8338 100644
--- a/tests/python/topi/python/test_topi_deformable_conv2d.py
+++ b/tests/python/topi/python/test_topi_deformable_conv2d.py
@@ -23,7 +23,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from tvm.testing import device_enabled, gpu
 
 
 _deformable_conv2d_implement = {
@@ -62,7 +62,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -85,6 +85,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_deformable_conv2d_nchw():
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 517cb4d3ecc64..f050bab3b1052 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -23,7 +23,8 @@
 from tvm.topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend, Int8Fallback
+from common import Int8Fallback
+from tvm.testing import gpu, enabled_devices, requires_cuda, requires_gpu
 
 _dense_implement = {
     "generic": [(topi.nn.dense, topi.generic.schedule_dense)],
@@ -59,9 +60,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.create(device):
@@ -76,7 +74,7 @@ def check_device(device):
             f(a, b, c, d)
             tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -104,9 +102,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
             print("Skip because int8 intrinsics are not available")
             return
@@ -128,6 +123,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_dense():
     verify_dense(1, 1024, 1000, use_bias=True)
     verify_dense(1, 1024, 1000, use_bias=False)
@@ -136,6 +132,8 @@ def test_dense():
     verify_dense(128, 1024, 1000, use_bias=True)
 
 
+@requires_cuda
+@requires_gpu
 def test_dense_int8():
     with Int8Fallback():
         verify_dense_int8(2, 1024, 1000, use_bias=True)
diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py
index 8a645e6b45ca1..3aff7337b96ab 100644
--- a/tests/python/topi/python/test_topi_dense_tensorcore.py
+++ b/tests/python/topi/python/test_topi_dense_tensorcore.py
@@ -23,7 +23,7 @@
 from tvm.topi.util import get_const_tuple
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib import nvcc
+from tvm.testing import requires_tpu
 
 
 _dense_implement = {
@@ -53,12 +53,6 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        if not nvcc.have_tensorcore(ctx.compute_version):
-            print("skip because gpu does not support Tensor Cores")
-            return
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.create(device):
@@ -74,10 +68,10 @@ def check_device(device):
             tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-3)
 
 
-    for device in ['cuda']:
-        check_device(device)
+    check_device('cuda')
 
 
+@requires_tpu
 def test_dense_tensorcore():
     """Test cases"""
     verify_dense(8, 16, 32, use_bias=True)
diff --git a/tests/python/topi/python/test_topi_depth_to_space.py b/tests/python/topi/python/test_topi_depth_to_space.py
index 380f656bf5998..f5e23b8d6ed56 100644
--- a/tests/python/topi/python/test_topi_depth_to_space.py
+++ b/tests/python/topi/python/test_topi_depth_to_space.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.topi.testing
 
-from common import get_all_backend
+from common import enabled_devices
 
 
 def verify_depth_to_space(block_size, batch, in_channel, in_height, in_width, layout='NCHW', mode='DCR'):
@@ -64,7 +64,7 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 5497e1124e704..d5f781b317b5c 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -24,7 +24,8 @@
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import device_enabled, gpu
 
 _depthwise_conv2d_nchw_implement = {
     "generic": [(topi.nn.depthwise_conv2d_nchw, topi.generic.schedule_depthwise_conv2d_nchw)],
@@ -69,7 +70,7 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -143,7 +144,7 @@ def get_ref_data():
             tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
             tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
@@ -172,7 +173,7 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -243,7 +244,7 @@ def get_ref_data():
         tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
@@ -298,7 +299,7 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -360,6 +361,7 @@ def get_ref_data():
             check_device(device)
 
 
+@gpu
 def test_depthwise_conv2d():
     # mobilenet workloads
     depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
index ba8bfcc72a4e2..495e3d5c27525 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
@@ -24,6 +24,7 @@
 from tvm.topi.nn.util import get_pad_tuple
 import tvm.topi.testing
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
+from tvm.testing import device_enabled, requires_gpu
 
 
 def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multiplier, filter_h, stride_h, padding_h):
@@ -51,7 +52,7 @@ def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multipli
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -106,6 +107,7 @@ def get_ref_data():
     check_device("vulkan")
     check_device("nvptx")
 
+@requires_gpu
 def test_topi_depthwise_conv2d_backward_input_nhwc():
     verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 1)
     verify_depthwise_conv2d_back_input(16, 256, 56, 2, 3, 1, 1)
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
index 599225d0a6670..60c0875c557e1 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
@@ -24,6 +24,7 @@
 from tvm.topi.util import get_const_tuple
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
+from tvm.testing import device_enabled, requires_gpu
 
 
 def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multiplier, filter_h, stride_h, padding_h):
@@ -51,7 +52,7 @@ def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multipl
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -99,6 +100,7 @@ def get_ref_data():
     check_device("vulkan")
     check_device("nvptx")
 
+@requires_gpu
 def test_topi_depthwise_conv2d_backward_weight_nhwc():
     verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 1)
     verify_depthwise_conv2d_back_weight(16, 256, 56, 2, 3, 1, 1)
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 6050d452140cf..027b37ae7fcb1 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -26,7 +26,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend, Int8Fallback
+from common import enabled_devices, Int8Fallback
+from tvm.testing import gpu, device_enabled, requires_gpu, requires_cuda
 
 
 _group_conv2d_nchw_implement = {
@@ -71,7 +72,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
 
@@ -148,7 +149,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
@@ -182,6 +183,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_group_conv2d_nchw():
     # ResNeXt-50 workload
     verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32)
@@ -207,6 +209,8 @@ def test_group_conv2d_nchw():
 
 
 
+@requires_gpu
+@requires_cuda
 def test_group_conv2d_NCHWc_int8():
     with Int8Fallback():
         # ResNeXt-50 workload
diff --git a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
index 6afe44e51466c..6930d334f2213 100644
--- a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -27,7 +27,7 @@
 from tvm.topi.util import get_const_tuple
 import pytest
 
-from common import get_all_backend
+from common import enabled_devices
 
 def _transform_data(data, bn):
     # NCHW -> NCHW[x]c
diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py
index 7fce69d2300c2..8b1fa8ad00eb8 100644
--- a/tests/python/topi/python/test_topi_image.py
+++ b/tests/python/topi/python/test_topi_image.py
@@ -22,7 +22,7 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from common import enabled_devices
 
 def verify_resize(batch, in_channel, in_height, in_width, out_height, out_width,
                   layout='NCHW', coord_trans="align_corners", method="bilinear"):
@@ -62,7 +62,7 @@ def check_device(device):
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -129,7 +129,7 @@ def check_device(device):
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -191,7 +191,7 @@ def check_device(device):
 
             tvm.testing.assert_allclose(tvm_out.asnumpy(), baseline_np, rtol=1e-3, atol=1e-3)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     boxes_1 = np.array([[.2, .3, .7, .9]], dtype="float32")
@@ -240,7 +240,7 @@ def check_device(device):
             tvm.testing.assert_allclose(
                 tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     verify_affine_grid(1, (16, 32))
@@ -281,7 +281,7 @@ def check_device(device):
             tvm.testing.assert_allclose(
                 tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8))
diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py
index 2d57d078407c3..6502e8b024f9e 100644
--- a/tests/python/topi/python/test_topi_lrn.py
+++ b/tests/python/topi/python/test_topi_lrn.py
@@ -21,6 +21,7 @@
 from tvm import topi
 from tvm.topi.util import get_const_tuple
 import tvm.topi.testing
+from tvm.testing import device_enabled, gpu
 
 _lrn_schedule = {
     "generic": topi.generic.schedule_lrn,
@@ -41,7 +42,7 @@ def verify_lrn(shape, size, axis, bias, alpha, beta):
     b_np = tvm.topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
 
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -58,6 +59,7 @@ def check_device(device):
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
 
+@gpu
 def test_lrn():
     verify_lrn((1, 3, 5, 5), 3, 1, 1.0, 1.0, 0.5)
     verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 8a9754ed6f969..bff37697e1ed4 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.topi import util
-from common import get_all_backend
+from common import enabled_devices
 
 
 def test_util():
@@ -71,7 +71,7 @@ def check_device(device):
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for target in get_all_backend():
+        for target, ctx in enabled_devices():
             check_device(target)
 
     def test_isnan(
@@ -111,7 +111,7 @@ def check_device(device):
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for target in get_all_backend():
+        for target, ctx in enabled_devices():
             check_device(target)
 
     def test_infiniteness_ops(topi_op, ref_op, name):
@@ -141,7 +141,7 @@ def check_device(device):
                 foo(a, b)
                 tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-            for target in get_all_backend():
+            for target, ctx in enabled_devices():
                 check_device(target)
 
     test_apply(topi.floor, "floor", np.floor, -100, 100)
@@ -181,8 +181,7 @@ def verify(from_dtype, to_dtype, low=-100, high=100):
             a_np = a_np - a_np[2, 3]
         b_np = a_np.astype(to_dtype)
 
-        for device in get_all_backend():
-            ctx = tvm.context(device, 0)
+        for device in enabled_devices():
             if not ctx.exist:
                 print("Skip because %s is not enabled" % device)
                 continue
diff --git a/tests/python/topi/python/test_topi_pooling.py b/tests/python/topi/python/test_topi_pooling.py
index b24dd85927b14..31a127e418100 100644
--- a/tests/python/topi/python/test_topi_pooling.py
+++ b/tests/python/topi/python/test_topi_pooling.py
@@ -23,7 +23,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.topi.util import get_const_tuple
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 _pool_schedule = {
     "generic": topi.generic.schedule_pool,
@@ -93,9 +94,6 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
@@ -107,7 +105,7 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=2e-5, atol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True,
@@ -149,9 +147,6 @@ def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_inc
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _pool_grad_schedule)
@@ -164,9 +159,10 @@ def check_device(device):
         f(a, out_grad, pool_grad)
         tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
+@gpu
 def test_pool():
     """test cases of pool"""
     verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'avg', False, True)
@@ -183,6 +179,7 @@ def test_pool():
     verify_pool(1, 256, 31, 3, 3, [1, 0, 3, 2], 'max', False)
     verify_pool(1, 256, 31, 3, 3, [3, 2, 1, 0], 'max', True)
 
+@gpu
 def test_pool_grad():
     """test cases of pool_grad"""
     verify_pool_grad(1, 256, 32, 3, 2, [1, 1, 1, 1], 'avg', False, False)
@@ -224,9 +221,6 @@ def verify_global_pool(dshape, pool_type, layout='NCHW'):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule)
@@ -240,9 +234,10 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
+@gpu
 def test_global_pool():
     """test cases of global_pool"""
     verify_global_pool((1, 1024, 7, 7), 'avg')
@@ -270,9 +265,6 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout="NCHW", dtype="floa
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule)
@@ -286,10 +278,11 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np_out, rtol=4e-5, atol=1e-6)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_adaptive_pool():
     """test cases of adaptive_pool"""
     verify_adaptive_pool((1, 3, 224, 224), (1, 1), "max")
@@ -331,9 +324,6 @@ def verify_pool3d(n, ic, ih, kh, sh, padding, pool_type,
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
@@ -345,10 +335,11 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_pool3d():
     """test cases of pool3d"""
     verify_pool3d(1, 256, 32, 2, 2, [0, 0, 0, 0, 0, 0], 'avg', False, True)
@@ -386,9 +377,6 @@ def verify_pool1d(n, ic, iw, kw, sw, padding, pool_type,
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
@@ -400,10 +388,11 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
+@gpu
 def test_pool1d():
     """test cases of pool1d"""
     verify_pool1d(1, 256, 32, 2, 2, [0, 0], 'avg', False, True)
diff --git a/tests/python/topi/python/test_topi_reduce.py b/tests/python/topi/python/test_topi_reduce.py
index d84182f21ffdd..65ffe511e1554 100644
--- a/tests/python/topi/python/test_topi_reduce.py
+++ b/tests/python/topi/python/test_topi_reduce.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 
-from common import get_all_backend
+from common import enabled_devices
 
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
@@ -122,7 +122,7 @@ def check_device(device):
                 tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
             tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py
index 1114b3fa3c8ce..eb60df5c499df 100644
--- a/tests/python/topi/python/test_topi_relu.py
+++ b/tests/python/topi/python/test_topi_relu.py
@@ -24,7 +24,8 @@
 from tvm.topi.util import get_const_tuple
 from tvm.contrib.nvcc import have_fp16
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 def verify_relu(m, n, dtype="float32"):
     A = te.placeholder((m, n), name='A', dtype=dtype)
@@ -35,9 +36,6 @@ def verify_relu(m, n, dtype="float32"):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because %s does not have fp16 support" % device)
             return
@@ -51,7 +49,7 @@ def check_device(device):
         foo(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -92,10 +90,12 @@ def _prelu_numpy(x, W):
     out_np = _prelu_numpy(x_np, w_np)
     tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
 
+@gpu
 def test_relu():
     verify_relu(10, 128, "float32")
     verify_relu(128, 64, "float16")
 
+@gpu
 def test_schedule_big_array():
     verify_relu(1024 * 100 , 512)
 
diff --git a/tests/python/topi/python/test_topi_reorg.py b/tests/python/topi/python/test_topi_reorg.py
index e5a19474029af..4a36c456b7a27 100644
--- a/tests/python/topi/python/test_topi_reorg.py
+++ b/tests/python/topi/python/test_topi_reorg.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import te
 import tvm.topi.testing
+from tvm.testing import gpu, device_enabled
 
 _reorg_schedule = {
     "generic": topi.generic.schedule_reorg,
@@ -47,7 +48,7 @@ def get_ref_data_reorg():
     def check_device(device):
         '''Cheching devices is enabled or not'''
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -63,6 +64,7 @@ def check_device(device):
     for device in ['llvm', 'cuda']:
         check_device(device)
 
+@gpu
 def test_reorg():
     verify_reorg(1, 20, 8, 2)
 
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 1ff69be7bc872..8ee1f76e9bc1c 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -24,7 +24,8 @@
 import logging
 from tvm.topi.util import get_const_tuple
 
-from common import get_all_backend
+from common import enabled_devices
+from tvm.testing import gpu
 
 _softmax_schedule = {
     "generic": topi.generic.schedule_softmax,
@@ -35,9 +36,6 @@
 
 def check_device(A, B, a_np, b_np, device, name):
     ctx = tvm.context(device, 0)
-    if not ctx.exist:
-        print("Skip because %s is not enabled" % device)
-        return
     print("Running on target: %s" % device)
     with tvm.target.create(device):
         s_func = tvm.topi.testing.dispatch(device, _softmax_schedule)
@@ -59,7 +57,7 @@ def verify_softmax(m, n, dtype="float32"):
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = tvm.topi.testing.softmax_python(a_np)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(A, B, a_np, b_np, device, "softmax")
 
 def verify_softmax_4d(shape, dtype="float32"):
@@ -71,9 +69,10 @@ def verify_softmax_4d(shape, dtype="float32"):
     b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h*w, c))
     b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(A, B, a_np, b_np, device, "softmax")
 
+@gpu
 def test_softmax():
     verify_softmax(32, 10)
     verify_softmax(3, 4)
@@ -89,10 +88,11 @@ def verify_log_softmax(m, n, dtype="float32"):
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = tvm.topi.testing.log_softmax_python(a_np)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(A, B, a_np, b_np, device, "log_softmax")
 
 
+@gpu
 def test_log_softmax():
     verify_log_softmax(32, 10)
     verify_log_softmax(3, 4)
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 7abfe586a4e03..66cfd424e05ea 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -21,6 +21,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
+from tvm.testing import gpu, device_enabled
 
 _argsort_implement = {
     "generic": (topi.argsort, topi.generic.schedule_argsort),
@@ -52,10 +53,10 @@ def verify_argsort(axis, is_ascend):
         np_indices = np_indices[:, :dshape[axis]]
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        ctx = tvm.context(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _argsort_implement)
@@ -97,7 +98,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -124,6 +125,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_argsort():
     np.random.seed(0)
     for axis in [0, -1, 1]:
@@ -131,6 +133,7 @@ def test_argsort():
         verify_argsort(axis, False)
 
 
+@gpu
 def test_topk():
     np.random.seed(0)
     for k in [0, 1, 5]:
diff --git a/tests/python/topi/python/test_topi_space_to_depth.py b/tests/python/topi/python/test_topi_space_to_depth.py
index f659c33d3739b..af2f5e8d53d52 100644
--- a/tests/python/topi/python/test_topi_space_to_depth.py
+++ b/tests/python/topi/python/test_topi_space_to_depth.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.topi.testing
 
-from common import get_all_backend
+from common import enabled_devices
 
 
 def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, layout='NCHW'):
@@ -64,7 +64,7 @@ def check_device(device):
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index e5fd0e9e66845..cf5ee6b2ec15a 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -25,6 +25,7 @@
 from collections import namedtuple
 import time
 import scipy.sparse as sp
+from tvm.testing import gpu, device_enabled
 
 _sparse_dense_implement = {
     "generic": (topi.nn.sparse_dense, topi.generic.schedule_sparse_dense),
@@ -56,7 +57,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -100,7 +101,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -141,7 +142,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -178,7 +179,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -303,7 +304,7 @@ def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -325,11 +326,13 @@ def check_device(device):
     for device in ['llvm', 'cuda']:
         check_device(device)
 
+@gpu
 def test_sparse_dense_bsr():
     M, N, K, BS_R, BS_C, density = 1, 64, 128, 8, 16, 0.9
     verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu=True)
     verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu=False)
 
+@gpu
 def test_sparse_dense_bsr_randomized():
     for _ in range(20):
         BS_R = np.random.randint(1, 16)
@@ -351,7 +354,7 @@ def test_sparse_dense_bsr_randomized():
 
         def check_device(device):
             ctx = tvm.context(device, 0)
-            if not ctx.exist:
+            if not device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 return
             print("Running on target: %s" % device)
@@ -372,14 +375,11 @@ def check_device(device):
             check_device(device)
 
 
-def test_sparse_dense():
-    test_sparse_dense_csr()
-    test_sparse_dense_bsr()
-    test_sparse_dense_bsr_randomized()
-
 if __name__ == "__main__":
     test_csrmv()
     test_csrmm()
     test_dense()
-    test_sparse_dense()
+    test_sparse_dense_csr()
+    test_sparse_dense_bsr()
+    test_sparse_dense_bsr_randomized()
     test_sparse_transpose_csr()
diff --git a/tests/python/topi/python/test_topi_tensor.py b/tests/python/topi/python/test_topi_tensor.py
index 34442845a8693..0755b152730ec 100644
--- a/tests/python/topi/python/test_topi_tensor.py
+++ b/tests/python/topi/python/test_topi_tensor.py
@@ -22,6 +22,7 @@
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib.nvcc import have_fp16
+from tvm.testing import device_enabled, requires_cuda, requires_gpu
 
 def verify_elemwise_sum(num_args, dtype):
     shape = (3,5,4)
@@ -41,7 +42,7 @@ def get_ref_data():
     np_nd = get_ref_data()
 
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
 
@@ -70,7 +71,7 @@ def get_ref_data():
     np_nd = get_ref_data()
 
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
 
@@ -89,7 +90,7 @@ def check_device(device):
 
 def verify_vectorization(n, m, dtype):
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
@@ -112,6 +113,8 @@ def check_device(device):
     for device in ["cuda"]:
         check_device(device)
 
+@requires_gpu
+@requires_cuda
 def test_vectorization():
     verify_vectorization(128, 64, "float16")
 
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index 13d24d59aab0f..e927e166139e4 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -23,16 +23,13 @@
 import tvm.topi.testing
 from tvm.contrib.nvcc import have_fp16
 
-from common import get_all_backend
+from tvm.testing import gpu, device_enabled, enabled_devices
 
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_broadcast_schedule(device)(B)
@@ -44,7 +41,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -53,9 +50,6 @@ def verify_reinterpret(in_shape, in_dtype, out_dtype, generator):
     B = topi.reinterpret(A, out_dtype)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         if in_dtype == "float16" and device == 'cuda' and not have_fp16(ctx.compute_version):
             print("Skip because %s does not have fp16 support" % device)
             return
@@ -70,7 +64,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_equal(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -79,9 +73,6 @@ def verify_transpose(in_shape, axes):
     B = topi.transpose(A, axes)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(B)
@@ -93,7 +84,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -102,9 +93,6 @@ def verify_reshape(src_shape, dst_shape):
     B = topi.reshape(A, dst_shape)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(B)
@@ -116,7 +104,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -125,9 +113,6 @@ def verify_squeeze(src_shape, axis):
     B = topi.squeeze(A, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(B)
@@ -141,7 +126,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -164,9 +149,6 @@ def get_concat_schedule(target):
     out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = get_concat_schedule(device)(out_tensor)
@@ -179,7 +161,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_stack(shapes, axis):
@@ -189,9 +171,6 @@ def verify_stack(shapes, axis):
     out_tensor = topi.stack(tensor_l, axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_broadcast_schedule(device)(out_tensor)
@@ -204,7 +183,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -213,9 +192,6 @@ def verify_split(src_shape, indices_or_sections, axis):
     tensor_l = topi.split(A, indices_or_sections, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(tensor_l)
@@ -229,7 +205,7 @@ def check_device(device):
         for out_nd, out_npy in zip(out_nds, out_npys):
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -272,7 +248,7 @@ def verify_flip(in_shape, axis):
     B = topi.flip(A, axis) + 1
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -291,6 +267,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_reverse_sequence():
     def verify_reverse_sequence(in_data, seq_lengths, batch_axis, seq_axis, ref_res):
         seq_lengths = np.array(seq_lengths).astype("int32")
@@ -300,9 +277,6 @@ def verify_reverse_sequence(in_data, seq_lengths, batch_axis, seq_axis, ref_res)
 
         def check_device(device):
             ctx = tvm.context(device, 0)
-            if not ctx.exist:
-                print("Skip because %s is not enabled" % device)
-                return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
                 s = tvm.topi.testing.get_injective_schedule(device)(C)
@@ -315,7 +289,7 @@ def check_device(device):
             foo(data_nd, seq_lengths_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), ref_res)
 
-        for device in get_all_backend():
+        for device in enabled_devices():
             check_device(device)
 
     indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
@@ -382,7 +356,7 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -417,7 +391,7 @@ def verify_strided_slice(in_shape, begin, end, strides=None):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -449,7 +423,7 @@ def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -492,9 +466,6 @@ def verify_gather(data, axis, indices):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(out_tensor)
@@ -508,7 +479,7 @@ def check_device(device):
         func(data_nd, indices_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_gather_nd(src_shape, indices_src, indices_dtype):
@@ -520,9 +491,6 @@ def verify_gather_nd(src_shape, indices_src, indices_dtype):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(out_tensor)
@@ -540,7 +508,7 @@ def check_device(device):
         func(data_nd, indices_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_arange(start, stop, step):
@@ -559,9 +527,6 @@ def verify_arange(start, stop, step):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(A)
@@ -570,7 +535,7 @@ def check_device(device):
         f(a_nd)
         tvm.testing.assert_allclose(a_nd.asnumpy(), a_np)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_repeat(in_shape, repeats, axis):
@@ -578,9 +543,6 @@ def verify_repeat(in_shape, repeats, axis):
     B = topi.repeat(A, repeats, axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_broadcast_schedule(device)(B)
@@ -592,7 +554,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_tile(in_shape, reps):
@@ -600,9 +562,6 @@ def verify_tile(in_shape, reps):
     B = topi.tile(A, reps)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_broadcast_schedule(device)(B)
@@ -614,7 +573,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_where(in_shape):
@@ -625,9 +584,6 @@ def verify_where(in_shape):
     C = topi.where(Cond, A, B)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_broadcast_schedule(device)(C)
@@ -643,7 +599,7 @@ def check_device(device):
         f(cond_nd, x_nd, y_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
@@ -653,9 +609,6 @@ def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
     one_hot_result = topi.transform.one_hot(indices, on_value_const, off_value_const, depth, axis, dtype)
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(one_hot_result)
@@ -668,7 +621,7 @@ def check_device(device):
         out_topi = out_nd.asnumpy()
         tvm.testing.assert_allclose(out_topi, out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 
@@ -686,9 +639,6 @@ def verify_unravel_index(indices, shape, dtype):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(Z)
@@ -701,7 +651,7 @@ def check_device(device):
         foo(datax_nd, datay_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
@@ -722,9 +672,6 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_injective_schedule(device)(D)
@@ -743,9 +690,10 @@ def check_device(device):
 
         tvm.testing.assert_allclose(out_nd.asnumpy(), np.array(xpected))
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
+@gpu
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
@@ -755,6 +703,7 @@ def test_strided_slice():
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
     verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
 
+@gpu
 def test_strided_set():
     verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])
     verify_strided_set((3, 4, 3), (3, 1, 2), [0, 0, 0], [4, -5, 4], [1, -1, 2])
@@ -766,11 +715,14 @@ def test_strided_set():
     verify_strided_set((3, 4, 3), (2, 3, 3), [1, 1, 0], [4, 4, 3])
     verify_strided_set((3, 4, 3), (2, 3, 3), [1, 1], [4, 4, 3])
 
+@gpu
+@gpu
 def test_expand_dims():
     verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
     verify_expand_dims((3, 10), (1, 3, 10), -3, 1)
 
 
+@gpu
 def test_reinterpret():
     verify_reinterpret((1000,), "float32", "int32",
                        lambda shape: np.random.randn(*shape) * 1000)
@@ -784,12 +736,14 @@ def test_reinterpret():
                        lambda shape: np.random.randint(0, 2 ** 32 - 1, size=shape))
 
 
+@gpu
 def test_transpose():
     verify_transpose((3, 10, 2), (1, 0, 2))
     verify_transpose((3, 10, 5), (2, 0, 1))
     verify_transpose((3, 10), None)
 
 
+@gpu
 def test_reshape():
     verify_reshape((1, 2, 3, 4), (2, 3, 4))
     verify_reshape((4, 2, 3, 4), (2, 4, 12))
@@ -798,10 +752,12 @@ def test_reshape():
     verify_reshape((4, 0), (2, 0, 2))
 
 
+@gpu
 def test_where():
     verify_where((1, 2, 3, 4))
 
 
+@gpu
 def test_squeeze():
     verify_squeeze((1, 2, 3, 4), 0)
     verify_squeeze((1, 2, 1, 4), None)
@@ -824,6 +780,7 @@ def test_squeeze():
             assert c.asnumpy()[0] == 2
 
 
+@gpu
 def test_concatenate():
     verify_concatenate([(2,), (2,), (2,)], -1)
     verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
@@ -836,6 +793,7 @@ def test_concatenate():
     verify_concatenate([(1, 14400), (1, 2400), (1, 640), (1, 240)], 1)
 
 
+@gpu
 def test_stack():
     verify_stack([(2,), (2,), (2,)], -1)
     verify_stack([(2,), (2,), (2,)], 1)
@@ -844,11 +802,13 @@ def test_stack():
     verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
 
 
+@gpu
 def test_split():
     verify_split((2, 12, 3), 3, 1)
     verify_split((2, 12, 3), [2, 4], 1)
     verify_split((10, 12, 24), [5, 7, 9], -1)
 
+@gpu
 def test_flip():
     verify_flip((3, 4, 3), 1)
     verify_flip((3, 4, 3), 0)
@@ -863,6 +823,7 @@ def test_expand_like():
     verify_expand_like((3, 4), (3, 5, 4), [1])
     verify_expand_like((5, 7), (5, 6, 7, 8), [1, 3])
 
+@gpu
 def test_take():
     verify_take((4,), [1])
     verify_take((4,), [[0,1,2,3]])
@@ -882,6 +843,7 @@ def test_take():
     verify_take((3,4), [0, 2], axis=0, mode="fast")
     verify_take((3,4), [0, 2], axis=1, mode="fast")
 
+@gpu
 def test_gather():
     verify_gather([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]])
     verify_gather(np.random.randn(4, 7, 5), 0, np.random.randint(low=0, high=4, size=(1, 7, 5)))
@@ -891,6 +853,7 @@ def test_gather():
     verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 2)))
     verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 10)))
 
+@gpu
 def test_gather_nd():
     for indices_dtype in ['int32', 'float32']:
         verify_gather_nd((4,), [[1.8]], indices_dtype)
@@ -906,6 +869,7 @@ def test_gather_nd():
         verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]],
                          indices_dtype)
 
+@gpu
 def test_arange():
     verify_arange(None, 20, None)
     verify_arange(None, 20, 2)
@@ -917,18 +881,21 @@ def test_arange():
     verify_arange(20, 1, -1)
     verify_arange(20, 1, -1.5)
 
+@gpu
 def test_repeat():
     verify_repeat((2,), 1, 0)
     verify_repeat((3, 2), 2, 0)
     verify_repeat((3, 2, 4), 3, 1)
     verify_repeat((1, 3, 2, 4), 4, -1)
 
+@gpu
 def test_tile():
     verify_tile((3, 2), (2, 3))
     verify_tile((3, 2, 5), (2,))
     verify_tile((3, ), (2, 3, 3))
     verify_tile((4, 0), (5,))
 
+@gpu
 def test_layout_transform():
     in_shape = (1, 32, 8, 8)
     A = te.placeholder(shape=in_shape, dtype="float32", name="A")
@@ -941,9 +908,6 @@ def test_layout_transform():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         tvm_input = tvm.nd.array(input, ctx)
         tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype)
         print("Running on target: %s" % device)
@@ -953,10 +917,11 @@ def check_device(device):
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend in get_all_backend():
+    for backend in enabled_devices():
         check_device(backend)
 
 
+@gpu
 def test_shape():
     in_shape = (8, 7, 13)
     dtype = "int32"
@@ -968,9 +933,6 @@ def test_shape():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         tvm_input = tvm.nd.array(input, ctx)
         tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype)
         print("Running on target: %s" % device)
@@ -980,10 +942,11 @@ def check_device(device):
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend in get_all_backend():
+    for backend in enabled_devices():
         check_device(backend)
 
 
+@gpu
 def test_sequence_mask():
     for in_shape in (5, 10), (3, 4, 5, 4):
         for axis in [0, 1]:
@@ -999,9 +962,6 @@ def test_sequence_mask():
 
                 def check_device(device):
                     ctx = tvm.context(device, 0)
-                    if not ctx.exist:
-                        print("Skip because %s is not enabled" % device)
-                        return
                     tvm_A = tvm.nd.array(A_data, ctx)
                     tvm_B = tvm.nd.array(B_data, ctx)
                     tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32")
@@ -1011,9 +971,10 @@ def check_device(device):
                     f = tvm.build(s, [A, B, C], device, name="SequenceMask")
                     f(tvm_A, tvm_B, tvm_C)
                     tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data)
-                for backend in get_all_backend():
+                for backend in enabled_devices():
                     check_device(backend)
 
+@gpu
 def test_ndarray_size():
     in_shape = (5, 11, 7)
     dtype = "int32"
@@ -1025,9 +986,6 @@ def test_ndarray_size():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
         tvm_input = tvm.nd.array(input, ctx=ctx)
         tvm_output = tvm.nd.empty((), ctx=ctx, dtype=B.dtype)
         print("Running on target: %s" % device)
@@ -1037,18 +995,16 @@ def check_device(device):
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend in get_all_backend():
+    for backend in enabled_devices():
         check_device(backend)
 
 
+@gpu
 def test_where_fusion():
     """integration test that where and zeros should be properly inlined"""
     def check_device(device):
         with tvm.target.create(device):
             ctx = tvm.context(device, 0)
-            if not ctx.exist:
-                print("Skip because %s is not enabled" % device)
-                return
             print("Running on target: %s" % device)
             conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(device)
             data = te.placeholder((2, 1, 2, 4), 'int8', 'data')
@@ -1064,9 +1020,10 @@ def check_device(device):
             s = conv2d_schedule(outs)
             tvm.build(s, [data, w, add], target=backend)
 
-    for backend in get_all_backend():
+    for backend in enabled_devices():
         check_device(backend)
 
+@gpu
 def test_one_hot():
     verify_one_hot((3,), 3, 1, 0, -1, "int32")
     verify_one_hot((3,), 3, 1.0, 0.0, -1, "float32")
@@ -1076,6 +1033,7 @@ def test_one_hot():
     verify_one_hot((3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
 
 
+@gpu
 def test_unravel_index():
     for dtype in ["int32", "int64"]:
         verify_unravel_index([0, 1, 2, 3], [2, 2], dtype)
@@ -1083,6 +1041,7 @@ def test_unravel_index():
         verify_unravel_index(144, [5, 5, 5, 2], dtype)
         verify_unravel_index([100, 13, 5], [5, 5, 5, 2], dtype)
 
+@gpu
 def test_sparse_to_dense():
     verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0]) #scalar
     verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3]) #vector
diff --git a/tests/python/topi/python/test_topi_upsampling.py b/tests/python/topi/python/test_topi_upsampling.py
index 04cc310924026..8e9765ac70568 100644
--- a/tests/python/topi/python/test_topi_upsampling.py
+++ b/tests/python/topi/python/test_topi_upsampling.py
@@ -23,7 +23,7 @@
 import math
 from tvm.topi.util import nchw_pack_layout
 
-from common import get_all_backend
+from common import enabled_devices
 
 def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w,
                       layout='NCHW', method="nearest_neighbor",
@@ -73,7 +73,7 @@ def check_device(device):
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def test_upsampling():
@@ -156,7 +156,7 @@ def check_device(device):
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-    for device in get_all_backend():
+    for device in enabled_devices():
         check_device(device)
 
 def test_upsampling3d():
diff --git a/tests/python/topi/python/test_topi_util.py b/tests/python/topi/python/test_topi_util.py
index 345e7f9baf1a1..a6287b1375188 100644
--- a/tests/python/topi/python/test_topi_util.py
+++ b/tests/python/topi/python/test_topi_util.py
@@ -32,4 +32,4 @@ def test_get_shape():
     verify_get_shape((2, 3, 32, 32, 16, 8), "OIHW16i8o", "HWO8oI16i", (32, 32, 2, 8, 3, 16))
 
 if __name__ == "__main__":
-    test_get_shape()
\ No newline at end of file
+    test_get_shape()
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index e0e2205ba0bfc..96642ec12bb51 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -26,6 +26,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.util import get_const_tuple
 from tvm.topi.vision import ssd, non_max_suppression, get_valid_counts
+import pytest
+from tvm.testing import gpu, device_enabled
 
 _get_valid_counts_implement = {
     "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts),
@@ -88,7 +90,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -114,16 +116,13 @@ def check_device(device):
             tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
             tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-    """ Skip this test as it is intermittent
-        see https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094
     for device in ['llvm', 'cuda', 'opencl']:
-        # Disable gpu test for now
-        if device != "llvm":
-            continue
         check_device(device)
-    """
 
 
+@gpu
+@pytest.mark.skip("Skip this test as it is intermittent."
+                  "See https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094")
 def test_get_valid_counts():
     verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
@@ -143,7 +142,7 @@ def verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, n
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -179,7 +178,7 @@ def check_device(device):
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
-
+@gpu
 def test_non_max_suppression():
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -247,7 +246,7 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offse
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -267,12 +266,14 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
     verify_multibox_prior((1, 3, 224, 224), sizes=(0.5, 0.25, 0.1), ratios=(1, 2, 0.5))
     verify_multibox_prior((1, 32, 32, 32), sizes=(0.5, 0.25), ratios=(1, 2), steps=(2, 2), clip=True)
 
 
+@gpu
 def test_multibox_detection():
     batch_size = 1
     num_anchors = 3
@@ -292,7 +293,7 @@ def test_multibox_detection():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -336,7 +337,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -359,6 +360,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_roi_align():
     verify_roi_align(1, 16, 32, 64, 7, 1.0, -1)
     verify_roi_align(4, 16, 32, 64, 7, 0.5, 2)
@@ -387,7 +389,7 @@ def get_ref_data():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -409,6 +411,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_roi_pool():
     verify_roi_pool(1, 4, 16, 32, 7, 1.0)
     verify_roi_pool(4, 4, 16, 32, 7, 0.5)
@@ -421,7 +424,7 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -441,6 +444,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_proposal():
     attrs = {'scales': (0.5,),'ratios': (0.5,),
         'feature_stride': 16,
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 9282667c025a8..2dbcad00de07d 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -21,6 +21,7 @@
 from tvm import topi
 from tvm import te, auto_scheduler
 import tempfile
+from tvm.testing import device_enabled
 
 from test_auto_scheduler_common import matmul_auto_scheduler_test, get_tiled_matmul
 
@@ -46,7 +47,7 @@ def record_common(dag, s):
 
 
 def test_record_split_reorder_fuse_annotation():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     A = te.placeholder((512, 512), name='A')
@@ -80,7 +81,7 @@ def test_record_split_reorder_fuse_annotation():
 
 
 def test_record_compute_at_root_inline_cache_read_write():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     A = te.placeholder((512, 512), name='A')
@@ -108,7 +109,7 @@ def test_record_compute_at_root_inline_cache_read_write():
 
 
 def test_record_follow_split_follow_fused_split():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     A = te.placeholder((512, 512), name='A')
@@ -142,7 +143,7 @@ def test_record_follow_split_follow_fused_split():
 
 
 def test_record_pragma_storage_align_rfactor():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     A = te.placeholder((512, 512), name='A')
@@ -165,7 +166,7 @@ def test_record_pragma_storage_align_rfactor():
 
 
 def test_measure_local_builder_runner():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     dag, s0 = get_tiled_matmul()
@@ -183,7 +184,7 @@ def test_measure_local_builder_runner():
 
 
 def test_measure_local_builder_rpc_runner():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
 
     dag, s0 = get_tiled_matmul()
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index c7fa2ea364b5b..2875fd78ba3c8 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -65,4 +65,4 @@ def test_random_tuner():
 
 if __name__ == '__main__':
     test_gridsearch_tuner()
-    test_random_tuner()
\ No newline at end of file
+    test_random_tuner()
diff --git a/tests/python/unittest/test_hybrid_error_report.py b/tests/python/unittest/test_hybrid_error_report.py
index dd5d708409433..0dfdbbd0eec05 100644
--- a/tests/python/unittest/test_hybrid_error_report.py
+++ b/tests/python/unittest/test_hybrid_error_report.py
@@ -102,4 +102,4 @@ def wrap_error(module, lineno):
     wrap_error(Module4, 60)
     wrap_error(Module5, 70)
     wrap_error(Module6, 77)
-    wrap_error(Module7, 84)
\ No newline at end of file
+    wrap_error(Module7, 84)
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 56ae250925104..4667a0dbf9d24 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -20,12 +20,13 @@
 import tvm
 from tvm.contrib import graph_runtime
 from tvm.contrib.debugger import debug_runtime
+from tvm.testing import requires_gpu, requires_cuda, gpu, device_enabled
 
 def input_shape(mod):
     return [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
 
 def verify(data):
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mod, params = relay.testing.synthetic.get_workload()
@@ -42,7 +43,7 @@ def verify(data):
     return out
 
 def test_legacy_compatibility():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mod, params = relay.testing.synthetic.get_workload()
@@ -58,7 +59,7 @@ def test_legacy_compatibility():
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 def test_cpu():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mod, params = relay.testing.synthetic.get_workload()
@@ -83,10 +84,9 @@ def test_cpu():
     out = gmod.get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
+@requires_cuda
+@requires_gpu
 def test_gpu():
-    if not tvm.runtime.enabled("cuda"):
-        print("Skip because cuda is not enabled")
-        return
     mod, params = relay.testing.synthetic.get_workload()
     with relay.build_config(opt_level=3):
         complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
@@ -110,9 +110,10 @@ def test_gpu():
     out = gmod.get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
+@gpu
 def test_mod_export():
     def verify_cpu_export(obj_format):
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -150,7 +151,7 @@ def verify_cpu_export(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_gpu_export(obj_format):
-        if not tvm.runtime.enabled("cuda"):
+        if not device_enabled("cuda"):
             print("Skip because cuda is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -188,7 +189,7 @@ def verify_gpu_export(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_rpc_cpu_export(obj_format):
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -230,7 +231,7 @@ def verify_rpc_cpu_export(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_rpc_gpu_export(obj_format):
-        if not tvm.runtime.enabled("cuda"):
+        if not device_enabled("cuda"):
             print("Skip because cuda is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -278,9 +279,10 @@ def verify_rpc_gpu_export(obj_format):
         verify_rpc_cpu_export(obj_format)
         verify_rpc_gpu_export(obj_format)
 
+@gpu
 def test_remove_package_params():
     def verify_cpu_remove_package_params(obj_format):
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -326,7 +328,7 @@ def verify_cpu_remove_package_params(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_gpu_remove_package_params(obj_format):
-        if not tvm.runtime.enabled("cuda"):
+        if not device_enabled("cuda"):
             print("Skip because cuda is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -372,7 +374,7 @@ def verify_gpu_remove_package_params(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_rpc_cpu_remove_package_params(obj_format):
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -423,7 +425,7 @@ def verify_rpc_cpu_remove_package_params(obj_format):
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
     def verify_rpc_gpu_remove_package_params(obj_format):
-        if not tvm.runtime.enabled("cuda"):
+        if not device_enabled("cuda"):
             print("Skip because cuda is not enabled")
             return
         mod, params = relay.testing.synthetic.get_workload()
@@ -480,7 +482,7 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         verify_rpc_gpu_remove_package_params(obj_format)
 
 def test_debug_graph_runtime():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mod, params = relay.testing.synthetic.get_workload()
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index 9a859da39ae21..a6e10d4ec0ce0 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -19,6 +19,8 @@
 import tvm
 from tvm import te
 
+from tvm.testing import device_enabled, gpu
+
 from tvm.contrib import util
 header_file_dir_path = util.tempdir()
 
@@ -59,10 +61,11 @@ def generate_engine_module():
     return csource_module
 
 
+@gpu
 def test_mod_export():
     def verify_gpu_mod_export(obj_format):
         for device in ["llvm", "cuda"]:
-            if not tvm.runtime.enabled(device):
+            if not device_enabled(device):
                 print("skip because %s is not enabled..." % device)
                 return
 
@@ -89,7 +92,7 @@ def verify_gpu_mod_export(obj_format):
 
     def verify_multi_dso_mod_export(obj_format):
         for device in ["llvm"]:
-            if not tvm.runtime.enabled(device):
+            if not device_enabled(device):
                 print("skip because %s is not enabled..." % device)
                 return
 
@@ -117,7 +120,7 @@ def verify_multi_dso_mod_export(obj_format):
 
     def verify_json_import_dso(obj_format):
         for device in ["llvm"]:
-            if not tvm.runtime.enabled(device):
+            if not device_enabled(device):
                 print("skip because %s is not enabled..." % device)
                 return
 
@@ -173,7 +176,7 @@ def verify_multi_c_mod_export():
             print("Skip test because gcc is not available.")
 
         for device in ["llvm"]:
-            if not tvm.runtime.enabled(device):
+            if not device_enabled(device):
                 print("skip because %s is not enabled..." % device)
                 return
 
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index c7a5544f4a306..611b7ae763b21 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -22,6 +22,7 @@
 import sys
 import numpy as np
 import subprocess
+from tvm.testing import requires_gpu, device_enabled
 
 runtime_py = """
 import os
@@ -42,7 +43,7 @@
 """
 
 def test_dso_module_load():
-    if not tvm.runtime.enabled("llvm"):
+    if not device_enabled("llvm"):
         return
     dtype = 'int64'
     temp = util.tempdir()
@@ -90,6 +91,7 @@ def save_object(names):
         shell=True)
 
 
+@requires_gpu
 def test_device_module_dump():
     # graph
     n = tvm.runtime.convert(1024)
@@ -104,7 +106,7 @@ def test_device_module_dump():
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = util.tempdir()
@@ -132,7 +134,7 @@ def check_device(device):
 
     def check_stackvm(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = util.tempdir()
@@ -161,7 +163,7 @@ def test_combine_module_llvm():
 
     def check_llvm():
         ctx = tvm.cpu(0)
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled" )
             return
         temp = util.tempdir()
@@ -186,7 +188,7 @@ def check_llvm():
 
     def check_system_lib():
         ctx = tvm.cpu(0)
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             print("Skip because llvm is not enabled" )
             return
         temp = util.tempdir()
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 36312959da3d4..84cdc1f1bbf2c 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -17,26 +17,12 @@
 import tvm
 from tvm import te
 import numpy as np
-
-def enabled_ctx_list():
-    ctx_list = [('cpu', tvm.cpu(0)),
-                ('gpu', tvm.gpu(0)),
-                ('cl', tvm.opencl(0)),
-                ('metal', tvm.metal(0)),
-                ('rocm', tvm.rocm(0)),
-                ('vulkan', tvm.vulkan(0)),
-                ('vpi', tvm.vpi(0))]
-    for k, v  in ctx_list:
-        assert tvm.context(k, 0) == v
-    ctx_list = [x[1] for x in ctx_list if x[1].exist]
-    return ctx_list
-
-ENABLED_CTX_LIST = enabled_ctx_list()
-print("Testing using contexts:", ENABLED_CTX_LIST)
+from tvm.testing import enabled_devices, gpu
 
 
+@gpu
 def test_nd_create():
-    for ctx in ENABLED_CTX_LIST:
+    for target, ctx in enabled_devices():
         for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32",
                       "float32"]:
             x = np.random.randint(0, 10, size=(3, 4))
diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py
index 0059083ebdcc5..8d617fa790235 100644
--- a/tests/python/unittest/test_target_codegen_blob.py
+++ b/tests/python/unittest/test_target_codegen_blob.py
@@ -22,10 +22,12 @@
 import tvm
 from tvm import te
 import ctypes
+from tvm.testing import gpu, device_enabled
 
+@gpu
 def test_synthetic():
     for device in ["llvm", "cuda"]:
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("skip because %s is not enabled..." % device)
             return
 
@@ -70,10 +72,11 @@ def verify(data):
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 
+@gpu
 def test_cuda_lib():
     ctx = tvm.gpu(0)
     for device in ["llvm", "cuda"]:
-        if not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("skip because %s is not enabled..." % device)
             return
     nn = 12
@@ -99,4 +102,4 @@ def test_cuda_lib():
 
 if __name__ == "__main__":
     test_synthetic()
-    #test_system_lib()
+    test_cuda_lib()
diff --git a/tests/python/unittest/test_target_codegen_bool.py b/tests/python/unittest/test_target_codegen_bool.py
index cdb343f3530b0..49c0301ccd132 100644
--- a/tests/python/unittest/test_target_codegen_bool.py
+++ b/tests/python/unittest/test_target_codegen_bool.py
@@ -19,7 +19,9 @@
 import tvm
 from tvm import te
 import numpy as np
+from tvm.testing import device_enabled, gpu
 
+@gpu
 def test_cmp_load_store():
     n = 32
     A = te.placeholder((n,), name='A')
@@ -30,7 +32,7 @@ def test_cmp_load_store():
 
 
     def check_llvm():
-        if not tvm.runtime.enabled("llvm"):
+        if not device_enabled("llvm"):
             return
         s = te.create_schedule(D.op)
         xo, xi = s[C].split(C.op.axis[0], factor=4)
@@ -48,9 +50,9 @@ def check_llvm():
             d.asnumpy(), np.logical_and(a.asnumpy() > b.asnumpy(), a.asnumpy() > 1).astype('float32'))
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             return
+        ctx = tvm.context(device, 0)
         s = te.create_schedule(D.op)
         for stage in [C, D]:
             xo, xi = s[stage].split(stage.op.axis[0], factor=4)
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index 7fdd2592ee5f8..f55c55cff2798 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -22,16 +22,16 @@
 import unittest
 from tvm.contrib.nvcc import have_fp16, have_int8
 from tvm.contrib import nvcc
+from tvm.testing import requires_cuda, requires_gpu, device_enabled
 
 tx = te.thread_axis("threadIdx.x")
 bx = te.thread_axis("blockIdx.x")
 
+@requires_gpu
+@requires_cuda
 def test_cuda_vectorize_add():
     num_thread = 8
     def check_cuda(dtype, n, lanes):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -66,12 +66,11 @@ def check_cuda(dtype, n, lanes):
     check_cuda("float16", 64, 6)
     check_cuda("float16", 64, 8)
 
+@requires_gpu
+@requires_cuda
 def test_cuda_multiply_add():
     num_thread = 8
     def check_cuda(dtype, n, lanes):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
             print("skip because gpu does not support int8")
             return
@@ -98,12 +97,11 @@ def check_cuda(dtype, n, lanes):
         tvm.testing.assert_allclose(d.asnumpy(), np_d)
     check_cuda("int8", 64, 4)
 
+@requires_gpu
+@requires_cuda
 def test_cuda_vectorize_load():
     num_thread = 8
     def check_cuda(dtype, n, lanes):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         ctx = tvm.gpu(0)
         A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i], name='B')
@@ -123,11 +121,10 @@ def check_cuda(dtype, n, lanes):
     check_cuda("int8", 64, 8)
     check_cuda("int8", 64, 16)
 
+@requires_gpu
+@requires_cuda
 def test_cuda_make_int8():
     def check_cuda(n, value, lanes):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         dtype = 'int8'
         ctx = tvm.gpu(0)
         A = te.compute((n, lanes), lambda i,j: tvm.tir.const(value, dtype=dtype))
@@ -151,6 +148,8 @@ def check_cuda(n, value, lanes):
     check_cuda(64, -3, 2)
 
 
+@requires_gpu
+@requires_cuda
 def test_cuda_inf_nan():
     target = 'cuda'
     def check_inf_nan(ctx, n, value, dtype):
@@ -165,10 +164,6 @@ def check_inf_nan(ctx, n, value, dtype):
         # Only need to test compiling here
         fun(a, c)
 
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     ctx = tvm.context(target, 0)
 
     check_inf_nan(ctx, 1, -float('inf'), 'float32')
@@ -179,11 +174,9 @@ def check_inf_nan(ctx, n, value, dtype):
     check_inf_nan(ctx, 1, float('nan'), 'float64')
 
 
+@requires_gpu
+@requires_cuda
 def test_cuda_shuffle():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     idxm = tvm.tir.indexmod
     a = te.placeholder((64, ), 'int32')
     b = te.placeholder((64, ), 'int32')
@@ -227,12 +220,11 @@ def _transform(f, *_):
         module(nda, ndb, ndc)
         tvm.testing.assert_allclose(ndc.asnumpy(), ref)
 
+@requires_gpu
+@requires_cuda
 def test_crossthread_reduction1():
     def check(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist or not tvm.runtime.enabled(device):
-            print("skip because", device, "is not enabled..")
-            return
         n = te.var("n")
         m = te.var("m")
         A = te.placeholder((n, m), name='A')
@@ -268,10 +260,12 @@ def verify(nthd):
     check("rocm")
 
 
+@requires_gpu
+@requires_cuda
 def test_crossthread_reduction2():
     def check(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist or not tvm.runtime.enabled(device):
+        if not device_enabled(ctx):
             print("skip because", device, "is not enabled..")
             return
 
@@ -315,11 +309,9 @@ def verify(nthdx, nthdy):
     check("cuda")
     check("rocm")
 
+@requires_gpu
+@requires_cuda
 def test_cuda_reduction_binding():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     k = te.reduce_axis((0, 32), 'k')
     A = te.placeholder((96, 32), name='A')
     B = te.compute( (96,), lambda m:
@@ -334,10 +326,11 @@ def test_cuda_reduction_binding():
 
     fcuda = tvm.build(s, [A, B], "cuda")
 
+@requires_gpu
+@requires_cuda
 def test_rfactor_predicates():
     def check(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist or not tvm.runtime.enabled(device):
+        if device_enabled(device):
             print("skip because", device, "is not enabled..")
             return
 
@@ -373,7 +366,8 @@ def check(device):
     check("cuda")
     check("rocm")
 
-@unittest.skipIf(not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"), "skip because cuda is not enabled..")
+@requires_gpu
+@requires_cuda
 def test_cuda_const_float_to_half():
     # This import is required to use nvcc to perform code gen;
     # otherwise it is found that the code gen is done by nvrtc.
@@ -398,16 +392,11 @@ def test_cuda_const_float_to_half():
     func(a, c)
     np.testing.assert_equal(c.asnumpy(), a_np > b.value)
 
+@requires_gpu
+@requires_cuda
 def test_cuda_reduction():
     def check(device, dtype, m=32, n=32):
         ctx = tvm.context(device, 0)
-        if not ctx.exist or not tvm.runtime.enabled(device):
-            print("skip because", device, "is not enabled..")
-            return
-        if dtype == "float16" and not have_fp16(ctx.compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
         a = te.placeholder((m, n), name="a", dtype=dtype)
         b = te.placeholder((m, n), name="b", dtype=dtype)
         c = a + b
@@ -430,12 +419,11 @@ def check(device, dtype, m=32, n=32):
     check("rocm", "float32")
     check("cuda", "float16")
 
+@requires_gpu
+@requires_cuda
 def test_cuda_mix_threaded_and_normal_reduction():
     def check(device, dtype, m=32, n=32):
         ctx = tvm.context(device, 0)
-        if not ctx.exist or not tvm.runtime.enabled(device):
-            print("skip because", device, "is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(ctx.compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -458,11 +446,9 @@ def check(device, dtype, m=32, n=32):
     check("rocm", "float32")
     check("cuda", "float16")
 
+@requires_gpu
+@requires_cuda
 def test_cuda_floordiv_with_vectorization():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     with tvm.target.cuda():
         # B[i] = A[floordiv(i, k)]
         n = 256
@@ -485,11 +471,9 @@ def test_cuda_floordiv_with_vectorization():
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
+@requires_gpu
+@requires_cuda
 def test_cuda_floormod_with_vectorization():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     with tvm.target.cuda():
         # B[i] = A[floormod(i, k)]
         n = 256
@@ -512,11 +496,9 @@ def test_cuda_floormod_with_vectorization():
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
+@requires_gpu
+@requires_cuda
 def test_vectorized_casts():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     def check(t0, t1):
         if (t0 ==  "float16" or t1 == "float16") and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
@@ -571,6 +553,8 @@ def sched(B):
     s[B].bind(iio, tx)
     return s
 
+@requires_gpu
+@requires_cuda
 def test_vectorized_intrin1():
     test_funcs = [
         (tvm.tir.floor, lambda x : np.floor(x)),
@@ -594,9 +578,6 @@ def test_vectorized_intrin1():
         (tvm.tir.sqrt,  lambda x : np.sqrt(x)),
     ]
     def run_test(tvm_intrin, np_func, dtype):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -627,6 +608,8 @@ def run_test(tvm_intrin, np_func, dtype):
         run_test(*func, "float32")
         run_test(*func, "float16")
 
+@requires_gpu
+@requires_cuda
 def test_vectorized_intrin2(dtype="float32"):
     c2 = tvm.tir.const(2, dtype=dtype)
     test_funcs = [
@@ -634,10 +617,6 @@ def test_vectorized_intrin2(dtype="float32"):
         (tvm.tir.fmod,  lambda x : np.fmod(x, 2.0))
     ]
     def run_test(tvm_intrin, np_func):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
-
         n = 128
         A = te.placeholder((n,), dtype=dtype, name='A')
         B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name='B')
@@ -652,6 +631,8 @@ def run_test(tvm_intrin, np_func):
     for func in test_funcs:
         run_test(*func)
 
+@requires_gpu
+@requires_cuda
 def test_vectorized_popcount():
     def ref_popcount(x):
         cnt = 0
@@ -661,10 +642,6 @@ def ref_popcount(x):
         return cnt
 
     def run_test(dtype):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
-
         n = 128
         A = te.placeholder((n,), dtype=dtype, name='A')
         B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name='B')
@@ -680,11 +657,10 @@ def run_test(dtype):
     run_test("uint32")
     run_test("uint64")
 
+@requires_gpu
+@requires_cuda
 def test_cuda_vectorize_load_permute_pad():
     def check_cuda(dtype, n, l, padding, lanes):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -755,23 +731,21 @@ def post_visit(stmt):
 
     tvm.tir.stmt_functor.ir_transform(stmt['main'].body, pre_visit, post_visit)
 
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("CUDA device not found, skip the verification.")
-        return
-    else:
-        tgt = tvm.target.cuda()
-        mod = tvm.build(s, args, tgt)
-        # To check if every vectorize loop transforms to correct instruction
-        # print(mod.imported_modules[0].get_source())
-
-        ctx = tvm.context("cuda", 0)
-        a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
-        c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), ctx)
-        mod(a, b, c)
-        tvm.testing.assert_allclose(c.asnumpy(), np.dot(
-            a.asnumpy(), b.asnumpy()), rtol=1e-5)
-
+    tgt = tvm.target.cuda()
+    mod = tvm.build(s, args, tgt)
+    # To check if every vectorize loop transforms to correct instruction
+    # print(mod.imported_modules[0].get_source())
+
+    ctx = tvm.context("cuda", 0)
+    a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
+    c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), ctx)
+    mod(a, b, c)
+    tvm.testing.assert_allclose(c.asnumpy(), np.dot(
+        a.asnumpy(), b.asnumpy()), rtol=1e-5)
+
+@requires_gpu
+@requires_cuda
 def test_vectorized_cooperative_fetching_x():
     N = 512
     A = te.placeholder((N, N), name='A', dtype='float32')
@@ -821,6 +795,8 @@ def test_vectorized_cooperative_fetching_x():
 
     vcf_check_common(s, [A, B, C])
 
+@requires_gpu
+@requires_cuda
 def test_vectorized_cooperative_fetching_xy():
     N = 512
     A = te.placeholder((N, N), name='A')
@@ -874,11 +850,9 @@ def test_vectorized_cooperative_fetching_xy():
 
     vcf_check_common(s, [A, B, C])
 
+@requires_gpu
+@requires_cuda
 def test_unrolled_vectorization():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-
     dtype = 'float32'
     target = 'cuda'
 
diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py
index ddb35f31fe1df..0d8a27e009188 100644
--- a/tests/python/unittest/test_target_codegen_device.py
+++ b/tests/python/unittest/test_target_codegen_device.py
@@ -18,7 +18,9 @@
 from tvm import te
 from tvm.contrib import util
 import numpy as np
+from tvm.testing import requires_gpu, device_enabled
 
+@requires_gpu
 def test_large_uint_imm():
     value =  (1 << 63) + 123
     other = tvm.tir.const(3, "uint64")
@@ -32,9 +34,9 @@ def test_large_uint_imm():
     s[A].bind(xo, te.thread_axis("blockIdx.x"))
 
     def check_target(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             return
+        ctx = tvm.context(device, 0)
         f = tvm.build(s, [A], device)
         # launch the kernel.
         a = tvm.nd.empty((n, ), dtype=A.dtype, ctx=ctx)
@@ -45,6 +47,7 @@ def check_target(device):
     check_target("vulkan")
 
 
+@requires_gpu
 def test_add_pipeline():
     n = te.size_var('n')
     A = te.placeholder((n,), name='A')
@@ -64,11 +67,9 @@ def test_add_pipeline():
     s[D].bind(xo, te.thread_axis("blockIdx.x"))
 
     def check_target(device, host="stackvm"):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            return
-        if not tvm.runtime.enabled(host):
+        if not device_enabled(device) and device_enabled(host):
             return
+        ctx = tvm.context(device, 0)
         mhost = tvm.driver.build(s, [A, B, D], target=device, target_host=host)
         f = mhost.entry_func
         # launch the kernel.
diff --git a/tests/python/unittest/test_target_codegen_extern.py b/tests/python/unittest/test_target_codegen_extern.py
index 4104af8644397..b7f1c28f04195 100644
--- a/tests/python/unittest/test_target_codegen_extern.py
+++ b/tests/python/unittest/test_target_codegen_extern.py
@@ -17,7 +17,9 @@
 import tvm
 from tvm import te
 import numpy as np
+from tvm.testing import gpu, device_enabled
 
+@gpu
 def test_add_pipeline():
     nn = 64
     max_threads = 4
@@ -51,7 +53,7 @@ def extern_generator_gpu(ins, outs):
     print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
 
     def check_target(target):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
         C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
@@ -86,7 +88,7 @@ def my_extern_array_func1(aa, bb):
 
 
     def check_target(target):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], target)
@@ -116,7 +118,7 @@ def extern_generator(ins, outs):
     s = te.create_schedule(C.op)
 
     def check_target(target):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], target)
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index e403589dff1d2..a2254e1a89cd5 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -16,9 +16,12 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.testing import requires_opencl, requires_gpu
 
 target = 'opencl'
 
+@requires_gpu
+@requires_opencl
 def test_opencl_ternary_expression():
     def check_if_then_else(ctx, n, dtype):
         A = te.placeholder((n,), name='A', dtype=dtype)
@@ -52,10 +55,6 @@ def check_select(ctx, n, dtype):
         # Only need to test compiling here
         fun(a, c)
 
-    if not tvm.runtime.enabled(target):
-        print("skip because opencl is not enabled..")
-        return
-
     ctx = tvm.context(target, 0)
 
     check_if_then_else(ctx, 1, 'int8')
@@ -67,6 +66,8 @@ def check_select(ctx, n, dtype):
     check_select(ctx, 1, 'int16')
     check_select(ctx, 1, 'uint16')
 
+@requires_gpu
+@requires_opencl
 def test_opencl_inf_nan():
     def check_inf_nan(ctx, n, value, dtype):
         A = te.placeholder((n,), name='A', dtype=dtype)
@@ -80,10 +81,6 @@ def check_inf_nan(ctx, n, value, dtype):
         # Only need to test compiling here
         fun(a, c)
 
-    if not tvm.runtime.enabled(target):
-        print("skip because opencl is not enabled..")
-        return
-
     ctx = tvm.context(target, 0)
 
     check_inf_nan(ctx, 1, -float('inf'), 'float32')
@@ -94,6 +91,8 @@ def check_inf_nan(ctx, n, value, dtype):
     check_inf_nan(ctx, 1, float('nan'), 'float64')
 
 
+@requires_gpu
+@requires_opencl
 def test_opencl_max():
     def check_max(ctx, n, dtype):
         A = te.placeholder((n,), name='A', dtype=dtype)
@@ -109,10 +108,6 @@ def check_max(ctx, n, dtype):
         # Only need to test compiling here
         fun(a, c)
 
-    if not tvm.runtime.enabled(target):
-        print("skip because opencl is not enabled..")
-        return
-
     ctx = tvm.context(target, 0)
 
     check_max(ctx, 1, 'int8')
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index 8ab65f129cc5f..57fe9ca0607a5 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -21,6 +21,8 @@
 from tvm.te.hybrid import script
 from tvm.te.hybrid.runtime import HYBRID_GLOBALS
 
+from tvm.testing import requires_gpu, requires_cuda
+
 @pytest.mark.skip
 def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None):
     def tvm_val_2_py_val(val):
@@ -316,11 +318,9 @@ def if_and(a):
     run_and_check(func, ins, outs=outs)
 
 
+@requires_gpu
+@requires_cuda
 def test_bind():
-    if not tvm.gpu(0).exist:
-        print('[Warning] No GPU found! Skip bind test!')
-        return
-
     @script
     def vec_add(a, b):
         c = output_tensor((1000, ), 'float32')
@@ -463,6 +463,8 @@ def triangle(a, b):
     func, ins, outs = run_and_check(triangle, [a, b])
     run_and_check(func, ins, outs=outs)
 
+@requires_gpu
+@requires_cuda
 def test_allocate():
     @te.hybrid.script
     def blur2d(a):
@@ -482,27 +484,24 @@ def blur2d(a):
     func, ins, outs = run_and_check(blur2d, [a])
     run_and_check(func, ins, outs=outs)
 
-    if tvm.gpu().exist:
-        @te.hybrid.script
-        def share_vec_add(a, b):
-            c = output_tensor((256, ), 'float32')
-            shared = allocate((256, ), 'float32', 'shared')
-            for i in bind("threadIdx.x", 256):
-                shared[i] = a[i]
-            local = allocate((256, ), 'float32', 'local')
-            for i in bind("threadIdx.x", 256):
-                local[i] = b[i]
-            for i in bind("threadIdx.x", 256):
-                c[i] = shared[i] + local[i]
-            return c
-
-        a = te.placeholder((256, ), dtype='float32', name='a')
-        b = te.placeholder((256, ), dtype='float32', name='b')
-        c = share_vec_add(a, b)
-        func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
-        run_and_check(func, ins, outs=outs, target='cuda')
-    else:
-        print('[Warning] No GPU found! Skip shared mem test!')
+    @te.hybrid.script
+    def share_vec_add(a, b):
+        c = output_tensor((256, ), 'float32')
+        shared = allocate((256, ), 'float32', 'shared')
+        for i in bind("threadIdx.x", 256):
+            shared[i] = a[i]
+        local = allocate((256, ), 'float32', 'local')
+        for i in bind("threadIdx.x", 256):
+            local[i] = b[i]
+        for i in bind("threadIdx.x", 256):
+            c[i] = shared[i] + local[i]
+        return c
+
+    a = te.placeholder((256, ), dtype='float32', name='a')
+    b = te.placeholder((256, ), dtype='float32', name='b')
+    c = share_vec_add(a, b)
+    func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
+    run_and_check(func, ins, outs=outs, target='cuda')
 
 def test_upstream():
     @te.hybrid.script
diff --git a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
index 1f1791447ab16..7dda4d1a168d9 100644
--- a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
+++ b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
@@ -18,7 +18,7 @@
 from tvm import te
 from tvm import topi
 import numpy as np
-from tvm.contrib import nvcc
+from tvm.testing import requires_tpu
 
 def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
     A = te.placeholder((n, l), name='A', dtype='float16')
@@ -204,26 +204,14 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
       c_np[bs, :, :] = np.dot(a_np[bs, :, :], b_np[bs, :, :])
     np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
 
+@requires_tpu
 def test_tensor_core_matmul():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-        print("skip because gpu does not support tensor core")
-        return
-
     tensor_core_matmul(16) #test with warp_tile 16x16x16
     tensor_core_matmul(8) #test with warp_tile 8x32x16
     tensor_core_matmul(32) #test with warp_tile 32x8x16
 
+@requires_tpu
 def test_tensor_core_batch_matmul():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-        print("skip because gpu does not support tensor core")
-        return
-
     tensor_core_batch_matmul()
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_te_schedule_tensor_core.py b/tests/python/unittest/test_te_schedule_tensor_core.py
index aa87665455dfc..13770f4eed07a 100644
--- a/tests/python/unittest/test_te_schedule_tensor_core.py
+++ b/tests/python/unittest/test_te_schedule_tensor_core.py
@@ -18,7 +18,7 @@
 from tvm import te
 import numpy as np
 from tvm.topi.testing import conv2d_nhwc_python
-from tvm.contrib import nvcc
+from tvm.testing import requires_tpu
 
 VERIFY = True
 
@@ -103,14 +103,8 @@ def intrin_func(ins, outs):
     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
 
+@requires_tpu
 def test_tensor_core_batch_matmal():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-        print("skip because gpu does not support tensor core")
-        return
-
     batch_size = 4
     n = 512
     m, l = n, n
@@ -216,14 +210,8 @@ def test_tensor_core_batch_matmal():
 
 
 
+@requires_tpu
 def test_tensor_core_batch_conv():
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        print("skip because cuda is not enabled..")
-        return
-    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-        print("skip because gpu does not support tensor core")
-        return
-
     # The sizes of inputs and filters
     batch_size = 32
     height = 14
diff --git a/tests/python/unittest/test_te_tensor_overload.py b/tests/python/unittest/test_te_tensor_overload.py
index 97143681891cf..80688d6a31d9c 100644
--- a/tests/python/unittest/test_te_tensor_overload.py
+++ b/tests/python/unittest/test_te_tensor_overload.py
@@ -20,6 +20,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.topi.util import get_const_tuple
+from tvm.testing import gpu, device_enabled
 
 
 def test_operator_type_and_tags():
@@ -103,10 +104,10 @@ def verify_tensor_scalar_bop(shape, typ="add"):
         raise NotImplementedError()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        ctx = tvm.context(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             s = tvm.topi.testing.get_elemwise_schedule(device)(B)
@@ -150,7 +151,7 @@ def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"):
 
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -183,10 +184,11 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def verify_conv2d_scalar_bop(batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add"):
     def check_device(device):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        if not device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -239,6 +241,7 @@ def check_device(device):
         check_device(device)
 
 
+@gpu
 def test_tensor_scalar_bop():
     verify_tensor_scalar_bop((1,), typ="add")
     verify_tensor_scalar_bop((3, 5), typ="sub")
@@ -246,6 +249,7 @@ def test_tensor_scalar_bop():
     verify_tensor_scalar_bop((2, 3, 1, 32), typ="div")
 
 
+@gpu
 def test_broadcast_bop():
     verify_broadcast_bop((2, 3), (), typ="add")
     verify_broadcast_bop((5, 2, 3), (1,), typ="add")
@@ -254,6 +258,7 @@ def test_broadcast_bop():
     verify_broadcast_bop((2, 3, 1, 32), (64, 32), typ="div")
 
 
+@gpu
 def test_conv2d_scalar_bop():
     verify_conv2d_scalar_bop(1, 16, 4, 4, 3, 1, 1, typ="add")
     verify_conv2d_scalar_bop(1, 32, 2, 1, 3, 1, 1, typ="sub")
diff --git a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
index 2e37de49f2435..019e989809b8f 100644
--- a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
+++ b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
@@ -17,6 +17,7 @@
 """Test gpu code verifier"""
 import tvm
 from tvm import te
+from tvm.testing import requires_gpu, device_enabled
 
 def get_verify_pass(valid, **kwargs):
     def _fverify(f, *_):
@@ -25,6 +26,7 @@ def _fverify(f, *_):
     return tvm.tir.transform.prim_func_pass(_fverify, opt_level=0)
 
 
+@requires_gpu
 def test_shared_memory():
     def check_shared_memory(dtype):
         N = 1024
@@ -47,7 +49,7 @@ def check_shared_memory(dtype):
         # thread usage: M
 
         for target in ['opencl', 'cuda']:
-            if not tvm.context(target).exist:
+            if not device_enabled(target):
                 continue
             valid = [None]
             with tvm.transform.PassContext(config={"tir.add_lower_pass": [
@@ -66,6 +68,7 @@ def check_shared_memory(dtype):
     check_shared_memory('float32')
     check_shared_memory('int8x4')
 
+@requires_gpu
 def test_local_memory():
     N = 1024
     M = 128
@@ -83,7 +86,7 @@ def test_local_memory():
     # thread usage: M
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
@@ -101,6 +104,7 @@ def test_local_memory():
             tvm.build(s, [A, B], target)
         assert valid[0]
 
+@requires_gpu
 def test_num_thread():
     N = 1024
     M = 128
@@ -118,7 +122,7 @@ def test_num_thread():
     # thread usage: N
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
@@ -152,6 +156,7 @@ def test_num_thread():
             tvm.build(s, [A, B], target)
         assert valid[0]
 
+@requires_gpu
 def test_multiple_kernels():
     N = 1024
 
@@ -168,7 +173,7 @@ def test_multiple_kernels():
     # thread usage: N
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
@@ -186,6 +191,7 @@ def test_multiple_kernels():
             tvm.build(s, [A, C], target)
         assert valid[0]
 
+@requires_gpu
 def test_wrong_bind():
     N = 1024
 
@@ -199,7 +205,7 @@ def test_wrong_bind():
     s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
@@ -208,6 +214,7 @@ def test_wrong_bind():
             tvm.build(s, [A, B], target)
         assert not valid[0]
 
+@requires_gpu
 def test_vectorize():
     N = 1024
 
@@ -224,7 +231,7 @@ def test_vectorize():
     s[B].vectorize(ji)
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
@@ -233,6 +240,7 @@ def test_vectorize():
             tvm.lower(s, [A, B])
         assert not valid[0]
 
+@requires_gpu
 def test_vthread():
     N = 1024
 
@@ -245,7 +253,7 @@ def test_vthread():
     s[B].bind(s[B].op.axis[1], te.thread_axis("vthread"))
 
     for target in ['opencl', 'cuda']:
-        if not tvm.context(target).exist:
+        if not device_enabled(target):
             continue
 
         valid = [None]
diff --git a/tests/python/unittest/test_tir_analysis_verify_memory.py b/tests/python/unittest/test_tir_analysis_verify_memory.py
index 386fceb150e38..26d1d53fc50ba 100644
--- a/tests/python/unittest/test_tir_analysis_verify_memory.py
+++ b/tests/python/unittest/test_tir_analysis_verify_memory.py
@@ -17,6 +17,7 @@
 import tvm
 import pytest
 from tvm import te
+from tvm.testing import device_enabled, gpu
 
 # The following DLDeviceType/TVMDeviceExtType values
 # are originally defined in dlpack.h and c_runtime_api.h.
@@ -27,6 +28,7 @@
 # All computations are bound.
 # So VerifyMemory pass is expected to succeed.
 #
+@gpu
 def test_verify_memory_all_bind():
   n = te.var("n")
   A = te.placeholder((n,), name='A')
@@ -41,15 +43,17 @@ def test_verify_memory_all_bind():
   mod = tvm.lower(s, [A, B])
 
   for dev_type in gpu_devices + other_devices:
-      binded_mod = tvm.tir.transform.Apply(
-          lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
-      tvm.tir.transform.VerifyMemory()(binded_mod)
+      if device_enabled(dev_type):
+          binded_mod = tvm.tir.transform.Apply(
+              lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
+          tvm.tir.transform.VerifyMemory()(binded_mod)
 
 
 
 # Computations are not bound.
 # So VerifyMemory pass fails when device type is GPU.
 #
+@gpu
 def test_verify_memory_not_bind():
   n = te.var("n")
   A = te.placeholder((n,), name='A')
@@ -61,20 +65,23 @@ def test_verify_memory_not_bind():
   mod = tvm.lower(s, [A, B])
 
   for dev_type in gpu_devices:
-      binded_mod = tvm.tir.transform.Apply(
-          lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
-      with pytest.raises(RuntimeError):
-          tvm.tir.transform.VerifyMemory()(binded_mod)
+      if device_enabled(dev_type):
+          binded_mod = tvm.tir.transform.Apply(
+              lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
+          with pytest.raises(RuntimeError):
+              tvm.tir.transform.VerifyMemory()(binded_mod)
 
   for dev_type in other_devices:
-      binded_mod = tvm.tir.transform.Apply(
-          lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
-      tvm.tir.transform.VerifyMemory()(binded_mod)
+      if device_enabled(dev_type):
+          binded_mod = tvm.tir.transform.Apply(
+              lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
+          tvm.tir.transform.VerifyMemory()(binded_mod)
 
 
 # Computations are partially bound.
 # So VerifyMemory pass fails when device type is GPU.
 #
+@gpu
 def test_verify_memory_partially_bind():
   n = te.var("n")
   A = te.placeholder((n,), name='A')
@@ -91,15 +98,17 @@ def test_verify_memory_partially_bind():
   mod = tvm. lower(s, [A, B, C, D])
 
   for dev_type in gpu_devices:
-      binded_mod = tvm.tir.transform.Apply(
-          lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
-      with pytest.raises(RuntimeError):
-          tvm.tir.transform.VerifyMemory()(binded_mod)
+      if device_enabled(dev_type):
+          binded_mod = tvm.tir.transform.Apply(
+              lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
+          with pytest.raises(RuntimeError):
+              tvm.tir.transform.VerifyMemory()(binded_mod)
 
   for dev_type in other_devices:
-      binded_mod = tvm.tir.transform.Apply(
-          lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
-      tvm.tir.transform.VerifyMemory()(binded_mod)
+      if device_enabled(dev_type):
+          binded_mod = tvm.tir.transform.Apply(
+              lambda f: f.with_attr("target", tvm.target.create(dev_type)))(mod)
+          tvm.tir.transform.VerifyMemory()(binded_mod)
 
 
 
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index 95047f5344c76..c8b28e178c022 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -17,6 +17,7 @@
 import tvm
 from tvm import te
 import numpy as np
+from tvm.testing import requires_gpu, device_enabled
 
 def test_for():
     ib = tvm.tir.ir_builder.create()
@@ -90,7 +91,7 @@ def test_device_ir(A, B, C):
                    name="vector_add", dtype=dtype)
     s = te.create_schedule(C.op)
     def check_target(target):
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
@@ -103,6 +104,7 @@ def check_target(target):
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("llvm")
 
+@requires_gpu
 def test_gpu():
     n = te.size_var('n')
     dtype = "float32"
@@ -133,7 +135,7 @@ def test_device_ir(A, B, C):
     stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     def check_target(target):
         n = 1024
-        if not tvm.runtime.enabled(target):
+        if not device_enabled(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
diff --git a/tests/python/unittest/test_tir_transform_lower_warp_memory.py b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
index 5801200c15daa..3b78efacda940 100644
--- a/tests/python/unittest/test_tir_transform_lower_warp_memory.py
+++ b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
@@ -19,7 +19,9 @@
 from tvm.contrib.nvcc import have_fp16
 
 import numpy as np
+from tvm.testing import requires_gpu, requires_cuda
 
+@requires_cuda
 def test_lower_warp_memory_local_scope():
     m = 128
     A = te.placeholder((m,), name='A')
@@ -47,6 +49,7 @@ def test_lower_warp_memory_local_scope():
     assert(fdevice.body.body.value.value == "local")
     assert(fdevice.body.body.body.extents[0].value == 2)
 
+@requires_cuda
 def test_lower_warp_memory_correct_indices():
     n = 32
     A = te.placeholder((2, n, n), name='A', dtype="float32")
@@ -83,11 +86,10 @@ def test_lower_warp_memory_correct_indices():
     assert "threadIdx.x" in idx_names
     assert "threadIdx.y" not in idx_names
 
+@requires_gpu
+@requires_cuda
 def test_lower_warp_memory_cuda_end_to_end():
     def check_cuda(dtype):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -127,11 +129,10 @@ def check_cuda(dtype):
     check_cuda("float32")
     check_cuda("float16")
 
+@requires_gpu
+@requires_cuda
 def test_lower_warp_memory_cuda_half_a_warp():
     def check_cuda(dtype):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -170,11 +171,10 @@ def check_cuda(dtype):
     check_cuda("float32")
     check_cuda("float16")
 
+@requires_gpu
+@requires_cuda
 def test_lower_warp_memory_cuda_2_buffers():
     def check_cuda(dtype):
-        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-            print("skip because cuda is not enabled..")
-            return
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -218,6 +218,7 @@ def check_cuda(dtype):
     check_cuda("float32")
     check_cuda("float16")
 
+@requires_gpu
 def test_lower_warp_memory_roundup():
     def check(device, m):
         A = te.placeholder((m,), name='A')
@@ -246,7 +247,7 @@ def check(device, m):
             tvm.testing.assert_allclose(B_nd.asnumpy(), B_np)
 
     for device in ['cuda', 'rocm']:
-        if not tvm.context(device, 0).exist or not tvm.runtime.enabled(device):
+        if not device_enabled(device):
             print("skip because", device,"is not enabled..")
             continue
         check(device, m=31)
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index 3ff6804cf7e0c..61c14c5bfb399 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -16,7 +16,9 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.testing import requires_cuda
 
+@requires_cuda
 def test_thread_storage_sync():
     m = te.size_var('m')
     l = te.size_var('l')
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 61c079aa4744a..e7429e1ce9a2a 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -26,5 +26,20 @@ else
 fi
 set -u
 
+export TVM_TEST_DEVICES=""
+while test $# -gt 0
+do
+    case "$1" in
+        cpu) export TVM_TEST_DEVICES="llvm;llvm -device=arm_cpu;$TVM_TEST_DEVICES"
+            ;;
+        gpu) export TVM_TEST_DEVICES="cuda;opencl;metal;rocm;vulkan;nvptx;opencl -device=mali,aocl_sw_emu;$TVM_TEST_DEVICES"
+             export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
+            ;;
+        *) export TVM_TEST_DEVICES="$1;$TVM_TEST_DEVICES"
+            ;;
+    esac
+    shift
+done
+
 export TVM_PATH=`pwd`
 export PYTHONPATH=${TVM_PATH}/python
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index d61895c459737..c9e6712ae30e5 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -63,7 +63,7 @@ TVM_FFI=ctypes python3 -m pytest apps/dso_plugin_module
 TVM_FFI=ctypes python3 -m pytest tests/python/integration
 TVM_FFI=ctypes python3 -m pytest tests/python/contrib
 
-TVM_FFI=ctypes python3 -m pytest tests/python/relay
+TVM_TEST_DEVICES="llvm;cuda" TVM_FFI=ctypes python3 -m pytest tests/python/relay
 
 # Do not enable OpenGL
 # TVM_FFI=cython python -m pytest tests/webgl
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 3bc3caf825cf8..ce24887a64062 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -31,4 +31,4 @@ make cython3
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-python3 -m pytest tests/python/topi/
+python3 -m pytest tests/python/topi/ -m "${TVM_TEST_DEVICE-}"
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index 46162e116496e..c32cfb4171442 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -27,7 +27,7 @@
 from tvm import te
 
 from matplotlib import pyplot as plt
-from tvm.relay.testing.config import ctx_list
+from tvm.relay.testing.config import enabled_devices
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.contrib.download import download_testdata
@@ -70,7 +70,7 @@
 
 model_name = supported_model[0]
 dshape = (1, 3, 512, 512)
-target_list = ctx_list()
+target_list = enabled_devices()
 
 ######################################################################
 # Download and pre-process demo image