diff --git a/src/target/spirv/spirv_support.cc b/src/target/spirv/spirv_support.cc
index 0f1207f3e9a8..1ef56198df7f 100644
--- a/src/target/spirv/spirv_support.cc
+++ b/src/target/spirv/spirv_support.cc
@@ -72,6 +72,9 @@ SPIRVSupport::SPIRVSupport(tvm::Target target) {
   if (target->GetAttr<Bool>("supports_float16")) {
     supports_float16 = target->GetAttr<Bool>("supports_float16").value();
   }
+  if (target->GetAttr<Bool>("supports_float64")) {
+    supports_float64 = target->GetAttr<Bool>("supports_float64").value();
+  }
   if (target->GetAttr<Bool>("supports_int8")) {
     supports_int8 = target->GetAttr<Bool>("supports_int8").value();
   }
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 81d3b3fd7f3f..93cfecf4239d 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -15,15 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for transposed convolution."""
-import numpy as np
+
 import itertools
+import os
+
+import numpy as np
+
 import tvm
-from tvm import te
-from tvm import topi
+import tvm.testing
 import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
+
+from tvm import te, topi
 from tvm.topi.utils import get_const_tuple
-import tvm.testing
 
 _conv1d_transpose_ncw_implement = {
     "generic": (topi.nn.conv1d_transpose_ncw, topi.generic.schedule_conv1d_transpose_ncw),
@@ -31,74 +34,88 @@
 }
 
 
-def verify_conv1d_transpose_ncw(
-    batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding
+(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    output_padding,
+) = tvm.testing.parameters(
+    (1, 3, 224, 32, 5, 1, 0, (0,)),
+    (1, 3, 224, 32, 7, 1, 2, (0,)),
+    (1, 3, 224, 32, 5, 2, 1, (0,)),
+    (1, 3, 224, 32, 5, 2, 1, (1,)),
+    (1, 3, 224, 32, 5, 2, 0, (0,)),
+    (1, 32, 32, 128, 5, 1, 0, (0,)),
+    (1, 32, 32, 128, 5, 2, 1, (0,)),
+    (1, 1, 1024, 1, 512, 1, 256, (0,)),
+    (1, 1, 1024, 1, 512, 2, 256, (0,)),
+    (1, 1, 1024, 1, 512, 5, 256, (0,)),
+    (1, 1, 1024, 1, 512, 5, 256, (3,)),
+    (1, 2, 1024, 1, 128, 128, 0, (0,)),
+    (1, 1, 1024, 2, 128, 128, 0, (0,)),
+    (1, 1, 1024, 2, 2, 2, 0, (0,)),
+    (1, 1, 10, 1, 5, 1, (0, 3), (0,)),
+    (1, 1, 10, 1, 5, 1, (1, 3), (0,)),
+    (1, 1, 10, 1, 5, 1, (2, 3), (0,)),
+    (1, 257, 128, 1, 512, 128, 256, (0,)),
+)
+
+dtype = tvm.testing.parameter("float32")
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(
+    dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding
+):
+    dtype = "float32"
+    a_shape = (batch, in_channel, in_size)
+    w_shape = (in_channel, num_filter, kernel)
+
+    a_np = np.random.uniform(size=a_shape).astype(dtype)
+    w_np = np.random.uniform(size=w_shape).astype(dtype)
+    b_np = tvm.topi.testing.conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding)
+    c_np = np.maximum(b_np, 0)
+    return a_np, w_np, b_np, c_np
+
+
+@tvm.testing.known_failing_targets("vulkan")
+def test_conv1d_transpose_ncw(
+    target,
+    dev,
+    ref_data,
+    dtype,
+    stride,
+    padding,
+    output_padding,
 ):
-    in_width = in_size
-    A = te.placeholder((batch, in_channel, in_width), name="A")
-    W = te.placeholder((in_channel, num_filter, kernel), name="W")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv1d_transpose.verify_conv1d_transpose_ncw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = tvm.topi.testing.conv1d_transpose_ncw_python(
-            a_np, w_np, stride, padding, output_padding
-        )
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target, dev):
-        dev = tvm.device(target, 0)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement)
-            B = fcompute(A, W, stride, padding, A.dtype, output_padding)
-            C = topi.nn.relu(B)
-            s1 = fschedule([B])
-            s2 = fschedule([C])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        func1 = tvm.build(s1, [A, W, B], target)
-        func2 = tvm.build(s2, [A, W, C], target)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_conv1d_transpose_ncw():
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 1, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 7, 1, 2, (0,))
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1, (0,))
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1, (1,))
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 1, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 2, 1, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 1, 256, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (3,))
-    verify_conv1d_transpose_ncw(1, 2, 1024, 1, 128, 128, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 128, 128, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 2, 2, 0, (0,))
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0, 3), (0,))
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1, 3), (0,))
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2, 3), (0,))
-    verify_conv1d_transpose_ncw(1, 257, 128, 1, 512, 128, 256, (0,))
+
+    a_np, w_np, b_np, c_np = ref_data
+
+    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
+
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement)
+        B = fcompute(A, W, stride, padding, A.dtype, output_padding)
+        C = topi.nn.relu(B)
+        s1 = fschedule([B])
+        s2 = fschedule([C])
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+
+    func1 = tvm.build(s1, [A, W, B], target)
+    func2 = tvm.build(s2, [A, W, C], target)
+    func1(a, w, b)
+    func2(a, w, c)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    test_conv1d_transpose_ncw()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index eb4c5a343b58..96359860f569 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -27,8 +27,8 @@
 
 
 _conv2d_nhwc_implement = {
-    "llvm": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
-    "cuda": (topi.cuda.conv2d_nhwc, topi.cuda.schedule_conv2d_nhwc),
+    "generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
+    "gpu": (topi.cuda.conv2d_nhwc, topi.cuda.schedule_conv2d_nhwc),
     "cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc),
     "arm_cpu": (
         topi.arm_cpu.conv2d_nhwc_spatial_pack,
@@ -45,61 +45,55 @@
     "hls": (topi.nn.conv2d_nhwc, topi.hls.schedule_conv2d_nhwc),
 }
 
+dtype = tvm.testing.parameter("float32")
 
-def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation = tvm.testing.parameters(
+    (1, 256, 32, 256, 3, 1, "SAME", 1),
+    (4, 128, 16, 128, 5, 2, "SAME", 1),
+    (4, 128, 16, 256, 5, 2, "SAME", 1),
+    (1, 256, 32, 256, 3, 1, "VALID", 1),
+    (1, 256, 32, 256, 3, 1, "VALID", 1),
+    (4, 128, 16, 128, 5, 2, "VALID", 1),
+    (4, 128, 16, 256, 5, 2, "VALID", 1),
+    (1, 128, 16, 256, 3, 2, (0, 0, 1, 1), 1),
+    (1, 128, 16, 256, 3, 2, (1, 1, 2, 2), 1),
+    (1, 128, 16, 128, 5, 2, (3, 3, 2, 2), 1),
+    (1, 128, 16, 256, 3, 2, (0, 1, 2, 3), 1),
+    (1, 256, 32, 256, 3, 1, "SAME", 2),
+    (1, 256, 32, 256, 3, 1, (1, 1, 2, 2), 2),
+)
 
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc.v2")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-        return a_np, w_np, b_np
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation):
+    in_height = in_width = in_size
+    a_shape = (batch, in_height, in_width, in_channel)
+    w_shape = (kernel, kernel, in_channel, num_filter)
 
-    a_np, w_np, b_np = get_ref_data()
+    a_np = np.random.uniform(size=a_shape).astype(dtype)
+    w_np = np.random.uniform(size=w_shape).astype(dtype)
+    dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+    b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+    return a_np, w_np, b_np
 
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_nhwc_implement)
-            B = fcompute(A, W, stride, padding, dilation, dtype)
-            s = fschedule([B])
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        func = tvm.build(s, [A, W, B], target)
-        func(a, w, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
+def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
+    a_np, w_np, b_np = ref_data
 
+    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+    W = te.placeholder(w_np.shape, name="W", dtype=dtype)
 
-@tvm.testing.uses_gpu
-def test_conv2d_nhwc():
-    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME")
-    verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "SAME")
-    verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "SAME")
-    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID")
-    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID")
-    verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "VALID")
-    verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "VALID")
-    verify_conv2d_nhwc(1, 128, 16, 256, 3, 2, (0, 0, 1, 1))
-    verify_conv2d_nhwc(1, 128, 16, 256, 3, 2, (1, 1, 2, 2))
-    verify_conv2d_nhwc(1, 128, 16, 128, 5, 2, (3, 3, 2, 2))
-    verify_conv2d_nhwc(1, 128, 16, 256, 3, 2, (0, 1, 2, 3))
-    # dilation = 2
-    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME", dilation=2)
-    verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, (1, 1, 2, 2), dilation=2)
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_nhwc_implement)
+        B = fcompute(A, W, stride, padding, dilation, dtype)
+        s = fschedule([B])
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    func = tvm.build(s, [A, W, B], target)
+    func(a, w, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    test_conv2d_nhwc()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index e6323065d9be..3dff54dfa694 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -15,125 +15,82 @@
 # specific language governing permissions and limitations
 # under the License
 """test of correlation operator in NCHW layout"""
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import tvm.testing
 import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
+
+from tvm import autotvm, te, topi
 
 _correlation_implement = {
     "generic": (topi.nn.correlation_nchw, topi.generic.schedule_correlation_nchw),
-    "cuda": (topi.cuda.correlation_nchw, topi.cuda.schedule_correlation_nchw),
+    "gpu": (topi.cuda.correlation_nchw, topi.cuda.schedule_correlation_nchw),
 }
 
+(
+    data_shape,
+    kernel_size,
+    max_displacement,
+    stride1,
+    stride2,
+    pad_size,
+    is_multiply,
+) = tvm.testing.parameters(
+    ((1, 3, 10, 10), 1, 4, 1, 1, 4, True),
+    ((1, 3, 10, 10), 1, 5, 1, 1, 5, True),
+    ((5, 1, 4, 4), 3, 1, 2, 1, 2, True),
+    ((5, 1, 6, 4), 3, 1, 2, 2, 2, False),
+    ((5, 1, 11, 11), 5, 1, 1, 1, 2, False),
+)
+
+dtype = tvm.testing.parameter("float32")
 
-def verify_correlation_nchw(
-    data_shape, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(
+    dtype, data_shape, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
 ):
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
-        % (
-            data_shape[0],
-            data_shape[1],
-            data_shape[2],
-            data_shape[3],
-            kernel_size,
-            max_displacement,
-            stride1,
-            stride2,
-            pad_size,
-            is_multiply,
-        )
+    a_np = np.random.uniform(size=data_shape).astype(dtype)
+    b_np = np.random.uniform(size=data_shape).astype(dtype)
+    c_np = tvm.topi.testing.correlation_nchw_python(
+        a_np, b_np, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
     )
+    return a_np, b_np, c_np
 
-    A = te.placeholder(data_shape, name="data1")
-    B = te.placeholder(data_shape, name="data2")
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_correlation_nchw.verify_correlation_nchw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=data_shape).astype(dtype)
-        b_np = np.random.uniform(size=data_shape).astype(dtype)
-        c_np = tvm.topi.testing.correlation_nchw_python(
-            a_np, b_np, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
-        )
-        return a_np, b_np, c_np
-
-    a_np, b_np, c_np = get_ref_data()
-
-    def check_device(target, dev):
-        print("Running on target: %s" % target)
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _correlation_implement)
-        with tvm.target.Target(target):
-            C = fcompute(
-                A, B, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
-            )
-            s = fschedule([C])
-
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.empty(c_np.shape, dtype=dtype, device=dev)
-
-            func = tvm.build(s, [A, B, C], target)
-            func(a, b, c)
-            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    for target, dev in tvm.testing.enabled_targets():
-        check_device(target, dev)
-
-
-@tvm.testing.uses_gpu
-def test_correlation_nchw():
-    verify_correlation_nchw(
-        (1, 3, 10, 10),
-        kernel_size=1,
-        max_displacement=4,
-        stride1=1,
-        stride2=1,
-        pad_size=4,
-        is_multiply=True,
-    )
-    verify_correlation_nchw(
-        (1, 3, 10, 10),
-        kernel_size=1,
-        max_displacement=5,
-        stride1=1,
-        stride2=1,
-        pad_size=5,
-        is_multiply=True,
-    )
-    verify_correlation_nchw(
-        (5, 1, 4, 4),
-        kernel_size=3,
-        max_displacement=1,
-        stride1=2,
-        stride2=1,
-        pad_size=2,
-        is_multiply=True,
-    )
-    verify_correlation_nchw(
-        (5, 1, 6, 4),
-        kernel_size=3,
-        max_displacement=1,
-        stride1=2,
-        stride2=2,
-        pad_size=2,
-        is_multiply=False,
-    )
-    verify_correlation_nchw(
-        (5, 1, 11, 11),
-        kernel_size=5,
-        max_displacement=1,
-        stride1=1,
-        stride2=1,
-        pad_size=2,
-        is_multiply=False,
-    )
+
+def test_correlation_nchw(
+    target,
+    dev,
+    ref_data,
+    dtype,
+    kernel_size,
+    max_displacement,
+    stride1,
+    stride2,
+    pad_size,
+    is_multiply,
+):
+    a_np, b_np, c_np = ref_data
+
+    A = te.placeholder(a_np.shape, name="data1", dtype=dtype)
+    B = te.placeholder(b_np.shape, name="data2", dtype=dtype)
+
+    fcompute, fschedule = tvm.topi.testing.dispatch(target, _correlation_implement)
+    with tvm.target.Target(target):
+        C = fcompute(A, B, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply)
+        s = fschedule([C])
+
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.empty(c_np.shape, dtype=dtype, device=dev)
+
+        func = tvm.build(s, [A, B, C], target)
+        func(a, b, c)
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    test_correlation_nchw()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_loss.py b/tests/python/topi/python/test_topi_loss.py
index bb7655b192f5..c1b61e5b49cd 100644
--- a/tests/python/topi/python/test_topi_loss.py
+++ b/tests/python/topi/python/test_topi_loss.py
@@ -25,9 +25,17 @@
 import tvm.testing
 
 
-def verify_nll_loss(
-    dev, target, prediction_shape, reduction="mean", ignore_index=-100, dtype="float32"
-):
+prediction_shape, reduction, ignore_index, dtype = tvm.testing.parameters(
+    ((10, 5), "mean", -100, "float32"),
+    ((10, 5, 2, 2), "mean", -100, "float32"),
+    ((10, 5), "sum", -100, "float32"),
+    ((10, 5), "none", -100, "float32"),
+    ((10, 5), "mean", 3, "float32"),
+    ((10, 5), "mean", -100, "float64"),
+)
+
+
+def test_nll_loss(target, dev, prediction_shape, reduction, ignore_index, dtype):
     C = prediction_shape[1]
     target_shape = prediction_shape[:1] + prediction_shape[2:]
     predictions = te.placeholder(shape=prediction_shape, name="predictions", dtype=dtype)
@@ -56,15 +64,5 @@ def verify_nll_loss(
     tvm.testing.assert_allclose(out_topi, out_npy, rtol=1e-4, atol=1e-5)
 
 
-@tvm.testing.parametrize_targets
-def test_nll_loss(dev, target):
-    verify_nll_loss(dev, target, (10, 5))
-    verify_nll_loss(dev, target, (10, 5, 2, 2))
-    verify_nll_loss(dev, target, (10, 5), reduction="sum")
-    verify_nll_loss(dev, target, (10, 5), reduction="none")
-    verify_nll_loss(dev, target, (10, 5), ignore_index=3)
-    verify_nll_loss(dev, target, (10, 5), dtype="float64")
-
-
 if __name__ == "__main__":
-    test_nll_loss(tvm.device("cpu"), tvm.target.Target("llvm"))
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index c7f80033bdf3..5ee049fa379a 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -14,14 +14,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+import sys
+
 import numpy as np
+import pytest
 import scipy
 from scipy import special
+
 import tvm
-from tvm import te
-from tvm import topi
 import tvm.testing
 import tvm.topi.testing
+
+from tvm import te, topi
 from tvm.topi import utils
 
 
@@ -31,211 +36,205 @@ def test_util():
     assert utils.get_const_tuple((x, x)) == (100, 100)
 
 
-@tvm.testing.uses_gpu
-def test_ewise():
-    def test_apply(
-        func,
-        name,
-        f_numpy,
-        low,
-        high,
-        shape=(20, 3),
-        dtype="float32",
-        check_round=False,
-        skip_name_check=False,
-    ):
-        m = te.var("m")
-        l = te.var("l")
-        A = te.placeholder((m, l), dtype=dtype, name="A")
-
-        B = func(A)
-        assert tuple(B.shape) == tuple(A.shape)
-        if not skip_name_check:
-            assert B.op.body[0].op.name == "tir." + name
-        a_np = np.random.uniform(low=low, high=high, size=shape).astype(A.dtype) * 10
-        # avoid round check too close to boundary
-        if check_round:
-            a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-4
-        b_np = f_numpy(a_np)
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(B)
-            foo = tvm.build(s, [A, B], target, name=name)
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(np.zeros_like(b_np), dev)
-            foo(a, b)
-            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    def test_isnan(
-        low,
-        high,
-        shape=(20, 3),
-        dtype="float32",
-        check_round=False,
-        skip_name_check=False,
-    ):
-        m = te.var("m")
-        l = te.var("l")
-        A = te.placeholder((m, l), dtype=dtype, name="A")
-
-        B = topi.isnan(A)
-        assert tuple(B.shape) == tuple(A.shape)
-        if not skip_name_check:
-            assert B.op.body[0].op.name == "tir.isnan"
-        a_np = np.random.uniform(low=low, high=high, size=shape).astype(A.dtype) * 10
-        a_np.ravel()[np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)] = np.nan
-        # avoid round check too close to boundary
-        if check_round:
-            a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5
-        b_np = np.isnan(a_np)
-
-        def check_target(target, dev):
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(B)
-            foo = tvm.build(s, [A, B], target, name="isnan")
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(np.zeros_like(b_np), dev)
-            foo(a, b)
-            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-        for target, dev in tvm.testing.enabled_targets():
-            check_target(target, dev)
-
-    def test_infiniteness_ops(topi_op, ref_op, name):
-        for dtype in ["float32", "float64", "int32", "int16"]:
-            m = te.var("m")
-            l = te.var("l")
-            A = te.placeholder((m, l), dtype=dtype, name="A")
-            B = topi_op(A)
-            assert tuple(B.shape) == tuple(A.shape)
-
-            a_np = np.random.uniform(size=(8, 8)).astype(A.dtype) * 10
-            if dtype.startswith("float"):
-                a_np.ravel()[
-                    np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)
-                ] = np.infty
-                a_np.ravel()[
-                    np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)
-                ] = np.nan
-            b_np = ref_op(a_np)
-
-            def check_target(target, dev):
-                with tvm.target.Target(target):
-                    s = tvm.topi.testing.get_injective_schedule(target)(B)
-                foo = tvm.build(s, [A, B], target, name=name)
-                a = tvm.nd.array(a_np, dev)
-                b = tvm.nd.array(np.zeros_like(b_np), dev)
-                foo(a, b)
-                tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-            for target, dev in tvm.testing.enabled_targets():
-                check_target(target, dev)
-
-    test_apply(topi.floor, "floor", np.floor, -100, 100)
-    test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
-    test_apply(topi.sign, "sign", np.sign, -100, 100, skip_name_check=True)
-    test_apply(topi.trunc, "trunc", np.trunc, -100, 100)
-    test_apply(topi.abs, "fabs", np.abs, -100, 100)
-    test_apply(topi.round, "round", np.round, -100, 100, check_round=True)
-    test_apply(topi.exp, "exp", np.exp, -1, 1)
-    test_apply(topi.tanh, "tanh", np.tanh, -10, 10, shape=(128, 128))
-    test_apply(topi.tanh, "tanh", np.tanh, -10, 10, shape=(128, 128), dtype="float64")
-    test_apply(topi.sigmoid, "sigmoid", lambda x: 1 / (1 + np.exp(-x)), -1, 1)
-    test_apply(topi.log, "log", np.log, 0, 100)
-    test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
-    test_apply(
-        topi.rsqrt, "rsqrt", lambda x: np.ones_like(x) / np.sqrt(x), 0, 100, skip_name_check=True
-    )
-    test_apply(topi.cos, "cos", np.cos, -2.0 * np.pi, 2.0 * np.pi)
-    test_apply(topi.tan, "tan", np.tan, -2.0 * np.pi, 2.0 * np.pi, dtype="float32")
-    test_apply(topi.tan, "tan", np.tan, -2.0 * np.pi, 2.0 * np.pi, dtype="float64")
-    test_apply(topi.sin, "sin", np.sin, -2.0 * np.pi, 2.0 * np.pi)
-    test_apply(topi.erf, "erf", scipy.special.erf, -0.1, 0.1, dtype="float32")
-    test_isnan(-100, 100)
-    test_infiniteness_ops(topi.isfinite, np.isfinite, "isifinite")
-    test_infiniteness_ops(topi.isinf, np.isinf, "isinf")
-
-
-@tvm.testing.uses_gpu
-def test_cast():
-    def verify(from_dtype, to_dtype, low=-100, high=100):
-        shape = (5, 4)
-        A = te.placeholder(shape, dtype=from_dtype, name="A")
-        B = topi.cast(A, to_dtype)
-
-        if from_dtype == "bool":
-            a_np = np.random.choice([True, False], size=shape)
-        else:
-            a_np = np.random.uniform(low, high, size=shape).astype(from_dtype)
-        if to_dtype == "bool":
-            a_np = a_np - a_np[2, 3]
-        b_np = a_np.astype(to_dtype)
-
-        for target, dev in tvm.testing.enabled_targets():
-            print("Running on target: %s" % target)
-            with tvm.target.Target(target):
-                s = tvm.topi.testing.get_injective_schedule(target)(B)
-            foo = tvm.build(s, [A, B], target)
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.empty(shape=shape, dtype=to_dtype, device=dev)
-            foo(a, b)
-            tvm.testing.assert_allclose(b.numpy(), b_np)
-
-    verify("int32", "float32")
-    verify("int32", "float64")
-    verify("int32", "bool")
-    verify("float32", "int32")
-    verify("float32", "float64")
-    verify("float32", "bool")
-    verify("bool", "float32")
-    verify("bool", "int32")
-
-
-def test_fastmath():
-    def test_apply(func, name, f_numpy, low, high, step, dtype="float32"):
-        a_np = np.arange(low, high, step).astype(dtype).reshape((1, -1))
-        b_np = f_numpy(a_np)
-        A = te.placeholder(a_np.shape, dtype=dtype, name="A")
-        B = func(A)
-        assert tuple(B.shape) == tuple(A.shape)
-
-        def check_target(target):
-            dev = tvm.device(target, 0)
-            if not tvm.testing.device_enabled(target):
-                print("Skip because %s is not enabled" % target)
-                return
-            with tvm.target.Target(target):
-                s = topi.generic.schedule_injective(B)
-            func = tvm.build(s, [A, B], target, name=name)
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(np.zeros_like(b_np), dev)
-            func(a, b)
-            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
-
-        check_target("llvm")
-        check_target("llvm -device=arm-cpu")
-
-    test_apply(topi.fast_exp, "fast_exp", np.exp, low=-88, high=88, step=0.01)
-    test_apply(topi.fast_erf, "fast_erf", scipy.special.erf, low=-10, high=10, step=0.01)
-    test_apply(topi.fast_tanh, "fast_tanh", np.tanh, low=-10, high=10, step=0.01)
-    test_apply(
-        topi.nn.fast_softmax,
-        "fast_softmax",
-        tvm.topi.testing.softmax_python,
-        low=-10,
-        high=10,
-        step=0.01,
-    )
+ewise_operations = {
+    "floor": {"topi": topi.floor, "ref": np.floor, "input_range": (-100, 100)},
+    "ceil": {"topi": topi.ceil, "ref": np.ceil, "input_range": (-100, 100)},
+    "sign": {
+        "topi": topi.sign,
+        "ref": np.sign,
+        "input_range": (-100, 100),
+        "skip_name_check": True,
+    },
+    "trunc": {"topi": topi.trunc, "ref": np.trunc, "input_range": (-100, 100)},
+    "fabs": {"topi": topi.abs, "ref": np.fabs, "input_range": (-100, 100)},
+    "round": {"topi": topi.round, "ref": np.round, "input_range": (-100, 100), "check_round": True},
+    "exp": {"topi": topi.exp, "ref": np.exp, "input_range": (-1, 1)},
+    "tanh": {
+        "topi": topi.tanh,
+        "ref": np.tanh,
+        "input_range": (-10, 10),
+        "shape": (128, 128),
+        "dtype": ["float32", "float64"],
+    },
+    "sigmoid": {
+        "topi": topi.sigmoid,
+        "ref": lambda x: 1 / (1 + np.exp(-x)),
+        "input_range": (-1, 1),
+    },
+    "log": {"topi": topi.log, "ref": np.log, "input_range": (0, 100)},
+    "sqrt": {"topi": topi.sqrt, "ref": np.sqrt, "input_range": (0, 100)},
+    "rsqrt": {
+        "topi": topi.rsqrt,
+        "ref": lambda x: np.ones_like(x) / np.sqrt(x),
+        "input_range": (0, 100),
+        "skip_name_check": True,
+    },
+    "cos": {"topi": topi.cos, "ref": np.cos, "input_range": (-2.0 * np.pi, 2.0 * np.pi)},
+    "tan": {
+        "topi": topi.tan,
+        "ref": np.tan,
+        "input_range": (-2.0 * np.pi, 2.0 * np.pi),
+        "dtypes": ["float32", "float64"],
+    },
+    "sin": {"topi": topi.sin, "ref": np.sin, "input_range": (-2.0 * np.pi, 2.0 * np.pi)},
+    "erf": {"topi": topi.erf, "ref": scipy.special.erf, "input_range": (-0.1, 0.1)},
+    "isnan": {
+        "topi": topi.isnan,
+        "ref": np.isnan,
+        "input_range": (-1, 1),
+        "replace_with_nan": True,
+    },
+    "isfinite": {
+        "topi": topi.isfinite,
+        "ref": np.isfinite,
+        "input_range": (0, 1),
+        "shape": (8, 8),
+        "skip_name_check": True,
+        "replace_with_nan": True,
+        "replace_with_inf": True,
+        "dtypes": ["float32", "float64", "int32", "int16"],
+    },
+    "isinf": {
+        "topi": topi.isinf,
+        "ref": np.isinf,
+        "input_range": (0, 1),
+        "shape": (8, 8),
+        "skip_name_check": True,
+        "replace_with_nan": True,
+        "replace_with_inf": True,
+        "dtypes": ["float32", "float64", "int32", "int16"],
+    },
+    "fast_exp": {
+        "topi": topi.fast_exp,
+        "ref": np.exp,
+        "skip_name_check": True,
+        "input_range": (-88, 88),
+        "step": 0.01,
+    },
+    "fast_erf": {
+        "topi": topi.fast_erf,
+        "ref": scipy.special.erf,
+        "skip_name_check": True,
+        "input_range": (-10, 10),
+        "step": 0.01,
+    },
+    "fast_erf": {
+        "topi": topi.fast_tanh,
+        "ref": np.tanh,
+        "skip_name_check": True,
+        "input_range": (-10, 10),
+        "step": 0.01,
+    },
+}
+
+topi_name, dtype = tvm.testing.parameters(
+    *[
+        (name, dtype)
+        for name, config in ewise_operations.items()
+        for dtype in config.get("dtypes", ["float32"])
+    ]
+)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ewise_ref_data(topi_name, dtype):
+    config = ewise_operations[topi_name]
+
+    input_range = config["input_range"]
+    shape = config.get("shape", (20, 3))
+
+    a_np = np.random.uniform(*input_range, size=shape).astype(dtype)
+
+    if dtype.startswith("float"):
+        if config.get("replace_with_nan", False):
+            a_np.ravel()[np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)] = np.nan
+        if config.get("replace_with_inf", False):
+            a_np.ravel()[
+                np.random.choice(a_np.size, int(a_np.size * 0.5), replace=False)
+            ] = np.infty
+
+    # avoid round check too close to boundary
+    if topi_name == "round":
+        a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-4
+
+    b_np = config["ref"](a_np)
+
+    return a_np, b_np
+
+
+def test_ewise(target, dev, topi_name, dtype, ewise_ref_data):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan" and topi_name in ["tan", "erf", "isnan", "isfinite", "isinf"]:
+        pytest.xfail(f"Vulkan runtime doesn't support {topi_name} yet")
+
+    topi_op = ewise_operations[topi_name]["topi"]
+    skip_name_check = ewise_operations[topi_name].get("skip_name_check", False)
+
+    m = te.var("m")
+    l = te.var("l")
+    A = te.placeholder((m, l), dtype=dtype, name="A")
+
+    B = topi_op(A)
+    assert tuple(B.shape) == tuple(A.shape)
+    if not skip_name_check:
+        assert B.op.body[0].op.name == "tir." + topi_name
+
+    a_np, b_np = ewise_ref_data
+
+    with tvm.target.Target(target):
+        s = tvm.topi.testing.get_injective_schedule(target)(B)
+    foo = tvm.build(s, [A, B], target, name=topi_name)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros_like(b_np), dev)
+    foo(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
+
+
+from_dtype, to_dtype = tvm.testing.parameters(
+    ("int32", "float32"),
+    ("int32", "float64"),
+    ("int32", "bool"),
+    ("float32", "int32"),
+    ("float32", "float64"),
+    ("float32", "bool"),
+    ("bool", "float32"),
+    ("bool", "int32"),
+)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def cast_ref_data(from_dtype, to_dtype):
+    shape = (5, 4)
+    input_range = (-100, 100)
+
+    if from_dtype == "bool":
+        a_np = np.random.choice([True, False], size=shape)
+    else:
+        a_np = np.random.uniform(*input_range, size=shape).astype(from_dtype)
+
+    if to_dtype == "bool":
+        a_np = a_np - a_np[2, 3]
+    b_np = a_np.astype(to_dtype)
+
+    return a_np, b_np
+
+
+def test_cast(target, dev, cast_ref_data, from_dtype, to_dtype):
+    m = te.var("m")
+    l = te.var("l")
+    A = te.placeholder((m, l), dtype=from_dtype, name="A")
+    B = topi.cast(A, to_dtype)
+
+    a_np, b_np = cast_ref_data
+
+    with tvm.target.Target(target):
+        s = tvm.topi.testing.get_injective_schedule(target)(B)
+    foo = tvm.build(s, [A, B], target)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.empty(b_np.shape, dtype=to_dtype, device=dev)
+    foo(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np)
 
 
 if __name__ == "__main__":
-    test_util()
-    test_ewise()
-    test_cast()
-    test_fastmath()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_reduce.py b/tests/python/topi/python/test_topi_reduce.py
index 07656032f878..23d762c5002a 100644
--- a/tests/python/topi/python/test_topi_reduce.py
+++ b/tests/python/topi/python/test_topi_reduce.py
@@ -16,23 +16,76 @@
 # under the License.
 """Test code for reduce."""
 import os
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
-from tvm import te
-from tvm import topi
 import tvm.testing
 import tvm.topi.testing
 
+from tvm import te, topi
+
+in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
+    ((32,), 0, False, "argmax", "float32"),
+    ((128, 24, 128, 24), (1, 2, 3), True, "sum", "float32"),
+    ((2, 3), None, True, "all", "bool"),
+    ((128, 24 * 128 * 24), (1,), False, "max", "float32"),
+    ((32, 128, 24), None, True, "sum", "float32"),
+    ((32, 128, 24), None, True, "all", "bool"),
+    ((128, 24, 128, 24), (0, 2), False, "min", "float32"),
+    ((32, 128), 1, True, "argmax", "float32"),
+    ((32, 24, 32, 24), 2, False, "argmin", "float32"),
+    ((31, 21, 15), None, True, "argmax", "float32"),
+    ((31, 21, 15), None, False, "sum", "float32"),
+    ((128, 24, 128, 24), (1, 2, 3), True, "sum", "float64"),
+    ((2, 3), None, True, "any", "bool"),
+    ((32, 128, 24), None, True, "any", "bool"),
+    ((1, 4, 7), 1, True, "any", "bool"),
+    ((128, 24, 128, 24), 2, False, "any", "bool"),
+)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
+    # Test
+    if dtype == "bool":
+        in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
+    else:
+        in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
+        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
+
+    if reduce_type == "sum":
+        out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
+    elif reduce_type == "all" and dtype == "bool":
+        out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
+    elif reduce_type == "any" and dtype == "bool":
+        out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
+    elif reduce_type == "max":
+        out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
+    elif reduce_type == "min":
+        out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
+    elif reduce_type == "argmax":
+        out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
+    elif reduce_type == "argmin":
+        out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+    else:
+        raise NotImplementedError
+
+    return in_npy, in_npy_map, out_npy
+
 
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
         return arr.argmax(axis=axis)
     else:
-        if axis is not None:
+        if axis is None:
+            out_shape = [1 for _ in arr.shape]
+        else:
             out_shape = list(arr.shape)
             out_shape[axis] = 1
-        else:
-            out_shape = [1 for _ in range(len(arr.shape))]
+
         return arr.argmax(axis=axis).reshape(out_shape)
 
 
@@ -40,120 +93,72 @@ def _my_npy_argmin(arr, axis, keepdims):
     if not keepdims:
         return arr.argmin(axis=axis)
     else:
-        out_shape = list(arr.shape)
-        out_shape[axis] = 1
+        if axis is None:
+            out_shape = [1 for _ in arr.shape]
+        else:
+            out_shape = list(arr.shape)
+            out_shape[axis] = 1
         return arr.argmin(axis=axis).reshape(out_shape)
 
 
-def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
+def test_reduce_map(target, dev, ref_data, in_shape, axis, keepdims, reduce_type, dtype):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan" and reduce_type in ["sum", "any", "all"]:
+        pytest.xfail(f"Vulkan backend has known errors on {reduce_type}")
+
+    in_npy, in_npy_map, out_npy = ref_data
+
     # Build the logic and compile the function
     A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
     A1 = topi.sqrt(topi.exp(A))
     out_dtype = dtype
-    if type == "sum":
+    if reduce_type == "sum":
         B = topi.sum(A1, axis=axis, keepdims=keepdims)
-    elif type == "all":
+    elif reduce_type == "all":
         B = topi.all(A, axis=axis, keepdims=keepdims)
-    elif type == "any":
+    elif reduce_type == "any":
         B = topi.any(A, axis=axis, keepdims=keepdims)
-    elif type == "max":
+    elif reduce_type == "max":
         B = topi.max(A1, axis=axis, keepdims=keepdims)
-    elif type == "min":
+    elif reduce_type == "min":
         B = topi.min(A1, axis=axis, keepdims=keepdims)
-    elif type == "argmax":
+    elif reduce_type == "argmax":
         B = topi.argmax(A1, axis=axis, keepdims=keepdims)
         out_dtype = "int32"
-    elif type == "argmin":
+    elif reduce_type == "argmin":
         B = topi.argmin(A1, axis=axis, keepdims=keepdims)
         out_dtype = "int32"
     else:
         raise NotImplementedError
 
-    def check_device(device, dev):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_reduce_schedule(device)(B)
+    with tvm.target.Target(target):
+        s = tvm.topi.testing.get_reduce_schedule(target)(B)
 
-        foo = tvm.build(s, [A, B], device, name=type)
-        # Test
-        if dtype == "bool":
-            in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
-        else:
-            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
-
-        if type == "sum":
-            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
-        elif type == "all" and dtype == "bool":
-            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
-        elif type == "any" and dtype == "bool":
-            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
-        elif type == "max":
-            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
-        elif type == "min":
-            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
-        elif type == "argmax":
-            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
-        elif type == "argmin":
-            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
-        else:
-            raise NotImplementedError
-        data_tvm = tvm.nd.array(in_npy, device=dev)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
-        for _ in range(1):
-            foo(data_tvm, out_tvm)
-        if type == "argmax" or type == "argmin":
-            out_tvm_indices = out_tvm.numpy()
-            if keepdims:
-                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
-            if axis is None:
-                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
-            else:
-                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
-                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
-                out_tvm_val = in_npy_map[sel_indices]
-            if type == "argmax":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
-            elif type == "argmin":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
+    foo = tvm.build(s, [A, B], target, name=reduce_type)
+
+    data_tvm = tvm.nd.array(in_npy, device=dev)
+    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
+    foo(data_tvm, out_tvm)
+
+    if reduce_type == "argmax" or reduce_type == "argmin":
+        out_tvm_indices = out_tvm.numpy()
+        if keepdims:
+            out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
+        if axis is None:
+            out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
         else:
-            tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
-
-    for device, dev in tvm.testing.enabled_targets():
-        check_device(device, dev)
-
-
-@tvm.testing.uses_gpu
-def test_reduce_map():
-
-    verify_reduce_map_ele(in_shape=(32,), axis=0, keepdims=False, type="argmax")
-    verify_reduce_map_ele(in_shape=(128, 24, 128, 24), axis=(1, 2, 3), keepdims=True, type="sum")
-    verify_reduce_map_ele(in_shape=(2, 3), axis=None, keepdims=True, type="all", dtype="bool")
-    verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24), axis=(1,), keepdims=False, type="max")
-    verify_reduce_map_ele(in_shape=(32, 128, 24), axis=None, keepdims=True, type="sum")
-    verify_reduce_map_ele(
-        in_shape=(32, 128, 24), axis=None, keepdims=True, dtype="bool", type="all"
-    )
-    verify_reduce_map_ele(in_shape=(128, 24, 128, 24), axis=(0, 2), keepdims=False, type="min")
-    verify_reduce_map_ele(in_shape=(32, 128), axis=1, keepdims=True, type="argmax")
-    verify_reduce_map_ele(in_shape=(32, 24, 32, 24), axis=2, keepdims=False, type="argmin")
-    verify_reduce_map_ele(in_shape=(31, 21, 15), axis=None, keepdims=True, type="argmax")
-    verify_reduce_map_ele(in_shape=(31, 21, 15), axis=None, keepdims=False, type="sum")
-    verify_reduce_map_ele(
-        in_shape=(128, 24, 128, 24), axis=(1, 2, 3), keepdims=True, type="sum", dtype="float64"
-    )
-    verify_reduce_map_ele(in_shape=(2, 3), axis=None, keepdims=True, type="any", dtype="bool")
-    verify_reduce_map_ele(
-        in_shape=(32, 128, 24), axis=None, keepdims=True, type="any", dtype="bool"
-    )
-    verify_reduce_map_ele(in_shape=(1, 4, 7), axis=1, keepdims=True, type="any", dtype="bool")
-    verify_reduce_map_ele(
-        in_shape=(128, 24, 128, 24), axis=2, keepdims=False, type="any", dtype="bool"
-    )
-
-
-@tvm.testing.uses_gpu
-def test_complex_reduce():
+            other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
+            sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
+            out_tvm_val = in_npy_map[sel_indices]
+        if reduce_type == "argmax":
+            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
+        elif reduce_type == "argmin":
+            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
+    else:
+        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
+
+
+def test_complex_reduce(target, dev):
     in_shape = (2, 3)
     dtype = "float32"
     axis = 0
@@ -163,20 +168,20 @@ def test_complex_reduce():
     C = topi.add(B, B)
     D = topi.multiply(B, B)
     E = topi.add(C, D)
-    for device, dev in tvm.testing.enabled_targets():
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_reduce_schedule(device)(E)
-        foo = tvm.build(s, [A, E], device, name="sum")
-        in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-        sum_npy = in_npy.sum(axis=axis, keepdims=keepdims)
-        out_npy = sum_npy * 2 + sum_npy * sum_npy
-        data_tvm = tvm.nd.array(in_npy, device=dev)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=dtype)
-        foo(data_tvm, out_tvm)
-        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
+
+    with tvm.target.Target(target):
+        s = tvm.topi.testing.get_reduce_schedule(target)(E)
+    foo = tvm.build(s, [A, E], target, name="sum")
+
+    in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
+    sum_npy = in_npy.sum(axis=axis, keepdims=keepdims)
+    out_npy = sum_npy * 2 + sum_npy * sum_npy
+
+    data_tvm = tvm.nd.array(in_npy, device=dev)
+    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=dtype)
+    foo(data_tvm, out_tvm)
+    tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
 
 
 if __name__ == "__main__":
-    test_reduce_map()
-    test_complex_reduce()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 8af038a1f7ce..97fbedcc288e 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -15,14 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for softmax"""
+import logging
 import os
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
-from tvm import te
-from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-import logging
+from tvm import te, topi
 from tvm.topi.utils import get_const_tuple
 
 
@@ -34,75 +37,72 @@
 }
 
 
-def check_target(A, B, a_np, b_np, target, dev, name):
-    print("Running on target: %s" % target)
-    with tvm.target.Target(target):
-        s_func = tvm.topi.testing.dispatch(target, _softmax_schedule)
-        s = s_func(B)
-
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    f = tvm.build(s, [A, B], target, name=name)
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+dtype = tvm.testing.parameter("float32", "float64")
 
 
-def verify_softmax(m, n, dtype="float32"):
-    A = te.placeholder((m, n), dtype=dtype, name="A")
-    B = topi.nn.softmax(A)
-    # confirm lower works
-    s = te.create_schedule([B.op])
-    tvm.lower(s, [A, B], simple_mode=True)
-
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = tvm.topi.testing.softmax_python(a_np)
+configs = {
+    "softmax": {
+        "topi": topi.nn.softmax,
+        "ref": tvm.topi.testing.softmax_python,
+        "dimensions": [2, 4],
+    },
+    "log_softmax": {
+        "topi": topi.nn.log_softmax,
+        "ref": tvm.topi.testing.log_softmax_python,
+        "dimensions": [2],
+    },
+}
+shapes = [(32, 10), (3, 4), (1, 16, 256, 256)]
+softmax_operation, shape = tvm.testing.parameters(
+    *[
+        (name, shape)
+        for name, config in configs.items()
+        for shape in shapes
+        if len(shape) in config["dimensions"]
+    ]
+)
 
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(A, B, a_np, b_np, target, dev, "softmax")
 
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(shape, dtype, softmax_operation):
+    ref_func = configs[softmax_operation]["ref"]
 
-def verify_softmax_4d(shape, dtype="float32"):
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = topi.nn.softmax(A, axis=1)
+    a_np = np.random.uniform(size=shape).astype(dtype)
 
-    _, c, h, w = shape
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
-    b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2)
+    if len(shape) == 2:
+        b_np = ref_func(a_np)
+    elif len(shape) == 4:
+        _, c, h, w = a_np.shape
+        a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
+        b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
+        b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
 
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(A, B, a_np, b_np, target, dev, "softmax")
+    return a_np, b_np
 
 
-@tvm.testing.uses_gpu
-def test_softmax():
-    verify_softmax(32, 10)
-    verify_softmax(3, 4)
-    verify_softmax(32, 10, "float64")
-    verify_softmax_4d((1, 16, 256, 256))
+def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan" and dtype == "float64":
+        # https://www.khronos.org/registry/SPIR-V/specs/1.0/GLSL.std.450.html
+        pytest.xfail("Vulkan GLSL.std.450 does not support 64-bit floats")
 
+    A = te.placeholder(shape, dtype=dtype, name="A")
 
-def verify_log_softmax(m, n, dtype="float32"):
-    A = te.placeholder((m, n), dtype=dtype, name="A")
-    B = topi.nn.log_softmax(A)
-    # confirm lower works
-    s = te.create_schedule([B.op])
-    tvm.lower(s, [A, B], simple_mode=True)
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = tvm.topi.testing.log_softmax_python(a_np)
+    topi_op = configs[softmax_operation]["topi"]
+    B = topi_op(A, axis=1)
 
-    for target, dev in tvm.testing.enabled_targets():
-        check_target(A, B, a_np, b_np, target, dev, "log_softmax")
+    with tvm.target.Target(target):
+        fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)
+        s = fschedule(B)
 
+    a_np, b_np = ref_data
 
-@tvm.testing.uses_gpu
-def test_log_softmax():
-    verify_log_softmax(32, 10)
-    verify_log_softmax(3, 4)
-    verify_log_softmax(32, 10, "float64")
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    f = tvm.build(s, [A, B], target)
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_softmax()
-    test_log_softmax()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 65b2ae590308..43c6ce88be76 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -15,13 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for vision package"""
-from __future__ import print_function
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
-from tvm import te
-from tvm import topi
-import tvm.topi.testing
 import tvm.testing
+import tvm.topi.testing
+
+from tvm import te, topi
 
 _sort_implement = {
     "generic": (topi.sort, topi.generic.schedule_sort),
@@ -38,8 +41,17 @@
     "gpu": (topi.cuda.topk, topi.cuda.schedule_topk),
 }
 
+axis = tvm.testing.parameter(0, -1, 1)
+is_ascend = tvm.testing.parameter(True, False, ids=["is_ascend", "not_ascend"])
+dtype = tvm.testing.parameter("int64", "float32")
+
+topk = tvm.testing.parameter(0, 1, 5)
+topk_ret_type = tvm.testing.parameter("values", "indices", "both")
+
+
+def test_sort(target, dev, axis, is_ascend):
+    np.random.seed(0)
 
-def verify_sort(axis, is_ascend):
     dshape = (20, 100)
     data_dtype = "float32"
     data = te.placeholder(dshape, name="data", dtype=data_dtype)
@@ -58,28 +70,19 @@ def verify_sort(axis, is_ascend):
     else:
         np_sort = np_sort[:, : dshape[axis]]
 
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        dev = tvm.device(target, 0)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _sort_implement)
-            out = fcompute(data, axis=axis, is_ascend=is_ascend)
-            s = fschedule(out)
-
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
-        f = tvm.build(s, [data, out], target)
-        f(tvm_data, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), np_sort, rtol=1e0)
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _sort_implement)
+        out = fcompute(data, axis=axis, is_ascend=is_ascend)
+        s = fschedule(out)
 
-    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_target(target)
+    tvm_data = tvm.nd.array(np_data, dev)
+    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
+    f = tvm.build(s, [data, out], target)
+    f(tvm_data, tvm_out)
+    tvm.testing.assert_allclose(tvm_out.numpy(), np_sort, rtol=1e0)
 
 
-def verify_argsort(axis, is_ascend):
+def test_argsort(target, dev, axis, is_ascend):
     dshape = (20, 100)
     data_dtype = "float32"
     data = te.placeholder(dshape, name="data", dtype=data_dtype)
@@ -98,28 +101,21 @@ def verify_argsort(axis, is_ascend):
     else:
         np_indices = np_indices[:, : dshape[axis]]
 
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        dev = tvm.device(target, 0)
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _argsort_implement)
-            out = fcompute(data, axis=axis, is_ascend=is_ascend)
-            s = fschedule(out)
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _argsort_implement)
+        out = fcompute(data, axis=axis, is_ascend=is_ascend)
+        s = fschedule(out)
 
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
-        f = tvm.build(s, [data, out], target)
-        f(tvm_data, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), np_indices.astype(data_dtype), rtol=1e0)
+    tvm_data = tvm.nd.array(np_data, dev)
+    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
+    f = tvm.build(s, [data, out], target)
+    f(tvm_data, tvm_out)
+    tvm.testing.assert_allclose(tvm_out.numpy(), np_indices.astype(data_dtype), rtol=1e0)
 
-    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_target(target)
 
+def test_topk(target, dev, topk, axis, topk_ret_type, is_ascend, dtype):
+    np.random.seed(0)
 
-def verify_topk(k, axis, ret_type, is_ascend, dtype):
     shape = (20, 100)
     data_dtype = "float32"
     data = te.placeholder(shape, name="data", dtype=data_dtype)
@@ -129,7 +125,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         np_indices = np.argsort(np_data, axis=axis)
     else:
         np_indices = np.argsort(-np_data, axis=axis)
-    kk = k if k >= 1 else shape[axis]
+    kk = topk if topk >= 1 else shape[axis]
     if axis == 0:
         np_indices = np_indices[:kk, :]
         np_values = np.zeros(np_indices.shape).astype(data_dtype)
@@ -142,61 +138,25 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
             np_values[i, :] = np_data[i, np_indices[i, :]]
     np_indices = np_indices.astype(dtype)
 
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _topk_implement)
-            outs = fcompute(data, k, axis, ret_type, is_ascend, dtype)
-            outs = outs if isinstance(outs, list) else [outs]
-            s = fschedule(outs)
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_res = []
-        for t in outs:
-            tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, device=dev))
-        f = tvm.build(s, [data] + outs, target)
-        f(tvm_data, *tvm_res)
-        if ret_type == "both":
-            tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
-            tvm.testing.assert_allclose(tvm_res[1].numpy(), np_indices)
-        elif ret_type == "values":
-            tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
-        else:
-            tvm.testing.assert_allclose(tvm_res[0].numpy(), np_indices)
-
-    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.uses_gpu
-def test_sort():
-    np.random.seed(0)
-    for axis in [0, -1, 1]:
-        verify_sort(axis, True)
-        verify_sort(axis, False)
-
-
-@tvm.testing.uses_gpu
-def test_argsort():
-    np.random.seed(0)
-    for axis in [0, -1, 1]:
-        verify_argsort(axis, True)
-        verify_argsort(axis, False)
-
-
-@tvm.testing.uses_gpu
-def test_topk():
-    np.random.seed(0)
-    for k in [0, 1, 5]:
-        for axis in [0, -1, 1]:
-            for ret_type in ["both", "values", "indices"]:
-                verify_topk(k, axis, ret_type, True, "int64")
-                verify_topk(k, axis, ret_type, False, "float32")
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _topk_implement)
+        outs = fcompute(data, topk, axis, topk_ret_type, is_ascend, dtype)
+        outs = outs if isinstance(outs, list) else [outs]
+        s = fschedule(outs)
+    tvm_data = tvm.nd.array(np_data, dev)
+    tvm_res = []
+    for t in outs:
+        tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, device=dev))
+    f = tvm.build(s, [data] + outs, target)
+    f(tvm_data, *tvm_res)
+    if topk_ret_type == "both":
+        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
+        tvm.testing.assert_allclose(tvm_res[1].numpy(), np_indices)
+    elif topk_ret_type == "values":
+        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_values)
+    else:
+        tvm.testing.assert_allclose(tvm_res[0].numpy(), np_indices)
 
 
 if __name__ == "__main__":
-    test_argsort()
-    test_topk()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py
index 3e26241cea94..4dd9b193ad57 100644
--- a/tests/python/topi/python/test_topi_unique.py
+++ b/tests/python/topi/python/test_topi_unique.py
@@ -20,9 +20,14 @@
 from tvm import topi
 import tvm.topi.testing
 
+in_dtype = tvm.testing.parameter("int32", "int64")
+is_sorted = tvm.testing.parameter(True, False, ids=["sorted", "unsorted"])
+with_counts = tvm.testing.parameter(True, False, ids=["with_counts", "no_counts"])
+arr_size, maxval = tvm.testing.parameters((1, 100), (10, 10), (10000, 100))
+
 
 @tvm.testing.parametrize_targets
-def test_unique(dev, target):
+def test_unique(dev, target, in_dtype, is_sorted, with_counts, arr_size, maxval):
     def calc_numpy_unique(data, is_sorted=False):
         uniq, index, inverse, counts = np.unique(
             data, return_index=True, return_inverse=True, return_counts=True
@@ -43,82 +48,67 @@ def calc_numpy_unique(data, is_sorted=False):
             num_uniq,
         ]
 
-    def check_unique(data, is_sorted=False, with_counts=False):
-        # numpy reference
-        np_unique, np_indices, np_inverse_indices, np_counts, np_num_unique = calc_numpy_unique(
-            data, is_sorted
-        )
-        num_unique = np_num_unique[0]
+    data = np.random.randint(0, maxval, size=(arr_size)).astype(in_dtype)
 
-        implementations = {
-            "generic": (
-                lambda x, return_counts: topi.unique(x, is_sorted, return_counts),
-                topi.generic.schedule_unique,
-            ),
-            "cuda": (
-                lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
-                topi.cuda.schedule_scan,
-            ),
-            "nvptx": (
-                lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
-                topi.cuda.schedule_scan,
-            ),
-        }
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm_data = tvm.nd.array(data, device=dev)
-        tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), device=dev)
-        tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-        tvm_inverse_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-        tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), device=dev)
+    # numpy reference
+    np_unique, np_indices, np_inverse_indices, np_counts, np_num_unique = calc_numpy_unique(
+        data, is_sorted
+    )
+    num_unique = np_num_unique[0]
 
-        with tvm.target.Target(target):
-            te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
-            outs = fcompute(te_input, with_counts)
-            s = fschedule(outs)
-            func = tvm.build(s, [te_input, *outs])
+    implementations = {
+        "generic": (
+            lambda x, return_counts: topi.unique(x, is_sorted, return_counts),
+            topi.generic.schedule_unique,
+        ),
+        "gpu": (
+            lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
+            topi.cuda.schedule_scan,
+        ),
+        "nvptx": (
+            lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
+            topi.cuda.schedule_scan,
+        ),
+    }
+    fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+    tvm_data = tvm.nd.array(data, device=dev)
+    tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), device=dev)
+    tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
+    tvm_inverse_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
+    tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), device=dev)
 
-            if with_counts:
-                tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
-                func(
-                    tvm_data,
-                    tvm_unique,
-                    tvm_indices,
-                    tvm_inverse_indices,
-                    tvm_num_unique,
-                    tvm_counts,
-                )
-            else:
-                func(tvm_data, tvm_unique, tvm_indices, tvm_inverse_indices, tvm_num_unique)
+    with tvm.target.Target(target):
+        te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
+        outs = fcompute(te_input, with_counts)
+        s = fschedule(outs)
+        func = tvm.build(s, [te_input, *outs])
 
-        num_unique = np_num_unique[0]
-        assert tvm_num_unique.numpy()[0] == np_num_unique
+        if with_counts:
+            tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
+            func(
+                tvm_data,
+                tvm_unique,
+                tvm_indices,
+                tvm_inverse_indices,
+                tvm_num_unique,
+                tvm_counts,
+            )
+        else:
+            func(tvm_data, tvm_unique, tvm_indices, tvm_inverse_indices, tvm_num_unique)
 
-        np.testing.assert_allclose(tvm_unique.numpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5)
-        np.testing.assert_allclose(
-            tvm_indices.numpy()[:num_unique], np_indices, atol=1e-5, rtol=1e-5
-        )
+    num_unique = np_num_unique[0]
+    assert tvm_num_unique.numpy()[0] == np_num_unique
 
-        np.testing.assert_allclose(
-            tvm_inverse_indices.numpy(), np_inverse_indices, atol=1e-5, rtol=1e-5
-        )
+    np.testing.assert_allclose(tvm_unique.numpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5)
+    np.testing.assert_allclose(tvm_indices.numpy()[:num_unique], np_indices, atol=1e-5, rtol=1e-5)
 
-        if with_counts:
-            np.testing.assert_allclose(
-                tvm_counts.numpy()[:num_unique], np_counts, atol=1e-5, rtol=1e-5
-            )
+    np.testing.assert_allclose(
+        tvm_inverse_indices.numpy(), np_inverse_indices, atol=1e-5, rtol=1e-5
+    )
 
-    for in_dtype in ["int32", "int64"]:
-        for is_sorted in [True, False]:
-            for with_counts in [True, False]:
-                data = np.random.randint(0, 100, size=(1)).astype(in_dtype)
-                check_unique(data, is_sorted, with_counts)
-                data = np.random.randint(0, 10, size=(10)).astype(in_dtype)
-                check_unique(data, is_sorted, with_counts)
-                data = np.random.randint(0, 100, size=(10000)).astype(in_dtype)
-                check_unique(data, is_sorted, with_counts)
+    if with_counts:
+        np.testing.assert_allclose(tvm_counts.numpy()[:num_unique], np_counts, atol=1e-5, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    test_unique(tvm.device("cpu"), tvm.target.Target("llvm"))
-    test_unique(tvm.device("cuda"), tvm.target.Target("cuda"))
-    test_unique(tvm.device("nvptx"), tvm.target.Target("nvptx"))
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 234107d6686e..6ddb86f4027f 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -15,19 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for vision package"""
-from __future__ import print_function
 import math
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
-from tvm import te
-from tvm import topi
+import tvm.testing
 import tvm.topi.testing
 
-from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.utils import get_const_tuple
+from tvm import te, topi
 from tvm.topi.vision import ssd, non_max_suppression, get_valid_counts
-import pytest
-import tvm.testing
 
 _get_valid_counts_implement = {
     "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts),
@@ -71,35 +70,46 @@
 }
 
 
-def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
-    dtype = "float32"
-    batch_size, num_anchor, elem_length = dshape
-    np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
-    np_out1 = np.zeros(shape=(batch_size,))
-    np_out2 = np.zeros(shape=dshape).astype(dtype)
-    np_out3 = np.zeros(shape=(batch_size, num_anchor))
-    for i in range(batch_size):
-        np_out1[i] = 0
-        inter_idx = 0
-        for j in range(num_anchor):
-            score = np_data[i, j, score_index]
-            if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
-                for k in range(elem_length):
-                    np_out2[i, inter_idx, k] = np_data[i, j, k]
-                np_out1[i] += 1
-                np_out3[i, inter_idx] = j
-                inter_idx += 1
-            if j >= np_out1[i]:
-                for k in range(elem_length):
-                    np_out2[i, j, k] = -1.0
-                np_out3[i, j] = -1
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
+class TestValidCounts:
+    dshape, score_threshold, id_index, score_index = tvm.testing.parameters(
+        ((1, 1000, 5), 0.5, -1, 0),
+        ((1, 2500, 6), 0, 0, 1),
+        ((1, 2500, 5), -1, -1, 0),
+        ((3, 1000, 6), 0.55, 1, 0),
+        ((16, 500, 5), 0.95, -1, 1),
+    )
+    dtype = tvm.testing.parameter("float32")
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(self, dtype, dshape, score_threshold, id_index, score_index):
+        batch_size, num_anchor, elem_length = dshape
+        np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
+        np_out1 = np.zeros(shape=(batch_size,))
+        np_out2 = np.zeros(shape=dshape).astype(dtype)
+        np_out3 = np.zeros(shape=(batch_size, num_anchor))
+        for i in range(batch_size):
+            np_out1[i] = 0
+            inter_idx = 0
+            for j in range(num_anchor):
+                score = np_data[i, j, score_index]
+                if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
+                    for k in range(elem_length):
+                        np_out2[i, inter_idx, k] = np_data[i, j, k]
+                    np_out1[i] += 1
+                    np_out3[i, inter_idx] = j
+                    inter_idx += 1
+                if j >= np_out1[i]:
+                    for k in range(elem_length):
+                        np_out2[i, j, k] = -1.0
+                    np_out3[i, j] = -1
+
+        return np_data, np_out1, np_out2, np_out3
+
+    def test_get_valid_counts(
+        self, target, dev, ref_data, dtype, dshape, score_threshold, id_index, score_index
+    ):
+        np_data, np_out1, np_out2, np_out3 = ref_data
+
         with tvm.target.Target(target):
             fcompute, fschedule = tvm.topi.testing.dispatch(target, _get_valid_counts_implement)
             data = te.placeholder(dshape, name="data", dtype=dtype)
@@ -117,20 +127,10 @@ def check_device(target):
         tvm.testing.assert_allclose(tvm_out2.numpy(), np_out2, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out3.numpy(), np_out3, rtol=1e-3)
 
-    for target in ["llvm", "cuda", "opencl", "vulkan"]:
-        check_device(target)
-
-
-@tvm.testing.uses_gpu
-def test_get_valid_counts():
-    verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
-    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
-    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
-
 
 def verify_non_max_suppression(
+    target,
+    dev,
     np_data,
     np_valid_count,
     np_indices,
@@ -151,63 +151,53 @@ def verify_non_max_suppression(
     valid_count = te.placeholder((batch,), dtype="int32", name="valid_count")
     indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices")
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _nms_implement)
-            out = fcompute(
-                data,
-                valid_count,
-                indices,
-                max_output_size,
-                iou_threshold,
-                force_suppress,
-                top_k,
-                coord_start=coord_start,
-                score_index=score_index,
-                id_index=id_index,
-                return_indices=False,
-            )
-            indices_out = fcompute(
-                data,
-                valid_count,
-                indices,
-                max_output_size,
-                iou_threshold,
-                force_suppress,
-                top_k,
-                coord_start=coord_start,
-                score_index=score_index,
-                id_index=id_index,
-                return_indices=True,
-            )
-            s = fschedule(out)
-            indices_s = fschedule(indices_out)
-
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
-        tvm_indices = tvm.nd.array(np_indices, dev)
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _nms_implement)
+        out = fcompute(
+            data,
+            valid_count,
+            indices,
+            max_output_size,
+            iou_threshold,
+            force_suppress,
+            top_k,
+            coord_start=coord_start,
+            score_index=score_index,
+            id_index=id_index,
+            return_indices=False,
+        )
+        indices_out = fcompute(
+            data,
+            valid_count,
+            indices,
+            max_output_size,
+            iou_threshold,
+            force_suppress,
+            top_k,
+            coord_start=coord_start,
+            score_index=score_index,
+            id_index=id_index,
+            return_indices=True,
+        )
+        s = fschedule(out)
+        indices_s = fschedule(indices_out)
 
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
-        f = tvm.build(s, [data, valid_count, indices, out], target)
-        f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), np_result, rtol=1e-4)
+    tvm_data = tvm.nd.array(np_data, dev)
+    tvm_valid_count = tvm.nd.array(np_valid_count, dev)
+    tvm_indices = tvm.nd.array(np_indices, dev)
 
-        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev)
-        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target)
-        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-        tvm.testing.assert_allclose(tvm_indices_out.numpy(), np_indices_result, rtol=1e-4)
+    tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
+    f = tvm.build(s, [data, valid_count, indices, out], target)
+    f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
+    tvm.testing.assert_allclose(tvm_out.numpy(), np_result, rtol=1e-4)
 
-    for target in ["llvm", "cuda", "opencl", "nvptx"]:
-        check_device(target)
+    tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev)
+    f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target)
+    f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+    tvm.testing.assert_allclose(tvm_indices_out.numpy(), np_indices_result, rtol=1e-4)
 
 
-@tvm.testing.uses_gpu
-def test_non_max_suppression():
+def test_non_max_suppression(target, dev):
     np_data = np.array(
         [
             [
@@ -236,6 +226,8 @@ def test_non_max_suppression():
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
     verify_non_max_suppression(
+        target,
+        dev,
         np_data,
         np_valid_count,
         np_indices,
@@ -277,6 +269,8 @@ def test_non_max_suppression():
     )
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
     verify_non_max_suppression(
+        target,
+        dev,
         np_data,
         np_valid_count,
         np_indices,
@@ -292,91 +286,85 @@ def test_non_max_suppression():
     )
 
 
-def verify_multibox_prior(
-    dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False
-):
-    data = te.placeholder(dshape, name="data")
+class TestMultiboxPrior:
+    dshape, sizes, ratios, steps, offsets, clip = tvm.testing.parameters(
+        ((1, 3, 50, 50), (1,), (1,), (-1, -1), (0.5, 0.5), False),
+        ((1, 3, 224, 224), (0.5, 0.25, 0.1), (1, 2, 0.5), (-1, -1), (0.5, 0.5), False),
+        ((1, 32, 32, 32), (0.5, 0.25), (1, 2), (2, 2), (0.5, 0.5), True),
+    )
 
-    dtype = data.dtype
-    input_data = np.random.uniform(size=dshape).astype(dtype)
-
-    in_height = data.shape[2].value
-    in_width = data.shape[3].value
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    size_ratio_concat = sizes + ratios
-    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
-    offset_h = offsets[0]
-    offset_w = offsets[1]
-
-    oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
-    np_out = np.zeros(oshape).astype(dtype)
-
-    for i in range(in_height):
-        center_h = (i + offset_h) * steps_h
-        for j in range(in_width):
-            center_w = (j + offset_w) * steps_w
-            for k in range(num_sizes + num_ratios - 1):
-                w = (
-                    size_ratio_concat[k] * in_height / in_width / 2.0
-                    if k < num_sizes
-                    else size_ratio_concat[0]
-                    * in_height
-                    / in_width
-                    * math.sqrt(size_ratio_concat[k + 1])
-                    / 2.0
-                )
-                h = (
-                    size_ratio_concat[k] / 2.0
-                    if k < num_sizes
-                    else size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
-                )
-                count = (
-                    i * in_width * (num_sizes + num_ratios - 1)
-                    + j * (num_sizes + num_ratios - 1)
-                    + k
-                )
-                np_out[0][count][0] = center_w - w
-                np_out[0][count][1] = center_h - h
-                np_out[0][count][2] = center_w + w
-                np_out[0][count][3] = center_h + h
-    if clip:
-        np_out = np.clip(np_out, 0, 1)
-
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
+    dtype = tvm.testing.parameter("float32")
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(self, dtype, dshape, sizes, ratios, offsets, steps, clip):
+        in_height = dshape[2]
+        in_width = dshape[3]
+        num_sizes = len(sizes)
+        num_ratios = len(ratios)
+        size_ratio_concat = sizes + ratios
+        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+        offset_h = offsets[0]
+        offset_w = offsets[1]
+
+        out_shape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+
+        np_in = np.random.uniform(size=dshape).astype(dtype)
+        np_out = np.zeros(out_shape).astype(dtype)
+
+        for i in range(in_height):
+            center_h = (i + offset_h) * steps_h
+            for j in range(in_width):
+                center_w = (j + offset_w) * steps_w
+                for k in range(num_sizes + num_ratios - 1):
+                    w = (
+                        size_ratio_concat[k] * in_height / in_width / 2.0
+                        if k < num_sizes
+                        else size_ratio_concat[0]
+                        * in_height
+                        / in_width
+                        * math.sqrt(size_ratio_concat[k + 1])
+                        / 2.0
+                    )
+                    h = (
+                        size_ratio_concat[k] / 2.0
+                        if k < num_sizes
+                        else size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                    )
+                    count = (
+                        i * in_width * (num_sizes + num_ratios - 1)
+                        + j * (num_sizes + num_ratios - 1)
+                        + k
+                    )
+                    np_out[0][count][0] = center_w - w
+                    np_out[0][count][1] = center_h - h
+                    np_out[0][count][2] = center_w + w
+                    np_out[0][count][3] = center_h + h
+        if clip:
+            np_out = np.clip(np_out, 0, 1)
+
+        return np_in, np_out
+
+    def test_multibox_prior(
+        self, target, dev, dtype, dshape, ref_data, sizes, ratios, steps, offsets, clip
+    ):
+        np_in, np_out = ref_data
+
+        data = te.placeholder(dshape, name="data", dtype=dtype)
 
         fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_prior_implement)
         with tvm.target.Target(target):
             out = fcompute(data, sizes, ratios, steps, offsets, clip)
             s = fschedule(out)
 
-        tvm_input_data = tvm.nd.array(input_data, dev)
-        tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), dev)
+        tvm_input_data = tvm.nd.array(np_in, dev)
+        tvm_out = tvm.nd.array(np.zeros(np_out.shape, dtype=dtype), dev)
         f = tvm.build(s, [data, out], target)
         f(tvm_input_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-3)
 
-    for target in ["llvm", "opencl", "cuda"]:
-        check_device(target)
-
-
-@tvm.testing.uses_gpu
-def test_multibox_prior():
-    verify_multibox_prior((1, 3, 50, 50))
-    verify_multibox_prior((1, 3, 224, 224), sizes=(0.5, 0.25, 0.1), ratios=(1, 2, 0.5))
-    verify_multibox_prior(
-        (1, 32, 32, 32), sizes=(0.5, 0.25), ratios=(1, 2), steps=(2, 2), clip=True
-    )
-
 
-@tvm.testing.uses_gpu
-def test_multibox_detection():
+def test_multibox_detection(target, dev):
     batch_size = 1
     num_anchors = 3
     num_classes = 3
@@ -399,41 +387,56 @@ def test_multibox_detection():
         ]
     )
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_detection_implement)
-        with tvm.target.Target(target):
-            out = fcompute(cls_prob, loc_preds, anchors)
-            s = fschedule(out)
-
-        tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), dev)
-        tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), dev)
-        tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), dev)
-        tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), dev)
-        f = tvm.build(s, [cls_prob, loc_preds, anchors, out], target)
-        f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.numpy(), expected_np_out, rtol=1e-4)
-
-    for target in ["llvm", "opencl", "cuda"]:
-        check_device(target)
-
-
-def verify_roi_align(
-    batch, in_channel, in_size, num_roi, pooled_size, spatial_scale, sample_ratio, mode
-):  # For mode, 0 = avg, 1 = max
-    a_shape = (batch, in_channel, in_size, in_size)
-    rois_shape = (num_roi, 5)
+    fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_detection_implement)
+    with tvm.target.Target(target):
+        out = fcompute(cls_prob, loc_preds, anchors)
+        s = fschedule(out)
+
+    tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), dev)
+    tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), dev)
+    tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), dev)
+    tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), dev)
+    f = tvm.build(s, [cls_prob, loc_preds, anchors, out], target)
+    f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
+    tvm.testing.assert_allclose(tvm_out.numpy(), expected_np_out, rtol=1e-4)
+
+
+class TestRoiAlign:
+    (
+        batch,
+        in_channel,
+        in_size,
+        num_roi,
+        pooled_size,
+        spatial_scale,
+        sample_ratio,
+        mode,
+    ) = tvm.testing.parameters(
+        (1, 16, 32, 64, 7, 1.0, -1, 0),
+        (4, 16, 32, 64, 7, 0.5, 2, 0),
+        (1, 32, 32, 80, 8, 0.0625, 2, 0),
+        (1, 32, 500, 80, 8, 0.0625, 2, 0),
+        (1, 16, 32, 64, 7, 1.0, -1, 1),
+        (4, 16, 32, 64, 7, 0.5, 2, 1),
+        (1, 32, 32, 80, 8, 0.0625, 2, 1),
+        (1, 32, 500, 80, 8, 0.0625, 2, 1),
+    )
 
-    a = te.placeholder(a_shape)
-    rois = te.placeholder(rois_shape)
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(
+        self,
+        batch,
+        in_channel,
+        in_size,
+        num_roi,
+        pooled_size,
+        spatial_scale,
+        sample_ratio,
+        mode,
+    ):
+        a_shape = (batch, in_channel, in_size, in_size)
+        rois_shape = (num_roi, 5)
 
-    @memoize("topi.tests.test_topi_vision.verify_roi_align")
-    def get_ref_data():
         a_np = np.random.uniform(-1, 1, size=a_shape).astype("float32")
         rois_np = np.random.uniform(-1, 1, size=rois_shape).astype("float32") * in_size
         rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi)
@@ -448,13 +451,22 @@ def get_ref_data():
 
         return a_np, rois_np, b_np
 
-    a_np, rois_np, b_np = get_ref_data()
+    def test_roi_align(
+        self,
+        target,
+        dev,
+        ref_data,
+        pooled_size,
+        spatial_scale,
+        sample_ratio,
+        mode,
+    ):
+        # For mode, 0 = avg, 1 = max
+        a_np, rois_np, b_np = ref_data
+
+        a = te.placeholder(a_np.shape)
+        rois = te.placeholder(rois_np.shape)
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
         with tvm.target.Target(target):
             fcompute, fschedule = tvm.topi.testing.dispatch(target, _roi_align_implement)
             b = fcompute(
@@ -469,37 +481,24 @@ def check_device(target):
 
         tvm_a = tvm.nd.array(a_np, dev)
         tvm_rois = tvm.nd.array(rois_np, dev)
-        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev)
+        tvm_b = tvm.nd.array(np.zeros(b_np.shape, dtype=b.dtype), device=dev)
         f = tvm.build(s, [a, rois, b], target)
         f(tvm_a, tvm_rois, tvm_b)
         tvm_val = tvm_b.numpy()
         tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
 
-    for target in ["llvm", "cuda", "opencl"]:
-        check_device(target)
-
 
-@tvm.testing.uses_gpu
-def test_roi_align():
-    verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 0)
-    verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 0)
-    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 0)
-    verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 0)
-    verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 1)
-    verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 1)
-    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 1)
-    verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 1)
-
-
-def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):
-    a_shape = (batch, in_channel, in_size, in_size)
-    rois_shape = (num_roi, 5)
+class TestRoiPool:
+    batch, in_channel, in_size, num_roi, pooled_size, spatial_scale = tvm.testing.parameters(
+        (1, 4, 16, 32, 7, 1.0),
+        (4, 4, 16, 32, 7, 0.5),
+    )
 
-    a = te.placeholder(a_shape)
-    rois = te.placeholder(rois_shape)
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(self, batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):
+        a_shape = (batch, in_channel, in_size, in_size)
+        rois_shape = (num_roi, 5)
 
-    @memoize("topi.tests.test_topi_vision.verify_roi_pool")
-    def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype("float32")
         rois_np = np.random.uniform(size=rois_shape).astype("float32") * in_size
         rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi).astype("float32")
@@ -509,14 +508,11 @@ def get_ref_data():
         )
         return a_np, rois_np, b_np
 
-    a_np, rois_np, b_np = get_ref_data()
+    def test_roi_pool(self, target, dev, ref_data, pooled_size, spatial_scale):
+        a_np, rois_np, b_np = ref_data
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
+        a = te.placeholder(a_np.shape)
+        rois = te.placeholder(rois_np.shape)
 
         with tvm.target.Target(target):
             b = topi.vision.rcnn.roi_pool_nchw(
@@ -527,50 +523,32 @@ def check_device(target):
 
         tvm_a = tvm.nd.array(a_np, dev)
         tvm_rois = tvm.nd.array(rois_np, dev)
-        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev)
+        tvm_b = tvm.nd.array(np.zeros(b_np.shape, dtype=b.dtype), device=dev)
         f = tvm.build(s, [a, rois, b], target)
         f(tvm_a, tvm_rois, tvm_b)
         tvm.testing.assert_allclose(tvm_b.numpy(), b_np, rtol=1e-4)
 
-    for target in ["cuda", "llvm"]:
-        check_device(target)
 
-
-@tvm.testing.uses_gpu
-def test_roi_pool():
-    verify_roi_pool(1, 4, 16, 32, 7, 1.0)
-    verify_roi_pool(4, 4, 16, 32, 7, 0.5)
-
-
-def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
+def verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
     cls_prob = te.placeholder(np_cls_prob.shape)
     bbox_pred = te.placeholder(np_bbox_pred.shape)
     im_info = te.placeholder(np_im_info.shape)
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _proposal_implement)
-            out = fcompute(cls_prob, bbox_pred, im_info, **attrs)
-            s = fschedule(out)
-            f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], target)
-            tvm_cls_prob = tvm.nd.array(np_cls_prob, device=dev)
-            tvm_bbox_pred = tvm.nd.array(np_bbox_pred, device=dev)
-            tvm_im_info = tvm.nd.array(np_im_info, device=dev)
-            tvm_out = tvm.nd.empty(device=dev, shape=out.shape, dtype=out.dtype)
-            f(tvm_cls_prob, tvm_bbox_pred, tvm_im_info, tvm_out)
-            tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-4)
-
-    for target in ["llvm", "cuda"]:
-        check_device(target)
-
-
-@tvm.testing.uses_gpu
-def test_proposal():
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _proposal_implement)
+        out = fcompute(cls_prob, bbox_pred, im_info, **attrs)
+        s = fschedule(out)
+        f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], target)
+        tvm_cls_prob = tvm.nd.array(np_cls_prob, device=dev)
+        tvm_bbox_pred = tvm.nd.array(np_bbox_pred, device=dev)
+        tvm_im_info = tvm.nd.array(np_im_info, device=dev)
+        tvm_out = tvm.nd.empty(device=dev, shape=out.shape, dtype=out.dtype)
+        f(tvm_cls_prob, tvm_bbox_pred, tvm_im_info, tvm_out)
+        tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-4)
+
+
+@tvm.testing.known_failing_targets("vulkan")
+def test_proposal(target, dev):
     attrs = {
         "scales": (0.5,),
         "ratios": (0.5,),
@@ -612,7 +590,7 @@ def test_proposal():
         dtype="float32",
     )
 
-    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+    verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
 
     np_out = np.array(
         [
@@ -625,10 +603,12 @@ def test_proposal():
     )
 
     attrs["iou_loss"] = True
-    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+    verify_proposal(target, dev, np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
 
 
 def verify_all_class_non_max_suppression(
+    target,
+    dev,
     boxes_np,
     scores_np,
     max_output_boxes_per_class,
@@ -642,36 +622,24 @@ def verify_all_class_non_max_suppression(
     boxes = te.placeholder(dshape, name="boxes")
     scores = te.placeholder(scores_np.shape, dtype="float32", name="scores")
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            fcompute, fschedule = tvm.topi.testing.dispatch(target, _all_class_nms_implement)
-            out = fcompute(
-                boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold
-            )
-            s = fschedule(out)
-
-        tvm_boxes = tvm.nd.array(boxes_np, dev)
-        tvm_scores = tvm.nd.array(scores_np, dev)
-        selected_indices = tvm.nd.array(np.zeros((batch * num_class * num_boxes, 3), "int64"), dev)
-        num_detections = tvm.nd.array(np.zeros((1,), "int64"), dev)
+    with tvm.target.Target(target):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _all_class_nms_implement)
+        out = fcompute(boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold)
+        s = fschedule(out)
 
-        f = tvm.build(s, [boxes, scores, out[0], out[1]], target)
-        f(tvm_boxes, tvm_scores, selected_indices, num_detections)
+    tvm_boxes = tvm.nd.array(boxes_np, dev)
+    tvm_scores = tvm.nd.array(scores_np, dev)
+    selected_indices = tvm.nd.array(np.zeros((batch * num_class * num_boxes, 3), "int64"), dev)
+    num_detections = tvm.nd.array(np.zeros((1,), "int64"), dev)
 
-        tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
-        np.testing.assert_equal(tvm_res, expected_indices)
+    f = tvm.build(s, [boxes, scores, out[0], out[1]], target)
+    f(tvm_boxes, tvm_scores, selected_indices, num_detections)
 
-    for target in ["llvm", "cuda", "opencl", "vulkan"]:
-        check_device(target)
+    tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
+    np.testing.assert_equal(tvm_res, expected_indices)
 
 
-@tvm.testing.uses_gpu
-def test_all_class_non_max_suppression():
+def test_all_class_non_max_suppression(target, dev):
     boxes = np.array(
         [
             [
@@ -707,7 +675,14 @@ def test_all_class_non_max_suppression():
     )
 
     verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
+        target,
+        dev,
+        boxes,
+        scores,
+        max_output_boxes_per_class,
+        iou_threshold,
+        score_threshold,
+        expected,
     )
 
     boxes = np.array(
@@ -730,16 +705,16 @@ def test_all_class_non_max_suppression():
     expected = np.array([[0, 0, 3], [0, 0, 0]])
 
     verify_all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, expected
+        target,
+        dev,
+        boxes,
+        scores,
+        max_output_boxes_per_class,
+        iou_threshold,
+        score_threshold,
+        expected,
     )
 
 
 if __name__ == "__main__":
-    test_get_valid_counts()
-    test_multibox_prior()
-    test_multibox_detection()
-    test_roi_align()
-    test_roi_pool()
-    test_proposal()
-    test_non_max_suppression()
-    test_all_class_non_max_suppression()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py
index a8d57b3da780..ecc30199c1a7 100644
--- a/tests/python/unittest/test_tir_intrin.py
+++ b/tests/python/unittest/test_tir_intrin.py
@@ -139,7 +139,19 @@ def test_ldexp():
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
 
-def test_clz():
+dtype = tvm.testing.parameter("int32", "int64")
+
+
+@tvm.testing.parametrize_targets("llvm", "vulkan -from_device=0")
+def test_clz(target, dev, dtype):
+    target = tvm.target.Target(target)
+    if (
+        target.kind.name == "vulkan"
+        and dtype == "int64"
+        and not target.attrs.get("supports_int64", False)
+    ):
+        pytest.xfail("Vulkan target does not support Int64 types")
+
     def clz_np(x, dtype):
         ceil_log2 = np.ceil(np.log2(x)).astype(dtype)
         bits = int(dtype[-2:])
@@ -147,38 +159,32 @@ def clz_np(x, dtype):
         clz[np.bitwise_and(x, x - 1) == 0] -= 1
         return clz
 
-    for target in ["llvm", "vulkan"]:
-        if not tvm.testing.device_enabled("vulkan"):
-            continue
+    m = te.var("m")
+    A = te.placeholder((m,), name="A", dtype=dtype)
+    B = te.compute((m,), lambda *i: tvm.tir.clz(A(*i)), name="B")
+    s = te.create_schedule(B.op)
 
-        for dtype in ["int32", "int64"]:
-            m = te.var("m")
-            A = te.placeholder((m,), name="A", dtype=dtype)
-            B = te.compute((m,), lambda *i: tvm.tir.clz(A(*i)), name="B")
-            s = te.create_schedule(B.op)
+    if target.kind.name == "vulkan":
+        bx, tx = s[B].split(B.op.axis[0], factor=64)
 
-            if target == "vulkan":
-                bx, tx = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(bx, te.thread_axis("blockIdx.x"))
+        s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
-                s[B].bind(bx, te.thread_axis("blockIdx.x"))
-                s[B].bind(tx, te.thread_axis("threadIdx.x"))
-
-            f = tvm.build(s, [A, B], target)
-            dev = tvm.device(target, 0)
-            n = 10
+    f = tvm.build(s, [A, B], target)
+    n = 10
 
-            highs = [10, 100, 1000, 10000, 100000, 1000000]
+    highs = [10, 100, 1000, 10000, 100000, 1000000]
 
-            if dtype == "int64":
-                highs.append((1 << 63) - 1)
+    if dtype == "int64":
+        highs.append((1 << 63) - 1)
 
-            for high in highs:
-                a_np = np.random.randint(1, high=high, size=(n,)).astype(dtype)
-                a = tvm.nd.array(a_np, dev)
-                b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev)
-                f(a, b)
-                ref = clz_np(a_np, dtype)
-                np.testing.assert_equal(b.numpy(), ref)
+    for high in highs:
+        a_np = np.random.randint(1, high=high, size=(n,), dtype=dtype)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev)
+        f(a, b)
+        ref = clz_np(a_np, dtype)
+        np.testing.assert_equal(b.numpy(), ref)
 
 
 @tvm.script.tir
diff --git a/tests/python/unittest/test_tvm_testing_features.py b/tests/python/unittest/test_tvm_testing_features.py
index 4c9c5d91901a..f396eeeee5fb 100644
--- a/tests/python/unittest/test_tvm_testing_features.py
+++ b/tests/python/unittest/test_tvm_testing_features.py
@@ -57,7 +57,10 @@ def test_explicit_list(self, target):
         self.targets_with_explicit_list.append(target)
 
     def test_no_repeats_in_explicit_list(self):
-        assert self.targets_with_explicit_list == ["llvm"]
+        if tvm.testing.device_enabled("llvm"):
+            assert self.targets_with_explicit_list == ["llvm"]
+        else:
+            assert self.targets_with_explicit_list == []
 
     targets_with_exclusion = []