diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 94fc6422891f..b0cb033e8812 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -43,8 +43,8 @@
 #include "../ffi/src/ffi/extra/module.cc"
 #include "../ffi/src/ffi/extra/testing.cc"
 #include "../ffi/src/ffi/function.cc"
-#include "../ffi/src/ffi/ndarray.cc"
 #include "../ffi/src/ffi/object.cc"
+#include "../ffi/src/ffi/tensor.cc"
 #include "../ffi/src/ffi/traceback.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/device_api.cc"
@@ -52,7 +52,6 @@
 #include "../src/runtime/logging.cc"
 #include "../src/runtime/memory/memory_manager.cc"
 #include "../src/runtime/minrpc/minrpc_logger.cc"
-#include "../src/runtime/ndarray.cc"
 #include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
 #include "../src/runtime/rpc/rpc_channel.cc"
@@ -63,6 +62,7 @@
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/rpc/rpc_session.cc"
 #include "../src/runtime/rpc/rpc_socket_impl.cc"
+#include "../src/runtime/tensor.cc"
 #include "../src/runtime/thread_pool.cc"
 #include "../src/runtime/threading_backend.cc"
 #include "../src/runtime/workspace_pool.cc"
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index b9c6995729d0..b1548df3e177 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -72,8 +72,8 @@ def test_rpc_module():
         dev = remote.cl(0)
         remote.upload(path_dso_cl)
         f1 = remote.load_module("dev_lib_cl.so")
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
         time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
         cost = time_f(a, b).mean
         print("%g secs/op\n" % cost)
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index 5e62774607ba..be5a4ee94da9 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -25,7 +25,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <vector>
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index bd1df4aa62ad..64b795d8f45c 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -137,7 +137,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int inpu
   };
   DLManagedTensor managed{tensor, /*manager_ctx*/ nullptr, /*deleter*/ nullptr};
 
-  auto input = tvm::runtime::NDArray::FromDLPack(&managed);
+  auto input = tvm::runtime::Tensor::FromDLPack(&managed);
 
   tvm::ffi::Function set_input = get_module_func(TheModel->model_executor, "set_input");
   set_input(input_idx, input);
@@ -172,17 +172,17 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
   }
 
   tvm::ffi::Function get_output = get_module_func(TheModel->model_executor, "get_output");
-  tvm::runtime::NDArray output = get_output(output_idx);
+  tvm::runtime::Tensor output = get_output(output_idx);
 
   std::vector<int64_t> shape_vec{output->shape, output->shape + output->ndim};
 
-  auto* container = new tvm::runtime::NDArray::Container(
-      static_cast<void*>(output_value), shape_vec, output->dtype, Model::external());
+  auto* container = new tvm::runtime::Tensor::Container(static_cast<void*>(output_value), shape_vec,
+                                                        output->dtype, Model::external());
   container->SetDeleter([](tvm::Object* container) {
-    delete static_cast<tvm::runtime::NDArray::Container*>(container);
+    delete static_cast<tvm::runtime::Tensor::Container*>(container);
   });
 
-  tvm::runtime::NDArray host_output(tvm::runtime::GetObjectPtr<tvm::runtime::Object>(container));
+  tvm::runtime::Tensor host_output(tvm::runtime::GetObjectPtr<tvm::runtime::Object>(container));
 
   if (meta_size != 0) {
     auto* meta = reinterpret_cast<tensor_meta*>(output_meta);
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index 0e563ee1b688..67b9cd22aeba 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -72,8 +72,8 @@ def test_rpc_module(host, port, key, mode):
     dev = remote.metal(0)
     f1 = remote.load_module("dev_lib.dylib")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
     time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
     cost = time_f(a, b).mean
     print("Metal: %g secs/op" % cost)
diff --git a/docs/arch/index.rst b/docs/arch/index.rst
index 1acd38fb04c7..4985e91c0b7d 100644
--- a/docs/arch/index.rst
+++ b/docs/arch/index.rst
@@ -133,7 +133,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu
     import tvm
     # Example runtime execution program in python, with type annotated
     mod: tvm.runtime.Module = tvm.runtime.load_module("compiled_artifact.so")
-    arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], device=tvm.cuda(0))
+    arr: tvm.runtime.Tensor = tvm.runtime.tensor([1, 2, 3], device=tvm.cuda(0))
     fun: tvm.runtime.PackedFunc = mod["addone"]
     fun(arr)
     print(arr.numpy())
@@ -142,7 +142,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu
 :py:class:`tvm.runtime.Module` encapsulates the result of compilation. A runtime.Module contains a GetFunction method to obtain PackedFuncs by name.
 
 :py:class:`tvm.runtime.PackedFunc` is a type-erased function interface for both the generated functions. A runtime.PackedFunc can take arguments and return values with the
-following types: POD types(int, float), string, runtime.PackedFunc, runtime.Module, runtime.NDArray, and other sub-classes of runtime.Object.
+following types: POD types(int, float), string, runtime.PackedFunc, runtime.Module, runtime.Tensor, and other sub-classes of runtime.Object.
 
 :py:class:`tvm.runtime.Module` and :py:class:`tvm.runtime.PackedFunc` are powerful mechanisms to modularize the runtime. For example, to get the above `addone` function on CUDA, we can use LLVM to generate the host-side code to compute the launching parameters(e.g. size of the thread groups) and then call into another PackedFunc from a CUDAModule that is backed by the CUDA driver API. The same mechanism can be used for OpenCL kernels.
 
@@ -155,7 +155,7 @@ The above example only deals with a simple `addone` function. The code snippet b
    factory: tvm.runtime.Module = tvm.runtime.load_module("resnet18.so")
    # Create a stateful graph execution module for resnet18 on cuda(0)
    gmod: tvm.runtime.Module = factory["resnet18"](tvm.cuda(0))
-   data: tvm.runtime.NDArray = get_input_data()
+   data: tvm.runtime.Tensor = get_input_data()
    # set input
    gmod["set_input"](0, data)
    # execute the model
diff --git a/docs/deep_dive/tensor_ir/tutorials/tir_creation.py b/docs/deep_dive/tensor_ir/tutorials/tir_creation.py
index 3d07f6227b96..74b4406061b9 100644
--- a/docs/deep_dive/tensor_ir/tutorials/tir_creation.py
+++ b/docs/deep_dive/tensor_ir/tutorials/tir_creation.py
@@ -204,9 +204,9 @@ def mm_relu(a: T.handle, b: T.handle, c: T.handle):
 
 
 def evaluate_dynamic_shape(lib: tvm.runtime.Module, m: int, n: int, k: int):
-    A = tvm.nd.array(np.random.uniform(size=(m, k)).astype("float32"))
-    B = tvm.nd.array(np.random.uniform(size=(k, n)).astype("float32"))
-    C = tvm.nd.array(np.zeros((m, n), dtype="float32"))
+    A = tvm.runtime.tensor(np.random.uniform(size=(m, k)).astype("float32"))
+    B = tvm.runtime.tensor(np.random.uniform(size=(k, n)).astype("float32"))
+    C = tvm.runtime.tensor(np.zeros((m, n), dtype="float32"))
     lib(A, B, C)
     return C.numpy()
 
diff --git a/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py b/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py
index 702b53011b48..eb1b2eb02029 100644
--- a/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py
+++ b/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py
@@ -72,9 +72,9 @@ def main(
 b_np = np.random.uniform(size=(128, 128)).astype("float32")
 c_np = a_np @ b_np
 
-a_nd = tvm.nd.array(a_np)
-b_nd = tvm.nd.array(b_np)
-c_nd = tvm.nd.array(np.zeros((128, 128), dtype="float32"))
+a_nd = tvm.runtime.tensor(a_np)
+b_nd = tvm.runtime.tensor(b_np)
+c_nd = tvm.runtime.tensor(np.zeros((128, 128), dtype="float32"))
 
 
 def evaluate(mod: tvm.IRModule):
diff --git a/docs/get_started/tutorials/ir_module.py b/docs/get_started/tutorials/ir_module.py
index c53d0ca5ef74..8bb8fb77a445 100644
--- a/docs/get_started/tutorials/ir_module.py
+++ b/docs/get_started/tutorials/ir_module.py
@@ -237,7 +237,7 @@ def main(
 vm = relax.VirtualMachine(exec, dev)
 
 raw_data = np.random.rand(1, 784).astype("float32")
-data = tvm.nd.array(raw_data, dev)
+data = tvm.runtime.tensor(raw_data, dev)
 cpu_out = vm["main"](data, *params_from_torch["main"]).numpy()
 print(cpu_out)
 
@@ -267,8 +267,8 @@ def main(
 dev = tvm.device("cuda", 0)
 vm = relax.VirtualMachine(exec, dev)
 # Need to allocate data and params on GPU device
-data = tvm.nd.array(raw_data, dev)
-gpu_params = [tvm.nd.array(p, dev) for p in params_from_torch["main"]]
+data = tvm.runtime.tensor(raw_data, dev)
+gpu_params = [tvm.runtime.tensor(p, dev) for p in params_from_torch["main"]]
 gpu_out = vm["main"](data, *gpu_params).numpy()
 print(gpu_out)
 
diff --git a/docs/get_started/tutorials/quick_start.py b/docs/get_started/tutorials/quick_start.py
index 1153108c9632..753acbf0a475 100644
--- a/docs/get_started/tutorials/quick_start.py
+++ b/docs/get_started/tutorials/quick_start.py
@@ -141,9 +141,9 @@ def forward(self, x):
 device = tvm.cpu()
 vm = relax.VirtualMachine(ex, device)
 data = np.random.rand(1, 784).astype("float32")
-tvm_data = tvm.nd.array(data, device=device)
+tvm_data = tvm.runtime.tensor(data, device=device)
 params = [np.random.rand(*param.shape).astype("float32") for _, param in param_spec]
-params = [tvm.nd.array(param, device=device) for param in params]
+params = [tvm.runtime.tensor(param, device=device) for param in params]
 print(vm["forward"](tvm_data, *params).numpy())
 
 ################################################################################
@@ -158,14 +158,14 @@ def forward(self, x):
 #       prefill_logits = vm["prefill"](inputs, weight, kv_cache)
 #       decoded_logits = vm["decode"](inputs, weight, kv_cache)
 #
-# - TVM runtime comes with native data structures, such as NDArray, can also have zero
+# - TVM runtime comes with native data structures, such as Tensor, can also have zero
 #   copy exchange with existing ecosystem (DLPack exchange with PyTorch)
 #
 #   .. code-block:: Python
 #
-#       # Convert PyTorch tensor to TVM NDArray
-#       x_tvm = tvm.nd.from_dlpack(x_torch.to_dlpack())
-#       # Convert TVM NDArray to PyTorch tensor
+#       # Convert PyTorch tensor to TVM Tensor
+#       x_tvm = tvm.runtime.from_dlpack(x_torch.to_dlpack())
+#       # Convert TVM Tensor to PyTorch tensor
 #       x_torch = torch.from_dlpack(x_tvm.to_dlpack())
 #
 # - TVM runtime works in non-python environments, so it works on settings such as mobile
@@ -175,14 +175,14 @@ def forward(self, x):
 #       // C++ snippet
 #       runtime::Module vm = ex.GetFunction("load_executable")();
 #       vm.GetFunction("init")(...);
-#       NDArray out = vm.GetFunction("prefill")(data, weight, kv_cache);
+#       Tensor out = vm.GetFunction("prefill")(data, weight, kv_cache);
 #
 #   .. code-block:: Java
 #
 #       // Java snippet
 #       Module vm = ex.getFunction("load_executable").invoke();
 #       vm.getFunction("init").pushArg(...).invoke;
-#       NDArray out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
+#       Tensor out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
 #
 
 ################################################################################
diff --git a/docs/how_to/tutorials/cross_compilation_and_rpc.py b/docs/how_to/tutorials/cross_compilation_and_rpc.py
index a6b7206b3efa..b142eaa54956 100644
--- a/docs/how_to/tutorials/cross_compilation_and_rpc.py
+++ b/docs/how_to/tutorials/cross_compilation_and_rpc.py
@@ -182,8 +182,8 @@
 
 # create arrays on the remote device
 dev = remote.cpu()
-a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
 # the function will run on the remote device
 func(a, b)
 np.testing.assert_equal(b.numpy(), a.numpy() + 1)
@@ -249,8 +249,8 @@ def run_opencl():
 
     # run
     dev = remote.cl()
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
     func(a, b)
     np.testing.assert_equal(b.numpy(), a.numpy() + 1)
     print("OpenCL test passed!")
diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py
index d215654019f0..2e2747d61fc5 100644
--- a/docs/how_to/tutorials/customize_opt.py
+++ b/docs/how_to/tutorials/customize_opt.py
@@ -209,8 +209,8 @@ def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IR
 dev = tvm.device("cuda", 0)
 vm = relax.VirtualMachine(ex, dev)
 # Need to allocate data and params on GPU device
-data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev)
-gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
+data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32"), dev)
+gpu_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
 gpu_out = vm["forward"](data, *gpu_params).numpy()
 print(gpu_out)
 
diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py
index 88cc86bfa800..9f89e744a362 100644
--- a/docs/how_to/tutorials/e2e_opt_model.py
+++ b/docs/how_to/tutorials/e2e_opt_model.py
@@ -117,8 +117,8 @@
     dev = tvm.device("cuda", 0)
     vm = relax.VirtualMachine(ex, dev)
     # Need to allocate data and params on GPU device
-    gpu_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
-    gpu_params = [tvm.nd.array(p, dev) for p in params["main"]]
+    gpu_data = tvm.runtime.tensor(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
+    gpu_params = [tvm.runtime.tensor(p, dev) for p in params["main"]]
     gpu_out = vm["main"](gpu_data, *gpu_params).numpy()
 
     print(gpu_out.shape)
diff --git a/docs/how_to/tutorials/optimize_llm.py b/docs/how_to/tutorials/optimize_llm.py
index 8cc674920da1..0e82b055592f 100644
--- a/docs/how_to/tutorials/optimize_llm.py
+++ b/docs/how_to/tutorials/optimize_llm.py
@@ -489,7 +489,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
 
     # Convert params into ndarray
     params = [
-        tvm.nd.array(param_dict[k].astype("float16"), device=dev) for k in named_params.keys()
+        tvm.runtime.tensor(param_dict[k].astype("float16"), device=dev) for k in named_params.keys()
     ]
 
 
@@ -523,7 +523,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
     input_len = len(prompt)
 
     # Load prompt tokens into TVM ndarray on the target device
-    tokens = tvm.nd.array(np.array(prompt).astype("int32"), device=dev)
+    tokens = tvm.runtime.tensor(np.array(prompt).astype("int32"), device=dev)
 
 ######################################################################
 # Create the KVCache
@@ -609,7 +609,7 @@ def sample_token(logits):
     print("The generated token:")
 
     while last_token != tokenizer.eos_token_id:
-        tokens = tvm.nd.array(np.array([last_token]).astype("int32"), device=dev)
+        tokens = tvm.runtime.tensor(np.array([last_token]).astype("int32"), device=dev)
         hidden_states = embed(tokens, params)
         begin_forward_func(kv_cache, ShapeTuple([seq_id]), ShapeTuple([1]))
         logits, kv_cache = vm["decode"](hidden_states, kv_cache, params)
diff --git a/docs/reference/api/python/index.rst b/docs/reference/api/python/index.rst
index a233c69a0173..c63784781cb9 100644
--- a/docs/reference/api/python/index.rst
+++ b/docs/reference/api/python/index.rst
@@ -34,7 +34,6 @@ Python API
     :caption: tvm.runtime
 
     runtime/runtime
-    runtime/ndarray
     runtime/vm
     runtime/disco
     runtime/profiling
diff --git a/docs/reference/api/python/runtime/ndarray.rst b/docs/reference/api/python/runtime/ndarray.rst
deleted file mode 100644
index 8c794f04b193..000000000000
--- a/docs/reference/api/python/runtime/ndarray.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.runtime.ndarray
--------------------
-.. automodule:: tvm.runtime.ndarray
-    :members:
diff --git a/docs/reference/api/python/runtime/runtime.rst b/docs/reference/api/python/runtime/runtime.rst
index 4dd9d9653369..ae373080aeac 100644
--- a/docs/reference/api/python/runtime/runtime.rst
+++ b/docs/reference/api/python/runtime/runtime.rst
@@ -19,4 +19,3 @@ tvm.runtime
 -----------
 .. automodule:: tvm.runtime
    :members:
-   :exclude-members: NDArray
diff --git a/ffi/CMakeLists.txt b/ffi/CMakeLists.txt
index 90f1f89cbb92..94395d234352 100644
--- a/ffi/CMakeLists.txt
+++ b/ffi/CMakeLists.txt
@@ -57,7 +57,7 @@ set(tvm_ffi_objs_sources
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/object.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/error.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/function.cc"
-  "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/ndarray.cc"
+  "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/tensor.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/dtype.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/container.cc"
 )
@@ -189,7 +189,7 @@ if (TVM_FFI_BUILD_PYTHON_MODULE)
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/dtype.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/error.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/function.pxi
-    ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/ndarray.pxi
+    ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/tensor.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/object.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/string.pxi
   )
diff --git a/ffi/docs/.gitignore b/ffi/docs/.gitignore
index e35d8850c968..0b4a3621d9c3 100644
--- a/ffi/docs/.gitignore
+++ b/ffi/docs/.gitignore
@@ -1 +1,2 @@
 _build
+**/generated/*.rst
diff --git a/ffi/docs/Makefile b/ffi/docs/Makefile
index f589272b1845..ff28cb0cbc81 100644
--- a/ffi/docs/Makefile
+++ b/ffi/docs/Makefile
@@ -25,7 +25,7 @@ BUILDDIR      = _build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-.PHONY: help Makefile livehtml
+.PHONY: help Makefile livehtml clean
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
@@ -34,3 +34,7 @@ help:
 
 livehtml:
 	@sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	rm -rf $(BUILDDIR)
+	rm -rf reference/python/generated
diff --git a/ffi/docs/concepts/abi_overview.md b/ffi/docs/concepts/abi_overview.md
index 6d2fd100744c..118257896424 100644
--- a/ffi/docs/concepts/abi_overview.md
+++ b/ffi/docs/concepts/abi_overview.md
@@ -219,17 +219,17 @@ typedef struct TVMFFIObject {
 - `deleter` ensures that objects allocated from one language/runtime can be safely deleted in another.
 
 The object format provides a unified way to manage object life-cycle and dynamic type casting
-for heap-allocated objects, including Shape, NDArray,
+for heap-allocated objects, including Shape, Tensor,
 Function, Array, Map and other custom objects.
 
 
-### DLPack Compatible NDArray
+### DLPack Compatible Tensor
 
-We provide first-class support for DLPack raw unmanaged pointer support as well as a managed NDArray object that
-directly adopts the DLPack DLTensor layout. The overall layout of the NDArray object is as follows:
+We provide first-class support for DLPack raw unmanaged pointer support as well as a managed Tensor object that
+directly adopts the DLPack DLTensor layout. The overall layout of the Tensor object is as follows:
 
 ```c++
-struct NDArrayObj: public ffi::Object, public DLTensor {
+struct TensorObj: public ffi::Object, public DLTensor {
 };
 ```
 
@@ -241,7 +241,7 @@ DLTensor* ReadDLTensorPtr(const TVMFFIAny *value) {
   if (value->type_index == kTVMFFIDLTensorPtr) {
     return static_cast<DLTensor*>(value->v_ptr);
   }
-  assert(value->type_index == kTVMFFINDArray);
+  assert(value->type_index == kTVMFFITensor);
   return reinterpret_cast<DLTensor*>(
     reinterpret_cast<char*>(value->v_obj) + sizeof(TVMFFIObject));
 }
diff --git a/ffi/docs/conf.py b/ffi/docs/conf.py
index 317b58d3f60c..b97ed78ef8c1 100644
--- a/ffi/docs/conf.py
+++ b/ffi/docs/conf.py
@@ -20,6 +20,9 @@
 
 import tomli
 
+
+os.environ["TVM_FFI_BUILD_DOCS"] = "1"
+
 # -- General configuration ------------------------------------------------
 
 # Load version from pyproject.toml
diff --git a/ffi/docs/get_started/quick_start.md b/ffi/docs/get_started/quick_start.md
index 1f6b25ef6d28..7eb3b97727b1 100644
--- a/ffi/docs/get_started/quick_start.md
+++ b/ffi/docs/get_started/quick_start.md
@@ -194,7 +194,7 @@ and can be loaded in other language environments, such as c++. The following cod
 shows how to run the example exported function in C++.
 
 ```cpp
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/module.h>
 
 void CallAddOne(DLTensor* x, DLTensor *y) {
diff --git a/ffi/docs/guides/cpp_guide.md b/ffi/docs/guides/cpp_guide.md
index 84b6fd8dc9af..fdbd7f7d7ba2 100644
--- a/ffi/docs/guides/cpp_guide.md
+++ b/ffi/docs/guides/cpp_guide.md
@@ -342,18 +342,18 @@ Error type. Similarly, when we call a Python callback from C++, the error will b
 into the right error kind and message.
 
 
-## NDArray
+## Tensor
 
 For many use cases, we do not need to manage the nd-array/Tensor memory.
 In such cases, `DLTensor*` can be used as the function arguments.
 There can be cases for a managed container for multi-dimensional arrays.
-`ffi::NDArray` is a minimal container to provide such support.
+`ffi::Tensor` is a minimal container to provide such support.
 Notably, specific logic of device allocations and array operations are non-goals
-of the FFI. Instead, we provide minimal generic API `ffi::NDArray::FromNDAlloc`
-to enable flexible customization of NDArray allocation.
+of the FFI. Instead, we provide minimal generic API `ffi::Tensor::FromNDAlloc`
+to enable flexible customization of Tensor allocation.
 
 ```cpp
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/container/shape.h>
 
 struct CPUNDAlloc {
@@ -363,19 +363,19 @@ struct CPUNDAlloc {
   void FreeData(DLTensor* tensor) { free(tensor->data); }
 };
 
-void ExampleNDArray() {
+void ExampleTensor() {
   namespace ffi = tvm::ffi;
   ffi::Shape shape = {1, 2, 3};
   DLDataType dtype = {kDLFloat, 32, 1};
   DLDevice device = {kDLCPU, 0};
-  ffi::NDArray nd = ffi::NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
-  // now nd is a managed ndarray
+  ffi::Tensor tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+  // now tensor is a managed tensor
 }
 ```
 
 The above example shows how we define `CPUNDAlloc` that customizes `AllocData`
-and `FreeData` behavior. The CPUNDAlloc struct will be kept alive with the NDArray object.
-This pattern allows us to implement various NDArray allocations using the same API:
+and `FreeData` behavior. The CPUNDAlloc struct will be kept alive with the Tensor object.
+This pattern allows us to implement various Tensor allocations using the same API:
 
 - For CUDA allocation, we can change malloc to cudaMalloc
 - For memory-pool based allocation, we can update `CPUNDAlloc` to keep a strong reference to the pool,
@@ -387,27 +387,27 @@ of managed shapes and we provide quick conversions from standard vector types.
 
 ### DLPack Conversion
 
-We provide first-class DLPack support to the `ffi::NDArray` that enables efficient exchange
+We provide first-class DLPack support to the `ffi::Tensor` that enables efficient exchange
 through the DLPack Protocol.
 
 ```cpp
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 
-void ExampleNDArrayDLPack() {
+void ExampleTensorDLPack() {
   namespace ffi = tvm::ffi;
   ffi::Shape shape = {1, 2, 3};
   DLDataType dtype = {kDLFloat, 32, 1};
   DLDevice device = {kDLCPU, 0};
-  ffi::NDArray nd = ffi::NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+  ffi::Tensor tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
   // convert to DLManagedTensorVersioned
   DLManagedTensorVersioned* dlpack = nd.ToDLPackVersioned();
   // load back from DLManagedTensorVersioned
-  ffi::NDArray nd2 = ffi::NDArray::FromDLPackVersioned(dlpack);
+  ffi::Tensor tensor2 = ffi::Tensor::FromDLPackVersioned(dlpack);
 }
 ```
 
 These APIs are also available through the C APIs
-`TVMFFINDArrayFromDLPackVersioned` and `TVMFFINDArrayToDLPackVersioned`.
+`TVMFFITensorFromDLPackVersioned` and `TVMFFITensorToDLPackVersioned`.
 
 ## String and Bytes
 
diff --git a/ffi/docs/guides/python_guide.md b/ffi/docs/guides/python_guide.md
index 2d588049ae70..5ac7f318be25 100644
--- a/ffi/docs/guides/python_guide.md
+++ b/ffi/docs/guides/python_guide.md
@@ -50,9 +50,9 @@ mod.add_one_cpu(x, y)
 In this case, `tvm_ffi.load_module` will return a `tvm_ffi.Module` class that contains
 the exported functions. You can access the functions by their names.
 
-## NDArray
+## Tensor
 
-`tvm_ffi` provides a managed DLPack-compatible NDArray.
+`tvm_ffi` provides a managed DLPack-compatible Tensor.
 
 ```python
 import numpy as np
@@ -65,9 +65,9 @@ tvm_array = tvm_ffi.from_dlpack(np_data)
 np_result = np.from_dlpack(tvm_array)
 ```
 
-In most cases, however, you do not have to explicitly create NDArrays.
+In most cases, however, you do not have to explicitly create Tensors.
 The Python interface can take in `torch.Tensor` and `numpy.ndarray` objects
-and automatically convert them to `tvm_ffi.NDArray`.
+and automatically convert them to `tvm_ffi.Tensor`.
 
 ## Functions and Callbacks
 
diff --git a/ffi/examples/quick_start/run_example.py b/ffi/examples/quick_start/run_example.py
index cdd60916b91b..456e58ce91b9 100644
--- a/ffi/examples/quick_start/run_example.py
+++ b/ffi/examples/quick_start/run_example.py
@@ -32,7 +32,7 @@ def run_add_one_cpu():
     x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
     y = numpy.empty_like(x)
     # tvm-ffi automatically handles DLPack compatible tensors
-    # torch tensors can be viewed as ffi::NDArray or DLTensor*
+    # torch tensors can be viewed as ffi::Tensor or DLTensor*
     # in the background
     mod.add_one_cpu(x, y)
     print("numpy.result after add_one(x, y)")
@@ -44,7 +44,7 @@ def run_add_one_cpu():
     x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
     y = torch.empty_like(x)
     # tvm-ffi automatically handles DLPack compatible tensors
-    # torch tensors can be viewed as ffi::NDArray or DLTensor*
+    # torch tensors can be viewed as ffi::Tensor or DLTensor*
     # in the background
     mod.add_one_cpu(x, y)
     print("torch.result after add_one(x, y)")
diff --git a/ffi/examples/quick_start/src/run_example.cc b/ffi/examples/quick_start/src/run_example.cc
index e9993b034f18..90e61d170baa 100644
--- a/ffi/examples/quick_start/src/run_example.cc
+++ b/ffi/examples/quick_start/src/run_example.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/module.h>
 
 // This file shows how to load the same compiled module and interact with it in C++
@@ -27,16 +27,16 @@ struct CPUNDAlloc {
   void FreeData(DLTensor* tensor) { free(tensor->data); }
 };
 
-inline ffi::NDArray Empty(ffi::Shape shape, DLDataType dtype, DLDevice device) {
-  return ffi::NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+inline ffi::Tensor Empty(ffi::Shape shape, DLDataType dtype, DLDevice device) {
+  return ffi::Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
 }
 
 int main() {
   // load the module
   ffi::Module mod = ffi::Module::LoadFromFile("build/add_one_cpu.so");
 
-  // create an NDArray, alternatively, one can directly pass in a DLTensor*
-  ffi::NDArray x = Empty({5}, DLDataType({kDLFloat, 32, 1}), DLDevice({kDLCPU, 0}));
+  // create an Tensor, alternatively, one can directly pass in a DLTensor*
+  ffi::Tensor x = Empty({5}, DLDataType({kDLFloat, 32, 1}), DLDevice({kDLCPU, 0}));
   for (int i = 0; i < 5; ++i) {
     reinterpret_cast<float*>(x->data)[i] = static_cast<float>(i);
   }
diff --git a/ffi/include/tvm/ffi/c_api.h b/ffi/include/tvm/ffi/c_api.h
index 4df2daffeb61..2a694fc4adc3 100644
--- a/ffi/include/tvm/ffi/c_api.h
+++ b/ffi/include/tvm/ffi/c_api.h
@@ -131,9 +131,9 @@ typedef enum {
    */
   kTVMFFIShape = 69,
   /*!
-   * \brief NDArray object, layout = { TVMFFIObject, DLTensor, ... }
+   * \brief Tensor object, layout = { TVMFFIObject, DLTensor, ... }
    */
-  kTVMFFINDArray = 70,
+  kTVMFFITensor = 70,
   /*! \brief Array object. */
   kTVMFFIArray = 71,
   //----------------------------------------------------------------
@@ -497,15 +497,15 @@ TVM_FFI_DLL TVMFFIObjectHandle TVMFFIErrorCreate(const TVMFFIByteArray* kind,
 // Section: DLPack support APIs
 //------------------------------------------------------------
 /*!
- * \brief Produce a managed NDArray from a DLPack tensor.
+ * \brief Produce a managed Tensor from a DLPack tensor.
  * \param from The source DLPack tensor.
  * \param require_alignment The minimum alignment required of the data + byte_offset.
  * \param require_contiguous Boolean flag indicating if we need to check for contiguity.
- * \param out The output NDArray handle.
+ * \param out The output Tensor handle.
  * \return 0 on success, nonzero on failure.
  */
-TVM_FFI_DLL int TVMFFINDArrayFromDLPack(DLManagedTensor* from, int32_t require_alignment,
-                                        int32_t require_contiguous, TVMFFIObjectHandle* out);
+TVM_FFI_DLL int TVMFFITensorFromDLPack(DLManagedTensor* from, int32_t require_alignment,
+                                       int32_t require_contiguous, TVMFFIObjectHandle* out);
 
 /*!
  * \brief Produce a DLManagedTensor from the array that shares data memory with the array.
@@ -513,20 +513,20 @@ TVM_FFI_DLL int TVMFFINDArrayFromDLPack(DLManagedTensor* from, int32_t require_a
  * \param out The DLManagedTensor handle.
  * \return 0 on success, nonzero on failure.
  */
-TVM_FFI_DLL int TVMFFINDArrayToDLPack(TVMFFIObjectHandle from, DLManagedTensor** out);
+TVM_FFI_DLL int TVMFFITensorToDLPack(TVMFFIObjectHandle from, DLManagedTensor** out);
 
 /*!
- * \brief Produce a managed NDArray from a DLPack tensor.
+ * \brief Produce a managed Tensor from a DLPack tensor.
  * \param from The source DLPack tensor.
  * \param require_alignment The minimum alignment required of the data + byte_offset.
  * \param require_contiguous Boolean flag indicating if we need to check for contiguity.
- * \param out The output NDArray handle.
+ * \param out The output Tensor handle.
  * \return 0 on success, nonzero on failure.
  */
-TVM_FFI_DLL int TVMFFINDArrayFromDLPackVersioned(DLManagedTensorVersioned* from,
-                                                 int32_t require_alignment,
-                                                 int32_t require_contiguous,
-                                                 TVMFFIObjectHandle* out);
+TVM_FFI_DLL int TVMFFITensorFromDLPackVersioned(DLManagedTensorVersioned* from,
+                                                int32_t require_alignment,
+                                                int32_t require_contiguous,
+                                                TVMFFIObjectHandle* out);
 
 /*!
  * \brief Produce a DLManagedTensor from the array that shares data memory with the array.
@@ -534,8 +534,8 @@ TVM_FFI_DLL int TVMFFINDArrayFromDLPackVersioned(DLManagedTensorVersioned* from,
  * \param out The DLManagedTensor handle.
  * \return 0 on success, nonzero on failure.
  */
-TVM_FFI_DLL int TVMFFINDArrayToDLPackVersioned(TVMFFIObjectHandle from,
-                                               DLManagedTensorVersioned** out);
+TVM_FFI_DLL int TVMFFITensorToDLPackVersioned(TVMFFIObjectHandle from,
+                                              DLManagedTensorVersioned** out);
 
 //---------------------------------------------------------------
 // Section: dtype string support APIs.
@@ -1028,11 +1028,11 @@ inline TVMFFIShapeCell* TVMFFIShapeGetCellPtr(TVMFFIObjectHandle obj) {
 }
 
 /*!
- * \brief Get the DLTensor pointer from an NDArray object.
+ * \brief Get the DLTensor pointer from an Tensor object.
  * \param obj The object handle.
  * \return The DLTensor pointer.
  */
-inline DLTensor* TVMFFINDArrayGetDLTensorPtr(TVMFFIObjectHandle obj) {
+inline DLTensor* TVMFFITensorGetDLTensorPtr(TVMFFIObjectHandle obj) {
   return reinterpret_cast<DLTensor*>(reinterpret_cast<char*>(obj) + sizeof(TVMFFIObject));
 }
 
diff --git a/ffi/include/tvm/ffi/container/shape.h b/ffi/include/tvm/ffi/container/shape.h
index 6360fcd1e398..28f4961c999c 100644
--- a/ffi/include/tvm/ffi/container/shape.h
+++ b/ffi/include/tvm/ffi/container/shape.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file tvm/ffi/shape.h
- * \brief Container to store shape of an NDArray.
+ * \brief Container to store shape of an Tensor.
  */
 #ifndef TVM_FFI_CONTAINER_SHAPE_H_
 #define TVM_FFI_CONTAINER_SHAPE_H_
diff --git a/ffi/include/tvm/ffi/container/ndarray.h b/ffi/include/tvm/ffi/container/tensor.h
similarity index 73%
rename from ffi/include/tvm/ffi/container/ndarray.h
rename to ffi/include/tvm/ffi/container/tensor.h
index f65e386c0619..93526e5c2a5d 100644
--- a/ffi/include/tvm/ffi/container/ndarray.h
+++ b/ffi/include/tvm/ffi/container/tensor.h
@@ -19,11 +19,11 @@
  */
 
 /*!
- * \file tvm/ffi/ndarray.h
- * \brief Container to store an NDArray.
+ * \file tvm/ffi/tensor.h
+ * \brief Container to store an Tensor.
  */
-#ifndef TVM_FFI_CONTAINER_NDARRAY_H_
-#define TVM_FFI_CONTAINER_NDARRAY_H_
+#ifndef TVM_FFI_CONTAINER_TENSOR_H_
+#define TVM_FFI_CONTAINER_TENSOR_H_
 
 #include <tvm/ffi/container/shape.h>
 #include <tvm/ffi/dtype.h>
@@ -110,20 +110,20 @@ inline size_t GetDataSize(const DLTensor& arr) {
   return GetDataSize(size, arr.dtype);
 }
 
-/*! \brief An object representing an NDArray. */
-class NDArrayObj : public Object, public DLTensor {
+/*! \brief An object representing an Tensor. */
+class TensorObj : public Object, public DLTensor {
  public:
-  static constexpr const uint32_t _type_index = TypeIndex::kTVMFFINDArray;
-  static constexpr const char* _type_key = StaticTypeKey::kTVMFFINDArray;
-  TVM_FFI_DECLARE_STATIC_OBJECT_INFO(NDArrayObj, Object);
+  static constexpr const uint32_t _type_index = TypeIndex::kTVMFFITensor;
+  static constexpr const char* _type_key = StaticTypeKey::kTVMFFITensor;
+  TVM_FFI_DECLARE_STATIC_OBJECT_INFO(TensorObj, Object);
 
   /*!
-   * \brief Move NDArray to a DLPack managed tensor.
+   * \brief Move Tensor to a DLPack managed tensor.
    * \return The converted DLPack managed tensor.
    */
   DLManagedTensor* ToDLPack() const {
     DLManagedTensor* ret = new DLManagedTensor();
-    NDArrayObj* from = const_cast<NDArrayObj*>(this);
+    TensorObj* from = const_cast<TensorObj*>(this);
     ret->dl_tensor = *static_cast<DLTensor*>(from);
     ret->manager_ctx = from;
     ret->deleter = DLManagedTensorDeleter;
@@ -132,12 +132,12 @@ class NDArrayObj : public Object, public DLTensor {
   }
 
   /*!
-   * \brief Move  NDArray to a DLPack managed tensor.
+   * \brief Move  Tensor to a DLPack managed tensor.
    * \return The converted DLPack managed tensor.
    */
   DLManagedTensorVersioned* ToDLPackVersioned() const {
     DLManagedTensorVersioned* ret = new DLManagedTensorVersioned();
-    NDArrayObj* from = const_cast<NDArrayObj*>(this);
+    TensorObj* from = const_cast<TensorObj*>(this);
     ret->version.major = DLPACK_MAJOR_VERSION;
     ret->version.minor = DLPACK_MINOR_VERSION;
     ret->dl_tensor = *static_cast<DLTensor*>(from);
@@ -149,37 +149,37 @@ class NDArrayObj : public Object, public DLTensor {
   }
 
  protected:
-  // backs up the shape of the NDArray
+  // backs up the shape/strides
   Optional<Shape> shape_data_;
   Optional<Shape> stride_data_;
 
   static void DLManagedTensorDeleter(DLManagedTensor* tensor) {
-    NDArrayObj* obj = static_cast<NDArrayObj*>(tensor->manager_ctx);
+    TensorObj* obj = static_cast<TensorObj*>(tensor->manager_ctx);
     details::ObjectUnsafe::DecRefObjectHandle(obj);
     delete tensor;
   }
 
   static void DLManagedTensorVersionedDeleter(DLManagedTensorVersioned* tensor) {
-    NDArrayObj* obj = static_cast<NDArrayObj*>(tensor->manager_ctx);
+    TensorObj* obj = static_cast<TensorObj*>(tensor->manager_ctx);
     details::ObjectUnsafe::DecRefObjectHandle(obj);
     delete tensor;
   }
 
-  friend class NDArray;
+  friend class Tensor;
 };
 
 namespace details {
 /*!
- *\brief Helper class to create an NDArrayObj from an NDAllocator
+ *\brief Helper class to create an TensorObj from an NDAllocator
  *
  * The underlying allocator needs to be implemented by user.
  */
 template <typename TNDAlloc>
-class NDArrayObjFromNDAlloc : public NDArrayObj {
+class TensorObjFromNDAlloc : public TensorObj {
  public:
   template <typename... ExtraArgs>
-  NDArrayObjFromNDAlloc(TNDAlloc alloc, ffi::Shape shape, DLDataType dtype, DLDevice device,
-                        ExtraArgs&&... extra_args)
+  TensorObjFromNDAlloc(TNDAlloc alloc, ffi::Shape shape, DLDataType dtype, DLDevice device,
+                       ExtraArgs&&... extra_args)
       : alloc_(alloc) {
     this->device = device;
     this->ndim = static_cast<int>(shape.size());
@@ -193,7 +193,7 @@ class NDArrayObjFromNDAlloc : public NDArrayObj {
     alloc_.AllocData(static_cast<DLTensor*>(this), std::forward<ExtraArgs>(extra_args)...);
   }
 
-  ~NDArrayObjFromNDAlloc() { alloc_.FreeData(static_cast<DLTensor*>(this)); }
+  ~TensorObjFromNDAlloc() { alloc_.FreeData(static_cast<DLTensor*>(this)); }
 
  private:
   TNDAlloc alloc_;
@@ -201,9 +201,9 @@ class NDArrayObjFromNDAlloc : public NDArrayObj {
 
 /*! \brief helper class to import from DLPack legacy DLManagedTensor */
 template <typename TDLPackManagedTensor>
-class NDArrayObjFromDLPack : public NDArrayObj {
+class TensorObjFromDLPack : public TensorObj {
  public:
-  explicit NDArrayObjFromDLPack(TDLPackManagedTensor* tensor) : tensor_(tensor) {
+  explicit TensorObjFromDLPack(TDLPackManagedTensor* tensor) : tensor_(tensor) {
     *static_cast<DLTensor*>(this) = tensor_->dl_tensor;
     if (tensor_->dl_tensor.strides == nullptr) {
       Shape strides = Shape(details::MakeStridesFromShape(ndim, shape));
@@ -212,7 +212,7 @@ class NDArrayObjFromDLPack : public NDArrayObj {
     }
   }
 
-  ~NDArrayObjFromDLPack() {
+  ~TensorObjFromDLPack() {
     // run DLPack deleter if needed.
     if (tensor_->deleter != nullptr) {
       (*tensor_->deleter)(tensor_);
@@ -225,62 +225,62 @@ class NDArrayObjFromDLPack : public NDArrayObj {
 }  // namespace details
 
 /*!
- * \brief Managed NDArray.
- *  The array is backed by reference counted blocks.
+ * \brief Managed Tensor (n-dimensional array).
+ *  The tensor is backed by reference counted blocks.
  *
  * \note This class can be subclassed to implement downstream customized
- *       NDArray types that are backed by the same NDArrayObj storage type.
+ *       Tensor types that are backed by the same TensorObj storage type.
  */
-class NDArray : public ObjectRef {
+class Tensor : public ObjectRef {
  public:
   /*!
-   * \brief Get the shape of the NDArray.
-   * \return The shape of the NDArray.
+   * \brief Get the shape of the Tensor.
+   * \return The shape of the Tensor.
    */
   tvm::ffi::Shape shape() const {
-    NDArrayObj* obj = get_mutable();
+    TensorObj* obj = get_mutable();
     if (!obj->shape_data_.has_value()) {
       obj->shape_data_ = tvm::ffi::Shape(obj->shape, obj->shape + obj->ndim);
     }
     return *(obj->shape_data_);
   }
   /*!
-   * \brief Get the data type of the NDArray.
-   * \return The data type of the NDArray.
+   * \brief Get the data type of the Tensor.
+   * \return The data type of the Tensor.
    */
   DLDataType dtype() const { return (*this)->dtype; }
   /*!
-   * \brief Check if the NDArray is contiguous.
-   * \return True if the NDArray is contiguous, false otherwise.
+   * \brief Check if the Tensor is contiguous.
+   * \return True if the Tensor is contiguous, false otherwise.
    */
   bool IsContiguous() const { return tvm::ffi::IsContiguous(*get()); }
   /*!
-   * \brief Create a NDArray from a NDAllocator.
+   * \brief Create a Tensor from a NDAllocator.
    * \param alloc The NDAllocator.
-   * \param shape The shape of the NDArray.
-   * \param dtype The data type of the NDArray.
-   * \param device The device of the NDArray.
-   * \return The created NDArray.
+   * \param shape The shape of the Tensor.
+   * \param dtype The data type of the Tensor.
+   * \param device The device of the Tensor.
+   * \return The created Tensor.
    * \tparam TNDAlloc The type of the NDAllocator, impelments Alloc and Free.
    * \tparam ExtraArgs Extra arguments to be passed to Alloc.
    */
   template <typename TNDAlloc, typename... ExtraArgs>
-  static NDArray FromNDAlloc(TNDAlloc alloc, ffi::Shape shape, DLDataType dtype, DLDevice device,
-                             ExtraArgs&&... extra_args) {
-    return NDArray(make_object<details::NDArrayObjFromNDAlloc<TNDAlloc>>(
+  static Tensor FromNDAlloc(TNDAlloc alloc, ffi::Shape shape, DLDataType dtype, DLDevice device,
+                            ExtraArgs&&... extra_args) {
+    return Tensor(make_object<details::TensorObjFromNDAlloc<TNDAlloc>>(
         alloc, shape, dtype, device, std::forward<ExtraArgs>(extra_args)...));
   }
 
   /*!
-   * \brief Create a NDArray from a DLPack managed tensor, pre v1.0 API.
+   * \brief Create a Tensor from a DLPack managed tensor, pre v1.0 API.
    * \param tensor The input DLPack managed tensor.
    * \param require_alignment The minimum alignment requored of the data + byte_offset.
    * \param require_contiguous Boolean flag indicating if we need to check for contiguity.
    * \note This function will not run any checks on flags.
-   * \return The created NDArray.
+   * \return The created Tensor.
    */
-  static NDArray FromDLPack(DLManagedTensor* tensor, size_t require_alignment = 0,
-                            bool require_contiguous = false) {
+  static Tensor FromDLPack(DLManagedTensor* tensor, size_t require_alignment = 0,
+                           bool require_contiguous = false) {
     if (require_alignment != 0 && !ffi::IsAligned(tensor->dl_tensor, require_alignment)) {
       TVM_FFI_THROW(RuntimeError) << "FromDLPack: Data is not aligned to " << require_alignment
                                   << " bytes.";
@@ -288,18 +288,18 @@ class NDArray : public ObjectRef {
     if (require_contiguous && !ffi::IsContiguous(tensor->dl_tensor)) {
       TVM_FFI_THROW(RuntimeError) << "FromDLPack: Tensor is not contiguous.";
     }
-    return NDArray(make_object<details::NDArrayObjFromDLPack<DLManagedTensor>>(tensor));
+    return Tensor(make_object<details::TensorObjFromDLPack<DLManagedTensor>>(tensor));
   }
 
   /*!
-   * \brief Create a NDArray from a DLPack managed tensor, post v1.0 API.
+   * \brief Create a Tensor from a DLPack managed tensor, post v1.0 API.
    * \param tensor The input DLPack managed tensor.
    * \param require_alignment The minimum alignment requored of the data + byte_offset.
    * \param require_contiguous Boolean flag indicating if we need to check for contiguity.
-   * \return The created NDArray.
+   * \return The created Tensor.
    */
-  static NDArray FromDLPackVersioned(DLManagedTensorVersioned* tensor, size_t require_alignment = 0,
-                                     bool require_contiguous = false) {
+  static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor, size_t require_alignment = 0,
+                                    bool require_contiguous = false) {
     if (require_alignment != 0 && !ffi::IsAligned(tensor->dl_tensor, require_alignment)) {
       TVM_FFI_THROW(RuntimeError) << "FromDLPack: Data is not aligned to " << require_alignment
                                   << " bytes.";
@@ -310,32 +310,32 @@ class NDArray : public ObjectRef {
     if (tensor->flags & DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED) {
       TVM_FFI_THROW(RuntimeError) << "Subbyte type padded is not yet supported";
     }
-    return NDArray(make_object<details::NDArrayObjFromDLPack<DLManagedTensorVersioned>>(tensor));
+    return Tensor(make_object<details::TensorObjFromDLPack<DLManagedTensorVersioned>>(tensor));
   }
 
   /*!
-   * \brief Convert the NDArray to a DLPack managed tensor.
+   * \brief Convert the Tensor to a DLPack managed tensor.
    * \return The converted DLPack managed tensor.
    */
   DLManagedTensor* ToDLPack() const { return get_mutable()->ToDLPack(); }
 
   /*!
-   * \brief Convert the NDArray to a DLPack managed tensor.
+   * \brief Convert the Tensor to a DLPack managed tensor.
    * \return The converted DLPack managed tensor.
    */
   DLManagedTensorVersioned* ToDLPackVersioned() const { return get_mutable()->ToDLPackVersioned(); }
 
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS(NDArray, ObjectRef, NDArrayObj);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS(Tensor, ObjectRef, TensorObj);
 
  protected:
   /*!
    * \brief Get mutable internal container pointer.
    * \return a mutable container pointer.
    */
-  NDArrayObj* get_mutable() const { return const_cast<NDArrayObj*>(get()); }
+  TensorObj* get_mutable() const { return const_cast<TensorObj*>(get()); }
 };
 
 }  // namespace ffi
 }  // namespace tvm
 
-#endif  // TVM_FFI_CONTAINER_NDARRAY_H_
+#endif  // TVM_FFI_CONTAINER_TENSOR_H_
diff --git a/ffi/include/tvm/ffi/extra/structural_equal.h b/ffi/include/tvm/ffi/extra/structural_equal.h
index 9727940297ed..8eb5da7f67df 100644
--- a/ffi/include/tvm/ffi/extra/structural_equal.h
+++ b/ffi/include/tvm/ffi/extra/structural_equal.h
@@ -40,13 +40,13 @@ class StructuralEqual {
    * \param lhs The left hand side Any object.
    * \param rhs The right hand side Any object.
    * \param map_free_vars Whether to map free variables.
-   * \param skip_ndarray_content Whether to skip comparingn darray data content,
+   * \param skip_tensor_content Whether to skip comparingn darray data content,
    *                             useful for cases where we don't care about parameters content
    * \return True if the two Any values are structurally equal, false otherwise.
    */
   TVM_FFI_EXTRA_CXX_API static bool Equal(const Any& lhs, const Any& rhs,
                                           bool map_free_vars = false,
-                                          bool skip_ndarray_content = false);
+                                          bool skip_tensor_content = false);
   /**
    * \brief Get the first mismatch AccessPath pair when running
    * structural equal comparison between two Any values.
@@ -54,14 +54,13 @@ class StructuralEqual {
    * \param lhs The left hand side Any object.
    * \param rhs The right hand side Any object.
    * \param map_free_vars Whether to map free variables.
-   * \param skip_ndarray_content Whether to skip comparing ndarray data content,
+   * \param skip_tensor_content Whether to skip comparing tensor data content,
    *                             useful for cases where we don't care about parameters content
    * \return If comparison fails, return the first mismatch AccessPath pair,
    *         otherwise return std::nullopt.
    */
   TVM_FFI_EXTRA_CXX_API static Optional<reflection::AccessPathPair> GetFirstMismatch(
-      const Any& lhs, const Any& rhs, bool map_free_vars = false,
-      bool skip_ndarray_content = false);
+      const Any& lhs, const Any& rhs, bool map_free_vars = false, bool skip_tensor_content = false);
 
   /*
    * \brief Compare two Any values for structural equality.
diff --git a/ffi/include/tvm/ffi/extra/structural_hash.h b/ffi/include/tvm/ffi/extra/structural_hash.h
index 9cb08a1c0fc8..1d7ba2613e90 100644
--- a/ffi/include/tvm/ffi/extra/structural_hash.h
+++ b/ffi/include/tvm/ffi/extra/structural_hash.h
@@ -38,12 +38,12 @@ class StructuralHash {
    * \brief Hash an Any value.
    * \param value The Any value to hash.
    * \param map_free_vars Whether to map free variables.
-   * \param skip_ndarray_content Whether to skip comparingn darray data content,
+   * \param skip_tensor_content Whether to skip comparingn darray data content,
    *                             useful for cases where we don't care about parameters content.
    * \return The hash value.
    */
   TVM_FFI_EXTRA_CXX_API static uint64_t Hash(const Any& value, bool map_free_vars = false,
-                                             bool skip_ndarray_content = false);
+                                             bool skip_tensor_content = false);
   /*!
    * \brief Hash an Any value.
    * \param value The Any value to hash.
diff --git a/ffi/include/tvm/ffi/object.h b/ffi/include/tvm/ffi/object.h
index cc5ee8d94585..ab0e424551e9 100644
--- a/ffi/include/tvm/ffi/object.h
+++ b/ffi/include/tvm/ffi/object.h
@@ -57,7 +57,7 @@ struct StaticTypeKey {
   static constexpr const char* kTVMFFIBytes = "ffi.Bytes";
   static constexpr const char* kTVMFFIStr = "ffi.String";
   static constexpr const char* kTVMFFIShape = "ffi.Shape";
-  static constexpr const char* kTVMFFINDArray = "ffi.NDArray";
+  static constexpr const char* kTVMFFITensor = "ffi.Tensor";
   static constexpr const char* kTVMFFIObject = "ffi.Object";
   static constexpr const char* kTVMFFIFunction = "ffi.Function";
   static constexpr const char* kTVMFFIArray = "ffi.Array";
diff --git a/ffi/include/tvm/ffi/type_traits.h b/ffi/include/tvm/ffi/type_traits.h
index 9cdb2b933894..b972f5835926 100644
--- a/ffi/include/tvm/ffi/type_traits.h
+++ b/ffi/include/tvm/ffi/type_traits.h
@@ -463,15 +463,15 @@ struct TypeTraits<DLTensor*> : public TypeTraitsBase {
 
   TVM_FFI_INLINE static void MoveToAny(DLTensor*, TVMFFIAny*) {
     TVM_FFI_THROW(RuntimeError)
-        << "DLTensor* cannot be held in Any as it does not retain ownership, use NDArray instead";
+        << "DLTensor* cannot be held in Any as it does not retain ownership, use Tensor instead";
   }
 
   TVM_FFI_INLINE static std::optional<DLTensor*> TryCastFromAnyView(const TVMFFIAny* src) {
     if (src->type_index == TypeIndex::kTVMFFIDLTensorPtr) {
       return static_cast<DLTensor*>(src->v_ptr);
-    } else if (src->type_index == TypeIndex::kTVMFFINDArray) {
-      // Conversion from NDArray pointer to DLTensor
-      // based on the assumption that NDArray always follows the TVMFFIObject header
+    } else if (src->type_index == TypeIndex::kTVMFFITensor) {
+      // Conversion from Tensor pointer to DLTensor
+      // based on the assumption that Tensor always follows the TVMFFIObject header
       static_assert(sizeof(TVMFFIObject) == 24);
       return reinterpret_cast<DLTensor*>(reinterpret_cast<char*>(src->v_obj) +
                                          sizeof(TVMFFIObject));
diff --git a/ffi/pyproject.toml b/ffi/pyproject.toml
index ab2a7f84dfc3..430b47c33b8b 100644
--- a/ffi/pyproject.toml
+++ b/ffi/pyproject.toml
@@ -17,7 +17,7 @@
 
 [project]
 name = "apache-tvm-ffi"
-version = "0.1.0a7"
+version = "0.1.0a8"
 description = "tvm ffi"
 
 authors = [{ name = "TVM FFI team" }]
diff --git a/ffi/python/tvm_ffi/__init__.py b/ffi/python/tvm_ffi/__init__.py
index 7f702a7b09fc..807dc56a9181 100644
--- a/ffi/python/tvm_ffi/__init__.py
+++ b/ffi/python/tvm_ffi/__init__.py
@@ -26,9 +26,9 @@
 from .core import Object, ObjectGeneric, Function
 from .convert import convert
 from .error import register_error
-from .ndarray import Device, device
-from .ndarray import cpu, cuda, rocm, opencl, metal, vpi, vulkan, ext_dev, hexagon, webgpu
-from .ndarray import from_dlpack, NDArray, Shape
+from .tensor import Device, device
+from .tensor import cpu, cuda, rocm, opencl, metal, vpi, vulkan, ext_dev, hexagon, webgpu
+from .tensor import from_dlpack, Tensor, Shape
 from .container import Array, Map
 from .module import Module, ModulePropertyMask, system_lib, load_module
 from . import serialization
@@ -65,7 +65,7 @@
     "hexagon",
     "webgpu",
     "from_dlpack",
-    "NDArray",
+    "Tensor",
     "Shape",
     "Array",
     "Map",
diff --git a/ffi/python/tvm_ffi/cython/base.pxi b/ffi/python/tvm_ffi/cython/base.pxi
index 4acf5f0a1717..f1cd77bc47e8 100644
--- a/ffi/python/tvm_ffi/cython/base.pxi
+++ b/ffi/python/tvm_ffi/cython/base.pxi
@@ -49,7 +49,7 @@ cdef extern from "tvm/ffi/c_api.h":
         kTVMFFIError = 67
         kTVMFFIFunction = 68
         kTVMFFIShape = 69
-        kTVMFFINDArray = 70
+        kTVMFFITensor = 70
         kTVMFFIArray = 71
         kTVMFFIMap = 72
         kTVMFFIModule = 73
@@ -196,14 +196,14 @@ cdef extern from "tvm/ffi/c_api.h":
     int TVMFFIDataTypeToString(const DLDataType* dtype, TVMFFIAny* out) nogil
     const TVMFFIByteArray* TVMFFITraceback(
         const char* filename, int lineno, const char* func, int cross_ffi_boundary) nogil;
-    int TVMFFINDArrayFromDLPack(DLManagedTensor* src, int32_t require_alignment,
+    int TVMFFITensorFromDLPack(DLManagedTensor* src, int32_t require_alignment,
                                 int32_t require_contiguous, TVMFFIObjectHandle* out) nogil
-    int TVMFFINDArrayFromDLPackVersioned(DLManagedTensorVersioned* src,
+    int TVMFFITensorFromDLPackVersioned(DLManagedTensorVersioned* src,
                                         int32_t require_alignment,
                                         int32_t require_contiguous,
                                         TVMFFIObjectHandle* out) nogil
-    int TVMFFINDArrayToDLPack(TVMFFIObjectHandle src, DLManagedTensor** out) nogil
-    int TVMFFINDArrayToDLPackVersioned(TVMFFIObjectHandle src,
+    int TVMFFITensorToDLPack(TVMFFIObjectHandle src, DLManagedTensor** out) nogil
+    int TVMFFITensorToDLPackVersioned(TVMFFIObjectHandle src,
                                         DLManagedTensorVersioned** out) nogil
     const TVMFFITypeInfo* TVMFFIGetTypeInfo(int32_t type_index) nogil
     TVMFFIByteArray TVMFFISmallBytesGetContentByteArray(const TVMFFIAny* value) nogil
@@ -211,7 +211,7 @@ cdef extern from "tvm/ffi/c_api.h":
     TVMFFIErrorCell* TVMFFIErrorGetCellPtr(TVMFFIObjectHandle obj) nogil
     TVMFFIOpaqueObjectCell* TVMFFIOpaqueObjectGetCellPtr(TVMFFIObjectHandle obj) nogil
     TVMFFIShapeCell* TVMFFIShapeGetCellPtr(TVMFFIObjectHandle obj) nogil
-    DLTensor* TVMFFINDArrayGetDLTensorPtr(TVMFFIObjectHandle obj) nogil
+    DLTensor* TVMFFITensorGetDLTensorPtr(TVMFFIObjectHandle obj) nogil
     DLDevice TVMFFIDLDeviceFromIntPair(int32_t device_type, int32_t device_id) nogil
 
 cdef extern from "tvm/ffi/extra/c_env_api.h":
diff --git a/ffi/python/tvm_ffi/cython/core.pyx b/ffi/python/tvm_ffi/cython/core.pyx
index 010341187ce6..b24a83da7c1d 100644
--- a/ffi/python/tvm_ffi/cython/core.pyx
+++ b/ffi/python/tvm_ffi/cython/core.pyx
@@ -22,5 +22,5 @@ include "./device.pxi"
 include "./object.pxi"
 include "./error.pxi"
 include "./string.pxi"
-include "./ndarray.pxi"
+include "./tensor.pxi"
 include "./function.pxi"
diff --git a/ffi/python/tvm_ffi/cython/function.pxi b/ffi/python/tvm_ffi/cython/function.pxi
index fc273b5cee0f..ea10356077da 100644
--- a/ffi/python/tvm_ffi/cython/function.pxi
+++ b/ffi/python/tvm_ffi/cython/function.pxi
@@ -15,12 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import ctypes
+import os
 from numbers import Real, Integral
 
-try:
-    # optionally import torch and setup torch related utils
-    import torch
-except ImportError:
+
+if os.environ.get("TVM_FFI_BUILD_DOCS", "0") == "0":
+    try:
+        # optionally import torch and setup torch related utils
+        import torch
+    except ImportError:
+        torch = None
+else:
     torch = None
 
 
@@ -43,9 +48,9 @@ cdef inline object make_ret(TVMFFIAny result):
    # TODO: Implement
     cdef int32_t type_index
     type_index = result.type_index
-    if type_index == kTVMFFINDArray:
-        # specially handle NDArray as it needs a special dltensor field
-        return make_ndarray_from_any(result)
+    if type_index == kTVMFFITensor:
+        # specially handle Tensor as it needs a special dltensor field
+        return make_tensor_from_any(result)
     elif type_index == kTVMFFIOpaquePyObject:
         return make_ret_opaque_object(result)
     elif type_index >= kTVMFFIStaticObjectBegin:
@@ -92,13 +97,13 @@ cdef inline int make_args(tuple py_args, TVMFFIAny* out, list temp_args,
             out[i].v_int64 = 0
         out[i].zero_padding = 0
 
-        if isinstance(arg, NDArray):
+        if isinstance(arg, Tensor):
             if (<Object>arg).chandle != NULL:
-                out[i].type_index = kTVMFFINDArray
-                out[i].v_ptr = (<NDArray>arg).chandle
+                out[i].type_index = kTVMFFITensor
+                out[i].v_ptr = (<Tensor>arg).chandle
             else:
                 out[i].type_index = kTVMFFIDLTensorPtr
-                out[i].v_ptr = (<NDArray>arg).cdltensor
+                out[i].v_ptr = (<Tensor>arg).cdltensor
         elif isinstance(arg, Object):
             out[i].type_index = TVMFFIObjectGetTypeIndex((<Object>arg).chandle)
             out[i].v_ptr = (<Object>arg).chandle
@@ -106,9 +111,9 @@ cdef inline int make_args(tuple py_args, TVMFFIAny* out, list temp_args,
             is_cuda = arg.is_cuda
             arg = from_dlpack(torch.utils.dlpack.to_dlpack(arg),
                               required_alignment=__dlpack_auto_import_required_alignment__)
-            out[i].type_index = kTVMFFINDArray
-            out[i].v_ptr = (<NDArray>arg).chandle
-            temp_dltensor = TVMFFINDArrayGetDLTensorPtr((<NDArray>arg).chandle)
+            out[i].type_index = kTVMFFITensor
+            out[i].v_ptr = (<Tensor>arg).chandle
+            temp_dltensor = TVMFFITensorGetDLTensorPtr((<Tensor>arg).chandle)
             # record the stream and device for torch context
             if is_cuda and ctx_dev_type != NULL and ctx_dev_type[0] == -1:
                 ctx_dev_type[0] = temp_dltensor.device.device_type
@@ -119,8 +124,8 @@ cdef inline int make_args(tuple py_args, TVMFFIAny* out, list temp_args,
             temp_args.append(arg)
         elif hasattr(arg, "__dlpack__"):
             arg = from_dlpack(arg, required_alignment=__dlpack_auto_import_required_alignment__)
-            out[i].type_index = kTVMFFINDArray
-            out[i].v_ptr = (<NDArray>arg).chandle
+            out[i].type_index = kTVMFFITensor
+            out[i].v_ptr = (<Tensor>arg).chandle
             temp_args.append(arg)
         elif isinstance(arg, PyNativeObject) and arg.__tvm_ffi_object__ is not None:
             arg = arg.__tvm_ffi_object__
diff --git a/ffi/python/tvm_ffi/cython/ndarray.pxi b/ffi/python/tvm_ffi/cython/tensor.pxi
similarity index 89%
rename from ffi/python/tvm_ffi/cython/ndarray.pxi
rename to ffi/python/tvm_ffi/cython/tensor.pxi
index 9dfe1222dc7e..5544359c9e02 100644
--- a/ffi/python/tvm_ffi/cython/ndarray.pxi
+++ b/ffi/python/tvm_ffi/cython/tensor.pxi
@@ -17,12 +17,12 @@
 
 __dlpack_version__ = (1, 1)
 __dlpack_auto_import_required_alignment__ = 8
-_CLASS_NDARRAY = None
+_CLASS_TENSOR = None
 
 
-def _set_class_ndarray(cls):
-    global _CLASS_NDARRAY
-    _CLASS_NDARRAY = cls
+def _set_class_tensor(cls):
+    global _CLASS_TENSOR
+    _CLASS_TENSOR = cls
 
 
 cdef const char* _c_str_dltensor = "dltensor"
@@ -55,7 +55,7 @@ cdef inline int _from_dlpack(
     if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor):
         ptr = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
         with nogil:
-            c_api_ret_code = TVMFFINDArrayFromDLPack(
+            c_api_ret_code = TVMFFITensorFromDLPack(
                 ptr, c_req_alignment, c_req_contiguous, out)
         CHECK_CALL(c_api_ret_code)
         # set name and destructor to be empty
@@ -77,7 +77,7 @@ cdef inline int _from_dlpack_versioned(
         ptr = <DLManagedTensorVersioned*>pycapsule.PyCapsule_GetPointer(
             dltensor, _c_str_dltensor_versioned)
         with nogil:
-            c_api_ret_code = TVMFFINDArrayFromDLPackVersioned(
+            c_api_ret_code = TVMFFITensorFromDLPackVersioned(
                 ptr, c_req_alignment, c_req_contiguous, out)
         CHECK_CALL(c_api_ret_code)
         # set name and destructor to be empty
@@ -89,7 +89,7 @@ cdef inline int _from_dlpack_versioned(
 
 def from_dlpack(ext_tensor, *, required_alignment=8, required_contiguous=True):
     """
-    Convert an external tensor to an NDArray.
+    Convert an external tensor to an Tensor.
 
     Parameters
     ----------
@@ -147,7 +147,7 @@ def from_dlpack(ext_tensor, *, required_alignment=8, required_contiguous=True):
             )
         else:
             raise TypeError("Expect from_dlpack to take either a compatible tensor or PyCapsule")
-    return make_ndarray_from_chandle(chandle)
+    return make_tensor_from_chandle(chandle)
 
 
 # helper class for shape handling
@@ -156,7 +156,7 @@ def _shape_obj_get_py_tuple(obj):
     return tuple(shape.data[i] for i in range(shape.size))
 
 
-cdef class NDArray(Object):
+cdef class Tensor(Object):
     """N-dimensional array that is compatible with DLPack.
     """
     cdef DLTensor* cdltensor
@@ -199,7 +199,7 @@ cdef class NDArray(Object):
         cdef int c_api_ret_code
 
         with nogil:
-            c_api_ret_code = TVMFFINDArrayToDLPack(self.chandle, &dltensor)
+            c_api_ret_code = TVMFFITensorToDLPack(self.chandle, &dltensor)
         CHECK_CALL(c_api_ret_code)
         return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, <PyCapsule_Destructor>_c_dlpack_deleter)
 
@@ -208,7 +208,7 @@ cdef class NDArray(Object):
         cdef int c_api_ret_code
 
         with nogil:
-            c_api_ret_code = TVMFFINDArrayToDLPackVersioned(self.chandle, &dltensor)
+            c_api_ret_code = TVMFFITensorToDLPackVersioned(self.chandle, &dltensor)
         CHECK_CALL(c_api_ret_code)
         return pycapsule.PyCapsule_New(
             dltensor, _c_str_dltensor_versioned, <PyCapsule_Destructor>_c_dlpack_versioned_deleter)
@@ -266,27 +266,27 @@ cdef class NDArray(Object):
                 raise BufferError(f"Unsupported max_version {max_version}")
 
 
-_set_class_ndarray(NDArray)
-_register_object_by_index(kTVMFFINDArray, NDArray)
+_set_class_tensor(Tensor)
+_register_object_by_index(kTVMFFITensor, Tensor)
 
 
 cdef inline object make_ret_dltensor(TVMFFIAny result):
     cdef DLTensor* dltensor
     dltensor = <DLTensor*>result.v_ptr
-    ndarray = _CLASS_NDARRAY.__new__(_CLASS_NDARRAY)
-    (<Object>ndarray).chandle = NULL
-    (<NDArray>ndarray).cdltensor = dltensor
-    return ndarray
+    tensor = _CLASS_TENSOR.__new__(_CLASS_TENSOR)
+    (<Object>tensor).chandle = NULL
+    (<Tensor>tensor).cdltensor = dltensor
+    return tensor
 
 
-cdef inline object make_ndarray_from_chandle(TVMFFIObjectHandle chandle):
+cdef inline object make_tensor_from_chandle(TVMFFIObjectHandle chandle):
     # TODO: Implement
-    cdef NDArray ndarray
-    ndarray = _CLASS_NDARRAY.__new__(_CLASS_NDARRAY)
-    (<Object>ndarray).chandle = chandle
-    (<NDArray>ndarray).cdltensor = TVMFFINDArrayGetDLTensorPtr(chandle)
-    return ndarray
+    cdef Tensor tensor
+    tensor = _CLASS_TENSOR.__new__(_CLASS_TENSOR)
+    (<Object>tensor).chandle = chandle
+    (<Tensor>tensor).cdltensor = TVMFFITensorGetDLTensorPtr(chandle)
+    return tensor
 
 
-cdef inline object make_ndarray_from_any(TVMFFIAny any):
-    return make_ndarray_from_chandle(any.v_ptr)
+cdef inline object make_tensor_from_any(TVMFFIAny any):
+    return make_tensor_from_chandle(any.v_ptr)
diff --git a/ffi/python/tvm_ffi/module.py b/ffi/python/tvm_ffi/module.py
index c3c1d089c612..684018416e62 100644
--- a/ffi/python/tvm_ffi/module.py
+++ b/ffi/python/tvm_ffi/module.py
@@ -38,25 +38,8 @@ class ModulePropertyMask(IntEnum):
 class Module(core.Object):
     """Runtime Module."""
 
-    def __new__(cls):
-        instance = super(Module, cls).__new__(cls)  # pylint: disable=no-value-for-parameter
-        instance.entry_name = "main"
-        instance._entry = None
-        return instance
-
-    @property
-    def entry_func(self):
-        """Get the entry function
-
-        Returns
-        -------
-        f : tvm_ffi.Function
-            The entry function if exist
-        """
-        if self._entry:
-            return self._entry
-        self._entry = self.get_function("main")
-        return self._entry
+    # constant for entry function name
+    entry_name = "main"
 
     @property
     def kind(self):
@@ -142,10 +125,8 @@ def __getitem__(self, name):
         return self.get_function(name)
 
     def __call__(self, *args):
-        if self._entry:
-            return self._entry(*args)
         # pylint: disable=not-callable
-        return self.entry_func(*args)
+        return self.main(*args)
 
     def inspect_source(self, fmt=""):
         """Get source code from module, if available.
diff --git a/ffi/python/tvm_ffi/ndarray.py b/ffi/python/tvm_ffi/tensor.py
similarity index 98%
rename from ffi/python/tvm_ffi/ndarray.py
rename to ffi/python/tvm_ffi/tensor.py
index d65b8fb36176..97240c6a499f 100644
--- a/ffi/python/tvm_ffi/ndarray.py
+++ b/ffi/python/tvm_ffi/tensor.py
@@ -14,11 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""NDArray related objects and functions."""
+"""Tensor related objects and functions."""
 
 from numbers import Integral
 from . import core
-from .core import Device, NDArray, from_dlpack
+from .core import Device, Tensor, from_dlpack
 from . import registry
 from . import _ffi_api
 
@@ -240,7 +240,7 @@ def webgpu(dev_id=0):
 
 __all__ = [
     "from_dlpack",
-    "NDArray",
+    "Tensor",
     "device",
     "cpu",
     "cuda",
diff --git a/ffi/src/ffi/extra/structural_equal.cc b/ffi/src/ffi/extra/structural_equal.cc
index 171fa2f750a0..976ba4ecf4d8 100644
--- a/ffi/src/ffi/extra/structural_equal.cc
+++ b/ffi/src/ffi/extra/structural_equal.cc
@@ -23,8 +23,8 @@
  */
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
-#include <tvm/ffi/container/ndarray.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/structural_equal.h>
 #include <tvm/ffi/reflection/accessor.h>
 #include <tvm/ffi/string.h>
@@ -111,9 +111,9 @@ class StructEqualHandler {
         return CompareShape(AnyUnsafe::MoveFromAnyAfterCheck<Shape>(std::move(lhs)),
                             AnyUnsafe::MoveFromAnyAfterCheck<Shape>(std::move(rhs)));
       }
-      case TypeIndex::kTVMFFINDArray: {
-        return CompareNDArray(AnyUnsafe::MoveFromAnyAfterCheck<NDArray>(std::move(lhs)),
-                              AnyUnsafe::MoveFromAnyAfterCheck<NDArray>(std::move(rhs)));
+      case TypeIndex::kTVMFFITensor: {
+        return CompareTensor(AnyUnsafe::MoveFromAnyAfterCheck<Tensor>(std::move(lhs)),
+                             AnyUnsafe::MoveFromAnyAfterCheck<Tensor>(std::move(rhs)));
       }
       default: {
         return CompareObject(AnyUnsafe::MoveFromAnyAfterCheck<ObjectRef>(std::move(lhs)),
@@ -341,14 +341,14 @@ class StructEqualHandler {
     return true;
   }
 
-  bool CompareNDArray(NDArray lhs, NDArray rhs) {
+  bool CompareTensor(Tensor lhs, Tensor rhs) {
     if (lhs.same_as(rhs)) return true;
     if (lhs->ndim != rhs->ndim) return false;
     for (int i = 0; i < lhs->ndim; ++i) {
       if (lhs->shape[i] != rhs->shape[i]) return false;
     }
     if (lhs->dtype != rhs->dtype) return false;
-    if (!skip_ndarray_content_) {
+    if (!skip_tensor_content_) {
       TVM_FFI_ICHECK_EQ(lhs->device.device_type, kDLCPU) << "can only compare CPU tensor";
       TVM_FFI_ICHECK_EQ(rhs->device.device_type, kDLCPU) << "can only compare CPU tensor";
       TVM_FFI_ICHECK(lhs.IsContiguous()) << "Can only compare contiguous tensor";
@@ -385,8 +385,8 @@ class StructEqualHandler {
   }
   // whether we map free variables that are not defined
   bool map_free_vars_{false};
-  // whether we compare ndarray data
-  bool skip_ndarray_content_{false};
+  // whether we compare tensor data
+  bool skip_tensor_content_{false};
   // the root lhs for result printing
   std::vector<reflection::AccessStep>* mismatch_lhs_reverse_path_ = nullptr;
   std::vector<reflection::AccessStep>* mismatch_rhs_reverse_path_ = nullptr;
@@ -399,20 +399,20 @@ class StructEqualHandler {
 };
 
 bool StructuralEqual::Equal(const Any& lhs, const Any& rhs, bool map_free_vars,
-                            bool skip_ndarray_content) {
+                            bool skip_tensor_content) {
   StructEqualHandler handler;
   handler.map_free_vars_ = map_free_vars;
-  handler.skip_ndarray_content_ = skip_ndarray_content;
+  handler.skip_tensor_content_ = skip_tensor_content;
   return handler.CompareAny(lhs, rhs);
 }
 
 Optional<reflection::AccessPathPair> StructuralEqual::GetFirstMismatch(const Any& lhs,
                                                                        const Any& rhs,
                                                                        bool map_free_vars,
-                                                                       bool skip_ndarray_content) {
+                                                                       bool skip_tensor_content) {
   StructEqualHandler handler;
   handler.map_free_vars_ = map_free_vars;
-  handler.skip_ndarray_content_ = skip_ndarray_content;
+  handler.skip_tensor_content_ = skip_tensor_content;
   std::vector<reflection::AccessStep> lhs_reverse_path;
   std::vector<reflection::AccessStep> rhs_reverse_path;
   handler.mismatch_lhs_reverse_path_ = &lhs_reverse_path;
diff --git a/ffi/src/ffi/extra/structural_hash.cc b/ffi/src/ffi/extra/structural_hash.cc
index 9f245c1d174d..2eb9843fed4f 100644
--- a/ffi/src/ffi/extra/structural_hash.cc
+++ b/ffi/src/ffi/extra/structural_hash.cc
@@ -23,8 +23,8 @@
  */
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
-#include <tvm/ffi/container/ndarray.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/structural_hash.h>
 #include <tvm/ffi/reflection/accessor.h>
 #include <tvm/ffi/reflection/registry.h>
@@ -84,8 +84,8 @@ class StructuralHashHandler {
       case TypeIndex::kTVMFFIShape: {
         return HashShape(AnyUnsafe::MoveFromAnyAfterCheck<Shape>(std::move(src)));
       }
-      case TypeIndex::kTVMFFINDArray: {
-        return HashNDArray(AnyUnsafe::MoveFromAnyAfterCheck<NDArray>(std::move(src)));
+      case TypeIndex::kTVMFFITensor: {
+        return HashTensor(AnyUnsafe::MoveFromAnyAfterCheck<Tensor>(std::move(src)));
       }
       default: {
         return HashObject(AnyUnsafe::MoveFromAnyAfterCheck<ObjectRef>(std::move(src)));
@@ -267,29 +267,29 @@ class StructuralHashHandler {
     return hash_value;
   }
 
-  uint64_t HashNDArray(NDArray ndarray) {
-    uint64_t hash_value = details::StableHashCombine(ndarray->GetTypeKeyHash(), ndarray->ndim);
-    for (int i = 0; i < ndarray->ndim; ++i) {
-      hash_value = details::StableHashCombine(hash_value, ndarray->shape[i]);
+  uint64_t HashTensor(Tensor tensor) {
+    uint64_t hash_value = details::StableHashCombine(tensor->GetTypeKeyHash(), tensor->ndim);
+    for (int i = 0; i < tensor->ndim; ++i) {
+      hash_value = details::StableHashCombine(hash_value, tensor->shape[i]);
     }
     TVMFFIAny temp;
     temp.v_uint64 = 0;
-    temp.v_dtype = ndarray->dtype;
+    temp.v_dtype = tensor->dtype;
     hash_value = details::StableHashCombine(hash_value, temp.v_int64);
 
-    if (!skip_ndarray_content_) {
-      TVM_FFI_ICHECK_EQ(ndarray->device.device_type, kDLCPU) << "can only hash CPU tensor";
-      TVM_FFI_ICHECK(ndarray.IsContiguous()) << "Can only hash contiguous tensor";
-      size_t data_size = GetDataSize(*(ndarray.operator->()));
+    if (!skip_tensor_content_) {
+      TVM_FFI_ICHECK_EQ(tensor->device.device_type, kDLCPU) << "can only hash CPU tensor";
+      TVM_FFI_ICHECK(tensor.IsContiguous()) << "Can only hash contiguous tensor";
+      size_t data_size = GetDataSize(*(tensor.operator->()));
       uint64_t data_hash =
-          details::StableHashBytes(static_cast<const char*>(ndarray->data), data_size);
+          details::StableHashBytes(static_cast<const char*>(tensor->data), data_size);
       hash_value = details::StableHashCombine(hash_value, data_hash);
     }
     return hash_value;
   }
 
   bool map_free_vars_{false};
-  bool skip_ndarray_content_{false};
+  bool skip_tensor_content_{false};
   // free var counter.
   uint32_t free_var_counter_{0};
   // graph node counter.
@@ -300,10 +300,10 @@ class StructuralHashHandler {
   std::unordered_map<ObjectRef, uint64_t, ObjectPtrHash, ObjectPtrEqual> hash_memo_;
 };
 
-uint64_t StructuralHash::Hash(const Any& value, bool map_free_vars, bool skip_ndarray_content) {
+uint64_t StructuralHash::Hash(const Any& value, bool map_free_vars, bool skip_tensor_content) {
   StructuralHashHandler handler;
   handler.map_free_vars_ = map_free_vars;
-  handler.skip_ndarray_content_ = skip_ndarray_content;
+  handler.skip_tensor_content_ = skip_tensor_content;
   return handler.HashAny(value);
 }
 
diff --git a/ffi/src/ffi/ndarray.cc b/ffi/src/ffi/tensor.cc
similarity index 71%
rename from ffi/src/ffi/ndarray.cc
rename to ffi/src/ffi/tensor.cc
index 41d4273b597c..7b44e4586b4b 100644
--- a/ffi/src/ffi/ndarray.cc
+++ b/ffi/src/ffi/tensor.cc
@@ -17,11 +17,11 @@
  * under the License.
  */
 /*
- * \file src/ffi/ndarray.cc
- * \brief NDArray C API implementation
+ * \file src/ffi/tensor.cc
+ * \brief Tensor C API implementation
  */
 #include <tvm/ffi/c_api.h>
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 
@@ -47,35 +47,35 @@ TVM_FFI_STATIC_INIT_BLOCK({
 }  // namespace ffi
 }  // namespace tvm
 
-int TVMFFINDArrayFromDLPack(DLManagedTensor* from, int32_t min_alignment,
-                            int32_t require_contiguous, TVMFFIObjectHandle* out) {
+int TVMFFITensorFromDLPack(DLManagedTensor* from, int32_t min_alignment, int32_t require_contiguous,
+                           TVMFFIObjectHandle* out) {
   TVM_FFI_SAFE_CALL_BEGIN();
-  tvm::ffi::NDArray nd =
-      tvm::ffi::NDArray::FromDLPack(from, static_cast<size_t>(min_alignment), require_contiguous);
-  *out = tvm::ffi::details::ObjectUnsafe::MoveObjectRefToTVMFFIObjectPtr(std::move(nd));
+  tvm::ffi::Tensor tensor =
+      tvm::ffi::Tensor::FromDLPack(from, static_cast<size_t>(min_alignment), require_contiguous);
+  *out = tvm::ffi::details::ObjectUnsafe::MoveObjectRefToTVMFFIObjectPtr(std::move(tensor));
   TVM_FFI_SAFE_CALL_END();
 }
 
-int TVMFFINDArrayFromDLPackVersioned(DLManagedTensorVersioned* from, int32_t min_alignment,
-                                     int32_t require_contiguous, TVMFFIObjectHandle* out) {
+int TVMFFITensorFromDLPackVersioned(DLManagedTensorVersioned* from, int32_t min_alignment,
+                                    int32_t require_contiguous, TVMFFIObjectHandle* out) {
   TVM_FFI_SAFE_CALL_BEGIN();
-  tvm::ffi::NDArray nd = tvm::ffi::NDArray::FromDLPackVersioned(
+  tvm::ffi::Tensor tensor = tvm::ffi::Tensor::FromDLPackVersioned(
       from, static_cast<size_t>(min_alignment), require_contiguous);
-  *out = tvm::ffi::details::ObjectUnsafe::MoveObjectRefToTVMFFIObjectPtr(std::move(nd));
+  *out = tvm::ffi::details::ObjectUnsafe::MoveObjectRefToTVMFFIObjectPtr(std::move(tensor));
   TVM_FFI_SAFE_CALL_END();
 }
 
-int TVMFFINDArrayToDLPack(TVMFFIObjectHandle from, DLManagedTensor** out) {
+int TVMFFITensorToDLPack(TVMFFIObjectHandle from, DLManagedTensor** out) {
   TVM_FFI_SAFE_CALL_BEGIN();
-  *out = tvm::ffi::details::ObjectUnsafe::RawObjectPtrFromUnowned<tvm::ffi::NDArrayObj>(
+  *out = tvm::ffi::details::ObjectUnsafe::RawObjectPtrFromUnowned<tvm::ffi::TensorObj>(
              static_cast<TVMFFIObject*>(from))
              ->ToDLPack();
   TVM_FFI_SAFE_CALL_END();
 }
 
-int TVMFFINDArrayToDLPackVersioned(TVMFFIObjectHandle from, DLManagedTensorVersioned** out) {
+int TVMFFITensorToDLPackVersioned(TVMFFIObjectHandle from, DLManagedTensorVersioned** out) {
   TVM_FFI_SAFE_CALL_BEGIN();
-  *out = tvm::ffi::details::ObjectUnsafe::RawObjectPtrFromUnowned<tvm::ffi::NDArrayObj>(
+  *out = tvm::ffi::details::ObjectUnsafe::RawObjectPtrFromUnowned<tvm::ffi::TensorObj>(
              static_cast<TVMFFIObject*>(from))
              ->ToDLPackVersioned();
   TVM_FFI_SAFE_CALL_END();
diff --git a/ffi/tests/cpp/test_example.cc b/ffi/tests/cpp/test_example.cc
index 68e529821953..9808be68da65 100644
--- a/ffi/tests/cpp/test_example.cc
+++ b/ffi/tests/cpp/test_example.cc
@@ -20,7 +20,7 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/container/tuple.h>
 #include <tvm/ffi/container/variant.h>
 #include <tvm/ffi/function.h>
@@ -127,29 +127,29 @@ struct CPUNDAlloc {
   void FreeData(DLTensor* tensor) { free(tensor->data); }
 };
 
-void ExampleNDArray() {
+void ExampleTensor() {
   namespace ffi = tvm::ffi;
   ffi::Shape shape = {1, 2, 3};
   DLDataType dtype = {kDLFloat, 32, 1};
   DLDevice device = {kDLCPU, 0};
-  ffi::NDArray nd = ffi::NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+  ffi::Tensor tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
 }
 
-void ExampleNDArrayDLPack() {
+void ExampleTensorDLPack() {
   namespace ffi = tvm::ffi;
   ffi::Shape shape = {1, 2, 3};
   DLDataType dtype = {kDLFloat, 32, 1};
   DLDevice device = {kDLCPU, 0};
-  ffi::NDArray nd = ffi::NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+  ffi::Tensor tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
   // convert to DLManagedTensorVersioned
-  DLManagedTensorVersioned* dlpack = nd.ToDLPackVersioned();
+  DLManagedTensorVersioned* dlpack = tensor.ToDLPackVersioned();
   // load back from DLManagedTensorVersioned
-  ffi::NDArray nd2 = ffi::NDArray::FromDLPackVersioned(dlpack);
+  ffi::Tensor tensor2 = ffi::Tensor::FromDLPackVersioned(dlpack);
 }
 
-TEST(Example, NDArray) {
-  ExampleNDArray();
-  ExampleNDArrayDLPack();
+TEST(Example, Tensor) {
+  ExampleTensor();
+  ExampleTensorDLPack();
 }
 
 void ExampleString() {
diff --git a/ffi/tests/cpp/test_ndarray.cc b/ffi/tests/cpp/test_tensor.cc
similarity index 70%
rename from ffi/tests/cpp/test_ndarray.cc
rename to ffi/tests/cpp/test_tensor.cc
index 0196bfc4fb25..17a6427af35c 100644
--- a/ffi/tests/cpp/test_ndarray.cc
+++ b/ffi/tests/cpp/test_tensor.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 #include <gtest/gtest.h>
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 
 namespace {
 
@@ -28,12 +28,12 @@ struct CPUNDAlloc {
   void FreeData(DLTensor* tensor) { free(tensor->data); }
 };
 
-inline NDArray Empty(Shape shape, DLDataType dtype, DLDevice device) {
-  return NDArray::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
+inline Tensor Empty(Shape shape, DLDataType dtype, DLDevice device) {
+  return Tensor::FromNDAlloc(CPUNDAlloc(), shape, dtype, device);
 }
 
-TEST(NDArray, Basic) {
-  NDArray nd = Empty(Shape({1, 2, 3}), DLDataType({kDLFloat, 32, 1}), DLDevice({kDLCPU, 0}));
+TEST(Tensor, Basic) {
+  Tensor nd = Empty(Shape({1, 2, 3}), DLDataType({kDLFloat, 32, 1}), DLDevice({kDLCPU, 0}));
   Shape shape = nd.shape();
   EXPECT_EQ(shape.size(), 3);
   EXPECT_EQ(shape[0], 1);
@@ -45,7 +45,7 @@ TEST(NDArray, Basic) {
   }
 
   Any any0 = nd;
-  NDArray nd2 = any0.as<NDArray>().value();
+  Tensor nd2 = any0.as<Tensor>().value();
   EXPECT_EQ(nd2.shape(), shape);
   EXPECT_EQ(nd2.dtype(), DLDataType({kDLFloat, 32, 1}));
   for (int64_t i = 0; i < shape.Product(); ++i) {
@@ -56,9 +56,9 @@ TEST(NDArray, Basic) {
   EXPECT_EQ(nd2.use_count(), 3);
 }
 
-TEST(NDArray, DLPack) {
-  NDArray nd = Empty({1, 2, 3}, DLDataType({kDLInt, 16, 1}), DLDevice({kDLCPU, 0}));
-  DLManagedTensor* dlpack = nd.ToDLPack();
+TEST(Tensor, DLPack) {
+  Tensor tensor = Empty({1, 2, 3}, DLDataType({kDLInt, 16, 1}), DLDevice({kDLCPU, 0}));
+  DLManagedTensor* dlpack = tensor.ToDLPack();
   EXPECT_EQ(dlpack->dl_tensor.ndim, 3);
   EXPECT_EQ(dlpack->dl_tensor.shape[0], 1);
   EXPECT_EQ(dlpack->dl_tensor.shape[1], 2);
@@ -72,22 +72,22 @@ TEST(NDArray, DLPack) {
   EXPECT_EQ(dlpack->dl_tensor.strides[0], 6);
   EXPECT_EQ(dlpack->dl_tensor.strides[1], 3);
   EXPECT_EQ(dlpack->dl_tensor.strides[2], 1);
-  EXPECT_EQ(nd.use_count(), 2);
+  EXPECT_EQ(tensor.use_count(), 2);
   {
-    NDArray nd2 = NDArray::FromDLPack(dlpack);
-    EXPECT_EQ(nd2.use_count(), 1);
-    EXPECT_EQ(nd2->data, nd->data);
-    EXPECT_EQ(nd.use_count(), 2);
-    EXPECT_EQ(nd2.use_count(), 1);
+    Tensor tensor2 = Tensor::FromDLPack(dlpack);
+    EXPECT_EQ(tensor2.use_count(), 1);
+    EXPECT_EQ(tensor2->data, tensor->data);
+    EXPECT_EQ(tensor.use_count(), 2);
+    EXPECT_EQ(tensor2.use_count(), 1);
   }
-  EXPECT_EQ(nd.use_count(), 1);
+  EXPECT_EQ(tensor.use_count(), 1);
 }
 
-TEST(NDArray, DLPackVersioned) {
+TEST(Tensor, DLPackVersioned) {
   DLDataType dtype = DLDataType({kDLFloat4_e2m1fn, 4, 1});
   EXPECT_EQ(GetDataSize(2, dtype), 2 * 4 / 8);
-  NDArray nd = Empty({2}, dtype, DLDevice({kDLCPU, 0}));
-  DLManagedTensorVersioned* dlpack = nd.ToDLPackVersioned();
+  Tensor tensor = Empty({2}, dtype, DLDevice({kDLCPU, 0}));
+  DLManagedTensorVersioned* dlpack = tensor.ToDLPackVersioned();
   EXPECT_EQ(dlpack->version.major, DLPACK_MAJOR_VERSION);
   EXPECT_EQ(dlpack->version.minor, DLPACK_MINOR_VERSION);
   EXPECT_EQ(dlpack->dl_tensor.ndim, 1);
@@ -100,14 +100,14 @@ TEST(NDArray, DLPackVersioned) {
   EXPECT_EQ(dlpack->dl_tensor.byte_offset, 0);
   EXPECT_EQ(dlpack->dl_tensor.strides[0], 1);
 
-  EXPECT_EQ(nd.use_count(), 2);
+  EXPECT_EQ(tensor.use_count(), 2);
   {
-    NDArray nd2 = NDArray::FromDLPackVersioned(dlpack);
-    EXPECT_EQ(nd2.use_count(), 1);
-    EXPECT_EQ(nd2->data, nd->data);
-    EXPECT_EQ(nd.use_count(), 2);
-    EXPECT_EQ(nd2.use_count(), 1);
+    Tensor tensor2 = Tensor::FromDLPackVersioned(dlpack);
+    EXPECT_EQ(tensor2.use_count(), 1);
+    EXPECT_EQ(tensor2->data, tensor->data);
+    EXPECT_EQ(tensor.use_count(), 2);
+    EXPECT_EQ(tensor2.use_count(), 1);
   }
-  EXPECT_EQ(nd.use_count(), 1);
+  EXPECT_EQ(tensor.use_count(), 1);
 }
 }  // namespace
diff --git a/ffi/tests/python/test_function.py b/ffi/tests/python/test_function.py
index 4b0db45b4bd3..0b45fe5583b3 100644
--- a/ffi/tests/python/test_function.py
+++ b/ffi/tests/python/test_function.py
@@ -74,21 +74,21 @@ def test_echo():
     assert fadd1(1, 2) == 3
     assert fadd1.same_as(fadd)
 
-    def check_ndarray():
+    def check_tensor():
         np_data = np.arange(10, dtype="int32")
         if not hasattr(np_data, "__dlpack__"):
             return
-        # test NDArray
+        # test Tensor
         x = tvm_ffi.from_dlpack(np_data)
-        assert isinstance(x, tvm_ffi.NDArray)
-        nd_result = fecho(x)
-        assert isinstance(nd_result, tvm_ffi.NDArray)
-        assert nd_result.shape == (10,)
-        assert nd_result.dtype == tvm_ffi.dtype("int32")
-        assert nd_result.device.device_type == tvm_ffi.Device.kDLCPU
-        assert nd_result.device.device_id == 0
-
-    check_ndarray()
+        assert isinstance(x, tvm_ffi.Tensor)
+        tensor_result = fecho(x)
+        assert isinstance(tensor_result, tvm_ffi.Tensor)
+        assert tensor_result.shape == (10,)
+        assert tensor_result.dtype == tvm_ffi.dtype("int32")
+        assert tensor_result.device.device_type == tvm_ffi.Device.kDLCPU
+        assert tensor_result.device.device_id == 0
+
+    check_tensor()
 
 
 def test_return_raw_str_bytes():
diff --git a/ffi/tests/python/test_ndarray.py b/ffi/tests/python/test_tensor.py
similarity index 93%
rename from ffi/tests/python/test_ndarray.py
rename to ffi/tests/python/test_tensor.py
index f0ce0d193c8f..2e2a99940017 100644
--- a/ffi/tests/python/test_ndarray.py
+++ b/ffi/tests/python/test_tensor.py
@@ -25,12 +25,12 @@
 import numpy as np
 
 
-def test_ndarray_attributes():
+def test_tensor_attributes():
     data = np.zeros((10, 8, 4, 2), dtype="int16")
     if not hasattr(data, "__dlpack__"):
         return
     x = tvm_ffi.from_dlpack(data)
-    assert isinstance(x, tvm_ffi.NDArray)
+    assert isinstance(x, tvm_ffi.Tensor)
     assert x.shape == (10, 8, 4, 2)
     assert x.dtype == tvm_ffi.dtype("int16")
     assert x.device.device_type == tvm_ffi.Device.kDLCPU
@@ -56,9 +56,9 @@ def test_shape_object():
 
 
 @pytest.mark.skipif(torch is None, reason="Torch is not installed")
-def test_ndarray_auto_dlpack():
+def test_tensor_auto_dlpack():
     def check(x, y):
-        assert isinstance(y, tvm_ffi.NDArray)
+        assert isinstance(y, tvm_ffi.Tensor)
         assert y.shape == (128,)
         assert y.dtype == tvm_ffi.dtype("int64")
         assert y.device.device_type == tvm_ffi.Device.kDLCPU
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index 6f7d6d2d130d..f04a6cfe6d53 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -314,11 +314,11 @@ namespace attr {
 constexpr const char* kModuleName = "mod_name";
 
 /*
- * \brief All the runtime::NDArrays extracted from PrimFunc tir::AllocateConst nodes. The
+ * \brief All the runtime::Tensors extracted from PrimFunc tir::AllocateConst nodes. The
  * node will record the index into this array. See also kConstNameToConstant below, which is
  * the analog for Realy Functions.
  *
- * Type: Array<runtime::NDArray>
+ * Type: Array<runtime::Tensor>
  */
 constexpr const char* kConstants = "constants";
 
@@ -360,12 +360,12 @@ constexpr const char* kExternalMods = "external_mods";
 constexpr const char* kSystemLibPrefix = "system_lib_prefix";
 
 /*!
- * \brief All the named runtime::NDArrays accumulated during compilation by external codegen.
+ * \brief All the named runtime::Tensors accumulated during compilation by external codegen.
  * Generally the associated runtime::Module will indicate it requires bindings for these names,
  * and during module initialization these bindings will be recovered from a ConstLoaderModule.
  * See also kConstantsArray above, which is the analog for PrimFuncs.
  *
- * Type: Map<String, runtime::NDArray>
+ * Type: Map<String, runtime::Tensor>
  */
 constexpr const char* kConstNameToConstant = "const_name_to_constant";
 
diff --git a/include/tvm/meta_schedule/builder.h b/include/tvm/meta_schedule/builder.h
index 7e0be7de8265..a5c3fe5f2c5f 100644
--- a/include/tvm/meta_schedule/builder.h
+++ b/include/tvm/meta_schedule/builder.h
@@ -26,8 +26,8 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
 #include <tvm/ir/module.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -41,7 +41,7 @@ class BuilderInputNode : public runtime::Object {
   /*! \brief The target to be built for. */
   Target target;
   /*! \brief Parameters for Relax build module. */
-  Optional<Map<String, runtime::NDArray>> params;
+  Optional<Map<String, runtime::Tensor>> params;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -68,7 +68,7 @@ class BuilderInput : public runtime::ObjectRef {
    * \param params Parameters for Relax build module.
    */
   TVM_DLL explicit BuilderInput(IRModule mod, Target target,
-                                Optional<Map<String, runtime::NDArray>> params = std::nullopt);
+                                Optional<Map<String, runtime::Tensor>> params = std::nullopt);
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(BuilderInput, runtime::ObjectRef, BuilderInputNode);
 };
 
diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 29bc030c5b25..6c631a9eca74 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -192,10 +192,10 @@ class DatabaseNode : public runtime::Object {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
-   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *    - "ignore-tensor": Same as "structural", but ignore tensor raw data during
    *                        equality testing and hashing.
    *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      given module. The "ignore-tensor" varint is used for the extracted blocks
    *                      or in case no anchor block is found.
    *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    */
@@ -291,10 +291,10 @@ class PyDatabaseNode : public DatabaseNode {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
-   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *    - "ignore-tensor": Same as "structural", but ignore tensor raw data during
    *                        equality testing and hashing.
    *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      given module. The "ignore-tensor" varint is used for the extracted blocks
    *                      or in case no anchor block is found.
    *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    */
diff --git a/include/tvm/meta_schedule/feature_extractor.h b/include/tvm/meta_schedule/feature_extractor.h
index 88bf056ebb6f..88fcf9ac618d 100644
--- a/include/tvm/meta_schedule/feature_extractor.h
+++ b/include/tvm/meta_schedule/feature_extractor.h
@@ -25,8 +25,8 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
 #include <tvm/meta_schedule/measure_candidate.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -47,10 +47,10 @@ class FeatureExtractorNode : public runtime::Object {
    * \brief Extract features from the given measure candidate.
    * \param context The tuning context for feature extraction.
    * \param candidates The measure candidates to extract features from.
-   * \return The feature ndarray extracted.
+   * \return The feature tensor extracted.
    */
-  virtual Array<tvm::runtime::NDArray> ExtractFrom(const TuneContext& context,
-                                                   const Array<MeasureCandidate>& candidates) = 0;
+  virtual Array<tvm::runtime::Tensor> ExtractFrom(const TuneContext& context,
+                                                  const Array<MeasureCandidate>& candidates) = 0;
 
   static constexpr const char* _type_key = "meta_schedule.FeatureExtractor";
   TVM_DECLARE_BASE_OBJECT_INFO(FeatureExtractorNode, Object);
@@ -63,9 +63,9 @@ class PyFeatureExtractorNode : public FeatureExtractorNode {
    * \brief Extract features from the given measure candidate.
    * \param context The tuning context for feature extraction.
    * \param candidates The measure candidates to extract features from.
-   * \return The feature ndarray extracted.
+   * \return The feature tensor extracted.
    */
-  using FExtractFrom = ffi::TypedFunction<Array<tvm::runtime::NDArray>(
+  using FExtractFrom = ffi::TypedFunction<Array<tvm::runtime::Tensor>(
       const TuneContext& context, const Array<MeasureCandidate>& candidates)>;
   /*!
    * \brief Get the feature extractor as string with name.
@@ -83,8 +83,8 @@ class PyFeatureExtractorNode : public FeatureExtractorNode {
     // `f_as_string` is not registered
   }
 
-  Array<tvm::runtime::NDArray> ExtractFrom(const TuneContext& context,
-                                           const Array<MeasureCandidate>& candidates) final;
+  Array<tvm::runtime::Tensor> ExtractFrom(const TuneContext& context,
+                                          const Array<MeasureCandidate>& candidates) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyFeatureExtractor";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyFeatureExtractorNode, FeatureExtractorNode);
diff --git a/include/tvm/node/structural_hash.h b/include/tvm/node/structural_hash.h
index 0aca92d0e28a..2c0c54db4121 100644
--- a/include/tvm/node/structural_hash.h
+++ b/include/tvm/node/structural_hash.h
@@ -25,7 +25,7 @@
 
 #include <tvm/node/functor.h>
 #include <tvm/runtime/data_type.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cmath>
 #include <functional>
diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 22cda9e06635..e7198fcf2237 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -432,7 +432,7 @@ class DataflowVar : public Var {
 class ConstantNode : public LeafExprNode {
  public:
   /*! \brief The data of the tensor */
-  runtime::NDArray data;
+  runtime::Tensor data;
 
   /*! \return The corresponding tensor type of the data */
   TensorType tensor_type() const;
@@ -458,7 +458,7 @@ class Constant : public LeafExpr {
    *        If not specified, infer it from data.
    * \param span The source span of the expression.
    */
-  TVM_DLL explicit Constant(runtime::NDArray data,
+  TVM_DLL explicit Constant(runtime::Tensor data,
                             Optional<StructInfo> struct_info_annotation = std::nullopt,
                             Span span = Span());
 
diff --git a/include/tvm/runtime/disco/builtin.h b/include/tvm/runtime/disco/builtin.h
index bc0faf2413e5..acd4a214ff7b 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -21,7 +21,7 @@
 
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 
@@ -64,13 +64,13 @@ inline std::string ReduceKind2String(ReduceKind kind) {
  */
 TVM_DLL ffi::Module LoadVMModule(std::string path, Optional<Device> device);
 /*!
- * \brief Create an uninitialized empty NDArray
- * \param shape The shape of the NDArray
- * \param dtype The dtype of the NDArray
- * \param device The device the NDArray is created on. If None, use the thread local default device
- * \return The NDArray created
+ * \brief Create an uninitialized empty Tensor
+ * \param shape The shape of the Tensor
+ * \param dtype The dtype of the Tensor
+ * \param device The device the Tensor is created on. If None, use the thread local default device
+ * \return The Tensor created
  */
-TVM_DLL NDArray DiscoEmptyNDArray(ffi::Shape shape, DataType dtype, Optional<Device> device);
+TVM_DLL Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype, Optional<Device> device);
 /*!
  * \brief Perform an allreduce operation using the underlying communication library
  * \param send The array send to perform allreduce on
@@ -78,21 +78,21 @@ TVM_DLL NDArray DiscoEmptyNDArray(ffi::Shape shape, DataType dtype, Optional<Dev
  * \param in_group Whether the allreduce operation performs globally or in group as default.
  * \param recv The array receives the outcome of allreduce
  */
-TVM_DLL void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray recv);
+TVM_DLL void AllReduce(Tensor send, ReduceKind reduce_kind, bool in_group, Tensor recv);
 /*!
  * \brief Perform an allgather operation using the underlying communication library
  * \param send The array send to perform allgather on
  * \param in_group Whether the allgather operation performs globally or in group as default.
  * \param recv The array receives the outcome of allgather
  */
-TVM_DLL void AllGather(NDArray send, bool in_group, NDArray recv);
+TVM_DLL void AllGather(Tensor send, bool in_group, Tensor recv);
 /*!
  * \brief Perform a broadcast operation from worker-0
  * \param send The buffer to be broadcasted
  * \param in_group Whether the broadcast operation performs globally or in group as default.
  * \param recv The buffer receives the broadcasted array
  */
-TVM_DLL void BroadcastFromWorker0(NDArray send, bool in_group, NDArray recv);
+TVM_DLL void BroadcastFromWorker0(Tensor send, bool in_group, Tensor recv);
 /*!
  * \brief Perform a scatter operation from worker-0, chunking the given buffer into equal parts.
  * \param send For worker-0, it must be provided, and otherwise, the buffer must be None.
@@ -100,7 +100,7 @@ TVM_DLL void BroadcastFromWorker0(NDArray send, bool in_group, NDArray recv);
  * \param in_group Whether the scatter operation performs globally or in group as default.
  * \param recv The receiving buffer, which must not be None.
  */
-TVM_DLL void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv);
+TVM_DLL void ScatterFromWorker0(Optional<Tensor> send, bool in_group, Tensor recv);
 /*!
  * \brief Perform a gather operation to worker-0.
  * \param send The sending buffer, which must not be None.
@@ -108,36 +108,36 @@ TVM_DLL void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray r
  * \param recv For worker-0, it must be provided, and otherwise, the buffer must be None. The
  * receiving buffer will be divided into equal parts and receive from each worker accordingly.
  */
-TVM_DLL void GatherToWorker0(NDArray send, bool in_group, Optional<NDArray> recv);
+TVM_DLL void GatherToWorker0(Tensor send, bool in_group, Optional<Tensor> recv);
 /*!
  * \brief Receive a buffer from worker-0. No-op if the current worker is worker-0.
  * \param buffer The buffer to be received
  */
-TVM_DLL void RecvFromWorker0(NDArray buffer);
+TVM_DLL void RecvFromWorker0(Tensor buffer);
 /*!
  * \brief Send a buffer to the corresponding worker in the next group.
  * An error is thrown if the worker is already in the last group.
  * \param buffer The sending buffer.
  */
-TVM_DLL void SendToNextGroup(NDArray buffer);
+TVM_DLL void SendToNextGroup(Tensor buffer);
 /*!
  * \brief Receive a buffer from the corresponding worker in the previous group.
  * An error is thrown if the worker is already in the first group.
  * \param buffer The receiving buffer.
  */
-TVM_DLL void RecvFromPrevGroup(NDArray buffer);
+TVM_DLL void RecvFromPrevGroup(Tensor buffer);
 /*!
  * \brief Send a buffer to the target receiver worker (globally across all groups).
  * \param buffer The sending buffer.
  * \param receiver_id The global receiver worker id.
  */
-TVM_DLL void SendToWorker(NDArray buffer, int receiver_id);
+TVM_DLL void SendToWorker(Tensor buffer, int receiver_id);
 /*!
  * \brief Receive a buffer from the target sender worker (globally across all groups).
  * \param buffer The receiving buffer.
  * \param sender_id The global sender worker id.
  */
-TVM_DLL void RecvFromWorker(NDArray buffer, int sender_id);
+TVM_DLL void RecvFromWorker(Tensor buffer, int sender_id);
 /*! \brief Get the local worker id */
 TVM_DLL int WorkerId();
 /*!
diff --git a/include/tvm/runtime/disco/session.h b/include/tvm/runtime/disco/session.h
index 4fe0e72e79c1..72ac577d52d4 100644
--- a/include/tvm/runtime/disco/session.h
+++ b/include/tvm/runtime/disco/session.h
@@ -46,7 +46,7 @@
  * It is assumed that the controler can synchronize with and access the registers of worker-0.
  * The Disco session provides multiple APIs to interact specifically with the worker-0.
  * To shared data with other workers, a common paradigm in Disco is to copy data from the
- * controler-side NDArray to the worker-0, and then copy it to other workers using primitives on
+ * controler-side Tensor to the worker-0, and then copy it to other workers using primitives on
  * the data plane, for example, `broadcast` and `send`.
  *
  * **Control plane.** The controler broadcasts commands to all the workers as control signals.
@@ -74,8 +74,8 @@
 
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/int_tuple.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 
 #include <queue>
 #include <string>
@@ -143,9 +143,9 @@ class DRefObj : public Object {
    */
   inline ffi::Any DebugGetFromRemote(int worker_id);
   /*!
-   * \brief Copy from the NDArray provided to a remote worker.
+   * \brief Copy from the Tensor provided to a remote worker.
    * \param worker_id The id of the worker to be copied to.
-   * \param source The NDArray to be copied.
+   * \param source The Tensor to be copied.
    */
   inline void DebugCopyFrom(int worker_id, ffi::AnyView source);
 
@@ -189,7 +189,7 @@ class SessionObj : public Object {
    * - std::string;
    * - DRef.
    * Examples of unsupported types:
-   * - NDArray, DLTensor;
+   * - Tensor, DLTensor;
    * - TVM Objects, including ffi::Function, Module and String;
    * \param func The function to be called.
    * \param args The variadic arguments.
@@ -209,17 +209,17 @@ class SessionObj : public Object {
   /*! \brief Get a global functions on workers. */
   TVM_DLL virtual DRef GetGlobalFunc(const std::string& name) = 0;
   /*!
-   * \brief Copy an NDArray from worker-0 to the controler-side NDArray
+   * \brief Copy an Tensor from worker-0 to the controler-side Tensor
    * \param host_array The array to be copied to worker-0
-   * \param remote_array The NDArray on worker-0
+   * \param remote_array The Tensor on worker-0
    */
-  TVM_DLL virtual void CopyFromWorker0(const NDArray& host_array, const DRef& remote_array) = 0;
+  TVM_DLL virtual void CopyFromWorker0(const Tensor& host_array, const DRef& remote_array) = 0;
   /*!
-   * \brief Copy the controler-side NDArray to worker-0
+   * \brief Copy the controler-side Tensor to worker-0
    * \param host_array The array to be copied to worker-0
-   * \param remote_array The NDArray on worker-0
+   * \param remote_array The Tensor on worker-0
    */
-  TVM_DLL virtual void CopyToWorker0(const NDArray& host_array, const DRef& remote_array) = 0;
+  TVM_DLL virtual void CopyToWorker0(const Tensor& host_array, const DRef& remote_array) = 0;
   /*!
    * \brief Synchrnoize the controler with a worker, and it will wait until worker finishes
    * executing this instruction.
@@ -319,7 +319,7 @@ class WorkerZeroData {
    * \brief The host-side arrays to passed to worker-0 for special uses, for example,
    * copy-to-worker0 and copy-from-worker0
    */
-  std::queue<NDArray> host_arrays;
+  std::queue<Tensor> host_arrays;
   /*! \brief The mutex that guards `host_arrays` */
   std::mutex queue_mutex_;
 };
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index f103c6f30ac8..a10bc6b36e04 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -25,8 +25,8 @@
 #define TVM_RUNTIME_MEMORY_MEMORY_MANAGER_H_
 
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 
 #include <functional>
 #include <memory>
@@ -59,15 +59,15 @@ class Allocator {
  public:
   explicit Allocator(AllocatorType type) : type_(type) {}
   virtual ~Allocator() = default;
-  /*! \brief Allocate an empty NDArray using from the allocator.
-   *  \param shape The shape of the NDArray.
-   *  \param dtype The datatype of the NDArray.
+  /*! \brief Allocate an empty Tensor using from the allocator.
+   *  \param shape The shape of the Tensor.
+   *  \param dtype The datatype of the Tensor.
    *  \param dev The device where the array is allocated.
    *  \param mem_scope The device memory scope hint.
-   *  \return The empty NDArray.
+   *  \return The empty Tensor.
    */
-  TVM_DLL NDArray Empty(ffi::Shape shape, DLDataType dtype, Device dev,
-                        Optional<String> mem_scope = std::nullopt);
+  TVM_DLL Tensor Empty(ffi::Shape shape, DLDataType dtype, Device dev,
+                       Optional<String> mem_scope = std::nullopt);
   /*! \brief Return the allocator type. */
   inline AllocatorType type() const { return type_; }
   /*! \brief Allocate a buffer given a size, alignment and type.
@@ -163,12 +163,12 @@ class StorageObj : public Object {
   /*! \brief The allocator where the storage buffer is allocated from. */
   Allocator* allocator = nullptr;
 
-  /*! \brief Allocate an NDArray from a given piece of storage. */
-  TVM_DLL NDArray AllocNDArray(int64_t offset, ffi::Shape shape, DLDataType dtype);
+  /*! \brief Allocate an Tensor from a given piece of storage. */
+  TVM_DLL Tensor AllocTensor(int64_t offset, ffi::Shape shape, DLDataType dtype);
 
-  /*! \brief Allocate an NDArray with memory scope from a given piece of storage. */
-  TVM_DLL NDArray AllocNDArrayScoped(int64_t offset, ffi::Shape shape, DLDataType dtype,
-                                     String scope = "global");
+  /*! \brief Allocate an Tensor with memory scope from a given piece of storage. */
+  TVM_DLL Tensor AllocTensorScoped(int64_t offset, ffi::Shape shape, DLDataType dtype,
+                                   String scope = "global");
 
   ~StorageObj() {
     if (allocator) {
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 302b161b6fd7..9da9467e8ff2 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -52,8 +52,8 @@ enum TypeIndex : int32_t {
   // Frontends can take benefit of these constants.
   /*! \brief runtime::Module. */
   kRuntimeModule = TVMFFITypeIndex::kTVMFFIModule,
-  /*! \brief runtime::NDArray. */
-  kRuntimeNDArray = TVMFFITypeIndex::kTVMFFINDArray,
+  /*! \brief runtime::Tensor. */
+  kRuntimeTensor = TVMFFITypeIndex::kTVMFFITensor,
   /*! \brief runtime::Shape. */
   kRuntimeShape = TVMFFITypeIndex::kTVMFFIShape,
   // Extra builtin static index here
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 9f25b6775c13..88a22c981652 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -30,8 +30,8 @@
 #include <tvm/runtime/base.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 
 #include <stack>
 #include <string>
@@ -490,17 +490,17 @@ class RatioNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(RatioNode, Object);
 };
 
-/*! \brief String representation of an array of NDArray shapes
- *  \param shapes Array of NDArrays to get the shapes of.
+/*! \brief String representation of an array of Tensor shapes
+ *  \param shapes Array of Tensors to get the shapes of.
  *  \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
  */
-String ShapeString(const std::vector<NDArray>& shapes);
-/*! \brief String representation of shape encoded as an NDArray
- *  \param shape NDArray containing the shape.
+String ShapeString(const std::vector<Tensor>& shapes);
+/*! \brief String representation of shape encoded as an Tensor
+ *  \param shape Tensor containing the shape.
  *  \param dtype The dtype of the shape.
  *  \return A textual representation of the shape. For example: `float32[2]`.
  */
-String ShapeString(NDArray shape, DLDataType dtype);
+String ShapeString(Tensor shape, DLDataType dtype);
 /*! \brief String representation of a shape encoded as a vector
  *  \param shape Shape as a vector of integers.
  *  \param dtype The dtype of the shape.
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
index 2cfd1de44dde..c8e9d3c435f0 100644
--- a/include/tvm/runtime/serializer.h
+++ b/include/tvm/runtime/serializer.h
@@ -28,7 +28,7 @@
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 namespace dmlc {
 namespace serializer {
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/tensor.h
similarity index 78%
rename from include/tvm/runtime/ndarray.h
rename to include/tvm/runtime/tensor.h
index 9a295e491e82..9536dd2005c5 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/tensor.h
@@ -18,14 +18,14 @@
  */
 
 /*!
- * \file tvm/runtime/ndarray.h
- * \brief A device-independent managed NDArray abstraction.
+ * \file tvm/runtime/tensor.h
+ * \brief A device-independent managed Tensor abstraction.
  */
-#ifndef TVM_RUNTIME_NDARRAY_H_
-#define TVM_RUNTIME_NDARRAY_H_
+#ifndef TVM_RUNTIME_TENSOR_H_
+#define TVM_RUNTIME_TENSOR_H_
 
-#include <tvm/ffi/container/ndarray.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
 #include <tvm/runtime/base.h>
@@ -47,31 +47,31 @@ using ffi::IsAligned;
 using ffi::IsContiguous;
 
 /*!
- * \brief Managed NDArray.
+ * \brief Managed Tensor.
  *  The array is backed by reference counted blocks.
  */
-class NDArray : public tvm::ffi::NDArray {
+class Tensor : public tvm::ffi::Tensor {
  public:
-  using Container = ffi::NDArrayObj;
-  NDArray() = default;
+  using Container = ffi::TensorObj;
+  Tensor() = default;
   /*!
    * \brief constructor.
    * \param data ObjectPtr to the data container.
    */
-  explicit NDArray(ObjectPtr<Object> data) : tvm::ffi::NDArray(data) {}
-  NDArray(ffi::NDArray&& other) : tvm::ffi::NDArray(std::move(other)) {}  // NOLINT(*)
-  NDArray(const ffi::NDArray& other) : tvm::ffi::NDArray(other) {}        // NOLINT(*)
+  explicit Tensor(ObjectPtr<Object> data) : tvm::ffi::Tensor(data) {}
+  Tensor(ffi::Tensor&& other) : tvm::ffi::Tensor(std::move(other)) {}  // NOLINT(*)
+  Tensor(const ffi::Tensor& other) : tvm::ffi::Tensor(other) {}        // NOLINT(*)
 
   ffi::Shape Shape() const { return this->shape(); }
   runtime::DataType DataType() const { return runtime::DataType(this->dtype()); }
 
   // DLPack handling
-  static NDArray FromDLPack(DLManagedTensor* tensor) {
-    return tvm::ffi::NDArray::FromDLPack(tensor, kAllocAlignment, true);
+  static Tensor FromDLPack(DLManagedTensor* tensor) {
+    return tvm::ffi::Tensor::FromDLPack(tensor, kAllocAlignment, true);
   }
 
-  static NDArray FromDLPackVersioned(DLManagedTensorVersioned* tensor) {
-    return tvm::ffi::NDArray::FromDLPackVersioned(tensor, kAllocAlignment, true);
+  static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor) {
+    return tvm::ffi::Tensor::FromDLPackVersioned(tensor, kAllocAlignment, true);
   }
   /*!
    * \brief Copy data content from another array.
@@ -80,12 +80,12 @@ class NDArray : public tvm::ffi::NDArray {
    *       TVMSynchronize is necessary.
    */
   inline void CopyFrom(const DLTensor* other);
-  inline void CopyFrom(const NDArray& other);
+  inline void CopyFrom(const Tensor& other);
   /*!
    * \brief Copy data content from a byte buffer.
    * \param data The source bytes to be copied from.
    * \param nbytes The size of the buffer in bytes
-   *        Must be equal to the size of the NDArray.
+   *        Must be equal to the size of the Tensor.
    * \note The copy always triggers a TVMSynchronize.
    */
   TVM_DLL void CopyFromBytes(const void* data, size_t nbytes);
@@ -96,12 +96,12 @@ class NDArray : public tvm::ffi::NDArray {
    *       TVMSynchronize is necessary.
    */
   inline void CopyTo(DLTensor* other) const;
-  inline void CopyTo(const NDArray& other) const;
+  inline void CopyTo(const Tensor& other) const;
   /*!
    * \brief Copy data content into another array.
    * \param data The source bytes to be copied from.
    * \param nbytes The size of the data buffer.
-   *        Must be equal to the size of the NDArray.
+   *        Must be equal to the size of the Tensor.
    * \note The copy always triggers a TVMSynchronize.
    */
   TVM_DLL void CopyToBytes(void* data, size_t nbytes) const;
@@ -112,27 +112,27 @@ class NDArray : public tvm::ffi::NDArray {
    * \return The array under another device.
    * \note The copy always triggers a TVMSynchronize.
    */
-  TVM_DLL NDArray CopyTo(const Device& dev, Optional<String> mem_scope = std::nullopt) const;
+  TVM_DLL Tensor CopyTo(const Device& dev, Optional<String> mem_scope = std::nullopt) const;
   /*!
-   * \brief Load NDArray from stream
+   * \brief Load Tensor from stream
    * \param stream The input data stream
    * \return Whether load is successful
    */
   inline bool Load(dmlc::Stream* stream);
   /*!
-   * \brief Save NDArray to stream
+   * \brief Save Tensor to stream
    * \param stream The output data stream
    */
   inline void Save(dmlc::Stream* stream) const;
 
   /*!
-   * \brief Create a NDArray that shares the data memory with the current one.
+   * \brief Create a Tensor that shares the data memory with the current one.
    *
    * \param shape The shape of the new array.
    *
    * \param dtype The data type of the new array.
    *
-   * \param relative_byte_offset The offset of the output NDArray,
+   * \param relative_byte_offset The offset of the output Tensor,
    *     relative to the current byte offset.
    *
    *     By default, the offset of the view is the same as the offset
@@ -145,18 +145,18 @@ class NDArray : public tvm::ffi::NDArray {
    *       outside the bounds of the current array, this function will
    *       raise an exception.
    */
-  TVM_DLL NDArray CreateView(ffi::Shape shape, DLDataType dtype,
-                             uint64_t relative_byte_offset = 0) const;
+  TVM_DLL Tensor CreateView(ffi::Shape shape, DLDataType dtype,
+                            uint64_t relative_byte_offset = 0) const;
   /*!
-   * \brief Create an empty NDArray.
+   * \brief Create an empty Tensor.
    * \param shape The shape of the new array.
    * \param dtype The data type of the new array.
    * \param dev The device of the array.
    * \param mem_scope The memory scope of the array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(ffi::Shape shape, DLDataType dtype, Device dev,
-                               Optional<String> mem_scope = std::nullopt);
+  TVM_DLL static Tensor Empty(ffi::Shape shape, DLDataType dtype, Device dev,
+                              Optional<String> mem_scope = std::nullopt);
   /*!
    * \brief Function to copy data from one array to another.
    * \param from The source array.
@@ -184,33 +184,33 @@ class NDArray : public tvm::ffi::NDArray {
  */
 inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
 
-inline void NDArray::CopyFrom(const DLTensor* other) {
+inline void Tensor::CopyFrom(const DLTensor* other) {
   ICHECK(data_ != nullptr);
   CopyFromTo(other, get_mutable());
 }
 
-inline void NDArray::CopyFrom(const NDArray& other) {
+inline void Tensor::CopyFrom(const Tensor& other) {
   ICHECK(data_ != nullptr);
   ICHECK(other.data_ != nullptr);
   CopyFromTo(other.get_mutable(), get_mutable());
 }
 
-inline void NDArray::CopyTo(DLTensor* other) const {
+inline void Tensor::CopyTo(DLTensor* other) const {
   ICHECK(data_ != nullptr);
   CopyFromTo(get_mutable(), other);
 }
 
-inline void NDArray::CopyTo(const NDArray& other) const {
+inline void Tensor::CopyTo(const Tensor& other) const {
   ICHECK(data_ != nullptr);
   ICHECK(other.data_ != nullptr);
   CopyFromTo(get_mutable(), other.get_mutable());
 }
 
-/*! \brief Magic number for NDArray file */
-constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
+/*! \brief Magic number for Tensor file */
+constexpr uint64_t kTVMTensorMagic = 0xDD5E40F096B4A13F;
 
 inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
-  uint64_t header = kTVMNDArrayMagic, reserved = 0;
+  uint64_t header = kTVMTensorMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
   // Always save data as CPU context
@@ -244,7 +244,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
     strm->Write(tensor->data, data_byte_size);
   } else {
     std::vector<uint8_t> bytes(data_byte_size);
-    NDArray::CopyToBytes(const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size);
+    Tensor::CopyToBytes(const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size);
     if (!DMLC_IO_NO_ENDIAN_SWAP) {
       dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems);
     }
@@ -253,13 +253,13 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
   return true;
 }
 
-inline void NDArray::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
+inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
 
-inline bool NDArray::Load(dmlc::Stream* strm) {
+inline bool Tensor::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
+  ICHECK(header == kTVMTensorMagic) << "Invalid DLTensor file format";
   Device dev;
   int ndim;
   DLDataType dtype;
@@ -271,7 +271,7 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
   if (ndim != 0) {
     ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
   }
-  NDArray ret = NDArray::Empty(ffi::Shape(shape), dtype, dev);
+  Tensor ret = Tensor::Empty(ffi::Shape(shape), dtype, dev);
   int64_t num_elems = 1;
   int elem_bytes = (ret->dtype.bits + 7) / 8;
   for (int i = 0; i < ret->ndim; ++i) {
@@ -328,4 +328,4 @@ struct equal_to<tvm::Device> {
 };
 }  // namespace std
 
-#endif  // TVM_RUNTIME_NDARRAY_H_
+#endif  // TVM_RUNTIME_TENSOR_H_
diff --git a/include/tvm/runtime/vm/ndarray_cache_support.h b/include/tvm/runtime/vm/tensor_cache_support.h
similarity index 68%
rename from include/tvm/runtime/vm/ndarray_cache_support.h
rename to include/tvm/runtime/vm/tensor_cache_support.h
index 3ab08df04389..d2112cc83f4e 100644
--- a/include/tvm/runtime/vm/ndarray_cache_support.h
+++ b/include/tvm/runtime/vm/tensor_cache_support.h
@@ -16,12 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RUNTIME_VM_NDARRAY_CACHE_SUPPORT_H_
-#define TVM_RUNTIME_VM_NDARRAY_CACHE_SUPPORT_H_
+#ifndef TVM_RUNTIME_VM_TENSOR_CACHE_SUPPORT_H_
+#define TVM_RUNTIME_VM_TENSOR_CACHE_SUPPORT_H_
 
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <unordered_map>
@@ -32,10 +32,10 @@ namespace runtime {
 namespace vm {
 
 /*!
- * \brief Metadata for NDArray cache, which by default, is named as "ndarray-cache.json".
+ * \brief Metadata for Tensor cache, which by default, is named as "tensor-cache.json".
  */
-struct NDArrayCacheMetadata {
-  /*! \brief Each shard of NDArray cache, which by default, is named as "params_shard_x.bin". */
+struct TensorCacheMetadata {
+  /*! \brief Each shard of Tensor cache, which by default, is named as "params_shard_x.bin". */
   struct FileRecord {
     /*! \brief Metadata of each parameter */
     struct ParamRecord {
@@ -46,8 +46,8 @@ struct NDArrayCacheMetadata {
        * \param staging_buffer The buffer to be used to avoid extra OpenCL copies. Pass in a nullptr
        * in other cases
        */
-      TVM_DLL NDArray Load(Device device, const std::string* raw_data,
-                           Optional<NDArray>* staging_buffer = nullptr) const;
+      TVM_DLL Tensor Load(Device device, const std::string* raw_data,
+                          Optional<Tensor>* staging_buffer = nullptr) const;
 
       /*! \brief Name of the parameter */
       std::string name;
@@ -64,10 +64,10 @@ struct NDArrayCacheMetadata {
     };
 
     /*! \brief Load a FileRecord into memory */
-    TVM_DLL Array<NDArray> Load(Device device,                   //
-                                const std::string& path_prefix,  //
-                                std::string* raw_data_buffer,    //
-                                Optional<NDArray>* staging_buffer = nullptr) const;
+    TVM_DLL Array<Tensor> Load(Device device,                   //
+                               const std::string& path_prefix,  //
+                               std::string* raw_data_buffer,    //
+                               Optional<Tensor>* staging_buffer = nullptr) const;
 
     /*! \brief Relative path to the bin file */
     std::string data_path;
@@ -78,19 +78,19 @@ struct NDArrayCacheMetadata {
     /*! \brief The parameters in the file */
     std::vector<ParamRecord> records;
   };
-  /*! \brief The files in the NDArray cache */
+  /*! \brief The files in the Tensor cache */
   std::vector<FileRecord> records;
-  /*! \brief The path to the `ndarray-cache.json` file */
+  /*! \brief The path to the `tensor-cache.json` file */
   std::string path;
 
   /*! \brief Load the metadata from a specific directory */
-  TVM_DLL static NDArrayCacheMetadata Load(const std::string& path);
+  TVM_DLL static TensorCacheMetadata Load(const std::string& path);
   /*! \brief Load the metadata from a given JSON string */
-  static NDArrayCacheMetadata LoadFromStr(const std::string& json_str, const std::string& path);
+  static TensorCacheMetadata LoadFromStr(const std::string& json_str, const std::string& path);
 };
 
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_VM_NDARRAY_CACHE_SUPPORT_H_
+#endif  // TVM_RUNTIME_VM_TENSOR_CACHE_SUPPORT_H_
diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index 1e205edc43f3..52173a8d8a4f 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -510,7 +510,7 @@ class AllocateConstFrameNode : public TIRFrameNode {
   /*! \brief The extents of the allocate. */
   Array<PrimExpr> extents;
   /*! \brief The data associated with the constant. */
-  tvm::runtime::NDArray data;
+  tvm::runtime::Tensor data;
   /*! \brief The buffer var */
   tvm::tir::Var buffer_var;
   /*! \brief Additional annotations about the allocation. */
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 30b5bb3382f4..6894bfa1fb58 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -28,7 +28,7 @@ namespace script {
 namespace ir_builder {
 namespace tir {
 
-using tvm::runtime::NDArray;
+using tvm::runtime::Tensor;
 using tvm::tir::Buffer;
 using tvm::tir::Var;
 
@@ -323,7 +323,7 @@ AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_s
  * \param annotations Additional annotation hints.
  * \return The created AllocateConstFrame.
  */
-AllocateConstFrame AllocateConst(NDArray data, DataType dtype, Array<PrimExpr> extents,
+AllocateConstFrame AllocateConst(Tensor data, DataType dtype, Array<PrimExpr> extents,
                                  Optional<Map<String, Any>> annotations = std::nullopt);
 
 /*!
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index d3573c925daf..a48a8909c4d3 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -337,7 +337,7 @@ TVM_DLL const Op& tvm_stack_alloca();
 TVM_DLL const Op& tvm_stack_make_shape();
 
 /*!
- * \brief Allocate a NDArray(DLTensor) on stack, return the handle.
+ * \brief Allocate a Tensor(DLTensor) on stack, return the handle.
  *
  *  Type tvm_stack_make_array(Expr data,
  *                            Expr shape,
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 6ea50e9ae0f0..21a97f986d4f 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -27,7 +27,7 @@
 #include <tvm/ffi/container/map.h>
 #include <tvm/ffi/container/variant.h>
 #include <tvm/ir/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index 518d7602f562..7c8c9c30c7b5 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -135,13 +135,13 @@ class IndexMapNode : public Object {
    */
   Array<PrimExpr> MapShape(const Array<PrimExpr>& shape, arith::Analyzer* analyzer) const;
 
-  /* \brief Map an NDArray according to this index map
+  /* \brief Map an Tensor according to this index map
    *
-   * \param arr_src The NDArray whose layout is transformed by this index map.
+   * \param arr_src The Tensor whose layout is transformed by this index map.
    *
-   * \returns The transformed NDArray.
+   * \returns The transformed Tensor.
    */
-  runtime::NDArray MapNDArray(runtime::NDArray arr_src) const;
+  runtime::Tensor MapTensor(runtime::Tensor arr_src) const;
 
   /*!
    * \brief Convert to string representation in Python.
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index bbdb7c272ed8..b8c7ea594abe 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -363,10 +363,10 @@ class AllocateConstNode : public StmtNode {
   Var buffer_var;
   /*! \brief The optional data associated to the constant.
    */
-  Optional<runtime::NDArray> data;
+  Optional<runtime::Tensor> data;
   /*!
    * \brief If the PrimFunc containing the Stmt is added to IRModule, this is an optional index
-   * to indicate the index within "constants" attribute, that is a Array<NDArray> of IRModule.
+   * to indicate the index within "constants" attribute, that is a Array<Tensor> of IRModule.
    */
   Optional<Integer> irmod_storage_idx;
   /*! \brief The type of the buffer. */
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index eb64d87f9518..bd6a5d537239 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -676,7 +676,7 @@ TVM_DLL Pass UnifiedStaticMemoryPlanner();
  */
 TVM_DLL Pass InjectSoftwarePipeline();
 
-TVM_DLL Pass BindParams(const Array<runtime::NDArray>& constants);
+TVM_DLL Pass BindParams(const Array<runtime::Tensor>& constants);
 
 /*!
  * \brief Pass to collect tir non-scalar constants into module's 'Constants' attribute.
@@ -729,17 +729,17 @@ TVM_DLL Pass InjectPTXLDG32(bool enable_ptx_ldg32 = true);
 
 /*!
  * \brief Remove the weight layout rewrite block
- * \param skip_ndarray_rewrite If True, exact rewrite of NDArray, according to the given index map,
- *  will be skipped. Only the shape of the NDArray is transformed correctly, and the content of
+ * \param skip_tensor_rewrite If True, exact rewrite of Tensor, according to the given index map,
+ *  will be skipped. Only the shape of the Tensor is transformed correctly, and the content of
  *  the destination array will be filled with random values.
  *
- *  When this pass is called many times during MetaSchedule tuning, the raw data of NDArray,
- *  before and after rewrite, does not matter. Since NDArray layout rewrite, using IndexMap's
- *  MapNDArray, is currently slow, skipping the exact rewrite is sometimes necessary.
+ *  When this pass is called many times during MetaSchedule tuning, the raw data of Tensor,
+ *  before and after rewrite, does not matter. Since Tensor layout rewrite, using IndexMap's
+ *  MapTensor, is currently slow, skipping the exact rewrite is sometimes necessary.
  *
  * \return The pass.
  */
-TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite = false);
+TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_tensor_rewrite = false);
 
 /*!
  * \brief Add the explicit local stage for the shared memory access on GPU.
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index df637f6f5862..71b1bd3b8d25 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -706,8 +706,8 @@ inline PrimExpr GetLength(PrimExpr begin, PrimExpr end, PrimExpr stride, PrimExp
  *
  * \return A Tensor whose op member is the dynamic_strided_slice operation
  */
-inline Tensor dynamic_strided_slice_with_axes(
-    const Tensor& x, const Array<PrimExpr>& begin, const Array<PrimExpr>& end,
+inline te::Tensor dynamic_strided_slice_with_axes(
+    const te::Tensor& x, const Array<PrimExpr>& begin, const Array<PrimExpr>& end,
     const Array<PrimExpr>& strides, const Array<Integer>& axes, bool assume_inbound = true,
     std::string name = "T_dynamic_strided_slice_with_axes", std::string tag = kInjective) {
   const size_t src_tensor_dim = x->shape.size();
@@ -1967,13 +1967,13 @@ inline Tensor shape(const Tensor& src, DataType dtype, const std::string name =
  * \param tag output tensor tag.
  * \return Tensor of input shape.
  */
-inline Tensor ndarray_size(const Tensor& src, const DataType& dtype,
-                           const std::string& name = "ndarray_size",
-                           const std::string& tag = kInjective) {
+inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
+                              const std::string& name = "tensor_size",
+                              const std::string& tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
-  Array<PrimExpr> out_ndarray_size = {};
+  Array<PrimExpr> out_tensor_size = {};
   return compute(
-      out_ndarray_size,
+      out_tensor_size,
       [&](const Array<Var>& indices) {
         PrimExpr ret = 1;
         for (int i = 0; i < ndim; ++i) {
diff --git a/jvm/README.md b/jvm/README.md
index 71c737a4d00a..355a17a7b266 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -19,7 +19,7 @@
 
 This folder contains the Java interface for TVM runtime. It brings TVM runtime to Java virtual machine.
 
-- It enables you to construct NDArray from Java native array and vice versa.
+- It enables you to construct Tensor from Java native array and vice versa.
 - You can register and convert Java native functions to TVM functions.
 - It enables you to load shared libraries created by Python and C++.
 - It provides a simple interface for RPC server and client.
@@ -95,7 +95,7 @@ The following code snippet demonstrate how to load generated shared library (add
 
 ```java
 import org.apache.tvm.Module;
-import org.apache.tvm.NDArray;
+import org.apache.tvm.Tensor;
 import org.apache.tvm.Device;
 
 import java.io.File;
@@ -109,9 +109,9 @@ public class LoadAddFunc {
     Device dev = Device.cpu();
 
     long[] shape = new long[]{2};
-    NDArray arr = NDArray.empty(shape, dev);
+    Tensor arr = Tensor.empty(shape, dev);
     arr.copyFrom(new float[]{3f, 4f});
-    NDArray res = NDArray.empty(shape, dev);
+    Tensor res = Tensor.empty(shape, dev);
 
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
     System.out.println(Arrays.toString(res.asFloatArray()));
diff --git a/jvm/core/src/main/java/org/apache/tvm/Function.java b/jvm/core/src/main/java/org/apache/tvm/Function.java
index ee6b8e8cf5c5..29e105dee9f5 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Function.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Function.java
@@ -138,12 +138,12 @@ public Function pushArg(String arg) {
 
   /**
    * Push argument to the function.
-   * @param arg NDArray.
+   * @param arg Tensor.
    * @return this
    */
-  public Function pushArg(NDArrayBase arg) {
-    if (arg instanceof NDArray) {
-      Base._LIB.tvmFFIFunctionPushArgHandle(((NDArray) arg).handle, TypeIndex.kTVMFFINDArray);
+  public Function pushArg(TensorBase arg) {
+    if (arg instanceof Tensor) {
+      Base._LIB.tvmFFIFunctionPushArgHandle(((Tensor) arg).handle, TypeIndex.kTVMFFITensor);
     } else {
       Base._LIB.tvmFFIFunctionPushArgHandle(arg.dltensorHandle, TypeIndex.kTVMFFIDLTensorPtr);
     }
@@ -192,7 +192,7 @@ public Function pushArg(Device arg) {
 
   /**
    * Invoke function with arguments.
-   * @param args Can be Integer, Long, Float, Double, String, NDArray.
+   * @param args Can be Integer, Long, Float, Double, String, Tensor.
    * @return the result.
    */
   public TVMValue call(Object... args) {
@@ -203,10 +203,10 @@ public TVMValue call(Object... args) {
   }
 
   private static void pushArgToStack(Object arg) {
-    if (arg instanceof NDArrayBase) {
-      NDArrayBase nd = (NDArrayBase) arg;
-      if (nd instanceof NDArray) {
-        Base._LIB.tvmFFIFunctionPushArgHandle(((NDArray) nd).handle, TypeIndex.kTVMFFINDArray);
+    if (arg instanceof TensorBase) {
+      TensorBase nd = (TensorBase) arg;
+      if (nd instanceof Tensor) {
+        Base._LIB.tvmFFIFunctionPushArgHandle(((Tensor) nd).handle, TypeIndex.kTVMFFITensor);
       } else {
         Base._LIB.tvmFFIFunctionPushArgHandle(nd.dltensorHandle, TypeIndex.kTVMFFIDLTensorPtr);
       }
diff --git a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
index f471883ca5bc..a1e15a873a60 100644
--- a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
+++ b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
@@ -52,7 +52,7 @@ class LibInfo {
 
   native int tvmFFIFunctionCreateFromCallback(Function.Callback function, Base.RefLong handle);
 
-  // NDArray
+  // Tensor
   native int tvmFFIDLTensorGetShape(long handle, List<Long> shape);
 
   native int tvmFFIDLTensorCopyFromTo(long from, long to);
@@ -67,7 +67,7 @@ class LibInfo {
   // Device
   native int tvmSynchronize(int deviceType, int deviceId);
 
-  native int tvmNDArrayEmpty(long[] shape, int dtypeCode, int dtypeBits,
+  native int tvmTensorEmpty(long[] shape, int dtypeCode, int dtypeBits,
                              int dtypeLanes, int deviceType, int deviceId,
                              Base.RefLong handle);
 }
diff --git a/jvm/core/src/main/java/org/apache/tvm/TVMType.java b/jvm/core/src/main/java/org/apache/tvm/TVMType.java
index 1c2719eeca90..658fdaedc1e5 100644
--- a/jvm/core/src/main/java/org/apache/tvm/TVMType.java
+++ b/jvm/core/src/main/java/org/apache/tvm/TVMType.java
@@ -31,7 +31,7 @@ public class TVMType {
   /**
    * TVMType constructor.
    * @param typeStr type name, e.g., "float32", "float64", "uint8", etc.
-   * @param lanes NDArray lanes.
+   * @param lanes Tensor lanes.
    */
   public TVMType(String typeStr, int lanes) {
     this.lanes = lanes;
diff --git a/jvm/core/src/main/java/org/apache/tvm/TVMValue.java b/jvm/core/src/main/java/org/apache/tvm/TVMValue.java
index 45aef808f44c..532490a91367 100644
--- a/jvm/core/src/main/java/org/apache/tvm/TVMValue.java
+++ b/jvm/core/src/main/java/org/apache/tvm/TVMValue.java
@@ -45,7 +45,7 @@ public Function asFunction() {
     throw new UnsupportedOperationException();
   }
 
-  public NDArrayBase asNDArray() {
+  public TensorBase asTensor() {
     throw new UnsupportedOperationException();
   }
 
diff --git a/jvm/core/src/main/java/org/apache/tvm/NDArray.java b/jvm/core/src/main/java/org/apache/tvm/Tensor.java
similarity index 90%
rename from jvm/core/src/main/java/org/apache/tvm/NDArray.java
rename to jvm/core/src/main/java/org/apache/tvm/Tensor.java
index 6b151d7bf9d2..7b44049f9372 100644
--- a/jvm/core/src/main/java/org/apache/tvm/NDArray.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Tensor.java
@@ -23,13 +23,13 @@
 import java.util.List;
 
 /**
- * Lightweight NDArray class of TVM runtime.
+ * Lightweight Tensor class of TVM runtime.
  */
-public class NDArray extends NDArrayBase {
+public class Tensor extends TensorBase {
   private final TVMType dtype;
   private final Device device;
 
-  NDArray(long handle, boolean isView, TVMType dtype, Device dev) {
+  Tensor(long handle, boolean isView, TVMType dtype, Device dev) {
     super(handle, isView);
     this.dtype = dtype;
     this.device = dev;
@@ -37,7 +37,7 @@ public class NDArray extends NDArrayBase {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by float64
+   * The Tensor type must by float64
    * @param sourceArray the source data
    */
   public void copyFrom(double[] sourceArray) {
@@ -54,7 +54,7 @@ public void copyFrom(double[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by float32
+   * The Tensor type must by float32
    * @param sourceArray the source data
    */
   public void copyFrom(float[] sourceArray) {
@@ -71,7 +71,7 @@ public void copyFrom(float[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by int64
+   * The Tensor type must by int64
    * @param sourceArray the source data
    */
   public void copyFrom(long[] sourceArray) {
@@ -88,7 +88,7 @@ public void copyFrom(long[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by float32
+   * The Tensor type must by float32
    * @param sourceArray the source data
    */
   public void copyFrom(int[] sourceArray) {
@@ -105,7 +105,7 @@ public void copyFrom(int[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by int16
+   * The Tensor type must by int16
    * @param sourceArray the source data
    */
   public void copyFrom(short[] sourceArray) {
@@ -122,7 +122,7 @@ public void copyFrom(short[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by int8
+   * The Tensor type must by int8
    * @param sourceArray the source data
    */
   public void copyFrom(byte[] sourceArray) {
@@ -135,7 +135,7 @@ public void copyFrom(byte[] sourceArray) {
 
   /**
    * Copy from a native array.
-   * The NDArray type must by uint16
+   * The Tensor type must by uint16
    * @param sourceArray the source data
    */
   public void copyFrom(char[] sourceArray) {
@@ -167,8 +167,8 @@ public void copyFromRaw(byte[] sourceArray) {
   }
 
   /**
-   * Get shape of current NDArray.
-   * @return an array representing shape of current ndarray
+   * Get shape of current Tensor.
+   * @return an array representing shape of current tensor
    */
   public long[] shape() {
     List<Long> data = new ArrayList<Long>();
@@ -181,8 +181,8 @@ public long[] shape() {
   }
 
   /**
-   * Get total size of current NDArray.
-   * @return size of current NDArray.
+   * Get total size of current Tensor.
+   * @return size of current Tensor.
    */
   public long size() {
     long product = 1L;
@@ -195,7 +195,7 @@ public long size() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be float64
+   * The Tensor dtype must be float64
    * @return A copy of array content.
    */
   public double[] asDoubleArray() {
@@ -213,7 +213,7 @@ public double[] asDoubleArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be float32
+   * The Tensor dtype must be float32
    * @return A copy of array content.
    */
   public float[] asFloatArray() {
@@ -231,7 +231,7 @@ public float[] asFloatArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be int64
+   * The Tensor dtype must be int64
    * @return A copy of array content.
    */
   public long[] asLongArray() {
@@ -249,7 +249,7 @@ public long[] asLongArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be int32
+   * The Tensor dtype must be int32
    * @return A copy of array content.
    */
   public int[] asIntArray() {
@@ -267,7 +267,7 @@ public int[] asIntArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be int16
+   * The Tensor dtype must be int16
    * @return A copy of array content.
    */
   public short[] asShortArray() {
@@ -285,7 +285,7 @@ public short[] asShortArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be uint16
+   * The Tensor dtype must be uint16
    * @return A copy of array content.
    */
   public char[] asCharArray() {
@@ -303,7 +303,7 @@ public char[] asCharArray() {
 
   /**
    * Return a copied flat java array of current array (row-major).
-   * The NDArray dtype must be int8
+   * The Tensor dtype must be int8
    * @return A copy of array content.
    */
   public byte[] asByteArray() {
@@ -319,7 +319,7 @@ public byte[] asByteArray() {
    * @return A copy of array content.
    */
   public byte[] internal() {
-    NDArray tmp = NDArray.empty(shape(), dtype);
+    Tensor tmp = Tensor.empty(shape(), dtype);
     copyTo(tmp);
 
     int arrLength = dtype.numOfBytes * (int) size();
@@ -359,12 +359,12 @@ public Device device() {
    * @param dev The device of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape, TVMType dtype, Device dev) {
+  public static Tensor empty(long[] shape, TVMType dtype, Device dev) {
     Base.RefLong refHandle = new Base.RefLong();
-    Base.checkCall(Base._LIB.tvmNDArrayEmpty(
+    Base.checkCall(Base._LIB.tvmTensorEmpty(
         shape, dtype.typeCode, dtype.bits,
         dtype.lanes, dev.deviceType, dev.deviceId, refHandle));
-    return new NDArray(refHandle.value, false, dtype, dev);
+    return new Tensor(refHandle.value, false, dtype, dev);
   }
 
   /**
@@ -373,7 +373,7 @@ public static NDArray empty(long[] shape, TVMType dtype, Device dev) {
    * @param dtype The data type of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape, TVMType dtype) {
+  public static Tensor empty(long[] shape, TVMType dtype) {
     return empty(shape, dtype, Device.cpu(0));
   }
 
@@ -382,7 +382,7 @@ public static NDArray empty(long[] shape, TVMType dtype) {
    * @param shape The shape of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape) {
+  public static Tensor empty(long[] shape) {
     return empty(shape, new TVMType("float32", 1), Device.cpu(0));
   }
 
@@ -392,7 +392,7 @@ public static NDArray empty(long[] shape) {
    * @param dev The device of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape, Device dev) {
+  public static Tensor empty(long[] shape, Device dev) {
     return empty(shape, new TVMType("float32", 1), dev);
   }
 
diff --git a/jvm/core/src/main/java/org/apache/tvm/NDArrayBase.java b/jvm/core/src/main/java/org/apache/tvm/TensorBase.java
similarity index 86%
rename from jvm/core/src/main/java/org/apache/tvm/NDArrayBase.java
rename to jvm/core/src/main/java/org/apache/tvm/TensorBase.java
index 534dcb38d4a9..b150d65807ee 100644
--- a/jvm/core/src/main/java/org/apache/tvm/NDArrayBase.java
+++ b/jvm/core/src/main/java/org/apache/tvm/TensorBase.java
@@ -18,26 +18,26 @@
 package org.apache.tvm;
 
 /**
- * Base class of NDArray. To handle callback array.
+ * Base class of Tensor. To handle callback array.
  * Only deep-copy supported.
  */
-public class NDArrayBase extends TVMValue {
+public class TensorBase extends TVMValue {
   protected long handle;
   public final boolean isView;
   protected final long dltensorHandle;
 
-  NDArrayBase(long handle, boolean isView) {
+  TensorBase(long handle, boolean isView) {
     this.dltensorHandle = isView ? handle : handle + 8 * 2;
     this.handle = isView ? 0 : handle;
     this.isView = isView;
   }
 
-  @Override public NDArrayBase asNDArray() {
+  @Override public TensorBase asTensor() {
     return this;
   }
 
   /**
-   * Release the NDArray.
+   * Release the Tensor.
    */
   public void release() {
     if (this.handle != 0) {
@@ -56,7 +56,7 @@ public void release() {
    * @param target The target array to be copied, must have same shape as this array.
    * @return target
    */
-  public NDArrayBase copyTo(NDArrayBase target) {
+  public TensorBase copyTo(TensorBase target) {
     Base.checkCall(Base._LIB.tvmFFIDLTensorCopyFromTo(this.dltensorHandle, target.dltensorHandle));
     return target;
   }
diff --git a/jvm/core/src/main/java/org/apache/tvm/TypeIndex.java b/jvm/core/src/main/java/org/apache/tvm/TypeIndex.java
index 7689cc58ed63..e29bae51828c 100644
--- a/jvm/core/src/main/java/org/apache/tvm/TypeIndex.java
+++ b/jvm/core/src/main/java/org/apache/tvm/TypeIndex.java
@@ -37,7 +37,7 @@ public class TypeIndex {
   public static final int kTVMFFIError = 67;
   public static final int kTVMFFIFunction = 68;
   public static final int kTVMFFIShape = 70;
-  public static final int kTVMFFINDArray = 71;
+  public static final int kTVMFFITensor = 71;
   public static final int kTVMFFIArray = 72;
   public static final int kTVMFFIMap = 73;
   public static final int kTVMFFIModule = 73;
diff --git a/jvm/core/src/test/java/org/apache/tvm/FunctionTest.java b/jvm/core/src/test/java/org/apache/tvm/FunctionTest.java
index c2a1f78fa432..56e9a21a2b83 100644
--- a/jvm/core/src/test/java/org/apache/tvm/FunctionTest.java
+++ b/jvm/core/src/test/java/org/apache/tvm/FunctionTest.java
@@ -78,14 +78,14 @@ public void test_sum_first_byte() {
   }
 
   @Test
-  public void test_sum_ndarray() {
+  public void test_sum_tensor() {
     final long[] shape = new long[]{2, 1};
     Function func = Function.convertFunc(new Function.Callback() {
       @Override public Object invoke(TVMValue... args) {
         double sum = 0.0;
         for (TVMValue arg : args) {
-          NDArray arr = NDArray.empty(shape, new TVMType("float32"));
-          arg.asNDArray().copyTo(arr);
+          Tensor arr = Tensor.empty(shape, new TVMType("float32"));
+          arg.asTensor().copyTo(arr);
           float[] nativeArr = arr.asFloatArray();
           for (int i = 0; i < nativeArr.length; ++i) {
             sum += nativeArr[i];
@@ -95,7 +95,7 @@ public void test_sum_ndarray() {
         return sum;
       }
     });
-    NDArray arr = NDArray.empty(shape, new TVMType("float32"));
+    Tensor arr = Tensor.empty(shape, new TVMType("float32"));
     arr.copyFrom(new float[]{2f, 3f});
     TVMValue res = func.pushArg(arr).pushArg(arr).invoke();
     assertEquals(10.0, res.asDouble(), 1e-3);
diff --git a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
index 888cd18923be..5c692eecc3f6 100644
--- a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
+++ b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
@@ -42,11 +42,11 @@ public void test_load_add_func_cpu() {
 
     Device dev = new Device("cpu", 0);
     long[] shape = new long[]{2};
-    NDArray arr = NDArray.empty(shape, dev);
+    Tensor arr = Tensor.empty(shape, dev);
 
     arr.copyFrom(new float[]{3f, 4f});
 
-    NDArray res = NDArray.empty(shape, dev);
+    Tensor res = Tensor.empty(shape, dev);
 
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
     assertArrayEquals(new float[]{6f, 8f}, res.asFloatArray(), 1e-3f);
@@ -74,7 +74,7 @@ public void test_load_add_func_cuda() {
 
     final int dim = 100;
     long[] shape = new long[]{dim};
-    NDArray arr = NDArray.empty(shape, dev);
+    Tensor arr = Tensor.empty(shape, dev);
 
     float[] data = new float[dim];
     float[] dataX2 = new float[dim];
@@ -84,7 +84,7 @@ public void test_load_add_func_cuda() {
     }
     arr.copyFrom(data);
 
-    NDArray res = NDArray.empty(shape, dev);
+    Tensor res = Tensor.empty(shape, dev);
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
 
     assertArrayEquals(dataX2, res.asFloatArray(), 1e-3f);
diff --git a/jvm/core/src/test/java/org/apache/tvm/NDArrayTest.java b/jvm/core/src/test/java/org/apache/tvm/NDArrayTest.java
deleted file mode 100644
index c4c34360f740..000000000000
--- a/jvm/core/src/test/java/org/apache/tvm/NDArrayTest.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tvm;
-
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-
-public class NDArrayTest {
-  @Test
-  public void test_from_float32() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("float32"));
-    ndarray.copyFrom(new float[]{1, 2, 3, 4});
-    assertArrayEquals(new float[]{1f, 2f, 3f, 4f}, ndarray.asFloatArray(), 1e-3f);
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_float64() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("float64"));
-    ndarray.copyFrom(new double[]{1, 2, 3, 4});
-    assertArrayEquals(new double[]{1.0, 2.0, 3.0, 4.0}, ndarray.asDoubleArray(), 1e-3);
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_int8() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("int8"));
-    ndarray.copyFrom(new byte[]{1, 2, 3, 4});
-    assertArrayEquals(new byte[]{1, 2, 3, 4}, ndarray.asByteArray());
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_int16() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("int16"));
-    ndarray.copyFrom(new short[]{1, 2, 3, 4});
-    assertArrayEquals(new short[]{1, 2, 3, 4}, ndarray.asShortArray());
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_int32() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("int32"));
-    ndarray.copyFrom(new int[]{1, 2, 3, 4});
-    assertArrayEquals(new int[]{1, 2, 3, 4}, ndarray.asIntArray());
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_int64() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("int64"));
-    ndarray.copyFrom(new long[]{1, 2, 3, 4});
-    assertArrayEquals(new long[]{1, 2, 3, 4}, ndarray.asLongArray());
-    ndarray.release();
-  }
-
-  @Test
-  public void test_from_uint16() {
-    NDArray ndarray = NDArray.empty(new long[]{2, 2}, new TVMType("uint16"));
-    ndarray.copyFrom(new char[]{65535, 2, 3, 4});
-    assertArrayEquals(new char[]{65535, 2, 3, 4}, ndarray.asCharArray());
-    ndarray.release();
-  }
-}
diff --git a/jvm/core/src/test/java/org/apache/tvm/TensorTest.java b/jvm/core/src/test/java/org/apache/tvm/TensorTest.java
new file mode 100644
index 000000000000..546bf661e400
--- /dev/null
+++ b/jvm/core/src/test/java/org/apache/tvm/TensorTest.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tvm;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class TensorTest {
+  @Test
+  public void test_from_float32() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("float32"));
+    tensor.copyFrom(new float[]{1, 2, 3, 4});
+    assertArrayEquals(new float[]{1f, 2f, 3f, 4f}, tensor.asFloatArray(), 1e-3f);
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_float64() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("float64"));
+    tensor.copyFrom(new double[]{1, 2, 3, 4});
+    assertArrayEquals(new double[]{1.0, 2.0, 3.0, 4.0}, tensor.asDoubleArray(), 1e-3);
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_int8() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("int8"));
+    tensor.copyFrom(new byte[]{1, 2, 3, 4});
+    assertArrayEquals(new byte[]{1, 2, 3, 4}, tensor.asByteArray());
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_int16() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("int16"));
+    tensor.copyFrom(new short[]{1, 2, 3, 4});
+    assertArrayEquals(new short[]{1, 2, 3, 4}, tensor.asShortArray());
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_int32() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("int32"));
+    tensor.copyFrom(new int[]{1, 2, 3, 4});
+    assertArrayEquals(new int[]{1, 2, 3, 4}, tensor.asIntArray());
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_int64() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("int64"));
+    tensor.copyFrom(new long[]{1, 2, 3, 4});
+    assertArrayEquals(new long[]{1, 2, 3, 4}, tensor.asLongArray());
+    tensor.release();
+  }
+
+  @Test
+  public void test_from_uint16() {
+    Tensor tensor = Tensor.empty(new long[]{2, 2}, new TVMType("uint16"));
+    tensor.copyFrom(new char[]{65535, 2, 3, 4});
+    assertArrayEquals(new char[]{65535, 2, 3, 4}, tensor.asCharArray());
+    tensor.release();
+  }
+}
diff --git a/jvm/native/src/main/native/jni_helper_func.h b/jvm/native/src/main/native/jni_helper_func.h
index 9b50fb6a4914..659c6e4f2943 100644
--- a/jvm/native/src/main/native/jni_helper_func.h
+++ b/jvm/native/src/main/native/jni_helper_func.h
@@ -151,8 +151,8 @@ jobject newFunction(JNIEnv* env, jlong value) {
   return object;
 }
 
-jobject newNDArray(JNIEnv* env, jlong handle, jboolean isview) {
-  jclass cls = env->FindClass("org/apache/tvm/NDArrayBase");
+jobject newTensor(JNIEnv* env, jlong handle, jboolean isview) {
+  jclass cls = env->FindClass("org/apache/tvm/TensorBase");
   jmethodID constructor = env->GetMethodID(cls, "<init>", "(JZ)V");
   jobject object = env->NewObject(cls, constructor, handle, isview);
   env->DeleteLocalRef(cls);
@@ -218,10 +218,10 @@ jobject tvmRetValueToJava(JNIEnv* env, TVMFFIAny value) {
       return newFunction(env, reinterpret_cast<jlong>(value.v_obj));
     }
     case TypeIndex::kTVMFFIDLTensorPtr: {
-      return newNDArray(env, reinterpret_cast<jlong>(value.v_ptr), true);
+      return newTensor(env, reinterpret_cast<jlong>(value.v_ptr), true);
     }
-    case TypeIndex::kTVMFFINDArray: {
-      return newNDArray(env, reinterpret_cast<jlong>(value.v_obj), false);
+    case TypeIndex::kTVMFFITensor: {
+      return newTensor(env, reinterpret_cast<jlong>(value.v_obj), false);
     }
     case TypeIndex::kTVMFFISmallStr: {
       TVMFFIByteArray arr = TVMFFISmallBytesGetContentByteArray(&value);
diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
index b512ec8775bd..e18d1171df1f 100644
--- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
@@ -26,8 +26,8 @@
 #else
 #include <dlfcn.h>
 #include <tvm/ffi/c_api.h>
-#include <tvm/ffi/container/ndarray.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/function.h>
 #endif
 #include <cstring>
@@ -325,7 +325,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmFFIObjectFree(JNIEnv* env,
   return TVMFFIObjectDecRef(reinterpret_cast<TVMFFIObjectHandle>(jhandle));
 }
 
-// NDArray
+// Tensor
 
 JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmFFIDLTensorGetShape(JNIEnv* env, jobject obj,
                                                                           jlong jhandle,
@@ -356,7 +356,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmFFIDLTensorCopyFromTo(JNIE
                                                                             jlong jfrom,
                                                                             jlong jto) {
   TVM_FFI_SAFE_CALL_BEGIN();
-  static auto fcopy_from_to = tvm::ffi::Function::GetGlobalRequired("runtime.TVMArrayCopyFromTo");
+  static auto fcopy_from_to = tvm::ffi::Function::GetGlobalRequired("runtime.TVMTensorCopyFromTo");
   fcopy_from_to(reinterpret_cast<DLTensor*>(jfrom), reinterpret_cast<DLTensor*>(jto));
   TVM_FFI_SAFE_CALL_END();
 }
@@ -370,7 +370,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmFFIDLTensorCopyFromJArray(
   DLTensor* to = reinterpret_cast<DLTensor*>(jto);
   size_t size = tvm::ffi::GetDataSize(*to);
   static auto fcopy_from_bytes =
-      tvm::ffi::Function::GetGlobalRequired("runtime.TVMArrayCopyFromBytes");
+      tvm::ffi::Function::GetGlobalRequired("runtime.TVMTensorCopyFromBytes");
   fcopy_from_bytes(to, static_cast<void*>(pdata), size);
   env->ReleaseByteArrayElements(jarr, pdata, 0);
   TVM_FFI_SAFE_CALL_END();
@@ -384,7 +384,8 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmFFIDLTensorCopyToJArray(JN
   DLTensor* from = reinterpret_cast<DLTensor*>(jfrom);
   size_t size = tvm::ffi::GetDataSize(*from);
   jbyte* pdata = env->GetByteArrayElements(jarr, NULL);
-  static auto fcopy_to_bytes = tvm::ffi::Function::GetGlobalRequired("runtime.TVMArrayCopyToBytes");
+  static auto fcopy_to_bytes =
+      tvm::ffi::Function::GetGlobalRequired("runtime.TVMTensorCopyToBytes");
   fcopy_to_bytes(from, static_cast<void*>(pdata), size);
   env->ReleaseByteArrayElements(jarr, static_cast<jbyte*>(pdata),
                                 0);  // copy back to java array automatically
@@ -401,7 +402,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmSynchronize(JNIEnv* env, j
   TVM_FFI_SAFE_CALL_END();
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmNDArrayEmpty(
+JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmTensorEmpty(
     JNIEnv* env, jobject obj, jlongArray jshape, jint jdtypeCode, jint jdtypeBits, jint jdtypeLanes,
     jint jdeviceType, jint jdeviceId, jobject jret) {
   TVM_FFI_SAFE_CALL_BEGIN();
@@ -414,8 +415,8 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmNDArrayEmpty(
   dtype.lanes = static_cast<int16_t>(jdtypeLanes);
   DLDevice device{static_cast<DLDeviceType>(jdeviceType), jdeviceId};
   env->ReleaseLongArrayElements(jshape, shapeArray, 0);
-  static auto fempty = tvm::ffi::Function::GetGlobalRequired("runtime.TVMArrayAllocWithScope");
-  tvm::ffi::NDArray out = fempty(shape, dtype, device, nullptr).cast<tvm::ffi::NDArray>();
+  static auto fempty = tvm::ffi::Function::GetGlobalRequired("runtime.TVMTensorAllocWithScope");
+  tvm::ffi::Tensor out = fempty(shape, dtype, device, nullptr).cast<tvm::ffi::Tensor>();
   void* handle = tvm::ffi::details::ObjectUnsafe::MoveObjectRefToTVMFFIObjectPtr(std::move(out));
   setLongField(env, jret, reinterpret_cast<jlong>(handle));
   TVM_FFI_SAFE_CALL_END();
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 59d8e0566654..c3c8c559c84f 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -29,9 +29,9 @@
 # top-level alias
 # tvm.runtime
 from .runtime.object import Object
-from .runtime.ndarray import device, cpu, cuda, opencl, vulkan, metal
-from .runtime.ndarray import vpi, rocm, ext_dev, hexagon
-from .runtime import ndarray as nd, DataType, DataTypeCode
+from .runtime._tensor import device, cpu, cuda, opencl, vulkan, metal
+from .runtime._tensor import vpi, rocm, ext_dev, hexagon
+from .runtime import DataType, DataTypeCode
 
 # tvm.error
 from . import error
diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
index 4d39dfd1c645..b69bc4f84ee5 100644
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -123,7 +123,7 @@ def _get_np_int32_array_handle(arr):
 
     Parameters
     ----------
-    arr: numpy.NDArray
+    arr: numpy.Tensor
         source numpy array
 
     Returns
diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py
index 75b37cef6199..e6214ed3a259 100644
--- a/python/tvm/contrib/dlpack.py
+++ b/python/tvm/contrib/dlpack.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping functions to bridge frameworks with DLPack support to TVM"""
-from tvm.runtime import ndarray
+import tvm.runtime
 
 
 def convert_func(tvm_func, tensor_type, to_dlpack_func):
@@ -37,7 +37,7 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
 
     def _wrapper(*args):
         args = tuple(
-            ndarray.from_dlpack(to_dlpack_func(arg)) if isinstance(arg, tensor_type) else arg
+            tvm.runtime.from_dlpack(to_dlpack_func(arg)) if isinstance(arg, tensor_type) else arg
             for arg in args
         )
         return tvm_func(*args)
diff --git a/python/tvm/contrib/hexagon/generate_take_op.py b/python/tvm/contrib/hexagon/generate_take_op.py
index b70eb451a1a5..080a7d6a1953 100644
--- a/python/tvm/contrib/hexagon/generate_take_op.py
+++ b/python/tvm/contrib/hexagon/generate_take_op.py
@@ -84,7 +84,7 @@ def visit_call_(self, call_node: relax.Call) -> relax.Call:
                 take_node = relax.call_tir(
                     take_func_gv,
                     relax.expr.Tuple(
-                        [call_node.args[1][0], relax.expr.Constant(tvm.nd.array(LUT))]
+                        [call_node.args[1][0], relax.expr.Constant(tvm.runtime.tensor(LUT))]
                     ),
                     call_node.struct_info,
                 )
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
index 92298c011d4a..7c4ccdd5b20f 100644
--- a/python/tvm/contrib/hexagon/meta_schedule.py
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -21,7 +21,7 @@
 import tvm
 
 from tvm.ir.module import IRModule
-from tvm.runtime import Module, NDArray
+from tvm.runtime import Module, Tensor
 from tvm.target import Target
 from tvm.driver import build as tvm_build
 from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
@@ -140,10 +140,10 @@ def export_func(mod):
         return str(binary_path)
 
     def default_build_with_context(
-        mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]
+        mod: IRModule, target: Target, _params: Optional[Dict[str, Tensor]]
     ) -> Module:
         with pass_context:
-            mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+            mod = RemoveWeightLayoutRewriteBlock(skip_tensor_rewrite=True)(mod)
             return tvm_build(mod, target=target)
 
     if pass_context is not None:
diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index a26822bc5fb8..d84c18aaf73e 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -336,7 +336,7 @@ def pack_imports(
     """
 
     path_bin = os.path.join(workspace_dir, "imports.bin")
-    pack_to_bin_f_name = "runtime.ModulePackImportsToNDArray"
+    pack_to_bin_f_name = "runtime.ModulePackImportsToTensor"
     fpack_to_bin = tvm.get_global_func(pack_to_bin_f_name)
     assert fpack_to_bin, f"Expecting {pack_to_bin_f_name} in registry"
 
@@ -438,7 +438,7 @@ def allocate_hexagon_array(
         for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
     ]
 
-    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
+    arr = tvm.runtime.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
 
     if data is not None:
         arr.copyfrom(data.reshape(physical_shape))
diff --git a/python/tvm/contrib/miopen.py b/python/tvm/contrib/miopen.py
index 3aa885f5454a..6ec2cd78e4d3 100644
--- a/python/tvm/contrib/miopen.py
+++ b/python/tvm/contrib/miopen.py
@@ -29,7 +29,7 @@ def _get_np_int32_array_handle(arr):
 
     Parameters
     ----------
-    arr: numpy.NDArray
+    arr: numpy.Tensor
         source numpy array
 
     Returns
diff --git a/python/tvm/contrib/msc/core/codegen/codegen.py b/python/tvm/contrib/msc/core/codegen/codegen.py
index 96c9c23dfd9d..b2b97fc8b593 100644
--- a/python/tvm/contrib/msc/core/codegen/codegen.py
+++ b/python/tvm/contrib/msc/core/codegen/codegen.py
@@ -129,7 +129,7 @@ def load(
 
 def to_relax(
     graph: MSCGraph,
-    weights: Optional[Dict[str, tvm.nd.array]] = None,
+    weights: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     codegen_config: Optional[Dict[str, str]] = None,
     print_config: Optional[Dict[str, str]] = None,
     build_folder: msc_utils.MSCDirectory = None,
diff --git a/python/tvm/contrib/msc/core/frontend/translate.py b/python/tvm/contrib/msc/core/frontend/translate.py
index 687d770c93a6..24825c99d485 100644
--- a/python/tvm/contrib/msc/core/frontend/translate.py
+++ b/python/tvm/contrib/msc/core/frontend/translate.py
@@ -67,13 +67,13 @@ def _normalize(info):
 
 
 def normalize_weights(
-    t_weights: Dict[MSCTensor, tvm.nd.array], graph: MSCGraph
-) -> Dict[str, tvm.nd.array]:
+    t_weights: Dict[MSCTensor, tvm.runtime.Tensor], graph: MSCGraph
+) -> Dict[str, tvm.runtime.Tensor]:
     """Normalize the weghts.
 
     Parameters
     ----------
-    t_weights: dict of <MSCTensor, tvm.nd.array>
+    t_weights: dict of <MSCTensor, tvm.runtime.tensor>
         The weights extracted from IRModule.
     graph: tvm.contrib.msc.core.ir.MSCGraph
         The translated graph.
@@ -88,7 +88,7 @@ def _to_data(ref_t, data):
         weight_t = graph.find_tensor(ref_t.name)
         if weight_t.ndim == 1:
             if ref_t.ndim != weight_t.ndim:
-                return tvm.nd.array(data.numpy().reshape(weight_t.get_shape()))
+                return tvm.runtime.tensor(data.numpy().reshape(weight_t.get_shape()))
             return data
         if ref_t.layout and weight_t.layout:
             ref_layout, weight_layout = ref_t.layout.name, weight_t.layout.name
@@ -97,7 +97,7 @@ def _to_data(ref_t, data):
                     l in ref_layout for l in weight_layout
                 ), "layout mismatch {} compare to {}".format(ref_t, weight_t)
                 permute = [ref_layout.index(l) for l in weight_layout]
-                return tvm.nd.array(data.numpy().transpose(*permute))
+                return tvm.runtime.tensor(data.numpy().transpose(*permute))
         return data
 
     weights = {t.name: _to_data(t, d) for t, d in t_weights.items() if graph.has_tensor(t.name)}
@@ -111,11 +111,11 @@ def _to_data(ref_t, data):
 
 def from_relax(
     mod: tvm.IRModule,
-    params: Optional[Dict[str, tvm.nd.array]] = None,
+    params: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     trans_config: Optional[Dict[str, str]] = None,
     build_config: Optional[Dict[str, str]] = None,
     opt_config: Optional[Dict[str, str]] = None,
-) -> Tuple[MSCGraph, Dict[str, tvm.nd.array]]:
+) -> Tuple[MSCGraph, Dict[str, tvm.runtime.Tensor]]:
     """Change IRModule to MSCGraph.
 
     Parameters
@@ -195,10 +195,10 @@ def visit_var_binding_(self, binding) -> None:
 def byoc_partition(
     target: str,
     mod: tvm.IRModule,
-    params: Optional[Dict[str, tvm.nd.array]] = None,
+    params: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     trans_config: Optional[Dict[str, str]] = None,
     build_config: Optional[Dict[str, str]] = None,
-) -> Tuple[tvm.IRModule, List[Tuple[MSCGraph, Dict[str, tvm.nd.array]]]]:
+) -> Tuple[tvm.IRModule, List[Tuple[MSCGraph, Dict[str, tvm.runtime.Tensor]]]]:
     """Partition module to target sub functions.
 
     Parameters
diff --git a/python/tvm/contrib/msc/core/runtime/hook.py b/python/tvm/contrib/msc/core/runtime/hook.py
index e129d9771b02..f87b2d3d06a0 100644
--- a/python/tvm/contrib/msc/core/runtime/hook.py
+++ b/python/tvm/contrib/msc/core/runtime/hook.py
@@ -136,9 +136,9 @@ def _apply(
         self,
         runner: object,
         graphs: List[MSCGraph],
-        weights: Dict[str, tvm.nd.array],
+        weights: Dict[str, tvm.runtime.Tensor],
         weights_path: str,
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Apply the default funcion
 
         Parameters
@@ -147,7 +147,7 @@ def _apply(
             The runner context.
         graphs: list<MSCGraph>
             The translated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The translated weights.
         weights_path: str
             The weights path.
@@ -156,7 +156,7 @@ def _apply(
         -------
         graphs: list<MSCGraph>
             The updated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The updated weights.
 
         """
diff --git a/python/tvm/contrib/msc/core/runtime/runner.py b/python/tvm/contrib/msc/core/runtime/runner.py
index 074c7048c5e9..bd9cc01d76f2 100644
--- a/python/tvm/contrib/msc/core/runtime/runner.py
+++ b/python/tvm/contrib/msc/core/runtime/runner.py
@@ -340,7 +340,9 @@ def save_cache(
         title = self.runner_mark("SAVE_CACHE")
         self._logger.debug(msc_utils.msg_block(title, {"folder": cache_dir, "info": cache_info}))
 
-    def translate(self, apply_hooks: bool = True) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    def translate(
+        self, apply_hooks: bool = True
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Translate IRModule to MSCgraphs
 
         Parameters
@@ -352,7 +354,7 @@ def translate(self, apply_hooks: bool = True) -> Tuple[List[MSCGraph], Dict[str,
         -------
         graphs: list<MSCGraph>
             The translated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The translated weights.
         """
 
@@ -366,7 +368,7 @@ def translate(self, apply_hooks: bool = True) -> Tuple[List[MSCGraph], Dict[str,
                 graphs, weights = self._apply_hook("after translate", hook, graphs, weights)
         return graphs, weights
 
-    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Translate IRModule to MSCgraphs
 
         Parameters
@@ -378,7 +380,7 @@ def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.n
         -------
         graphs: list<MSCGraph>
             The translated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The translated weights.
         """
 
@@ -387,7 +389,7 @@ def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.n
     def reset_tools(
         self,
         graphs: List[MSCGraph] = None,
-        weights: List[Dict[str, tvm.nd.array]] = None,
+        weights: List[Dict[str, tvm.runtime.Tensor]] = None,
         tools: List[BaseTool] = None,
         cache_dir: msc_utils.MSCDirectory = None,
     ):
@@ -397,7 +399,7 @@ def reset_tools(
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: list<dict<str, tvm.nd.array>>
+        weights: list<dict<str, tvm.runtime.tensor>>
             The weights.
         tools: list<BaseTool>
             The tools.
@@ -408,7 +410,7 @@ def reset_tools(
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: list<dict<str, tvm.nd.array>>
+        weights: list<dict<str, tvm.runtime.tensor>>
             The weights.
         """
 
@@ -444,14 +446,16 @@ def generate_model(self, apply_hooks: bool = True) -> Any:
                 model = self._apply_hook("after generate", hook, model)
         return model
 
-    def _generate_model(self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]) -> Any:
+    def _generate_model(
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Any:
         """Codegen the model according to framework
 
         Parameters
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
@@ -763,7 +767,9 @@ def get_outputs(self) -> List[Dict[str, str]]:
 
         return self._model_info["outputs"]
 
-    def get_weights(self, framework: str = None, device: str = None) -> Iterable[tvm.nd.array]:
+    def get_weights(
+        self, framework: str = None, device: str = None
+    ) -> Iterable[tvm.runtime.Tensor]:
         """Get the weights from graphs
 
         Parameters
@@ -775,7 +781,7 @@ def get_weights(self, framework: str = None, device: str = None) -> Iterable[tvm
 
         Returns
         -------
-        weights: generator<tvm.nd.array>
+        weights: generator<tvm.runtime.tensor>
             The generator of weight datas.
         """
 
@@ -787,23 +793,23 @@ def get_weights(self, framework: str = None, device: str = None) -> Iterable[tvm
                     data = msc_utils.cast_array(data, framework, device)
                 yield data
 
-    def get_runtime_params(self) -> Dict[str, tvm.nd.array]:
+    def get_runtime_params(self) -> Dict[str, tvm.runtime.Tensor]:
         """Get the runtime parameters
 
         Returns
         -------
-        params: dict<str, tvm.nd.array>
+        params: dict<str, tvm.runtime.tensor>
             The parameters from runtime.
         """
 
         return self._get_runtime_params()
 
-    def _get_runtime_params(self) -> Dict[str, tvm.nd.array]:
+    def _get_runtime_params(self) -> Dict[str, tvm.runtime.Tensor]:
         """Get the runtime parameters
 
         Returns
         -------
-        params: dict<str, tvm.nd.array>
+        params: dict<str, tvm.runtime.tensor>
             The parameters from runtime.
         """
 
@@ -1146,7 +1152,7 @@ def support_device(cls, device: str) -> bool:
 class ModelRunner(BaseRunner):
     """Model runner of MSC"""
 
-    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Translate IRModule to MSCgraphs
 
         Parameters
@@ -1158,7 +1164,7 @@ def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.n
         -------
         graphs: list<MSCGraph>
             The translated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The translated weights.
         """
 
@@ -1210,14 +1216,16 @@ def _save_graphs(self, cache_dir: msc_utils.MSCDirectory) -> dict:
                 f_graph.write(self._graphs[0].to_json())
         return {"main": main_info}
 
-    def _generate_model(self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]) -> Any:
+    def _generate_model(
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Any:
         """Codegen the model according to framework
 
         Parameters
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
@@ -1319,7 +1327,7 @@ def visualize(self, visual_dir: msc_utils.MSCDirectory, export_graph: bool = Fal
             with open(visual_dir.relpath(self._byoc_graph.name + "_graph.json"), "w") as f_graph:
                 f_graph.write(self._byoc_graph.to_json())
 
-    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Translate IRModule to MSCgraphs
 
         Parameters
@@ -1331,7 +1339,7 @@ def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.n
         -------
         graphs: list<MSCGraph>
             The translated graphs
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The translated weights.
         """
 
@@ -1405,14 +1413,16 @@ def _save_graphs(self, cache_dir: msc_utils.MSCDirectory) -> dict:
             "byoc_mod": "byoc_module.json",
         }
 
-    def _generate_model(self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]) -> Any:
+    def _generate_model(
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Any:
         """Codegen the model according to framework
 
         Parameters
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
diff --git a/python/tvm/contrib/msc/core/tools/distill/distiller.py b/python/tvm/contrib/msc/core/tools/distill/distiller.py
index 55b7947a6e20..7812627ebc75 100644
--- a/python/tvm/contrib/msc/core/tools/distill/distiller.py
+++ b/python/tvm/contrib/msc/core/tools/distill/distiller.py
@@ -48,22 +48,22 @@ def setup(self) -> dict:
         return super().setup()
 
     def _reset(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
@@ -164,7 +164,7 @@ def _save_weights(self, weights: Dict[str, Any]):
             The distilled weights.
         """
 
-        weights = {n: tvm.nd.array(msc_utils.cast_array(d)) for n, d in weights.items()}
+        weights = {n: tvm.runtime.tensor(msc_utils.cast_array(d)) for n, d in weights.items()}
         weights_path = self._weights_folder.relpath("distill_{}.bin".format(self._current_iter))
         with open(weights_path, "wb") as f_params:
             f_params.write(tvm.runtime.save_param_dict(weights))
diff --git a/python/tvm/contrib/msc/core/tools/prune/pruner.py b/python/tvm/contrib/msc/core/tools/prune/pruner.py
index 38f855d0ebce..95024e1abb41 100644
--- a/python/tvm/contrib/msc/core/tools/prune/pruner.py
+++ b/python/tvm/contrib/msc/core/tools/prune/pruner.py
@@ -104,22 +104,22 @@ def _update_stages(strategy):
         return super()._parse_strategys([_update_stages(s) for s in strategy_list])
 
     def _reset(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
@@ -315,22 +315,22 @@ def _prunable(w_node: WeightJoint) -> bool:
             self._plan[w_node.name]["out_indices"] = []
 
     def prune_graphs(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
@@ -375,7 +375,7 @@ def _prune_by_channel(tensor: MSCTensor, dim, channel_axis: int = None):
                         if w_config["out_indices"]:
                             data = PruneMethod.prune_axis(data, out_axis, w_config["out_indices"])
                         pruned_tensors[w_name] = _prune_by_shape(weight, data.shape)
-                        pruned_weights[w_name] = tvm.nd.array(data)
+                        pruned_weights[w_name] = tvm.runtime.tensor(data)
                         w_node.set_attr(
                             "pruned_shape",
                             ",".join([str(i) for i in pruned_tensors[w_name].get_shape()]),
diff --git a/python/tvm/contrib/msc/core/tools/tool.py b/python/tvm/contrib/msc/core/tools/tool.py
index 06a16f2bbe49..cb860729f792 100644
--- a/python/tvm/contrib/msc/core/tools/tool.py
+++ b/python/tvm/contrib/msc/core/tools/tool.py
@@ -372,16 +372,16 @@ def setup(self) -> dict:
     def reset(
         self,
         graphs: List[MSCGraph],
-        weights: Dict[str, tvm.nd.array],
+        weights: Dict[str, tvm.runtime.Tensor],
         cache_dir: msc_utils.MSCDirectory = None,
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool with graphs and weights
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         cache_dir: MSCDirectory
             cache path for save/load info.
@@ -390,7 +390,7 @@ def reset(
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
@@ -411,22 +411,22 @@ def reset(
         return self._graphs, self._weights
 
     def _reset(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
@@ -1440,22 +1440,22 @@ def setup(self) -> dict:
         return super().setup()
 
     def _reset(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
-    ) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Reset the tool
 
         Parameters
         ----------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
         """
 
diff --git a/python/tvm/contrib/msc/core/transform/transform.py b/python/tvm/contrib/msc/core/transform/transform.py
index 47ea21266eb0..19f5b5a03236 100644
--- a/python/tvm/contrib/msc/core/transform/transform.py
+++ b/python/tvm/contrib/msc/core/transform/transform.py
@@ -122,7 +122,7 @@ def SetBYOCAttrs(target, entry_name: str = "main") -> tvm.ir.transform.Pass:
 
 def BindNamedParams(
     func_name: str,
-    params: Dict[str, tvm.runtime.NDArray],
+    params: Dict[str, tvm.runtime.Tensor],
 ) -> tvm.ir.transform.Pass:
     """Bind params of function of the module to constant tensors with span names.
 
@@ -130,7 +130,7 @@ def BindNamedParams(
     ----------
     func_name: str
         The function name to be bound
-    params: dict<str, tvm.nd.array>
+    params: dict<str, tvm.runtime.tensor>
         The map from parameter or parameter name to constant
         tensors.
 
diff --git a/python/tvm/contrib/msc/core/utils/info.py b/python/tvm/contrib/msc/core/utils/info.py
index 189dd3ebbb37..b4301beeb53e 100644
--- a/python/tvm/contrib/msc/core/utils/info.py
+++ b/python/tvm/contrib/msc/core/utils/info.py
@@ -46,7 +46,7 @@ def _analysis(self, data: Any) -> Tuple[str, str, np.ndarray]:
             return MSCFramework.MSC, "list", "cpu"
         if isinstance(data, np.ndarray):
             return MSCFramework.MSC, "tensor", "cpu"
-        if isinstance(data, tvm.runtime.NDArray):
+        if isinstance(data, tvm.runtime.Tensor):
             device = tvm.runtime.Device.DEVICE_TYPE_TO_NAME[data.device.device_type]
             if data.device.device_id:
                 device += ":{}".format(data.device.device_id)
@@ -71,7 +71,7 @@ def _analysis(self, data: Any) -> Tuple[str, str, np.ndarray]:
     def abstract(self) -> str:
         """Get abstract describe of the data"""
 
-        data = self._to_ndarray()
+        data = self._to_tensor()
         prefix = "[{},{}]".format(";".join([str(s) for s in data.shape]), data.dtype.name)
         if data.size < 10:
             return "{} {}".format(prefix, ",".join([str(i) for i in data.flatten()]))
@@ -79,7 +79,7 @@ def abstract(self) -> str:
             prefix, data.max(), data.min(), data.sum() / data.size
         )
 
-    def _to_ndarray(self) -> np.ndarray:
+    def _to_tensor(self) -> np.ndarray:
         """Cast array like object to np.ndarray
 
         Returns
@@ -120,7 +120,7 @@ def _to_device(self, device: str) -> Any:
         if self._framework == MSCFramework.TORCH:
             return self._meta_data.to(self.get_device(device))
         if self._framework == MSCFramework.TVM:
-            return tvm.nd.array(self._cast_data(), device=self.get_device(device))
+            return tvm.runtime.tensor(self._cast_data(), device=self.get_device(device))
         return self._meta_data
 
     def cast(self, framework: str, device: str = "cpu") -> Any:
@@ -144,13 +144,13 @@ def cast(self, framework: str, device: str = "cpu") -> Any:
             return self._meta_data
         if framework == self._framework:
             return self._to_device(device)
-        data = self._to_ndarray()
+        data = self._to_tensor()
         if framework == MSCFramework.TORCH:
             import torch  # pylint: disable=import-outside-toplevel
 
             return torch.from_numpy(data).to(self.get_device(device, framework))
         if framework == MSCFramework.TVM:
-            return tvm.nd.array(data, device=self.get_device(device, framework))
+            return tvm.runtime.tensor(data, device=self.get_device(device, framework))
         return data
 
     def get_device(self, device: str, framework: str = None) -> Any:
@@ -198,7 +198,7 @@ def is_array(cls, data: Any) -> bool:
             Whether the data is array like.
         """
 
-        normal_types = (np.ndarray, tvm.runtime.NDArray, tvm.relax.Var)
+        normal_types = (np.ndarray, tvm.runtime.Tensor, tvm.relax.Var)
         if isinstance(data, normal_types):
             return True
         if isinstance(data, (list, tuple)) and all(isinstance(d, (int, float)) for d in data):
diff --git a/python/tvm/contrib/msc/framework/tensorflow/codegen/codegen.py b/python/tvm/contrib/msc/framework/tensorflow/codegen/codegen.py
index f24150efcd6c..b9728b8f63cc 100644
--- a/python/tvm/contrib/msc/framework/tensorflow/codegen/codegen.py
+++ b/python/tvm/contrib/msc/framework/tensorflow/codegen/codegen.py
@@ -28,7 +28,7 @@
 
 def to_tensorflow(
     graph: MSCGraph,
-    weights: Optional[Dict[str, tvm.nd.array]] = None,
+    weights: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     codegen_config: Optional[Dict[str, str]] = None,
     print_config: Optional[Dict[str, str]] = None,
     build_folder: msc_utils.MSCDirectory = None,
diff --git a/python/tvm/contrib/msc/framework/tensorflow/frontend/translate.py b/python/tvm/contrib/msc/framework/tensorflow/frontend/translate.py
index 1accaba8595a..36e4e75491fa 100644
--- a/python/tvm/contrib/msc/framework/tensorflow/frontend/translate.py
+++ b/python/tvm/contrib/msc/framework/tensorflow/frontend/translate.py
@@ -34,7 +34,7 @@ def from_tensorflow(
     build_config: Optional[Dict[str, str]] = None,
     opt_config: Optional[Dict[str, str]] = None,
     as_msc: bool = True,
-) -> Tuple[Union[MSCGraph, tvm.IRModule], Dict[str, tvm.nd.array]]:
+) -> Tuple[Union[MSCGraph, tvm.IRModule], Dict[str, tvm.runtime.Tensor]]:
     """Change tensorflow GraphDef to MSCGraph.
 
     Parameters
diff --git a/python/tvm/contrib/msc/framework/tensorflow/runtime/runner.py b/python/tvm/contrib/msc/framework/tensorflow/runtime/runner.py
index 2297b3e82523..eeee4635ab4e 100644
--- a/python/tvm/contrib/msc/framework/tensorflow/runtime/runner.py
+++ b/python/tvm/contrib/msc/framework/tensorflow/runtime/runner.py
@@ -88,7 +88,7 @@ def destory(self):
         super().destory()
 
     def _generate_model(
-        self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
     ) -> tf_v1.Graph:
         """Codegen the model according to framework
 
@@ -96,7 +96,7 @@ def _generate_model(
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
diff --git a/python/tvm/contrib/msc/framework/tensorrt/codegen/codegen.py b/python/tvm/contrib/msc/framework/tensorrt/codegen/codegen.py
index 4643d49c1e83..a3cd7224953c 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/codegen/codegen.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/codegen/codegen.py
@@ -33,7 +33,7 @@
 
 def to_sub_tensorrt(
     graph: MSCGraph,
-    weights: Dict[str, tvm.nd.array],
+    weights: Dict[str, tvm.runtime.Tensor],
     codegen_config: Optional[Dict[str, str]] = None,
     print_config: Optional[Dict[str, str]] = None,
     build_folder: msc_utils.MSCDirectory = None,
@@ -145,7 +145,7 @@ def _build_engine(engine_name: str, folder: msc_utils.MSCDirectory) -> str:
 def to_tensorrt(
     mod: tvm.IRModule,
     graphs: List[MSCGraph],
-    weights: Dict[str, tvm.nd.array],
+    weights: Dict[str, tvm.runtime.Tensor],
     codegen_configs: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None,
     print_configs: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None,
     extra_options: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None,
diff --git a/python/tvm/contrib/msc/framework/tensorrt/frontend/translate.py b/python/tvm/contrib/msc/framework/tensorrt/frontend/translate.py
index 4a02b02728de..59095aff4563 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/frontend/translate.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/frontend/translate.py
@@ -60,10 +60,10 @@ def transform_for_tensorrt(
 
 def partition_for_tensorrt(
     mod: tvm.IRModule,
-    params: Optional[Dict[str, tvm.nd.array]] = None,
+    params: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     trans_config: Optional[Dict[str, str]] = None,
     build_config: Optional[Dict[str, str]] = None,
-) -> Tuple[tvm.IRModule, List[Tuple[MSCGraph, Dict[str, tvm.nd.array]]]]:
+) -> Tuple[tvm.IRModule, List[Tuple[MSCGraph, Dict[str, tvm.runtime.Tensor]]]]:
     """Partition module to tensorrt sub functions.
 
     Parameters
diff --git a/python/tvm/contrib/msc/framework/tensorrt/runtime/runner.py b/python/tvm/contrib/msc/framework/tensorrt/runtime/runner.py
index 3dd392c7d8ac..43b9d096bd9e 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/runtime/runner.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/runtime/runner.py
@@ -79,14 +79,16 @@ def make_plan(self, tool_type: str, data_loader: Any = None) -> dict:
             assert quantizer.calibrated, "Failed to calibrate the tenosrrt quantizer"
         return super().make_plan(tool_type, data_loader)
 
-    def _generate_model(self, graphs: List[MSCGraph], weights: Dict[str, tvm.nd.array]) -> Any:
+    def _generate_model(
+        self, graphs: List[MSCGraph], weights: Dict[str, tvm.runtime.Tensor]
+    ) -> Any:
         """Codegen the model according to framework
 
         Parameters
         -------
         graphs: list<MSCgraph>
             The msc graphs.
-        weights: dict<str, tvm.nd.array>
+        weights: dict<str, tvm.runtime.tensor>
             The weights.
 
         Returns
diff --git a/python/tvm/contrib/msc/framework/tensorrt/tools/quantize/quantizer.py b/python/tvm/contrib/msc/framework/tensorrt/tools/quantize/quantizer.py
index 88cc55a65e1f..259085454f18 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/tools/quantize/quantizer.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/tools/quantize/quantizer.py
@@ -67,22 +67,22 @@ def setup(self) -> dict:
                 return super().setup()
 
             def _reset(
-                self, graphs: List[MSCGraph], weights: List[Dict[str, tvm.nd.array]]
-            ) -> Tuple[List[MSCGraph], List[Dict[str, tvm.nd.array]]]:
+                self, graphs: List[MSCGraph], weights: List[Dict[str, tvm.runtime.Tensor]]
+            ) -> Tuple[List[MSCGraph], List[Dict[str, tvm.runtime.Tensor]]]:
                 """Reset the tool
 
                 Parameters
                 ----------
                 graphs: list<MSCgraph>
                     The msc graphs.
-                weights: list<dict<str, tvm.nd.array>>
+                weights: list<dict<str, tvm.runtime.tensor>>
                     The weights
 
                 Returns
                 -------
                 graphs: list<MSCgraph>
                     The msc graphs.
-                weights: list<dict<str, tvm.nd.array>>
+                weights: list<dict<str, tvm.runtime.tensor>>
                     The weights
                 """
 
diff --git a/python/tvm/contrib/msc/framework/torch/codegen/codegen.py b/python/tvm/contrib/msc/framework/torch/codegen/codegen.py
index 5ca5de400634..cac575f9e2c7 100644
--- a/python/tvm/contrib/msc/framework/torch/codegen/codegen.py
+++ b/python/tvm/contrib/msc/framework/torch/codegen/codegen.py
@@ -28,7 +28,7 @@
 
 def to_torch(
     graph: MSCGraph,
-    weights: Optional[Dict[str, tvm.nd.array]] = None,
+    weights: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     codegen_config: Optional[Dict[str, str]] = None,
     print_config: Optional[Dict[str, str]] = None,
     build_folder: msc_utils.MSCDirectory = None,
diff --git a/python/tvm/contrib/msc/framework/torch/frontend/translate.py b/python/tvm/contrib/msc/framework/torch/frontend/translate.py
index b11051376014..eb6e8b5e56b0 100644
--- a/python/tvm/contrib/msc/framework/torch/frontend/translate.py
+++ b/python/tvm/contrib/msc/framework/torch/frontend/translate.py
@@ -66,7 +66,7 @@ def from_torch(
     build_config: Optional[Dict[str, str]] = None,
     as_msc: bool = True,
     custom_convert_map: dict = None,
-) -> Tuple[Union[MSCGraph, tvm.IRModule], Dict[str, tvm.nd.array]]:
+) -> Tuple[Union[MSCGraph, tvm.IRModule], Dict[str, tvm.runtime.Tensor]]:
     """Change torch nn.Module to MSCGraph.
 
     Parameters
diff --git a/python/tvm/contrib/msc/framework/torch/runtime/runner.py b/python/tvm/contrib/msc/framework/torch/runtime/runner.py
index a4d37d08f521..de1356f08d06 100644
--- a/python/tvm/contrib/msc/framework/torch/runtime/runner.py
+++ b/python/tvm/contrib/msc/framework/torch/runtime/runner.py
@@ -37,7 +37,7 @@
 class TorchRunner(ModelRunner):
     """Runner of Torch"""
 
-    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.nd.array]]:
+    def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.runtime.Tensor]]:
         """Translate IRModule to MSCgraphs
 
         Parameters
@@ -49,7 +49,7 @@ def _translate(self, mod: tvm.IRModule) -> Tuple[List[MSCGraph], Dict[str, tvm.n
         -------
         graph_list: list<MSCGraph>
             The translated graphs
-        weights_list: list<dict<str, tvm.nd.array>>
+        weights_list: list<dict<str, tvm.runtime.tensor>>
             The translated weights
         """
         graphs, weights = super()._translate(mod)
@@ -107,12 +107,12 @@ def _call_runnable(
         ]
         return runnable(*torch_inputs)
 
-    def _get_runtime_params(self) -> Dict[str, tvm.nd.array]:
+    def _get_runtime_params(self) -> Dict[str, tvm.runtime.Tensor]:
         """Get the runtime parameters
 
         Returns
         -------
-        params: dict<str, tvm.nd.array>
+        params: dict<str, tvm.runtime.tensor>
             The parameters from runtime.
         """
 
diff --git a/python/tvm/contrib/msc/framework/tvm/codegen/codegen.py b/python/tvm/contrib/msc/framework/tvm/codegen/codegen.py
index 3c964464043a..31c2cc619ea8 100644
--- a/python/tvm/contrib/msc/framework/tvm/codegen/codegen.py
+++ b/python/tvm/contrib/msc/framework/tvm/codegen/codegen.py
@@ -26,7 +26,7 @@
 
 def to_relax(
     graph: MSCGraph,
-    weights: Optional[Dict[str, tvm.nd.array]] = None,
+    weights: Optional[Dict[str, tvm.runtime.Tensor]] = None,
     codegen_config: Optional[Dict[str, str]] = None,
     print_config: Optional[Dict[str, str]] = None,
     build_folder: msc_utils.MSCDirectory = None,
diff --git a/python/tvm/contrib/msc/framework/tvm/runtime/runner.py b/python/tvm/contrib/msc/framework/tvm/runtime/runner.py
index c6ae512a64e6..a27200d7b6a5 100644
--- a/python/tvm/contrib/msc/framework/tvm/runtime/runner.py
+++ b/python/tvm/contrib/msc/framework/tvm/runtime/runner.py
@@ -49,7 +49,7 @@ def __init__(self, runnable: tvm.relax.VirtualMachine, entry: str = "main"):
         self._runnable = runnable
         self._entry = entry
 
-    def __call__(self, *inputs) -> List[tvm.nd.array]:
+    def __call__(self, *inputs) -> List[tvm.runtime.Tensor]:
         execute_step("before_forward", *inputs)
         output = self._runnable[self._entry](*inputs)
         return execute_step("after_forward", output)
@@ -250,13 +250,13 @@ def run_native(
             with tvm.transform.PassContext(opt_level=3):
                 relax_exec = tvm.compile(model, target)
                 runnable = tvm.relax.VirtualMachine(relax_exec, tvm.cuda())
-            tvm_inputs = [tvm.nd.array(inputs[i], device=tvm.cuda()) for i in input_names]
+            tvm_inputs = [tvm.runtime.tensor(inputs[i], device=tvm.cuda()) for i in input_names]
         else:
             target = tvm.target.Target("llvm")
             with tvm.transform.PassContext(opt_level=3):
                 relax_exec = tvm.compile(model, target)
                 runnable = tvm.relax.VirtualMachine(relax_exec, tvm.cpu())
-            tvm_inputs = [tvm.nd.array(inputs[i]) for i in input_names]
+            tvm_inputs = [tvm.runtime.tensor(inputs[i]) for i in input_names]
 
         def _run_once():
             return runnable["main"](*tvm_inputs)
@@ -271,7 +271,7 @@ def _run_once():
         else:
             outputs = _run_once()
             avg_time = -1
-        if isinstance(outputs, tvm.runtime.NDArray):
+        if isinstance(outputs, tvm.runtime.Tensor):
             outputs = [outputs]
         assert len(output_names) == len(outputs), "Outputs mismatch, {} with {}".format(
             output_names, len(outputs)
diff --git a/python/tvm/contrib/msc/framework/tvm/tools/quantize/method.py b/python/tvm/contrib/msc/framework/tvm/tools/quantize/method.py
index d56193d9f7c1..cc9e7e818355 100644
--- a/python/tvm/contrib/msc/framework/tvm/tools/quantize/method.py
+++ b/python/tvm/contrib/msc/framework/tvm/tools/quantize/method.py
@@ -81,9 +81,9 @@ def get_quantize_cache(
             scale_tensor = scale_tensor.astype(quantizer.find_tensor(name).dtype_name)
             zero_point = np.zeros_like(scale_tensor).astype("int8")
             scale_span = _ffi_api.SpanCreateWithAttr("name", name_prefix + "_scale")
-            scale_tensor = tvm.relax.Constant(tvm.nd.array(scale_tensor), span=scale_span)
+            scale_tensor = tvm.relax.Constant(tvm.runtime.tensor(scale_tensor), span=scale_span)
             zp_span = _ffi_api.SpanCreateWithAttr("name", name_prefix + "_zero_point")
-            zero_point = tvm.relax.Constant(tvm.nd.array(zero_point), span=zp_span)
+            zero_point = tvm.relax.Constant(tvm.runtime.tensor(zero_point), span=zp_span)
             quantizer._save_tensor_cache(name, consumer, "scale_tensor", scale_tensor)
             quantizer._save_tensor_cache(name, consumer, "zero_point", zero_point)
         return scale_tensor, zero_point
diff --git a/python/tvm/contrib/msc/framework/tvm/tools/quantize/quantizer.py b/python/tvm/contrib/msc/framework/tvm/tools/quantize/quantizer.py
index 173dc7c3d9e8..58fbd96c3741 100644
--- a/python/tvm/contrib/msc/framework/tvm/tools/quantize/quantizer.py
+++ b/python/tvm/contrib/msc/framework/tvm/tools/quantize/quantizer.py
@@ -85,8 +85,8 @@ def _execute_after_build(
                 return super()._execute_after_build(output + gather_tensors)
 
             def _execute_after_forward(
-                self, outputs: List[tvm.runtime.NDArray]
-            ) -> Union[tvm.runtime.NDArray, List[tvm.runtime.NDArray]]:
+                self, outputs: List[tvm.runtime.Tensor]
+            ) -> Union[tvm.runtime.Tensor, List[tvm.runtime.Tensor]]:
                 """Execute after model forward
 
                 Parameters
diff --git a/python/tvm/contrib/msc/framework/tvm/tools/track/tracker.py b/python/tvm/contrib/msc/framework/tvm/tools/track/tracker.py
index 2bb0de02be22..39b8e4034b56 100644
--- a/python/tvm/contrib/msc/framework/tvm/tools/track/tracker.py
+++ b/python/tvm/contrib/msc/framework/tvm/tools/track/tracker.py
@@ -83,8 +83,8 @@ def _execute_after_build(
                 return super()._execute_after_build(output + track_tensors)
 
             def _execute_after_forward(
-                self, outputs: List[tvm.runtime.NDArray]
-            ) -> Union[tvm.runtime.NDArray, List[tvm.runtime.NDArray]]:
+                self, outputs: List[tvm.runtime.Tensor]
+            ) -> Union[tvm.runtime.Tensor, List[tvm.runtime.Tensor]]:
                 """Execute after model forward
 
                 Parameters
diff --git a/python/tvm/contrib/tflite_runtime.py b/python/tvm/contrib/tflite_runtime.py
index 81c43861c47a..076946214678 100644
--- a/python/tvm/contrib/tflite_runtime.py
+++ b/python/tvm/contrib/tflite_runtime.py
@@ -86,7 +86,7 @@ def set_input(self, index, value):
         value : the input value.
            The input key
 
-        params : dict of str to NDArray
+        params : dict of str to Tensor
            Additonal arguments
         """
         self._set_input(index, value)
@@ -96,7 +96,7 @@ def invoke(self):
 
         Parameters
         ----------
-        input_dict: dict of str to NDArray
+        input_dict: dict of str to Tensor
             List of input values to be feed to
         """
         self._invoke()
diff --git a/python/tvm/contrib/tvmjs.py b/python/tvm/contrib/tvmjs.py
index e24b88a3f8c3..a72eafd2bf75 100644
--- a/python/tvm/contrib/tvmjs.py
+++ b/python/tvm/contrib/tvmjs.py
@@ -71,7 +71,7 @@ def _calculate_md5(filename):
     return hash_md5.hexdigest()
 
 
-class NDArrayCacheShardingManager:
+class TensorCacheShardingManager:
     """Internal helper to shard ndarrays."""
 
     def __init__(
@@ -198,10 +198,10 @@ def pending_nbytes(self):
         return len(self.curr_data)
 
 
-def dump_ndarray_cache(
+def dump_tensor_cache(
     params: Union[
-        Mapping[str, Union[np.ndarray, tvm.runtime.NDArray]],
-        Iterator[Tuple[str, Union[np.ndarray, tvm.runtime.NDArray]]],
+        Mapping[str, Union[np.ndarray, tvm.runtime.Tensor]],
+        Iterator[Tuple[str, Union[np.ndarray, tvm.runtime.Tensor]]],
     ],
     cache_dir: str,
     encode_format="f32-to-bf16",
@@ -210,13 +210,13 @@ def dump_ndarray_cache(
     show_progress: bool = True,
     update_if_exists: bool = False,
 ):
-    """Dump parameters to NDArray cache.
+    """Dump parameters to Tensor cache.
 
     Parameters
     ----------
     params: Union[
-        Mapping[str, Union[np.ndarray, tvm.runtime.NDArray]],
-        Iterator[Tuple[str, Union[np.ndarray, tvm.runtime.NDArray]]],
+        Mapping[str, Union[np.ndarray, tvm.runtime.Tensor]],
+        Iterator[Tuple[str, Union[np.ndarray, tvm.runtime.Tensor]]],
     ]
         The parameter dictionary or generator
 
@@ -257,7 +257,7 @@ def dump_ndarray_cache(
     print("Start storing to cache %s" % cache_dir)
     shard_cap_nbytes = shard_cap_mb * (1 << 20)
 
-    nd_cache_json = os.path.join(cache_dir, "ndarray-cache.json")
+    nd_cache_json = os.path.join(cache_dir, "tensor-cache.json")
     if update_if_exists and os.path.exists(nd_cache_json):
         with open(nd_cache_json, "r") as infile:
             old_data = json.load(infile)
@@ -265,7 +265,7 @@ def dump_ndarray_cache(
                 meta_data = old_data["metadata"]
             records = old_data["records"]
 
-    shard_manager = NDArrayCacheShardingManager(
+    shard_manager = TensorCacheShardingManager(
         cache_dir, "params_shard", shard_cap_nbytes, initial_shard_records=records
     )
 
@@ -277,7 +277,7 @@ def dump_ndarray_cache(
             v = v.numpy()
 
         # prefer to preserve original dtype, especially if the format was bfloat16
-        dtype = origin_v.dtype if isinstance(origin_v, tvm.nd.NDArray) else v.dtype
+        dtype = origin_v.dtype if isinstance(origin_v, tvm.runtime.Tensor) else v.dtype
 
         if dtype in DataType.NUMPY_DTYPE_TO_STR:
             dtype = DataType.NUMPY_DTYPE_TO_STR[dtype]
@@ -325,15 +325,15 @@ def dump_ndarray_cache(
                 if item["dtype"] == "float32":
                     item["format"] = "raw"
                     item["dtype"] = "bfloat16"
-        b16_nd_cache_json = os.path.join(cache_dir, "ndarray-cache-b16.json")
+        b16_nd_cache_json = os.path.join(cache_dir, "tensor-cache-b16.json")
         # also dump a file that contains bf16
         with open(b16_nd_cache_json, "w") as outfile:
             json.dump({"metadata": meta_data, "records": records}, outfile, indent=4)
         print("Also saved a bf16 record to %s" % b16_nd_cache_json)
 
 
-def load_ndarray_cache(cachepath: str, device: tvm.runtime.Device):
-    """Load the ndarray cache from the directory or json.
+def load_tensor_cache(cachepath: str, device: tvm.runtime.Device):
+    """Load the tensor cache from the directory or json.
 
 
     Parameters
@@ -345,7 +345,7 @@ def load_ndarray_cache(cachepath: str, device: tvm.runtime.Device):
         The device we would like to load the data from.
     """
     if not cachepath.endswith(".json"):
-        cachepath = os.path.join(cachepath, "ndarray-cache.json")
+        cachepath = os.path.join(cachepath, "tensor-cache.json")
 
     cachedir = os.path.dirname(cachepath)
     json_info = json.loads(open(cachepath, "r").read())
@@ -366,7 +366,7 @@ def load_ndarray_cache(cachepath: str, device: tvm.runtime.Device):
             offset = rec["byteOffset"]
             nbytes = rec["nbytes"]
 
-            arr = tvm.nd.empty(shape, dtype, device=device)
+            arr = tvm.runtime.empty(shape, dtype, device=device)
             assert offset + nbytes <= len(raw_data)
             buffer_source = raw_data[offset : offset + nbytes]
             if dtype == "float8_e4m3fn":
diff --git a/python/tvm/dlight/benchmark/bench.py b/python/tvm/dlight/benchmark/bench.py
index 7ab50d412575..ea9f4299b24f 100644
--- a/python/tvm/dlight/benchmark/bench.py
+++ b/python/tvm/dlight/benchmark/bench.py
@@ -106,7 +106,7 @@ def benchmark(
     input_infos = populuate_input_shape(args, dym_var_sample)
     # generate input tensors, including scalars
     # scalars are appended to the end of the list due to parsing order
-    input_tensors: List[Union[tvm.nd.NDArray, int]] = []
+    input_tensors: List[Union[tvm.runtime.Tensor, int]] = []
     scalar_input_tensors: List[int] = []
     for input_shape, input_dtype in input_infos:
         if input_dtype == "scalar":
@@ -116,7 +116,7 @@ def benchmark(
         else:
             # normal case like [1, n, 128], generate random tensor
             input_tensors.append(
-                tvm.nd.array(generate_input_data(list(input_shape), input_dtype), device=dev)
+                tvm.runtime.tensor(generate_input_data(list(input_shape), input_dtype), device=dev)
             )
     # append scalar input tensors for rotary embedding
     input_tensors.extend(scalar_input_tensors)
@@ -144,7 +144,7 @@ def benchmark(
         _, profile_result = rpc_run(
             rt_mod,
             device_type=dev.DEVICE_TYPE_TO_NAME[dev.device_type],
-            args=[w.numpy() if isinstance(w, tvm.nd.NDArray) else w for w in input_tensors],
+            args=[w.numpy() if isinstance(w, tvm.runtime.Tensor) else w for w in input_tensors],
             rpc_config=rpc_config,
             evaluator_config=evaluator_config,
         )
diff --git a/python/tvm/exec/disco_worker.py b/python/tvm/exec/disco_worker.py
index fc22a50d9bf4..9c47627548ab 100644
--- a/python/tvm/exec/disco_worker.py
+++ b/python/tvm/exec/disco_worker.py
@@ -23,8 +23,8 @@
 
 import tvm
 from tvm_ffi import get_global_func, register_func
-from tvm.runtime import NDArray, ShapeTuple, String
-from tvm.runtime.ndarray import array
+from tvm.runtime import Tensor, ShapeTuple, String
+from tvm.runtime.tensor import tensor
 
 
 @register_func("tests.disco.add_one", override=True)
@@ -37,9 +37,9 @@ def _add_one_float(x: float):
     return x + 0.5
 
 
-@register_func("tests.disco.add_one_ndarray", override=True)
-def _add_one_ndarray(x: NDArray) -> NDArray:
-    return array(x.numpy() + 1)
+@register_func("tests.disco.add_one_tensor", override=True)
+def _add_one_tensor(x: Tensor) -> Tensor:
+    return tensor(x.numpy() + 1)
 
 
 @register_func("tests.disco.str", override=True)
@@ -60,7 +60,7 @@ def _shape_tuple_func(x: ShapeTuple):
 
 
 @register_func("tests.disco.test_callback", override=True)
-def _make_callback(device: tvm.runtime.Device) -> Callable[[str, int], NDArray]:
+def _make_callback(device: tvm.runtime.Device) -> Callable[[str, int], Tensor]:
     """For use in tests/python/disco/test_callback.py
 
     This function simulates a callback to be used for lazy parameter
@@ -75,7 +75,7 @@ def _make_callback(device: tvm.runtime.Device) -> Callable[[str, int], NDArray]:
 
     Returns
     -------
-    fget_item: Callable[[str,int], NDArray]
+    fget_item: Callable[[str,int], Tensor]
 
         A callback function that accepts a parameter's name and index,
         and returns the specified parameter.
@@ -83,7 +83,7 @@ def _make_callback(device: tvm.runtime.Device) -> Callable[[str, int], NDArray]:
     """
     import numpy as np  # pylint: disable=import-outside-toplevel
 
-    def fget_item(param_name: str, param_index: int) -> NDArray:
+    def fget_item(param_name: str, param_index: int) -> Tensor:
         if param_index == 0:
             assert param_name == "A"
             arr = np.arange(16).reshape([4, 4]).astype("int32")
@@ -92,7 +92,7 @@ def fget_item(param_name: str, param_index: int) -> NDArray:
             arr = np.arange(4).reshape([2, 2]).astype("float32")
         else:
             raise ValueError(f"Unexpected index {param_index}")
-        return tvm.nd.array(arr, device=device)
+        return tvm.runtime.tensor(arr, device=device)
 
     return fget_item
 
diff --git a/python/tvm/exec/rpc_proxy.py b/python/tvm/exec/rpc_proxy.py
index fd3ec55ba655..f8b4507f8e2f 100644
--- a/python/tvm/exec/rpc_proxy.py
+++ b/python/tvm/exec/rpc_proxy.py
@@ -40,7 +40,7 @@ def find_example_resource():
     # recursively apend things in www, up to two levels
     resource_bases = [
         os.path.join(base_path, "web", "dist", "www"),
-        os.path.join(base_path, "web", ".ndarray_cache"),
+        os.path.join(base_path, "web", ".tensor_cache"),
     ]
     for base in resource_bases:
         if not os.path.isdir(base):
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index 5e7996cf94e2..651ab392039c 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -195,7 +195,7 @@ def structural_equal(lhs, rhs, map_free_vars=False):
     return bool(_ffi_node_api.StructuralEqual(lhs, rhs, False, map_free_vars))  # type: ignore # pylint: disable=no-member
 
 
-def get_first_structural_mismatch(lhs, rhs, map_free_vars=False, skip_ndarray_content=False):
+def get_first_structural_mismatch(lhs, rhs, map_free_vars=False, skip_tensor_content=False):
     """Like structural_equal(), but returns the AccessPath pair of the first detected mismatch.
 
     Parameters
@@ -210,7 +210,7 @@ def get_first_structural_mismatch(lhs, rhs, map_free_vars=False, skip_ndarray_co
         Whether free variables (i.e. variables without a definition site) should be mapped
         as equal to each other.
 
-    skip_ndarray_content : bool
+    skip_tensor_content : bool
         Whether to skip the content of ndarray.
 
     Returns
@@ -221,7 +221,7 @@ def get_first_structural_mismatch(lhs, rhs, map_free_vars=False, skip_ndarray_co
     """
     lhs = tvm.runtime.convert(lhs)
     rhs = tvm.runtime.convert(rhs)
-    return _ffi_node_api.GetFirstStructuralMismatch(lhs, rhs, map_free_vars, skip_ndarray_content)  # type: ignore # pylint: disable=no-member
+    return _ffi_node_api.GetFirstStructuralMismatch(lhs, rhs, map_free_vars, skip_tensor_content)  # type: ignore # pylint: disable=no-member
 
 
 def assert_structural_equal(lhs, rhs, map_free_vars=False):
diff --git a/python/tvm/meta_schedule/builder/builder.py b/python/tvm/meta_schedule/builder/builder.py
index 3383ef55ada0..39493781404a 100644
--- a/python/tvm/meta_schedule/builder/builder.py
+++ b/python/tvm/meta_schedule/builder/builder.py
@@ -23,7 +23,7 @@
 # isort: on
 from tvm_ffi import register_object
 from tvm.ir import IRModule
-from tvm.runtime import NDArray, Object
+from tvm.runtime import Tensor, Object
 from tvm.target import Target
 
 from .. import _ffi_api
@@ -39,19 +39,19 @@ class BuilderInput(Object):
         The IRModule to be built.
     target : Target
         The target to be built for.
-    params: Optional[Dict[str, NDArray]]
+    params: Optional[Dict[str, Tensor]]
         The parameters for Relax build module
     """
 
     mod: IRModule
     target: Target
-    params: Optional[Dict[str, NDArray]]
+    params: Optional[Dict[str, Tensor]]
 
     def __init__(
         self,
         mod: IRModule,
         target: Target,
-        params: Optional[Dict[str, NDArray]] = None,
+        params: Optional[Dict[str, Tensor]] = None,
     ) -> None:
         """Constructor.
 
@@ -61,7 +61,7 @@ def __init__(
             The IRModule to be built.
         target : Target
             The target to be built for.
-        params: Optional[Dict[str, NDArray]]
+        params: Optional[Dict[str, Tensor]]
             The parameters for Relax build module
         """
         self.__init_handle_by_constructor__(
diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 297d6cb61028..cda8d21838cb 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -21,7 +21,7 @@
 
 from tvm_ffi import register_func
 from tvm.ir import IRModule
-from tvm.runtime import Module, NDArray, load_param_dict, save_param_dict
+from tvm.runtime import Module, Tensor, load_param_dict, save_param_dict
 from tvm.target import Target
 
 from ...contrib.popen_pool import MapResult, PopenPoolExecutor, StatusKind
@@ -33,18 +33,18 @@
 
 
 T_BUILD = Callable[  # pylint: disable=invalid-name
-    [IRModule, Target, Optional[Dict[str, NDArray]]], Module
+    [IRModule, Target, Optional[Dict[str, Tensor]]], Module
 ]
 T_EXPORT = Callable[[Module], str]  # pylint: disable=invalid-name
 
 
-def _serialize_params(params: Optional[Dict[str, NDArray]]) -> Optional[bytearray]:
+def _serialize_params(params: Optional[Dict[str, Tensor]]) -> Optional[bytearray]:
     if params is None:
         return None
     return save_param_dict(params)
 
 
-def _deserialize_params(params: Optional[bytearray]) -> Optional[Dict[str, NDArray]]:
+def _deserialize_params(params: Optional[bytearray]) -> Optional[Dict[str, Tensor]]:
     if params is None:
         return None
     return load_param_dict(params)
@@ -81,7 +81,7 @@ class LocalBuilder(PyBuilder):
         def default_build(
             mod: IRModule,
             target: Target,
-            params: Optional[Dict[str, NDArray]]
+            params: Optional[Dict[str, Tensor]]
         ) -> Module:
             ...
 
@@ -235,7 +235,7 @@ def _worker_func(
 
 
 @register_func("meta_schedule.builder.default_build")
-def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]) -> Module:
+def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, Tensor]]) -> Module:
     """Default build function.
 
     Parameters
@@ -244,7 +244,7 @@ def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDA
         The IRModule to be built.
     target : Target
         The target to be built.
-    _params : Optional[Dict[str, NDArray]]
+    _params : Optional[Dict[str, Tensor]]
         The parameters to be used for the build. Must be None.
 
     Returns
@@ -257,7 +257,7 @@ def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDA
     from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
 
     # pylint: enable=import-outside-toplevel
-    mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+    mod = RemoveWeightLayoutRewriteBlock(skip_tensor_rewrite=True)(mod)
     return tvm_build(mod, target=target)
 
 
diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py
index 9191eee6a68f..ef846a6c7c5f 100644
--- a/python/tvm/meta_schedule/cost_model/mlp_model.py
+++ b/python/tvm/meta_schedule/cost_model/mlp_model.py
@@ -32,7 +32,7 @@
 import tvm
 
 from ...contrib.tar import tar, untar
-from ...runtime import NDArray
+from ...runtime import Tensor
 from ...target import Target
 from ..cost_model import PyCostModel
 from ..database import JSONDatabase
@@ -441,7 +441,7 @@ def extract_features(
     """
     extractor = extractor or PerStoreFeature(extract_workload=True)
 
-    def _feature(feature: NDArray) -> np.ndarray:
+    def _feature(feature: Tensor) -> np.ndarray:
         return feature.numpy().astype("float32")
 
     def _mean_cost(res: RunnerResult) -> float:
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 5806454cdddb..a14dceef379f 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -26,7 +26,7 @@
 import numpy as np  # type: ignore
 
 from ...contrib.tar import tar, untar
-from ...runtime import NDArray
+from ...runtime import Tensor
 from ..cost_model import PyCostModel
 from ..feature_extractor import FeatureExtractor
 from ..logging import get_logger
@@ -484,7 +484,7 @@ def update(
         group = self.data.get(new_group_hash, None)
 
         # Step 2. Extract features
-        def _feature(x: NDArray) -> np.ndarray:
+        def _feature(x: Tensor) -> np.ndarray:
             return x.numpy().astype("float32")
 
         def _mean_cost(x: RunnerResult) -> float:
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
index cdf08c6e0335..7c6f7459cacc 100644
--- a/python/tvm/meta_schedule/database/json_database.py
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -38,10 +38,10 @@ class JSONDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
+                            given module. The "ignore-tensor" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
     """
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index 69b129ec215f..1d6d4121231c 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -31,10 +31,10 @@ class MemoryDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
+                            given module. The "ignore-tensor" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
     """
diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py
index 477c5664fdf3..74b2a6eb60da 100644
--- a/python/tvm/meta_schedule/database/schedule_fn_database.py
+++ b/python/tvm/meta_schedule/database/schedule_fn_database.py
@@ -37,10 +37,10 @@ class ScheduleFnDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
+                            given module. The "ignore-tensor" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
     """
diff --git a/python/tvm/meta_schedule/feature_extractor/feature_extractor.py b/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
index d4c68fcb93e0..b50a22142943 100644
--- a/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
+++ b/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
@@ -24,7 +24,7 @@
 
 from tvm_ffi import register_object
 from tvm.runtime import Object
-from tvm.runtime.ndarray import NDArray
+from tvm.runtime._tensor import Tensor
 
 from .. import _ffi_api
 from ..search_strategy import MeasureCandidate
@@ -40,7 +40,7 @@ class FeatureExtractor(Object):
 
     def extract_from(
         self, context: TuneContext, candidates: List[MeasureCandidate]
-    ) -> List[NDArray]:
+    ) -> List[Tensor]:
         """Extract features from the given measure candidate.
 
         Parameters
@@ -52,7 +52,7 @@ def extract_from(
 
         Returns
         -------
-        features : List[NDArray]
+        features : List[Tensor]
             The feature tvm ndarray extracted.
         """
         result = _ffi_api.FeatureExtractorExtractFrom(  # type: ignore # pylint: disable=no-member
@@ -108,7 +108,7 @@ class PyFeatureExtractor:
 
     def extract_from(
         self, context: TuneContext, candidates: List[MeasureCandidate]
-    ) -> List[NDArray]:
+    ) -> List[Tensor]:
         """Extract features from the given measure candidate.
 
         Parameters
@@ -120,7 +120,7 @@ def extract_from(
 
         Returns
         -------
-        features : List[NDArray]
+        features : List[Tensor]
             The feature tvm ndarray extracted.
         """
         raise NotImplementedError
diff --git a/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py b/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
index 18b84c364ad4..908dde400ec8 100644
--- a/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
+++ b/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
@@ -18,7 +18,7 @@
 from typing import List, Tuple, Union
 
 import numpy as np  # type: ignore
-from tvm.runtime.ndarray import NDArray, array
+import tvm.runtime
 
 from ..feature_extractor import PyFeatureExtractor
 from ..search_strategy import MeasureCandidate
@@ -54,11 +54,11 @@ def __init__(self, *, feature_size: int = 30, max_block_num: int = 5, seed=0):
 
     def extract_from(
         self, context: TuneContext, candidates: List[MeasureCandidate]
-    ) -> List[NDArray]:
+    ) -> List[tvm.runtime.Tensor]:
         np.random.set_state(self.random_state)
         result = [
             np.random.rand(np.random.randint(1, self.max_block_num + 1), self.feature_size)
             for candidate in candidates
         ]
         self.random_state = np.random.get_state()
-        return [array(x) for x in result]
+        return [tvm.runtime.tensor(x) for x in result]
diff --git a/python/tvm/meta_schedule/relax_integration.py b/python/tvm/meta_schedule/relax_integration.py
index 8d041b6caaf2..92e0e24a4cc3 100644
--- a/python/tvm/meta_schedule/relax_integration.py
+++ b/python/tvm/meta_schedule/relax_integration.py
@@ -26,7 +26,7 @@
 from tvm_ffi import get_global_func, register_func
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
-from tvm.runtime import NDArray
+from tvm.runtime import Tensor
 from tvm.target import Target
 from tvm.tir.expr import IntImm
 
@@ -56,7 +56,7 @@
 def extract_tasks(
     mod: Union[IRModule, "relax.Function"],
     target: Target,
-    params: Optional[Dict[str, NDArray]] = None,
+    params: Optional[Dict[str, Tensor]] = None,
     module_equality: str = "structural",
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relax program.
@@ -67,16 +67,16 @@ def extract_tasks(
         The module or function to tune
     target : tvm.target.Target
         The compilation target
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
+    params : Optional[Dict[str, tvm.runtime.Tensor]]
         The associated parameters of the program
     module_equality : Optional[str]
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
+                            given module. The "ignore-tensor" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
 
@@ -159,7 +159,7 @@ def extracted_tasks_to_tune_contexts(
 
 def tune_relax(
     mod: Union[IRModule, "relax.Function"],
-    params: Dict[str, NDArray],
+    params: Dict[str, Tensor],
     target: Union[str, Target],
     work_dir: str,
     max_trials_global: int,
@@ -184,7 +184,7 @@ def tune_relax(
     ----------
     mod : Union[IRModule, relax.Function]
         The module or function to tune
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
+    params : Optional[Dict[str, tvm.runtime.Tensor]]
         The associated parameters of the program
     target : Union[Target, str]
         The compilation target
@@ -221,10 +221,10 @@ def tune_relax(
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" variant is used for the extracted
+                            given module. The "ignore-tensor" variant is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
 
@@ -272,7 +272,7 @@ def tune_relax(
 @register_func("tvm.meta_schedule.tune_relax")
 def _tune_relax(
     mod: Union[IRModule, "relax.Function"],
-    params: Dict[str, NDArray],
+    params: Dict[str, Tensor],
     target: Union[str, Target],
     work_dir: str,
     max_trials_global: int,
@@ -297,7 +297,7 @@ def _tune_relax(
     ----------
     mod : Union[IRModule, relax.Function]
         The module or function to tune
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
+    params : Optional[Dict[str, tvm.runtime.Tensor]]
         The associated parameters of the program
     target : Union[Target, str]
         The compilation target
@@ -334,10 +334,10 @@ def _tune_relax(
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
-          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+          - "ignore-tensor": Same as "structural", but ignore tensor raw data during
                               equality testing and hashing.
           - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-                            given module. The "ignore-ndarray" varint is used for the extracted
+                            given module. The "ignore-tensor" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
 
@@ -380,7 +380,7 @@ def compile_relax(
     database: Database,
     mod: IRModule,
     target: Union[Target, str],
-    params: Optional[Dict[str, NDArray]],
+    params: Optional[Dict[str, Tensor]],
     enable_warning: bool = False,
 ) -> "relax.VMExecutable":
     """Compile a relax program with a MetaSchedule database.
@@ -393,7 +393,7 @@ def compile_relax(
         The Relax program to be compiled
     target : tvm.target.Target
         The compilation target
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
+    params : Optional[Dict[str, tvm.runtime.Tensor]]
         The associated parameters of the program
     enable_warning : bool
         A boolean value indicating if to print warnings for TIR functions not
diff --git a/python/tvm/meta_schedule/runner/utils.py b/python/tvm/meta_schedule/runner/utils.py
index ef0d4b5f98f7..d4af6726cee0 100644
--- a/python/tvm/meta_schedule/runner/utils.py
+++ b/python/tvm/meta_schedule/runner/utils.py
@@ -17,8 +17,9 @@
 """Runner utility functions"""
 import itertools
 from typing import Any, Callable, Dict, List
+import tvm.runtime
 
-from ...runtime import Device, Module, ndarray
+from ...runtime import Device, Module
 from .config import EvaluatorConfig
 
 T_ARG_INFO_JSON_OBJ = List[Any]  # pylint: disable=invalid-name
@@ -52,8 +53,8 @@ def alloc_argument_common(
         The allocation args
     """
 
-    def alloc_tensor(_, dtype, shape) -> ndarray.NDArray:
-        arg = ndarray.empty(shape=shape, dtype=dtype, device=device)
+    def alloc_tensor(_, dtype, shape) -> tvm.runtime.Tensor:
+        arg = tvm.runtime.empty(shape=shape, dtype=dtype, device=device)
         f_random_fill(arg)
         return arg
 
diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py
index 08618a289d52..4b1155b2a235 100644
--- a/python/tvm/meta_schedule/testing/tune_utils.py
+++ b/python/tvm/meta_schedule/testing/tune_utils.py
@@ -19,7 +19,7 @@
 import numpy as np  # type: ignore
 
 import tvm
-from tvm.runtime import NDArray
+from tvm.runtime import Tensor
 
 
 def generate_input_data(
@@ -81,8 +81,8 @@ def create_calculator(backend: str) -> Callable:
     def f_calculator(
         rt_mod: tvm.runtime.Module,
         dev: tvm.runtime.Device,  # pylint: disable=unused-argument
-        input_data: Dict[str, NDArray],
-    ) -> List[NDArray]:
+        input_data: Dict[str, Tensor],
+    ) -> List[Tensor]:
         """Fetch the result of running the given runtime module.
 
         Parameters
diff --git a/python/tvm/meta_schedule/testing/validate_database.py b/python/tvm/meta_schedule/testing/validate_database.py
index 4478792c5b22..e356e6c75358 100644
--- a/python/tvm/meta_schedule/testing/validate_database.py
+++ b/python/tvm/meta_schedule/testing/validate_database.py
@@ -205,22 +205,22 @@ def initializer() -> None:
 
     @register_func("tvm.meta_schedule.testing.default_check_metric")
     def default_check_metric(  # pylint: disable=unused-variable,unreachable-code
-        lhs: List[tvm.nd.NDArray], rhs: List[tvm.nd.NDArray]
+        lhs: List[tvm.runtime.Tensor], rhs: List[tvm.runtime.Tensor]
     ) -> bool:
         """Check if the outputs are equal
 
         Parameters
         ----------
-        lhs : List[tvm.nd.NDArray]
-            The first list of NDArrays to compare.
+        lhs : List[tvm.runtime.Tensor]
+            The first list of Tensors to compare.
 
-        rhs : List[tvm.nd.NDArray]
-            The second list of NDArrays to compare.
+        rhs : List[tvm.runtime.Tensor]
+            The second list of Tensors to compare.
 
         Returns
         -------
         is_equal : bool
-            Whether the two lists of NDArrays are equal.
+            Whether the two lists of Tensors are equal.
         """
         assert len(lhs) == len(rhs), "Different number of outputs from two modules"
         for i in range(len(lhs)):  # pylint: disable=consider-using-enumerate
@@ -232,7 +232,7 @@ def default_check_metric(  # pylint: disable=unused-variable,unreachable-code
 @register_func("tvm.meta_schedule.testing.default_input_generator")
 def default_input_generator(  # pylint: disable=unused-variable
     mod: IRModule,
-) -> List[tvm.nd.NDArray]:
+) -> List[tvm.runtime.Tensor]:
     """Default input generator function
 
     Parameters
@@ -242,25 +242,27 @@ def default_input_generator(  # pylint: disable=unused-variable
 
     Returns
     -------
-    inputs : List[tvm.nd.NDArray]
+    inputs : List[tvm.runtime.Tensor]
         The generated input data.
     """
 
     args_info = ms.arg_info.TensorInfo.from_prim_func(mod["main"])
     inputs = [
-        tvm.nd.array(generate_input_data(input_shape=arg_info.shape, input_dtype=arg_info.dtype))
+        tvm.runtime.tensor(
+            generate_input_data(input_shape=arg_info.shape, input_dtype=arg_info.dtype)
+        )
         for arg_info in args_info
     ]
     return inputs
 
 
-def to_numpy(a: List[tvm.nd.NDArray]) -> List[np.ndarray]:
-    """Convert a list of TVM NDArray to a list of numpy array
+def to_numpy(a: List[tvm.runtime.Tensor]) -> List[np.ndarray]:
+    """Convert a list of TVM Tensor to a list of numpy array
 
     Parameters
     ----------
-    a : List[tvm.nd.NDArray]
-        The list of TVM NDArray to be converted
+    a : List[tvm.runtime.Tensor]
+        The list of TVM Tensor to be converted
 
     Returns
     -------
@@ -271,8 +273,8 @@ def to_numpy(a: List[tvm.nd.NDArray]) -> List[np.ndarray]:
     return [x.numpy() for x in a]
 
 
-def to_tvm_ndarray(a: List[np.ndarray]) -> List[tvm.nd.NDArray]:
-    """Convert a list of numpy array to a list of TVM NDArray
+def to_tvm_tensor(a: List[np.ndarray]) -> List[tvm.runtime.Tensor]:
+    """Convert a list of numpy array to a list of TVM Tensor
 
     Parameters
     ----------
@@ -281,11 +283,11 @@ def to_tvm_ndarray(a: List[np.ndarray]) -> List[tvm.nd.NDArray]:
 
     Returns
     -------
-    b : List[tvm.nd.NDArray]
-        The list of TVM NDArray.
+    b : List[tvm.runtime.Tensor]
+        The list of TVM Tensor.
     """
-    assert a is not None, "Empty result cannot be converted to TVM NDArray"
-    return [tvm.nd.array(x) for x in a]
+    assert a is not None, "Empty result cannot be converted to TVM Tensor"
+    return [tvm.runtime.tensor(x) for x in a]
 
 
 def is_failed_record(record: ms.database.TuningRecord) -> bool:
@@ -436,7 +438,9 @@ def f_with_args_alloc_argument_common(
         args_list : List[T_ARGUMENT_LIST]
             The list of argument lists.
         """
-        return [[tvm.nd.array(arg, device=device) for arg in inputs] for _ in range(alloc_repeat)]
+        return [
+            [tvm.runtime.tensor(arg, device=device) for arg in inputs] for _ in range(alloc_repeat)
+        ]
 
     def f_with_args_run_evaluator_common(
         rt_mod: tvm.runtime.Module,
@@ -487,8 +491,8 @@ def f_with_args_run_evaluator_common(
         # fetch comparison function
         passed = check_and_run(
             ARGS.check_metric_func,
-            to_tvm_ndarray(original_res),
-            to_tvm_ndarray(scheduled_res),
+            to_tvm_tensor(original_res),
+            to_tvm_tensor(scheduled_res),
         )
 
         print_result(
@@ -556,7 +560,7 @@ def local_build_and_run(
     """
     # potential memory leak https://github.com/apache/tvm/issues/11096
     lib = tvm.compile(mod, target=target)
-    tvm_inputs = [tvm.nd.array(inp, device=device) for inp in inputs]
+    tvm_inputs = [tvm.runtime.tensor(inp, device=device) for inp in inputs]
     device.sync()
     func = lib.time_evaluator(lib.entry_name, dev=device, number=ARGS.number, repeat=ARGS.repeat)
     benchmark_res = func(*tvm_inputs)
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 78c05fed533e..2cda77ba0978 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -77,10 +77,10 @@ def tune_tasks(
         It must be one of the followings:
 
             - "structural": Use StructuralEqual/Hash
-            - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during equality
+            - "ignore-tensor": Same as "structural", but ignore tensor raw data during equality
                 testing and hashing.
             - "anchor-block": Apply equality testing and hashing on the anchor block extracted from
-                a given module. The "ignore-ndarray" varint is used for the extracted blocks or in
+                a given module. The "ignore-tensor" varint is used for the extracted blocks or in
                 case no anchor block is found. For the definition of the anchor block, see
                 tir/analysis/analysis.py.
     post_optimization : Optional[Bool]
diff --git a/python/tvm/relax/base_py_module.py b/python/tvm/relax/base_py_module.py
index f463a84fc692..688dc962f23f 100644
--- a/python/tvm/relax/base_py_module.py
+++ b/python/tvm/relax/base_py_module.py
@@ -24,7 +24,7 @@
 import tvm
 from tvm import relax, tir
 from tvm.ir import IRModule
-from tvm.runtime import Device, NDArray, PackedFunc
+from tvm.runtime import Device, Tensor, PackedFunc
 from tvm.target import Target
 
 try:
@@ -38,7 +38,7 @@ class BasePyModule:
 
     This class provides the infrastructure for:
     1. JIT compilation of TIR and Relax functions.
-    2. DLPack-based conversion between PyTorch tensors and TVM NDArrays.
+    2. DLPack-based conversion between PyTorch tensors and TVM Tensors.
     3. Wrapping Relax functions for easy Python calling.
     4. Cross-function calls between Python, TIR, and Relax functions.
 
@@ -208,7 +208,7 @@ def call_tir(self, tir_func, args, out_sinfo):
         return result[0] if len(result) == 1 else result
 
     def call_dps_packed(self, func_name: str, args, out_sinfo):
-        """Call a packed function with PyTorch tensors, converting TVM NDArrays via DLPack."""
+        """Call a packed function with PyTorch tensors, converting TVM Tensors via DLPack."""
         if hasattr(self, func_name) and callable(getattr(self, func_name)):
             return getattr(self, func_name)(*args)
 
@@ -269,8 +269,8 @@ def _convert_tvm_dtype_to_torch(self, tvm_dtype: str) -> "torch.dtype":
 
     def _convert_pytorch_to_tvm(
         self, tensors: Union[Any, List[Any], Tuple[Any, ...]]
-    ) -> Union[NDArray, List[NDArray]]:
-        """Convert PyTorch tensors to TVM NDArrays using DLPack."""
+    ) -> Union[Tensor, List[Tensor]]:
+        """Convert PyTorch tensors to TVM Tensors using DLPack."""
         # pylint: disable=import-outside-toplevel
         import torch
 
@@ -278,25 +278,25 @@ def _convert_pytorch_to_tvm(
             return [self._convert_single_pytorch_to_tvm(t) for t in tensors]
         return self._convert_single_pytorch_to_tvm(tensors)
 
-    def _convert_single_pytorch_to_tvm(self, tensor: Any) -> NDArray:
-        """Convert a single PyTorch tensor to TVM NDArray with robust fallbacks."""
+    def _convert_single_pytorch_to_tvm(self, tensor: Any) -> Tensor:
+        """Convert a single PyTorch tensor to TVM Tensor with robust fallbacks."""
         # pylint: disable=import-outside-toplevel
         import torch
 
-        if isinstance(tensor, NDArray):
+        if isinstance(tensor, Tensor):
             return tensor
         if isinstance(tensor, torch.Tensor):
             # 1. Try modern `torch.to_dlpack` (preferred for PyTorch >= 1.7)
             try:
                 dlpack = torch.to_dlpack(tensor)
-                return tvm.nd.from_dlpack(dlpack)
+                return tvm.runtime.from_dlpack(dlpack)
             except (AttributeError, ValueError):
                 pass  # Fall through to the next method
             # 2. Try legacy `torch.utils.dlpack.to_dlpack`
             if to_dlpack_legacy:
                 try:
                     dlpack = to_dlpack_legacy(tensor)
-                    return tvm.nd.from_dlpack(dlpack)
+                    return tvm.runtime.from_dlpack(dlpack)
                 except (AttributeError, ValueError) as error_legacy:
                     print(
                         f"Warning: Legacy DLPack conversion failed ({error_legacy}), "
@@ -304,33 +304,33 @@ def _convert_single_pytorch_to_tvm(self, tensor: Any) -> NDArray:
                     )
             # 3. If all DLPack methods fail, use numpy fallback
             numpy_array = tensor.detach().cpu().numpy()
-            return tvm.nd.array(numpy_array, device=self.device)
+            return tvm.runtime.tensor(numpy_array, device=self.device)
 
         # For other types (like scalars, lists), convert to numpy first
         try:
             numpy_array = np.array(tensor, dtype=np.float32)
-            return tvm.nd.array(numpy_array, device=self.device)
+            return tvm.runtime.tensor(numpy_array, device=self.device)
         except (TypeError, ValueError) as error:
             raise TypeError(
-                f"Unsupported type for conversion to TVM NDArray: {type(tensor)}"
+                f"Unsupported type for conversion to TVM Tensor: {type(tensor)}"
             ) from error
 
     def _convert_tvm_to_pytorch(
         self, tvm_arrays: Union[Any, List[Any]]
     ) -> Union["torch.Tensor", List["torch.Tensor"]]:
-        """Convert TVM NDArrays to PyTorch tensors using DLPack."""
+        """Convert TVM Tensors to PyTorch tensors using DLPack."""
         if isinstance(tvm_arrays, (list, tuple)):
             return [self._convert_single_tvm_to_pytorch(arr) for arr in tvm_arrays]
         return self._convert_single_tvm_to_pytorch(tvm_arrays)
 
     def _convert_single_tvm_to_pytorch(self, tvm_array: Any) -> "torch.Tensor":
-        """Convert a single TVM NDArray to PyTorch tensor using DLPack."""
+        """Convert a single TVM Tensor to PyTorch tensor using DLPack."""
         # pylint: disable=import-outside-toplevel
         import torch
 
         if isinstance(tvm_array, torch.Tensor):
             return tvm_array
-        if not isinstance(tvm_array, NDArray):
+        if not isinstance(tvm_array, Tensor):
             return torch.tensor(tvm_array)
         try:
             dlpack = tvm_array.to_dlpack()
diff --git a/python/tvm/relax/exec_builder.py b/python/tvm/relax/exec_builder.py
index 43f9a2e693b1..50d6c0679eca 100644
--- a/python/tvm/relax/exec_builder.py
+++ b/python/tvm/relax/exec_builder.py
@@ -106,7 +106,7 @@ def convert_constant(self, const: object) -> int:
     def emit_call(
         self,
         name: str,
-        args: Optional[List[Union[tvm.nd.NDArray, tvm.DataType]]] = None,
+        args: Optional[List[Union[tvm.runtime.Tensor, tvm.DataType]]] = None,
         dst: int = None,
     ) -> None:
         """emit a call instruction which calls a packed function."""
@@ -120,7 +120,7 @@ def emit_call(
                     shape_tuple = ShapeTuple(arg)
                     new_arg = self.convert_constant(shape_tuple)
                     args_.append(new_arg)
-                elif isinstance(arg, (tvm.nd.NDArray, tvm.DataType, ShapeTuple)):
+                elif isinstance(arg, (tvm.runtime.Tensor, tvm.DataType, ShapeTuple)):
                     new_arg = self.convert_constant(arg)
                     args_.append(new_arg)
                 else:
diff --git a/python/tvm/relax/expr.py b/python/tvm/relax/expr.py
index 051e49f81c83..2b78996f2974 100644
--- a/python/tvm/relax/expr.py
+++ b/python/tvm/relax/expr.py
@@ -25,8 +25,8 @@
 import tvm.ir
 import tvm.relax
 from tvm import DataType
+import tvm.runtime
 from tvm.runtime import Object
-from tvm.runtime import ndarray as _nd
 
 from ..ir import BaseFunc, Node, Span
 from ..runtime import Scriptable, String
@@ -713,7 +713,7 @@ class Constant(ExprWithOp):
 
     Parameters
     ----------
-    data: tvm.nd.NDArray
+    data: tvm.runtime.Tensor
         The data of the constant tensor.
 
     struct_info: Optional[StructInfo]
@@ -727,12 +727,12 @@ class Constant(ExprWithOp):
     Scalar constants are represented by ndim-0 constant tensors.
     """
 
-    data: tvm.nd.NDArray
+    data: tvm.runtime.Tensor
     span: Optional[Span]
 
     def __init__(
         self,
-        data: tvm.nd.NDArray,
+        data: tvm.runtime.Tensor,
         struct_info: Optional[StructInfo] = None,
         span: Optional[Span] = None,
     ) -> None:
@@ -1056,7 +1056,7 @@ def bind_params(
         self,
         binding_map: Mapping[
             Union[str, Var],
-            Union[int, float, PrimExpr, tvm.runtime.NDArray, _np.ndarray, Expr],
+            Union[int, float, PrimExpr, tvm.runtime.Tensor, _np.ndarray, Expr],
         ],
     ) -> "Function":
         """Return a new function with updated symbolic variable
@@ -1065,7 +1065,7 @@ def bind_params(
         ----------
         binding_map: Mapping[
                 Union[str, Var],
-                Union[int, float, PrimExpr, tvm.runtime.NDArray, _np.ndarray, Expr],
+                Union[int, float, PrimExpr, tvm.runtime.Tensor, _np.ndarray, Expr],
         ]
 
             The mapping of values to be replaced.
@@ -1093,7 +1093,7 @@ def _normalize_value(value):
                 # Relax uses int64 for symbolic variables, but the FFI
                 # converts python integers into int32.
                 return tvm.tir.const(value, "int64")
-            elif isinstance(value, (_np.ndarray, tvm.nd.NDArray)):
+            elif isinstance(value, (_np.ndarray, tvm.runtime.Tensor)):
                 return tvm.relax.const(value)
             else:
                 return value
@@ -1132,13 +1132,13 @@ def extern(name: str, struct_info: Optional[StructInfo] = None, span: Optional[S
 
 
 def const(
-    value: Union[bool, int, float, _np.ndarray, tvm.nd.NDArray], dtype: Optional[str] = None
+    value: Union[bool, int, float, _np.ndarray, tvm.runtime.Tensor], dtype: Optional[str] = None
 ) -> Constant:
     """Create a constant value.
 
     Parameters
     ----------
-    value: Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
+    value: Union[bool, int, float, numpy.ndarray, tvm.runtime.Tensor]
         The constant value.
 
     dtype: Optional[str]
@@ -1168,10 +1168,10 @@ def const(
     if isinstance(value, (_np.ndarray, _np.generic)):
         if dtype is not None:
             value = value.astype(dtype)
-        value = _nd.array(value)
+        value = tvm.runtime.tensor(value)
 
-    if not isinstance(value, _nd.NDArray):
-        raise ValueError("value has to be scalar or NDArray")
+    if not isinstance(value, tvm.runtime.Tensor):
+        raise ValueError("value has to be scalar or Tensor")
 
     return Constant(value)
 
diff --git a/python/tvm/relax/frontend/common.py b/python/tvm/relax/frontend/common.py
index ba2960c159fc..c1e9296ca3a5 100644
--- a/python/tvm/relax/frontend/common.py
+++ b/python/tvm/relax/frontend/common.py
@@ -23,7 +23,7 @@
 from tvm import topi
 
 
-def detach_params(mod: tvm.IRModule) -> Tuple[tvm.IRModule, Dict[str, List[tvm.nd.NDArray]]]:
+def detach_params(mod: tvm.IRModule) -> Tuple[tvm.IRModule, Dict[str, List[tvm.runtime.Tensor]]]:
     """Detach the attribute "params" in the functions of the input IRModule as
     separate dictionary of params.
 
@@ -37,7 +37,7 @@ def detach_params(mod: tvm.IRModule) -> Tuple[tvm.IRModule, Dict[str, List[tvm.n
     detached_mod : tvm.IRModule
         The IRModule after the detachment.
 
-    params_dict : Dict[str, List[tvm.nd.NDArray]]
+    params_dict : Dict[str, List[tvm.runtime.Tensor]]
         The detached params. The dict keys corresponds to the names of the
         functions in the input IRModule that have attribute "params".
     """
@@ -46,10 +46,8 @@ def detach_params(mod: tvm.IRModule) -> Tuple[tvm.IRModule, Dict[str, List[tvm.n
     for gv, func in mod.functions_items():
         if "params" in func.attrs:
             params = list(func.attrs["params"])
-            if not all([isinstance(param, tvm.nd.NDArray) for param in params]):
-                raise ValueError(
-                    'The value "params" attribute is expected to be a list of NDArray.'
-                )
+            if not all([isinstance(param, tvm.runtime.Tensor) for param in params]):
+                raise ValueError('The value "params" attribute is expected to be a list of Tensor.')
             params_dict[gv.name_hint] = params
             detached_mod[gv] = func.without_attr("params")
         else:
diff --git a/python/tvm/relax/frontend/nn/core.py b/python/tvm/relax/frontend/nn/core.py
index 068b2090db5b..b2904fe2a9be 100644
--- a/python/tvm/relax/frontend/nn/core.py
+++ b/python/tvm/relax/frontend/nn/core.py
@@ -42,9 +42,9 @@
 from tvm import tir
 from tvm.ir import IRModule
 from tvm.ir.transform import Pass
-from tvm.runtime import Device, NDArray
+import tvm.runtime
+from tvm.runtime import Device
 from tvm.runtime import device as as_device
-from tvm.runtime import ndarray
 from tvm.runtime.vm import VirtualMachine
 from tvm.target import Target
 
@@ -225,7 +225,7 @@ class Parameter(Tensor):
     it is called a bound parameter, otherwise it is called an unbound parameter.
     """
 
-    _data: Optional[NDArray]
+    _data: Optional[Tensor]
     attrs: Dict[str, Any]
 
     def __init__(
@@ -251,16 +251,16 @@ def __init__(
         self.attrs = OrderedDict()
 
     @property
-    def data(self) -> Optional[NDArray]:
+    def data(self) -> Optional[Tensor]:
         """Returns the concrete value of the parameter if it is bound to a concrete value,
-        otherwise returns None. The returned value is a tvm.runtime.NDArray."""
+        otherwise returns None. The returned value is a tvm.runtime.Tensor."""
         return self._data
 
     @data.setter
-    def data(self, data: Union[None, NDArray, np.ndarray, "torch.Tensor"]) -> None:
+    def data(self, data: Union[None, tvm.runtime.Tensor, np.ndarray, "torch.Tensor"]) -> None:
         """Set the concrete value of the parameter. The data should be one of the following:
         - None: unbind the parameter to concrete values
-        - tvm.runtime.NDArray
+        - tvm.runtime.Tensor
         - numpy.ndarray
         - torch.Tensor and any other DLPack-compliant tensors
         """
@@ -268,10 +268,10 @@ def data(self, data: Union[None, NDArray, np.ndarray, "torch.Tensor"]) -> None:
             self._data = data
             return
         # Try to do zero-copy if possible
-        if isinstance(data, NDArray):
+        if isinstance(data, tvm.runtime.Tensor):
             pass
         elif isinstance(data, np.ndarray):
-            data = ndarray.array(data)
+            data = tvm.runtime.tensor(data)
         elif hasattr(data, "__dlpack__"):
             data = _from_dlpack(data)
         else:
@@ -526,7 +526,7 @@ def _compile(spec, device, pipeline, debug):
                 ),
                 device,
             )
-            params = _param_to_ndarray(params, device)
+            params = _param_to_tensor(params, device)
             return spec, vm, params
 
         device = as_device(device)
@@ -628,15 +628,15 @@ def _attribute_finder(root: Module, prefix: str, condition_yield: Callable[[Any]
             )
 
 
-def _from_dlpack(tensor) -> NDArray:
+def _from_dlpack(tensor) -> tvm.runtime.Tensor:
     try:
-        return ndarray.from_dlpack(tensor)
+        return tvm.runtime.from_dlpack(tensor)
     except RuntimeError:
         pass
     # special logic for PyTorch
     device_type = tensor.device.type
     device_id = tensor.device.index or 0
-    return ndarray.array(
+    return tvm.runtime.tensor(
         tensor.numpy(),
         device=Device(
             Device.DEVICE_NAME_TO_TYPE[device_type],
@@ -645,7 +645,9 @@ def _from_dlpack(tensor) -> NDArray:
     )
 
 
-def _param_to_ndarray(params: List[Tuple[str, Parameter]], device: Device) -> List[NDArray]:
+def _param_to_tensor(
+    params: List[Tuple[str, Parameter]], device: Device
+) -> List[tvm.runtime.Tensor]:
     results = []
     missing = []
     for name, param in params:
diff --git a/python/tvm/relax/frontend/nn/modules.py b/python/tvm/relax/frontend/nn/modules.py
index b61656a2e6bd..5ca5f72787b7 100644
--- a/python/tvm/relax/frontend/nn/modules.py
+++ b/python/tvm/relax/frontend/nn/modules.py
@@ -27,7 +27,7 @@
 
 class IOEffect(Effect):
     """
-    Modeling IO side effect, for example, printing the content of NDArrays on screen, inserting
+    Modeling IO side effect, for example, printing the content of Tensors on screen, inserting
     debug breakpoints, etc.
     """
 
diff --git a/python/tvm/relax/frontend/nn/torch.py b/python/tvm/relax/frontend/nn/torch.py
index ae98868dae09..183cb11731e3 100644
--- a/python/tvm/relax/frontend/nn/torch.py
+++ b/python/tvm/relax/frontend/nn/torch.py
@@ -21,7 +21,7 @@
 import torch
 
 from tvm.ir import Array
-from tvm.runtime import NDArray, ShapeTuple, ndarray
+from tvm.runtime import Tensor, ShapeTuple, _tensor
 from tvm.runtime.vm import VirtualMachine
 
 from . import core
@@ -34,14 +34,14 @@ class TorchModule:  # pylint: disable=too-few-public-methods
 
     spec: _spec.ModuleSpec
     vm: VirtualMachine  # pylint: disable=invalid-name
-    params: List[NDArray]
+    params: List[Tensor]
     effects: List[Any]
 
     def __init__(  # pylint: disable=invalid-name
         self,
         spec: _spec.ModuleSpec,
         vm: VirtualMachine,
-        params: List[NDArray],
+        params: List[Tensor],
     ):
         try:
             self.effects = vm["_initialize_effect"]()
@@ -87,7 +87,7 @@ def _closure(*args):
 def _tvm_to_torch(arg):
     if isinstance(arg, (list, tuple, Array)):
         return [_tvm_to_torch(i) for i in arg]
-    if isinstance(arg, ndarray.NDArray):
+    if isinstance(arg, _tensor.Tensor):
         return torch.utils.dlpack.from_dlpack(arg)
     if isinstance(arg, ShapeTuple):
         return list(arg)
diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index 05e4534acae3..5470c911d30b 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -3830,9 +3830,9 @@ def _parse_value_proto(self, value_proto: onnx.onnx_ml_pb2.GraphProto):
             name = value_proto
         return name
 
-    def _parse_array(self, tensor_proto: onnx.onnx_ml_pb2.TensorProto) -> tvm.nd.array:
+    def _parse_array(self, tensor_proto: onnx.onnx_ml_pb2.TensorProto) -> tvm.runtime.tensor:
         np_array = get_numpy(tensor_proto).reshape(tuple(tensor_proto.dims))
-        return tvm.nd.array(np_array)
+        return tvm.runtime.tensor(np_array)
 
     def _parse_attr(self, attr_proto: onnx.onnx_ml_pb2.AttributeProto) -> Dict[str, Any]:
         """Convert a list of AttributeProto to a dict, with names as keys."""
diff --git a/python/tvm/relax/frontend/torch/dynamo.py b/python/tvm/relax/frontend/torch/dynamo.py
index c10019454015..8837d9683511 100644
--- a/python/tvm/relax/frontend/torch/dynamo.py
+++ b/python/tvm/relax/frontend/torch/dynamo.py
@@ -55,8 +55,8 @@ def _relax_backend(graph_module, example_inputs):
         assert isinstance(graph_module, torch.fx.GraphModule)
 
         def to_torch_tensor(nd_tensor):
-            """A helper function to transfer a NDArray to torch.tensor."""
-            if isinstance(nd_tensor, tvm.nd.NDArray):
+            """A helper function to transfer a Tensor to torch.tensor."""
+            if isinstance(nd_tensor, tvm.runtime.Tensor):
                 return torch.from_numpy(nd_tensor.numpy())
             elif isinstance(nd_tensor, tvm.ir.Array):
                 return tuple(to_torch_tensor(x) for x in nd_tensor)
@@ -64,12 +64,12 @@ def to_torch_tensor(nd_tensor):
                 raise ValueError(f"Unsupported type {type(nd_tensor)}")
 
         def to_tvm_tensor(torch_tensor):
-            """A helper function to transfer a torch.tensor to NDArray."""
+            """A helper function to transfer a torch.tensor to Tensor."""
             if not isinstance(torch_tensor, torch._subclasses.fake_tensor.FakeTensor):
-                return tvm.nd.array(torch_tensor.numpy())
+                return tvm.runtime.tensor(torch_tensor.numpy())
             # Fake Tensor
             real_tensor = torch.randn(torch_tensor.shape, dtype=torch_tensor.dtype)
-            return tvm.nd.array(real_tensor.numpy())
+            return tvm.runtime.tensor(real_tensor.numpy())
 
         graph_module.graph.eliminate_dead_code()
 
diff --git a/python/tvm/relax/frontend/torch/exported_program_translator.py b/python/tvm/relax/frontend/torch/exported_program_translator.py
index 1a53a0cbdc72..b489f3e79496 100644
--- a/python/tvm/relax/frontend/torch/exported_program_translator.py
+++ b/python/tvm/relax/frontend/torch/exported_program_translator.py
@@ -715,14 +715,14 @@ def from_exported_program(
                 if tensor_name == spec.target:
                     bind_name = spec.arg.name
                     break
-            binding[bind_name] = tvm.nd.from_dlpack(tensor_value.detach())
+            binding[bind_name] = tvm.runtime.from_dlpack(tensor_value.detach())
 
         mod = self.block_builder.get()
         mod = relax.transform.BindParams("main", binding)(mod)
 
         if keep_params_as_input:
             parameters = dict(exported_program.named_parameters())
-            params = [tvm.nd.from_dlpack(p.detach()) for p in parameters.values()]
+            params = [tvm.runtime.from_dlpack(p.detach()) for p in parameters.values()]
             mod["main"] = mod["main"].with_attr("params", params)
 
         return mod
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
index 754129ffdeb8..0d2e240be641 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -1042,7 +1042,7 @@ def from_fx(
                 dtype = self._convert_data_type(str(param.data.dtype))
                 inputs.append(relax.Var(name, relax.TensorStructInfo(shape, dtype)))
                 self.params[param] = inputs[-1]
-                params.append(tvm.nd.array(param.data.cpu().numpy()))
+                params.append(tvm.runtime.tensor(param.data.cpu().numpy()))
         else:
             func_attrs = None
 
diff --git a/python/tvm/relax/op/base.py b/python/tvm/relax/op/base.py
index b0570344e5a0..4663e47020e0 100644
--- a/python/tvm/relax/op/base.py
+++ b/python/tvm/relax/op/base.py
@@ -414,7 +414,7 @@ def render_object(val: tvm.Object) -> str:
     ret: str
         A string representing the value, ideally human-readable
     """
-    if isinstance(val, tvm.nd.NDArray):
+    if isinstance(val, tvm.runtime.Tensor):
         return str(val)
     if isinstance(val, tvm.ir.Array):
         fields = ", ".join([render_object(val[i]) for i in range(len(val))])
@@ -423,16 +423,16 @@ def render_object(val: tvm.Object) -> str:
 
 
 @tvm.register_func("relax.run.shape_to_tensor")
-def relax_shape_to_tensor(shape_tuple: tvm.runtime.ShapeTuple) -> tvm.nd.NDArray:
+def relax_shape_to_tensor(shape_tuple: tvm.runtime.ShapeTuple) -> tvm.runtime.Tensor:
     """
-    Takes a ShapeTuple and convert it to NDArray.
+    Takes a ShapeTuple and convert it to Tensor.
 
     Parameters
     ----------
     shape_tuple: tvm.runtime.ShapeTuple
-        Shape tuple that we want to convert to NDArray at runtime
+        Shape tuple that we want to convert to Tensor at runtime
     """
-    return tvm.nd.array([int(v) for v in shape_tuple])
+    return tvm.runtime.tensor([int(v) for v in shape_tuple])
 
 
 @tvm.register_func("relax.run.print")
@@ -514,7 +514,7 @@ def relax_assert_op(condition: tvm.Object, format_str: str, *format_args: tvm.Ob
 
     if isinstance(condition, (bool, int)):
         val = condition
-    elif isinstance(condition, tvm.nd.NDArray):
+    elif isinstance(condition, tvm.runtime.Tensor):
         # may happen if the original program had unknown shape or dtype for the tensor's type
         dtype = condition.dtype
         if dtype != "bool":
@@ -528,7 +528,7 @@ def relax_assert_op(condition: tvm.Object, format_str: str, *format_args: tvm.Ob
     else:
         # should be guaranteed by the type system
         raise ValueError(
-            f"The condition for relax assert must be a bool, int, or NDArray, "
+            f"The condition for relax assert must be a bool, int, or Tensor, "
             f"but received a {type(condition)}."
         )
 
diff --git a/python/tvm/relax/op/memory/view.py b/python/tvm/relax/op/memory/view.py
index 95adc782092f..a7f6f91e182a 100644
--- a/python/tvm/relax/op/memory/view.py
+++ b/python/tvm/relax/op/memory/view.py
@@ -70,7 +70,7 @@ def view(
 
     relative_byte_offset: Optional[Expr]
 
-        The offset of the output NDArray, relative to the byte offset
+        The offset of the output Tensor, relative to the byte offset
         of `data`.  If `None`, the offset of the view is the same as
         the offset of `data`.
 
diff --git a/python/tvm/relax/op/set.py b/python/tvm/relax/op/set.py
index ed4b2e2ff928..4d0fd3dd420f 100644
--- a/python/tvm/relax/op/set.py
+++ b/python/tvm/relax/op/set.py
@@ -86,13 +86,13 @@ def unique(
 
 @tvm.register_func("relax.run.unique")
 def numpy_unique(
-    x: tvm.nd.array,
+    x: tvm.runtime.tensor,
     sorted: int,
     return_index: int,
     return_inverse: int,
     return_counts: int,
     axis: Optional[int] = None,
-) -> tvm.nd.array:
+) -> tvm.runtime.tensor:
     """Returns the unique elements of the input tensor.
 
     Uses numpy.unique to compute unique elements.
@@ -107,9 +107,9 @@ def numpy_unique(
     output_sorted_numpy, indices = np.unique(x_numpy, return_index=True, axis=axis)
 
     if sorted:
-        return tvm.nd.array(output_sorted_numpy)
+        return tvm.runtime.tensor(output_sorted_numpy)
     output_numpy = np.take(x_numpy, builtins.sorted(indices), axis=axis)
-    return tvm.nd.array(output_numpy)
+    return tvm.runtime.tensor(output_numpy)
 
 
 def nonzero(x: Expr) -> Expr:
@@ -144,6 +144,6 @@ def nonzero(x: Expr) -> Expr:
 
 
 @tvm.register_func("relax.run.nonzero")
-def numpy_nonzero(x: tvm.nd.array) -> tvm.nd.array:
+def numpy_nonzero(x: tvm.runtime.tensor) -> tvm.runtime.tensor:
     np_result = np.atleast_1d(x.numpy()).nonzero()
-    return tvm.nd.array(np.stack(np_result, axis=0))
+    return tvm.runtime.tensor(np.stack(np_result, axis=0))
diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index 37ef6156e4e7..a5850267a8c4 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -151,7 +151,7 @@ def static_shape_tuning_pipeline(
         # the name should be f"{func_name}_transform_params"
         params = vm["main_transform_params"](params["main"])
 
-        input_data = tvm.nd.array(np.random.randn(1, 3, 224, 224).astype("float32"))
+        input_data = tvm.runtime.tensor(np.random.randn(1, 3, 224, 224).astype("float32"))
         out = vm["main"](input_data, *params).numpy()
     """
 
diff --git a/python/tvm/relax/testing/lib_comparator.py b/python/tvm/relax/testing/lib_comparator.py
index b15698c8db74..48930f062357 100644
--- a/python/tvm/relax/testing/lib_comparator.py
+++ b/python/tvm/relax/testing/lib_comparator.py
@@ -63,8 +63,8 @@ def __init__(self, mod, device, verbose=True, rtol=1e-5, atol=1e-5):
     def compare(
         self,
         name: str,
-        ref_args: Union[List[tvm.nd.NDArray], Tuple[tvm.nd.NDArray, ...]],
-        new_args: Union[List[tvm.nd.NDArray], Tuple[tvm.nd.NDArray, ...]],
+        ref_args: Union[List[tvm.runtime.Tensor], Tuple[tvm.runtime.Tensor, ...]],
+        new_args: Union[List[tvm.runtime.Tensor], Tuple[tvm.runtime.Tensor, ...]],
         ret_indices: Iterable[int],
     ):
         """Comparison function, can be overloaded.
@@ -103,7 +103,7 @@ def __call__(self, func, name, before_run, ret_val, *args):
             return
         if name.startswith("vm.builtin."):
             return
-        if any(not isinstance(x, tvm.nd.NDArray) for x in args):
+        if any(not isinstance(x, tvm.runtime.Tensor) for x in args):
             return
         try:
             self.mod.get_function(name, query_imports=True)
@@ -120,7 +120,7 @@ def __call__(self, func, name, before_run, ret_val, *args):
         ret_indices = (len(args) - 1,)
         temp_args = []
         for i, arg in enumerate(args):
-            arr = tvm.nd.empty(arg.shape, arg.dtype, device=self.device)
+            arr = tvm.runtime.empty(arg.shape, arg.dtype, device=self.device)
             # copy from cpu since we look at different device
             if i not in ret_indices:
                 temp_cpu = arg.copyto(tvm.cpu())
diff --git a/python/tvm/relax/testing/nn.py b/python/tvm/relax/testing/nn.py
index 6e7e3d4d197b..fb3564c6f1a1 100644
--- a/python/tvm/relax/testing/nn.py
+++ b/python/tvm/relax/testing/nn.py
@@ -281,7 +281,7 @@ def _unpack_params(value: object) -> List[relax.Var]:
     return []
 
 
-def init_params(mod: tvm.IRModule) -> List[tvm.nd.array]:
+def init_params(mod: tvm.IRModule) -> List[tvm.runtime.Tensor]:
     """Utility function to initialize model's parameters."""
     shape_dict = {v.name_hint: v.struct_info.shape for v in mod["main"].params}
     params = []
@@ -295,7 +295,7 @@ def init_params(mod: tvm.IRModule) -> List[tvm.nd.array]:
                     shape.append(int(i))
                 else:
                     raise TypeError("cannot initialize for unknown-shape parameters.")
-            params.append(tvm.nd.array(np.zeros(shape).astype(np.float32)))
+            params.append(tvm.runtime.tensor(np.zeros(shape).astype(np.float32)))
         else:
             raise TypeError("cannot initialize for unknown-shape parameters.")
     return params
diff --git a/python/tvm/relax/testing/vm.py b/python/tvm/relax/testing/vm.py
index 37bcf870a5df..737de13fc7f6 100644
--- a/python/tvm/relax/testing/vm.py
+++ b/python/tvm/relax/testing/vm.py
@@ -32,35 +32,35 @@ def move(src):
 @tvm.register_func("test.vm.add")
 def add(a, b):
     ret = a.numpy() + b.numpy()
-    return tvm.nd.array(ret)
+    return tvm.runtime.tensor(ret)
 
 
 @tvm.register_func("test.vm.mul")
 def mul(a, b):
     ret = a.numpy() * b.numpy()
-    return tvm.nd.array(ret)
+    return tvm.runtime.tensor(ret)
 
 
 @tvm.register_func("test.vm.equal_zero")
 def equal_zero(a):
     ret = np.all((a.numpy() == 0))
-    return tvm.nd.array(ret)
+    return tvm.runtime.tensor(ret)
 
 
 @tvm.register_func("test.vm.subtract_one")
 def subtract_one(a):
     ret = np.subtract(a.numpy(), 1)
-    return tvm.nd.array(ret)
+    return tvm.runtime.tensor(ret)
 
 
 @tvm.register_func("test.vm.identity")
 def identity_packed(a, b):
-    b[:] = tvm.nd.array(a.numpy())
+    b[:] = tvm.runtime.tensor(a.numpy())
 
 
 @tvm.register_func("test.vm.tile")
 def tile_packed(a, b):
-    b[:] = tvm.nd.array(np.tile(a.numpy(), (1, 2)))
+    b[:] = tvm.runtime.tensor(np.tile(a.numpy(), (1, 2)))
 
 
 @tvm.register_func("test.vm.add_scalar")
diff --git a/python/tvm/relax/training/optimizer.py b/python/tvm/relax/training/optimizer.py
index d6f503de0564..16a215f87dc3 100644
--- a/python/tvm/relax/training/optimizer.py
+++ b/python/tvm/relax/training/optimizer.py
@@ -291,7 +291,7 @@ def init(self, params: Union[Var, List[Var]]) -> "SGD":
         self._set_params_and_dtype(params)
         self.state = (
             # num_steps = 0
-            tvm.nd.array(np.zeros((), "int64")),
+            tvm.runtime.tensor(np.zeros((), "int64")),
         )
         return self
 
@@ -433,10 +433,10 @@ def init(self, params: Union[Var, List[Var]]) -> "MomentumSGD":
         self._set_params_and_dtype(params)
         self.state = (
             # num_steps = 0
-            tvm.nd.array(np.zeros((), "int64")),
+            tvm.runtime.tensor(np.zeros((), "int64")),
             # v_{param} is initialized to all zeros
             *(
-                tvm.nd.array(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
+                tvm.runtime.tensor(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
                 for p in self.param_list
             ),
         )
@@ -604,17 +604,17 @@ def init(self, params: Union[Var, List[Var]]) -> "Adam":
         self._set_params_and_dtype(params)
         self.state = (
             # num_steps, beta_0_prod, beta_1_prod
-            tvm.nd.array(np.zeros((), "int64")),
-            tvm.nd.array(np.ones((), self.dtype)),
-            tvm.nd.array(np.ones((), self.dtype)),
+            tvm.runtime.tensor(np.zeros((), "int64")),
+            tvm.runtime.tensor(np.ones((), self.dtype)),
+            tvm.runtime.tensor(np.ones((), self.dtype)),
             # first_momentum
             *(
-                tvm.nd.array(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
+                tvm.runtime.tensor(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
                 for p in self.param_list
             ),
             # second_momentum
             *(
-                tvm.nd.array(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
+                tvm.runtime.tensor(np.zeros(_get_shape_as_int_list(p), p.struct_info.dtype))
                 for p in self.param_list
             ),
         )
diff --git a/python/tvm/relax/training/trainer.py b/python/tvm/relax/training/trainer.py
index fbf48fece9f6..aaaa14dd2812 100644
--- a/python/tvm/relax/training/trainer.py
+++ b/python/tvm/relax/training/trainer.py
@@ -22,7 +22,7 @@
 import tvm
 from tvm import relax, TVMError
 from tvm.ir.module import IRModule
-from tvm.runtime.ndarray import NDArray
+from tvm.runtime._tensor import Tensor
 
 
 class Trainer:
@@ -100,12 +100,12 @@ def __init__(
             )
         ]
 
-        self._params: List[Optional[NDArray]] = [None] * self._param_num
+        self._params: List[Optional[Tensor]] = [None] * self._param_num
         self._param_name_to_pos: Dict[str, int] = {
             p.name_hint: i for i, p in enumerate(self._param_vars)
         }
 
-        self._states: List[Optional[NDArray]] = [None] * self._state_num
+        self._states: List[Optional[Tensor]] = [None] * self._state_num
         self._state_name_to_pos: Dict[str, int] = {
             s.name_hint: i for i, s in enumerate(self._state_vars)
         }
@@ -129,7 +129,7 @@ def xaiver_uniform_init_params(self):
         for p in self._param_vars:
             shape, dtype = self._get_shape_list(p), p.struct_info.dtype
             self._params.append(
-                tvm.nd.array(
+                tvm.runtime.tensor(
                     (np.sqrt(6.0 / np.sum(shape)) * np.random.uniform(-1.0, 1.0, shape)).astype(
                         dtype
                     ),
@@ -140,27 +140,27 @@ def xaiver_uniform_init_params(self):
     def zero_init_params(self):
         """Zero initialize all parameters. Requires all parameters have static shapes."""
         self._params = [
-            tvm.nd.array(np.zeros(self._get_shape_list(p), p.struct_info.dtype), self.device)
+            tvm.runtime.tensor(np.zeros(self._get_shape_list(p), p.struct_info.dtype), self.device)
             for p in self._param_vars
         ]
 
     def zero_init_states(self):
         """Zero initialize all states. Requires all states have static shapes."""
         self._states = [
-            tvm.nd.array(np.zeros(self._get_shape_list(s), s.struct_info.dtype), self.device)
+            tvm.runtime.tensor(np.zeros(self._get_shape_list(s), s.struct_info.dtype), self.device)
             for s in self._state_vars
         ]
 
     def load_params(
         self,
-        params: Union[List[Union[np.ndarray, NDArray]], Dict[str, Union[np.ndarray, NDArray]]],
+        params: Union[List[Union[np.ndarray, Tensor]], Dict[str, Union[np.ndarray, Tensor]]],
     ):
-        """Load parameters from a dict or a list. Will convert parameters into tvm.runtime.NDArray
+        """Load parameters from a dict or a list. Will convert parameters into tvm.runtime.Tensor
         in self.device.
 
         Parameters
         ----------
-        params : List[Union[np.ndarray, NDArray]], Dict[str, Union[np.ndarray, NDArray]]
+        params : List[Union[np.ndarray, Tensor]], Dict[str, Union[np.ndarray, Tensor]]
             The numerical value of the parameters.
 
             If params is a list, its length should be param_num. The value of parameters at the
@@ -176,25 +176,25 @@ def load_params(
                     f"The length of extern parameters is {len(params)}, which does not "
                     f"match the number of parameters {self._param_num}"
                 )
-            self._params = [tvm.nd.array(v, self.device) for v in params]
+            self._params = [tvm.runtime.tensor(v, self.device) for v in params]
         elif isinstance(params, dict):
             for key, val in params.items():
                 if key not in self._param_name_to_pos:
                     raise ValueError(f"Parameter {key} is not found in the model")
-                self._params[self._param_name_to_pos[key]] = tvm.nd.array(val, self.device)
+                self._params[self._param_name_to_pos[key]] = tvm.runtime.tensor(val, self.device)
         else:
             raise ValueError("The type of extern_params should be either list or dict")
 
     def load_states(
         self,
-        states: Union[List[Union[np.ndarray, NDArray]], Dict[str, Union[np.ndarray, NDArray]]],
+        states: Union[List[Union[np.ndarray, Tensor]], Dict[str, Union[np.ndarray, Tensor]]],
     ):
-        """Load model states from a dict or a list. Will convert states into tvm.runtime.NDArray
+        """Load model states from a dict or a list. Will convert states into tvm.runtime.Tensor
         in self.device.
 
         Parameters
         ----------
-        states : List[Union[np.ndarray, NDArray]], Dict[str, Union[np.ndarray, NDArray]]
+        states : List[Union[np.ndarray, Tensor]], Dict[str, Union[np.ndarray, Tensor]]
             The numerical value of the model states.
 
             If states is a list, its length should be state_num. The value of states at the
@@ -210,31 +210,31 @@ def load_states(
                     f"The length of extern states is {len(states)}, which does not match "
                     f"the number of model states {self._state_num}"
                 )
-            self._states = [tvm.nd.array(v, self.device) for v in states]
+            self._states = [tvm.runtime.tensor(v, self.device) for v in states]
         elif isinstance(states, dict):
             for key, val in states.items():
                 if key not in self._param_name_to_pos:
                     raise ValueError(f"Parameter {key} is not found in the model")
-                self._states[self._param_name_to_pos[key]] = tvm.nd.array(val, self.device)
+                self._states[self._param_name_to_pos[key]] = tvm.runtime.tensor(val, self.device)
         else:
             raise ValueError("The type of extern_states should be either list or dict")
 
-    def export_params(self) -> Dict[str, NDArray]:
-        """Export parameters to a dict (parameter name -> NDArray).
+    def export_params(self) -> Dict[str, Tensor]:
+        """Export parameters to a dict (parameter name -> Tensor).
 
         Returns
         -------
-        exported_dict : Dict[str, NDArray]
+        exported_dict : Dict[str, Tensor]
             The exported dictionary of parameters.
         """
         return {key: self._params[pos] for key, pos in self._param_name_to_pos.items()}
 
-    def export_states(self) -> Dict[str, NDArray]:
-        """Export model states to a dict (parameter name -> NDArray).
+    def export_states(self) -> Dict[str, Tensor]:
+        """Export model states to a dict (parameter name -> Tensor).
 
         Returns
         -------
-        exported_dict : Dict[str, NDArray]
+        exported_dict : Dict[str, Tensor]
             The exported dictionary of model states.
         """
         return {key: self._states[pos] for key, pos in self._state_name_to_pos.items()}
@@ -255,26 +255,28 @@ def _check_inited(self):
                 "inference."
             )
 
-    def predict(self, *input_instances: Union[np.ndarray, NDArray]) -> NDArray:
+    def predict(self, *input_instances: Union[np.ndarray, Tensor]) -> Tensor:
         """Call the `backbone` function and return the prediction result of the backbone.
 
         Parameters
         ----------
-        *input_instances : Union[np.ndarray, NDArray]
+        *input_instances : Union[np.ndarray, Tensor]
             The values corresponding to the input_instances part of the backbone function.
             Parameters and model states are not needed to provide.
 
         Returns
         -------
-        output : NDArray
+        output : Tensor
             The result of the backbone function. If the backbone contains model states, the updated
             states WILL NOT be returned.
         """
         self._check_inited()
         if len(input_instances) != self._input_num:
             raise ValueError("The length of the input does not match the backbone")
-        all_inputs: List[NDArray] = (
-            [tvm.nd.array(i, self.device) for i in input_instances] + self._params + self._states
+        all_inputs: List[Tensor] = (
+            [tvm.runtime.tensor(i, self.device) for i in input_instances]
+            + self._params
+            + self._states
         )
         res = self.vm[self.BACKBONE_FUNC](*all_inputs)
 
@@ -287,9 +289,9 @@ def predict(self, *input_instances: Union[np.ndarray, NDArray]) -> NDArray:
 
     def update(
         self,
-        input_instances: Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]],
-        targets: Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]],
-    ) -> NDArray:
+        input_instances: Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]],
+        targets: Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]],
+    ) -> Tensor:
         """Update parameters and model states. It will calculate the gradients of parameters
         and update them using the `optimizer` function.
 
@@ -298,21 +300,21 @@ def update(
 
         Parameters
         ----------
-        input_instances : Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]]
+        input_instances : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
             The values corresponding to the input_instances part of the backbone function.
             Parameters and model states are not needed to provide.
 
             If there are more than one input instances, you can provide a list.
 
-        targets : Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]]
+        targets : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
             The values corresponding to the targets part of the backbone function.
 
             If there are more than one targets, you can provide a list.
 
         Returns
         -------
-        loss : NDArray
-            The loss stored in tvm.runtime.NDArray.
+        loss : Tensor
+            The loss stored in tvm.runtime.Tensor.
         """
         self._check_inited()
 
@@ -325,11 +327,11 @@ def update(
         if len(input_instances) != self._input_num:
             raise ValueError("The length of the input does not match the backbone")
 
-        all_inputs: List[NDArray] = (
-            [tvm.nd.array(i, self.device) for i in input_instances]
+        all_inputs: List[Tensor] = (
+            [tvm.runtime.tensor(i, self.device) for i in input_instances]
             + self._params
             + self._states
-            + [tvm.nd.array(i, self.device) for i in targets]
+            + [tvm.runtime.tensor(i, self.device) for i in targets]
         )
         ret, grads = self.vm[self.ADJOINT_FUNC](*all_inputs)
 
@@ -348,21 +350,21 @@ def update(
 
     def profile_adjoint(
         self,
-        input_instances: List[Union[np.ndarray, NDArray]],
-        targets: List[Union[np.ndarray, NDArray]],
+        input_instances: List[Union[np.ndarray, Tensor]],
+        targets: List[Union[np.ndarray, Tensor]],
     ) -> tvm.runtime.profiling.Report:
         """Profile the adjoint function. It requires the VM to be constructed with `profile=True`,
         and runs `tvm.relax.VirtualMachine.profile()` internally.
 
         Parameters
         ----------
-        input_instances : Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]]
+        input_instances : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
             The values corresponding to the input_instances part of the backbone function.
             Parameters and model states are not needed to provide.
 
             If there are more than one input instances, you can provide a list.
 
-        targets : Union[np.ndarray, NDArray, List[Union[np.ndarray, NDArray]]]
+        targets : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
             The values corresponding to the targets part of the backbone function.
 
             If there are more than one targets, you can provide a list.
@@ -383,11 +385,11 @@ def profile_adjoint(
         if len(input_instances) != self._input_num:
             raise ValueError("The length of the input does not match the backbone")
 
-        all_inputs: List[NDArray] = (
-            [tvm.nd.array(i) for i in input_instances]
+        all_inputs: List[Tensor] = (
+            [tvm.runtime.tensor(i) for i in input_instances]
             + self._params
             + self._states
-            + [tvm.nd.array(i) for i in targets]
+            + [tvm.runtime.tensor(i) for i in targets]
         )
         all_inputs = [i.copyto(self.device) for i in all_inputs]
         return self.vm.profile(self.ADJOINT_FUNC, *all_inputs)
diff --git a/python/tvm/relax/transform/transform.py b/python/tvm/relax/transform/transform.py
index bf813b3dd612..c945732a6dfc 100644
--- a/python/tvm/relax/transform/transform.py
+++ b/python/tvm/relax/transform/transform.py
@@ -28,7 +28,7 @@
 from tvm.ir.container import Array
 from tvm.relax import Expr, Var, StructInfo
 from tvm.relax.dpl import DFPattern
-from tvm.runtime import NDArray, Object
+from tvm.runtime import Tensor, Object
 from tvm.tir import IndexMap, PrimFunc
 
 from . import _ffi_api
@@ -638,7 +638,7 @@ def AttachGlobalSymbol() -> tvm.ir.transform.Pass:
 
 def BindParams(
     func_name: str,
-    params: Dict[Union[str, Var], Union[tvm.runtime.NDArray, np.ndarray]],
+    params: Dict[Union[str, Var], Union[tvm.runtime.Tensor, np.ndarray]],
 ) -> tvm.ir.transform.Pass:
     """Bind params of function of the module to constant tensors.
 
@@ -647,7 +647,7 @@ def BindParams(
     func_name: str
         The function name to be bound
 
-    params: Dict[Union[str,relax.Var], Union[tvm.runtime.NDArray, np.ndarray]]
+    params: Dict[Union[str,relax.Var], Union[tvm.runtime.Tensor, np.ndarray]]
         The map from parameter or parameter name to constant tensors.
 
     Returns
@@ -657,9 +657,9 @@ def BindParams(
     tvm_params = {}
     for k, v in params.items():
         if isinstance(v, np.ndarray):
-            v = tvm.nd.array(v)
-        assert isinstance(v, (tvm.runtime.NDArray, tvm.relax.Constant)), (
-            f"param values are expected to be TVM.NDArray,"
+            v = tvm.runtime.tensor(v)
+        assert isinstance(v, (tvm.runtime.Tensor, tvm.relax.Constant)), (
+            f"param values are expected to be TVM.Tensor,"
             f"numpy.ndarray or tvm.relax.Constant, but got {type(v)}"
         )
         tvm_params[k] = v
@@ -1223,7 +1223,7 @@ def MetaScheduleTuneTIR(
 
 
 def MetaScheduleTuneIRMod(
-    params: Dict[str, NDArray],
+    params: Dict[str, Tensor],
     work_dir: str,
     max_trials_global: int,
     max_trials_per_task: Optional[int] = None,
@@ -1233,7 +1233,7 @@ def MetaScheduleTuneIRMod(
 
     Parameters
     ----------
-    params: Dict[str, NDArray]
+    params: Dict[str, Tensor]
        model params
     work_dir: str
        work directory
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 90267c05263a..37bc6b311745 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -23,9 +23,9 @@
 import time
 
 import tvm_ffi
+import tvm.runtime
 from tvm.base import TVMError
 from tvm.contrib import utils
-from tvm.runtime import ndarray as nd
 from tvm.runtime import Device
 
 from . import _ffi_api, base, server
@@ -86,9 +86,9 @@ def device(self, dev_type, dev_id=0):
         dev: Device
             The corresponding encoded remote device.
         """
-        dev = nd.device(dev_type, dev_id)
+        dev = tvm.runtime.device(dev_type, dev_id)
         encode = (self._tbl_index + 1) * base.RPC_SESS_MASK
-        dev = nd.device(dev.device_type + encode, dev.device_id)
+        dev = tvm.runtime.device(dev.device_type + encode, dev.device_id)
         dev._rpc_sess = self
         return dev
 
diff --git a/python/tvm/rpc/testing.py b/python/tvm/rpc/testing.py
index ba88c2048443..d27485413814 100644
--- a/python/tvm/rpc/testing.py
+++ b/python/tvm/rpc/testing.py
@@ -42,8 +42,8 @@ def _strcat(x, y):
     return x + y
 
 
-@tvm.register_func("rpc.test.remote_array_func")
-def _remote_array_func(y):
+@tvm.register_func("rpc.test.remote_tensor_func")
+def _remote_tensor_func(y):
     x = np.ones((3, 4))
     np.testing.assert_equal(y.numpy(), x)
 
@@ -56,7 +56,7 @@ def _add_to_lhs(x):
 @tvm.register_func("rpc.test.remote_return_nd")
 def _my_module(name):
     # Use closure to check the ref counter correctness
-    nd = tvm.nd.array(np.zeros(10).astype("float32"))
+    nd = tvm.runtime.tensor(np.zeros(10).astype("float32"))
 
     if name == "get_arr":
         return lambda: nd
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 5b7dea83679e..57546dcff48b 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -24,14 +24,14 @@
 from .script_printer import Scriptable
 from .object_generic import ObjectGeneric
 from .device import Device
-from .ndarray import NDArray
+from ._tensor import Tensor, tensor, empty
 from .module import Module
 from .profiling import Report
 from .executable import Executable
 
 # function exposures
-from .ndarray import device, cpu, cuda, opencl, vulkan, metal
-from .ndarray import vpi, rocm, ext_dev
+from ._tensor import device, cpu, cuda, opencl, vulkan, metal
+from ._tensor import vpi, rocm, ext_dev, from_dlpack
 from .module import load_module, enabled, system_lib, load_static_library, num_threads
 from .container import String, ShapeTuple
 from .object_generic import const
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/_tensor.py
similarity index 90%
rename from python/tvm/runtime/ndarray.py
rename to python/tvm/runtime/_tensor.py
index 39ff8fb1bffb..1d413272b2a3 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/_tensor.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-import, redefined-outer-name
-"""Runtime NDArray API"""
+"""Runtime Tensor API"""
 import ctypes
 import warnings
 from typing import Optional
@@ -49,7 +49,7 @@
 
 def from_dlpack(ext_tensor):
     """
-    Convert an external tensor to an NDArray.
+    Convert an external tensor to an Tensor.
 
     Parameters
     ----------
@@ -69,9 +69,9 @@ def from_dlpack(ext_tensor):
     )
 
 
-@tvm_ffi.register_object("ffi.NDArray")
-class NDArray(tvm_ffi.core.NDArray):
-    """Lightweight NDArray class of TVM runtime.
+@tvm_ffi.register_object("ffi.Tensor")
+class Tensor(tvm_ffi.core.Tensor):
+    """Lightweight Tensor class of TVM runtime.
 
     Strictly this is only an Array Container (a buffer object)
     No arthimetic operations are defined.
@@ -90,7 +90,7 @@ def __setitem__(self, in_slice, value):
             or in_slice.stop is not None
         ):
             raise ValueError("Array only support set from numpy array")
-        if isinstance(value, NDArray):
+        if isinstance(value, Tensor):
             if not value.same_as(self):
                 value.copyto(self)
         elif isinstance(value, (np.ndarray, np.generic)):
@@ -108,10 +108,10 @@ def copyfrom(self, source_array):
 
         Returns
         -------
-        arr : NDArray
+        arr : Tensor
             Reference to self.
         """
-        if isinstance(source_array, NDArray):
+        if isinstance(source_array, Tensor):
             source_array.copyto(self)
             return self
 
@@ -132,7 +132,7 @@ def copyfrom(self, source_array):
 
         if source_array.shape != shape:
             raise ValueError(
-                f"array shape do not match the shape of NDArray {source_array.shape} vs {shape}"
+                f"array shape do not match the shape of Tensor {source_array.shape} vs {shape}"
             )
         numpy_str_map = tvm_ffi.dtype.NUMPY_DTYPE_TO_STR
         np_dtype_str = (
@@ -159,14 +159,14 @@ def copyfrom(self, source_array):
         assert source_array.flags["C_CONTIGUOUS"]
         data = source_array.ctypes.data_as(ctypes.c_void_p)
         nbytes = source_array.size * source_array.dtype.itemsize
-        _ffi_api.TVMArrayCopyFromBytes(self, data, nbytes)
+        _ffi_api.TVMTensorCopyFromBytes(self, data, nbytes)
         return self
 
     def __repr__(self):
         # exception safety handling for chandle=None
         if self.__chandle__() == 0:
             return type(self).__name__ + "(chandle=None)"
-        res = f"<tvm.nd.NDArray shape={self.shape}, {self.device}>\n"
+        res = f"<tvm.runtime.Tensor shape={self.shape}, {self.device}>\n"
         res += self.numpy().__repr__()
         return res
 
@@ -218,7 +218,7 @@ def numpy(self):
         # TODO(kathy): revisit and get a mirrored function of ffi::GetDataSize
         # in Python to replace line below
         nbytes = np_arr.size if dtype == "bool" else (np_arr.size * old_dtype.bits + 7) // 8
-        _ffi_api.TVMArrayCopyToBytes(self, data, nbytes)
+        _ffi_api.TVMTensorCopyToBytes(self, data, nbytes)
 
         if old_dtype == "int4" or old_dtype.startswith("float4_e2m1fn"):
             length = np_arr.size
@@ -238,13 +238,13 @@ def copyto(self, target, mem_scope=None):
 
         Parameters
         ----------
-        target : NDArray
+        target : Tensor
             The target array to be copied, must have same shape as this array.
 
         mem_scope : Optional[str]
             The memory scope of the array.
         """
-        if isinstance(target, NDArray):
+        if isinstance(target, Tensor):
             return self._copyto(target)
         if isinstance(target, tvm_ffi.core.Device):
             res = empty(self.shape, self.dtype, target, mem_scope)
@@ -253,7 +253,7 @@ def copyto(self, target, mem_scope=None):
 
     def _copyto(self, target_nd):
         """Internal function that implements copy to target ndarray."""
-        _ffi_api.TVMArrayCopyFromTo(self, target_nd)
+        _ffi_api.TVMTensorCopyFromTo(self, target_nd)
         return target_nd
 
     def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset: int = 0):
@@ -301,7 +301,7 @@ def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset:
         if dtype is None:
             dtype = self.dtype
 
-        return _ffi_api.TVMArrayCreateView(self, shape, dtype, relative_byte_offset)
+        return _ffi_api.TVMTensorCreateView(self, shape, dtype, relative_byte_offset)
 
 
 def empty(shape, dtype="float32", device=None, mem_scope=None):
@@ -323,19 +323,19 @@ def empty(shape, dtype="float32", device=None, mem_scope=None):
 
     Returns
     -------
-    arr : tvm.nd.NDArray
+    arr : tvm.runtime.Tensor
         The array tvm supported.
     """
     device = device or cpu()
     if not isinstance(shape, tvm.runtime.ShapeTuple):
         shape = tvm.runtime.ShapeTuple([int(dim) for dim in shape])
     dtype = tvm_ffi.dtype(dtype)
-    arr = _ffi_api.TVMArrayAllocWithScope(shape, dtype, device, mem_scope)
+    arr = _ffi_api.TVMTensorAllocWithScope(shape, dtype, device, mem_scope)
     return arr
 
 
-def array(arr, device=None, mem_scope=None):
-    """Create an array from source arr.
+def tensor(arr, device=None, mem_scope=None):
+    """Create an tensor from source arr.
 
     Parameters
     ----------
@@ -350,15 +350,15 @@ def array(arr, device=None, mem_scope=None):
 
     Returns
     -------
-    ret : NDArray
+    ret : Tensor
         The created array
     """
     device = device or cpu()
 
-    if not isinstance(arr, (np.ndarray, NDArray)):
+    if not isinstance(arr, (np.ndarray, Tensor)):
         arr = np.array(arr)
     return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr)
 
 
 # Register back to FFI
-tvm_ffi.core._set_class_ndarray(NDArray)
+tvm_ffi.core._set_class_tensor(Tensor)
diff --git a/python/tvm/runtime/disco/session.py b/python/tvm/runtime/disco/session.py
index 4e4d030a6260..ed4ce06a3766 100644
--- a/python/tvm/runtime/disco/session.py
+++ b/python/tvm/runtime/disco/session.py
@@ -28,8 +28,8 @@
 from tvm_ffi import get_global_func, register_func, register_object
 from ..device import Device
 from ..container import ShapeTuple
-from ..ndarray import NDArray
-from ..ndarray import array as _as_NDArray
+from .._tensor import Tensor
+from .._tensor import tensor as _as_Tensor
 from ..object import Object
 from . import _ffi_api, process_pool  # pylint: disable=unused-import
 
@@ -58,20 +58,20 @@ def debug_get_from_remote(self, worker_id: int) -> Any:
     def debug_copy_from(
         self,
         worker_id: int,
-        value: Union[np.ndarray, NDArray],
+        value: Union[np.ndarray, Tensor],
     ) -> None:
-        """Copy an NDArray value to remote for debugging purposes.
+        """Copy an Tensor value to remote for debugging purposes.
 
         Parameters
         ----------
         worker_id : int
             The id of the worker to be copied to.
 
-        value : Union[numpy.ndarray, NDArray]
+        value : Union[numpy.ndarray, Tensor]
             The value to be copied.
         """
-        if not isinstance(value, NDArray):
-            value = _as_NDArray(value)
+        if not isinstance(value, Tensor):
+            value = _as_Tensor(value)
         return _ffi_api.DRefDebugCopyFrom(self, worker_id, value)  # type: ignore # pylint: disable=no-member
 
 
@@ -122,18 +122,18 @@ def empty(
         worker0_only: bool = False,
         in_group: bool = True,
     ) -> DRef:
-        """Create an empty NDArray on all workers and attach them to a DRef.
+        """Create an empty Tensor on all workers and attach them to a DRef.
 
         Parameters
         ----------
         shape : tuple of int
-            The shape of the NDArray.
+            The shape of the Tensor.
 
         dtype : str
-            The data type of the NDArray.
+            The data type of the Tensor.
 
         device : Optional[Device] = None
-            The device of the NDArray.
+            The device of the Tensor.
 
         worker0_only: bool
             If False (default), allocate an array on each worker.  If
@@ -147,7 +147,7 @@ def empty(
         Returns
         -------
         array : DRef
-            The created NDArray.
+            The created Tensor.
 
         """
         func = self._get_cached_method("runtime.disco.empty")
@@ -217,7 +217,7 @@ def call_packed(self, func: DRef, *args) -> DRef:
         Notes
         -----
         Examples of unsupported types:
-        - NDArray, DLTensor,;
+        - Tensor, DLTensor,;
         - TVM Objects, including PackedFunc, Module and String.
         """
         return _ffi_api.SessionCallPacked(self, 0, 0, func, *args)  # type: ignore # pylint: disable=no-member
@@ -246,29 +246,29 @@ def sync_worker_0(self) -> None:
         executing all the existing instructions."""
         return self._sync_worker(0)
 
-    def copy_from_worker_0(self, host_array: NDArray, remote_array: DRef) -> None:
-        """Copy an NDArray from worker-0 to the controller-side NDArray.
+    def copy_from_worker_0(self, host_array: Tensor, remote_array: DRef) -> None:
+        """Copy an Tensor from worker-0 to the controller-side Tensor.
 
         Parameters
         ----------
         host_array : numpy.ndarray
             The array to be copied to worker-0.
 
-        remote_array : NDArray
-            The NDArray on worker-0.
+        remote_array : Tensor
+            The Tensor on worker-0.
         """
         return _ffi_api.SessionCopyFromWorker0(self, host_array, remote_array)  # type: ignore # pylint: disable=no-member
 
-    def copy_to_worker_0(self, host_array: NDArray, remote_array: Optional[DRef] = None) -> DRef:
-        """Copy the controller-side NDArray to worker-0.
+    def copy_to_worker_0(self, host_array: Tensor, remote_array: Optional[DRef] = None) -> DRef:
+        """Copy the controller-side Tensor to worker-0.
 
         Parameters
         ----------
-        host_array : NDArray
+        host_array : Tensor
             The array to be copied to worker-0.
 
         remote_array : Optiona[DRef]
-            The destination NDArray on worker-0.
+            The destination Tensor on worker-0.
 
         Returns
         -------
@@ -329,7 +329,7 @@ def init_ccl(self, ccl: str, *device_ids):
 
     def broadcast(
         self,
-        src: Union[np.ndarray, NDArray],
+        src: Union[np.ndarray, Tensor],
         dst: Optional[DRef] = None,
         in_group: bool = True,
     ) -> DRef:
@@ -337,7 +337,7 @@ def broadcast(
 
         Parameters
         ----------
-        src: Union[np.ndarray, NDArray]
+        src: Union[np.ndarray, Tensor]
             The array to be broadcasted.
 
         dst: Optional[DRef]
@@ -356,8 +356,8 @@ def broadcast(
             `dst`.  Otherwise, it is the newly allocated space.
 
         """
-        if not isinstance(src, NDArray):
-            src = _as_NDArray(src)
+        if not isinstance(src, Tensor):
+            src = _as_Tensor(src)
 
         if dst is None:
             dst = self.empty(src.shape, src.dtype)
@@ -372,7 +372,7 @@ def broadcast_from_worker0(self, src: DRef, dst: DRef, in_group: bool = True) ->
 
         Parameters
         ----------
-        src: Union[np.ndarray, NDArray]
+        src: Union[np.ndarray, Tensor]
             The array to be broadcasted.
 
         dst: Optional[DRef]
@@ -387,7 +387,7 @@ def broadcast_from_worker0(self, src: DRef, dst: DRef, in_group: bool = True) ->
 
     def scatter(
         self,
-        src: Union[np.ndarray, NDArray],
+        src: Union[np.ndarray, Tensor],
         dst: Optional[DRef] = None,
         in_group: bool = True,
     ) -> DRef:
@@ -395,7 +395,7 @@ def scatter(
 
         Parameters
         ----------
-        src: Union[np.ndarray, NDArray]
+        src: Union[np.ndarray, Tensor]
             The array to be scattered.  The first dimension of this
             array, `src.shape[0]`, must be equal to the number of
             workers.
@@ -419,8 +419,8 @@ def scatter(
         """
         assert src.shape[0] == self.num_workers
 
-        if not isinstance(src, NDArray):
-            src = _as_NDArray(src)
+        if not isinstance(src, Tensor):
+            src = _as_Tensor(src)
 
         if dst is None:
             dst = self.empty(src.shape[1:], src.dtype)
@@ -435,7 +435,7 @@ def scatter_from_worker0(self, from_array: DRef, to_array: DRef, in_group: bool
 
         Parameters
         ----------
-        src: Union[np.ndarray, NDArray]
+        src: Union[np.ndarray, Tensor]
             The array to be scattered.  The first dimension of this
             array, `src.shape[0]`, must be equal to the number of
             workers.
diff --git a/python/tvm/runtime/executable.py b/python/tvm/runtime/executable.py
index 47c46959be28..a57c1b623183 100644
--- a/python/tvm/runtime/executable.py
+++ b/python/tvm/runtime/executable.py
@@ -39,7 +39,7 @@ def __getitem__(self, name: str) -> PackedFunc:
 
     def __call__(self, *args, **kwargs) -> Any:
         """Call the executable."""
-        return self.jit().entry_func(*args, **kwargs)
+        return self.jit().main(*args, **kwargs)
 
     def jit(
         self,
diff --git a/python/tvm/runtime/params.py b/python/tvm/runtime/params.py
index af0b4a26173a..f1ea7bda242d 100644
--- a/python/tvm/runtime/params.py
+++ b/python/tvm/runtime/params.py
@@ -16,15 +16,15 @@
 # under the License.
 # pylint: disable=invalid-name
 """Helper utility to save and load parameter dicts."""
-from . import _ffi_api, ndarray, NDArray
+from . import _ffi_api, tensor, Tensor
 
 
-def _to_ndarray(params):
+def _to_tensor(params):
     transformed = {}
 
     for k, v in params.items():
-        if not isinstance(v, NDArray):
-            transformed[k] = ndarray.array(v)
+        if not isinstance(v, Tensor):
+            transformed[k] = tensor(v)
         else:
             transformed[k] = v
 
@@ -39,7 +39,7 @@ def save_param_dict(params):
 
     Parameters
     ----------
-    params : dict of str to NDArray
+    params : dict of str to Tensor
         The parameter dictionary.
 
     Returns
@@ -59,7 +59,7 @@ def save_param_dict(params):
        # Pass in byte array to module to directly set parameters
        tvm.runtime.load_param_dict(param_bytes)
     """
-    return _ffi_api.SaveParams(_to_ndarray(params))
+    return _ffi_api.SaveParams(_to_tensor(params))
 
 
 def save_param_dict_to_file(params, path):
@@ -67,13 +67,13 @@ def save_param_dict_to_file(params, path):
 
     Parameters
     ----------
-    params : dict of str to NDArray
+    params : dict of str to Tensor
         The parameter dictionary.
 
     path: str
         The path to the parameter file.
     """
-    return _ffi_api.SaveParamsToFile(_to_ndarray(params), path)
+    return _ffi_api.SaveParamsToFile(_to_tensor(params), path)
 
 
 def load_param_dict(param_bytes):
@@ -86,7 +86,7 @@ def load_param_dict(param_bytes):
 
     Returns
     -------
-    params : dict of str to NDArray
+    params : dict of str to Tensor
         The parameter dictionary.
     """
     if isinstance(param_bytes, (bytes, str)):
@@ -104,7 +104,7 @@ def load_param_dict_from_file(path):
 
     Returns
     -------
-    params : dict of str to NDArray
+    params : dict of str to Tensor
         The parameter dictionary.
     """
     return _ffi_api.LoadParamsFromFile(path)
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index a955835573fd..72fb13378896 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -134,7 +134,7 @@ def invoke_closure(self, closure: Object, *args: Any) -> Object:
         closure : Object
             The VMClosure Object.
 
-        args : list[tvm.runtime.NDArray] or list[np.ndarray]
+        args : list[tvm.runtime.Tensor] or list[np.ndarray]
             The arguments to the closure.
 
         Returns
@@ -206,9 +206,9 @@ def _gettype(arg):
         if isinstance(arg, Object):
             cargs.append(arg)
         elif isinstance(arg, np.ndarray):
-            nd_arr = tvm.nd.array(arg, device=tvm.cpu(0))
+            nd_arr = tvm.runtime.tensor(arg, device=tvm.cpu(0))
             cargs.append(nd_arr)
-        elif isinstance(arg, tvm.runtime.NDArray):
+        elif isinstance(arg, tvm.runtime.Tensor):
             cargs.append(arg)
         elif isinstance(arg, (tuple, list)):
             field_args: List[Any] = []
@@ -217,7 +217,7 @@ def _gettype(arg):
             cargs.append(tuple(field_args))
         elif isinstance(arg, (Number, bool)):
             dtype = _gettype(arg)
-            value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0))
+            value = tvm.runtime.tensor(np.array(arg, dtype=dtype), device=tvm.cpu(0))
             cargs.append(value)
         elif isinstance(arg, str):
             cargs.append(arg)
@@ -252,7 +252,7 @@ def _convert_func_named_args(self, func_name: str, args: Any, **kwargs: Any) ->
 
     def set_input(self, func_name: str, *args: Any, **kwargs: Any) -> None:
         """Set the inputs to a function.
-        This interface works when using VM over RPC by internally converting NDArray in
+        This interface works when using VM over RPC by internally converting Tensor in
         the arguments to DLTensor, which is supported in RPC where remote could only
         have a minimal C runtime.
 
@@ -263,9 +263,9 @@ def set_input(self, func_name: str, *args: Any, **kwargs: Any) -> None:
         ----------
         func_name : str
             The name of the function.
-        args: List[tvm.runtime.NDArray] or List[np.ndarray]
+        args: List[tvm.runtime.Tensor] or List[np.ndarray]
             The arguments to the function.
-        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
+        kwargs: dict of str to tvm.runtime.Tensor or np.ndarray
             Named arguments to the function.
         """
         cargs: List[Any] = []
@@ -482,7 +482,7 @@ def profile(self, func_name: str, *args):
         func_name : str
             The name of the function.
 
-        args: List of NDArray or other objects supported by PackedFunc.
+        args: List of Tensor or other objects supported by PackedFunc.
             The arguments to the function.
 
         Returns
diff --git a/python/tvm/script/ir_builder/relax/distributed/ir.py b/python/tvm/script/ir_builder/relax/distributed/ir.py
index 159ad5aea169..465cf6313eb1 100644
--- a/python/tvm/script/ir_builder/relax/distributed/ir.py
+++ b/python/tvm/script/ir_builder/relax/distributed/ir.py
@@ -29,7 +29,7 @@
 from tvm.relax.distributed import DTensorStructInfo
 from tvm.relax.utils import args_converter
 from tvm import base as _base
-from tvm.runtime import ndarray as _nd
+from tvm.runtime import _tensor
 from tvm.relax.op.distributed import (
     redistribute as _redistribute,
     annotate_sharding as _annotate_sharding,
@@ -89,14 +89,14 @@ def call_tir(
 
 
 def const(
-    value: Union[bool, int, float, _np.ndarray, tvm.nd.NDArray],
+    value: Union[bool, int, float, _np.ndarray, tvm.runtime.Tensor],
     struct_info: DTensorStructInfo,
 ) -> Constant:
     """Create a constant value.
 
     Parameters
     ----------
-    value: Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
+    value: Union[bool, int, float, numpy.ndarray, tvm.runtime.Tensor]
         The constant value.
 
     dtype: Optional[str]
@@ -121,10 +121,10 @@ def const(
     if isinstance(value, (_np.ndarray, _np.generic)):
         if dtype is not None:
             value = value.astype(dtype)
-        value = _nd.array(value)
+        value = _tensor.tensor(value)
 
-    if not isinstance(value, _nd.NDArray):
-        raise ValueError("value has to be scalar or NDArray")
+    if not isinstance(value, _tensor.Tensor):
+        raise ValueError("value has to be scalar or Tensor")
 
     return Constant(value, struct_info)
 
diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py
index e61e563b706b..f045508bfcec 100644
--- a/python/tvm/script/ir_builder/relax/ir.py
+++ b/python/tvm/script/ir_builder/relax/ir.py
@@ -192,7 +192,7 @@
 from tvm.relax.utils import args_converter, gen_call_tir_inputs
 from tvm.runtime import Object as tvm_Object
 from tvm.runtime import ObjectGeneric
-from tvm.runtime.ndarray import (
+from tvm.runtime._tensor import (
     cpu,
     cuda,
     device,
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index c6549ad104c3..ed41ac9bfb56 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -32,7 +32,7 @@
 from tvm import ir, tir
 from tvm.ir import Type
 from tvm.ir.base import deprecated
-from tvm.runtime import String, convert, ndarray
+from tvm.runtime import String, convert, tensor
 from tvm.target import Target
 
 # pylint: disable=unused-import
@@ -1054,7 +1054,7 @@ def allocate_const(
         np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np_data), dtype, extents, annotations
+        tensor(np_data), dtype, extents, annotations
     )
 
 
diff --git a/python/tvm/target/detect_target.py b/python/tvm/target/detect_target.py
index 689825cbe174..808c63cef16a 100644
--- a/python/tvm/target/detect_target.py
+++ b/python/tvm/target/detect_target.py
@@ -18,8 +18,7 @@
 from typing import Union
 
 from tvm_ffi import get_global_func
-from ..runtime import Device
-from ..runtime.ndarray import device
+from ..runtime import Device, device
 from . import Target
 
 
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index c3634d3b0acc..91d3e2b81cc9 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -452,7 +452,7 @@ def const(value, dtype="int32", span=None):
 
     Parameters
     ----------
-    value : Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
+    value : Union[bool, int, float, numpy.ndarray, tvm.runtime.Tensor]
         The constant value.
 
     dtype : str
diff --git a/python/tvm/testing/runner.py b/python/tvm/testing/runner.py
index a4615f7a465f..f2625b28f972 100644
--- a/python/tvm/testing/runner.py
+++ b/python/tvm/testing/runner.py
@@ -24,7 +24,7 @@
     import numpy as np
 
     from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig
-    from tvm.runtime import Device, Module, NDArray
+    from tvm.runtime import Device, Module, Tensor
 
 # pylint: disable=import-outside-toplevel,protected-access
 
@@ -32,11 +32,11 @@
 def _args_to_device(args, device):
     import numpy as np
 
-    from tvm.runtime.ndarray import NDArray, empty
+    from tvm.runtime.tensor import Tensor, empty
 
     uploaded_args = []
     for arg in args:
-        if isinstance(arg, (np.ndarray, NDArray)):
+        if isinstance(arg, (np.ndarray, Tensor)):
             uploaded_args.append(empty(arg.shape, dtype=arg.dtype, device=device).copyfrom(arg))
         elif isinstance(arg, (int, float)):
             uploaded_args.append(arg)
@@ -46,11 +46,11 @@ def _args_to_device(args, device):
 
 
 def _args_to_numpy(args):
-    from tvm.runtime.ndarray import NDArray
+    from tvm.runtime.tensor import Tensor
 
     downloaded_args = []
     for arg in args:
-        if isinstance(arg, NDArray):
+        if isinstance(arg, Tensor):
             downloaded_args.append(arg.numpy())
         else:
             downloaded_args.append(arg)
@@ -80,7 +80,7 @@ def export_with(func):
 def local_run(  # pylint: disable=too-many-arguments,too-many-locals
     mod: "Module",
     device_type: str,
-    args: List[Union["np.ndarray", "NDArray", int, float]],
+    args: List[Union["np.ndarray", "Tensor", int, float]],
     evaluator_config: Optional["EvaluatorConfig"] = None,
     export_func: Union[Callable[["Module", str], None], Literal["tar", "ndk"]] = "tar",
     output_format: Optional[str] = None,
@@ -93,7 +93,7 @@ def local_run(  # pylint: disable=too-many-arguments,too-many-locals
         The TVM module to run.
     device_type : str
         The device type to run the module on.
-    args : List[Union[np.ndarray, NDArray, int, float]]
+    args : List[Union[np.ndarray, Tensor, int, float]]
         The arguments to be fed to the module.
     evaluator_config : Optional[EvaluatorConfig]
         The evaluator configuration to use.
@@ -109,7 +109,7 @@ def local_run(  # pylint: disable=too-many-arguments,too-many-locals
 
     Returns
     -------
-    args : List[Union[np.ndarray, NDArray, int, float]]
+    args : List[Union[np.ndarray, Tensor, int, float]]
         The results of running the module.
     profile_result : tvm.runtime.BenchmarkResult
         The profiling result of running the module.
@@ -152,7 +152,7 @@ def local_run(  # pylint: disable=too-many-arguments,too-many-locals
 def rpc_run(  # pylint: disable=too-many-arguments,too-many-locals
     mod: "Module",
     device_type: str,
-    args: List[Union["np.ndarray", "NDArray", int, float]],
+    args: List[Union["np.ndarray", "Tensor", int, float]],
     evaluator_config: Optional["EvaluatorConfig"] = None,
     rpc_config: Optional["RPCConfig"] = None,
     export_func: Union[Callable[["Module", str], None], Literal["tar", "ndk"]] = "tar",
@@ -166,7 +166,7 @@ def rpc_run(  # pylint: disable=too-many-arguments,too-many-locals
         The TVM module to run.
     device_type : str
         The device type to run the module on.
-    args : List[Union[np.ndarray, NDArray, int, float]]
+    args : List[Union[np.ndarray, Tensor, int, float]]
         The arguments to be fed to the module.
     evaluator_config : Optional[EvaluatorConfig]
         The evaluator configuration to use.
@@ -189,7 +189,7 @@ def rpc_run(  # pylint: disable=too-many-arguments,too-many-locals
 
     Returns
     -------
-    args : List[Union[np.ndarray, NDArray, int, float]]
+    args : List[Union[np.ndarray, Tensor, int, float]]
         The results of running the module.
     profile_result : tvm.runtime.BenchmarkResult
         The profiling result of running the module.
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index fcc452b6b4d4..da22cf77466f 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -324,7 +324,7 @@ def _compute_body(*us):
             return tvm.tir.stmt_functor.substitute(expr, vmap)
 
         A = tvm.te.compute([r.extent.value for v, r in vranges.items()], _compute_body)
-        args = [tvm.nd.empty(A.shape, A.dtype)]
+        args = [tvm.runtime.empty(A.shape, A.dtype)]
         mod = tvm.compile(tvm.IRModule.from_expr(tvm.te.create_prim_func([A])))
         mod(*args)
         return args[0].numpy()
diff --git a/python/tvm/tir/build.py b/python/tvm/tir/build.py
index 98e549cc9c32..beccb65b6359 100644
--- a/python/tvm/tir/build.py
+++ b/python/tvm/tir/build.py
@@ -22,7 +22,6 @@
 import tvm
 from tvm import ir
 from tvm.ir.module import IRModule
-from tvm.runtime import ndarray
 from tvm.target import Target
 from tvm.tir import PrimFunc
 
@@ -206,7 +205,7 @@ def build(
     if target is not None:
         if target.host is not None:
             target_host = target.host
-        elif ndarray.device(target.kind.name, 0).device_type == ndarray.cpu(0).device_type:
+        elif tvm.device(target.kind.name, 0).device_type == tvm.cpu(0).device_type:
             target_host = target
     target_host = Target.canon_target(target_host)
     target_to_bind = target_to_bind.with_host(target_host)
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 750a9118abd6..5b365e124cfc 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -28,7 +28,7 @@
 from tvm.ir import BaseFunc, Range
 from tvm.runtime import Object, Scriptable
 
-from ..runtime.ndarray import NDArray
+from ..runtime._tensor import Tensor
 from . import _ffi_api
 from .buffer import Buffer
 from .expr import PrimExpr, Var
@@ -490,20 +490,20 @@ def map_shape(self, shape: List[PrimExpr]) -> List[PrimExpr]:
         """
         return _ffi_api.IndexMapMapShape(self, shape)
 
-    def map_ndarray(self, arr_src: NDArray) -> NDArray:
-        """Apply thie index map to transform the layout of the input NDArray
+    def map_tensor(self, arr_src: Tensor) -> Tensor:
+        """Apply thie index map to transform the layout of the input Tensor
 
         Parameters
         ----------
-        arr_src : runtime.NDArray
-            The NDArray to be transformed
+        arr_src : runtime.Tensor
+            The Tensor to be transformed
 
         Returns
         -------
-        arr_dst : runtime.NDArray
-            The transformed NDArray
+        arr_dst : runtime.Tensor
+            The transformed Tensor
         """
-        return _ffi_api.IndexMapMapNDArray(self, arr_src)
+        return _ffi_api.IndexMapMapTensor(self, arr_src)
 
     def inverse(self, shape: List[Union[Range, PrimExpr]]) -> "IndexMap":
         """Return the inverse of the map
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index ffd9aeff886d..d706a1a15023 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -51,7 +51,7 @@ def call_packed_lowered(*args, span=None):
     The argument is the corresponding POD type when Expr is presented.
     When the argument is Buffer, the corresponding PackedFunc
     will recieve an TVMArrayHandle whose content is valid during the callback period.
-    If the PackedFunc is a python callback, then the corresponding argument is NDArray.
+    If the PackedFunc is a python callback, then the corresponding argument is Tensor.
 
     Parameters
     ----------
@@ -108,7 +108,7 @@ def call_packed(*args, span=None):
 
     When the argument is Buffer, the corresponding PackedFunc
     will receive an TVMArrayHandle whose content is valid during the callback period.
-    If the PackedFunc is a python callback, then the corresponding argument is NDArray.
+    If the PackedFunc is a python callback, then the corresponding argument is Tensor.
 
     Parameters
     ----------
@@ -356,7 +356,7 @@ def tvm_stack_make_shape(*args):
 
 
 def tvm_stack_make_array(data, shape, strides, ndim, arr_dtype, elem_offset):
-    """Allocate a NDArray(DLTensor) on stack, return the handle
+    """Allocate a Tensor(DLTensor) on stack, return the handle
 
     Parameters
     ----------
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index ed934183a5ce..bd90d5257495 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -31,7 +31,7 @@
 
 import tvm_ffi
 from tvm.ir import PrimExpr, Range, Span
-from tvm.runtime import Object, Scriptable, const, NDArray
+from tvm.runtime import Object, Scriptable, const, Tensor
 
 from . import _ffi_api
 from .buffer import Buffer
@@ -368,8 +368,8 @@ class AllocateConst(Stmt):
     extents : list of Expr
         The extents of the allocate
 
-    data_or_idx : Union[NDArray, int]
-        If an NDArray, this is the const data associated with the
+    data_or_idx : Union[Tensor, int]
+        If an Tensor, this is the const data associated with the
         constant.  If an integer, this is the index into the
         "constants" attribute of the `IRModule` that contains the
         `AllocateConst`.
@@ -387,7 +387,7 @@ class AllocateConst(Stmt):
     buffer_var: Var
     dtype: str
     extents: List[PrimExpr]
-    data: Optional[NDArray]
+    data: Optional[Tensor]
     irmod_storage_idx: Optional[int]
     body: Stmt
     annotations: Mapping[str, Object]
@@ -398,7 +398,7 @@ def __init__(
         buffer_var: Var,
         dtype: str,
         extents: List[PrimExpr],
-        data_or_idx: Union[NDArray, int],
+        data_or_idx: Union[Tensor, int],
         body: Stmt,
         annotations: Optional[Mapping[str, Object]] = None,
         span: Optional[Span] = None,
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 93a182ca3bc2..bf02529194e3 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -373,7 +373,7 @@ def MakePackedAPI():
     For static shapes, the `BufferNode::shape`, `BufferNode::strides`,
     and `BufferNode::elem_offset` member variables are used to
     generate runtime checks on the corresponding member variables in
-    the user-provided `DLTensor*` or `tvm.nd.array` argument.  (e.g. A
+    the user-provided `DLTensor*` or `tvm.runtime.tensor` argument.  (e.g. A
     PrimFunc that accepts a buffer of shape `[16,32]` validates that
     the `DLTensor::shape` array is `[16,32]`.)
 
@@ -1052,26 +1052,26 @@ def InjectPTXAsyncCopy():
     return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
 
 
-def RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=False):
+def RemoveWeightLayoutRewriteBlock(skip_tensor_rewrite=False):
     """Remove weight layout rewrite block before benchmarking during tuning stage.
 
     Parameters
     ----------
-    skip_ndarray_rewrite : bool
-        If True, exact rewrite of NDArray, according to the given index map, will be skipped.
-        Only the shape of the NDArray is transformed correctly, and the content of the destination
+    skip_tensor_rewrite : bool
+        If True, exact rewrite of Tensor, according to the given index map, will be skipped.
+        Only the shape of the Tensor is transformed correctly, and the content of the destination
         array will be filled with random values.
 
-        When this pass is called many times during MetaSchedule tuning, the raw data of NDArray,
-        before and after rewrite, does not matter. Since NDArray layout rewrite, using IndexMap's
-        MapNDArray, is currently slow, skipping the exact rewrite is sometimes necessary.
+        When this pass is called many times during MetaSchedule tuning, the raw data of Tensor,
+        before and after rewrite, does not matter. Since Tensor layout rewrite, using IndexMap's
+        MapTensor, is currently slow, skipping the exact rewrite is sometimes necessary.
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite)  # type: ignore
+    return _ffi_api.RemoveWeightLayoutRewriteBlock(skip_tensor_rewrite)  # type: ignore
 
 
 def ManifestSharedMemoryLocalStage():
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index f75e5db4b9b1..1ee2964ae9b5 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -105,8 +105,8 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         s = topi.generic.schedule_argsort(out)
         f = tvm.compile(s, [data, out], "llvm")
         dev = tvm.cpu()
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
+        tvm_data = tvm.runtime.tensor(np_data, dev)
+        tvm_out = tvm.runtime.tensor(np.zeros(dshape, dtype=data.dtype), dev)
         f(tvm_data, tvm_out)
     """
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 98cec99a09b7..db09aed05a3c 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -736,7 +736,7 @@ def sequence_mask(data, valid_length, mask_value=0, axis=0):
     return cpp.sequence_mask(data, valid_length, mask_value, axis)
 
 
-def ndarray_size(array, dtype="int32"):
+def tensor_size(array, dtype="int32"):
     """Get the number of elements of input array
 
     Parameters
@@ -752,7 +752,7 @@ def ndarray_size(array, dtype="int32"):
     result : tvm.te.Tensor
         The resulting tensor.
     """
-    return cpp.ndarray_size(array, dtype)
+    return cpp.tensor_size(array, dtype)
 
 
 def where(condition, x, y):
diff --git a/src/contrib/msc/core/ir/graph_builder.cc b/src/contrib/msc/core/ir/graph_builder.cc
index 7f84978105ea..00176fb2ca0f 100644
--- a/src/contrib/msc/core/ir/graph_builder.cc
+++ b/src/contrib/msc/core/ir/graph_builder.cc
@@ -34,7 +34,7 @@ namespace msc {
 
 using namespace tvm::relax;
 
-const std::string GetScalarStr(const runtime::NDArray& data, int float_precision) {
+const std::string GetScalarStr(const runtime::Tensor& data, int float_precision) {
   std::string scalar_str;
   if (data->dtype.code == kDLFloat) {
     const float val = ExprUtils::GetScalar<float>(data);
@@ -809,7 +809,7 @@ Array<Expr> GraphBuilder::GetPluginInputs(const Expr& expr) {
   return Downcast<relax::Tuple>(call->args[1])->fields;
 }
 
-Map<MSCTensor, NDArray> WeightsExtractor::GetWeights(const Function& func) {
+Map<MSCTensor, Tensor> WeightsExtractor::GetWeights(const Function& func) {
   VisitExpr(func);
   return weights_;
 }
@@ -849,7 +849,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
              return builder.Build(func);
            })
       .def("msc.core.GetRelaxWeights",
-           [](const IRModule& module, const String& entry_name) -> Map<MSCTensor, NDArray> {
+           [](const IRModule& module, const String& entry_name) -> Map<MSCTensor, Tensor> {
              const auto& func = Downcast<Function>(module->Lookup(entry_name));
              return WeightsExtractor(module).GetWeights(func);
            });
diff --git a/src/contrib/msc/core/ir/graph_builder.h b/src/contrib/msc/core/ir/graph_builder.h
index 401c452d95cb..79c4048304cf 100644
--- a/src/contrib/msc/core/ir/graph_builder.h
+++ b/src/contrib/msc/core/ir/graph_builder.h
@@ -28,7 +28,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/relax/expr.h>
 #include <tvm/relax/expr_functor.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/tir/data_layout.h>
 
 #include <set>
@@ -50,7 +50,7 @@ namespace msc {
 using namespace tvm::relax;
 
 using Expr = tvm::RelaxExpr;
-using tvm::runtime::NDArray;
+using tvm::runtime::Tensor;
 
 /*!
  * \brief Config for building MSCGraph.
@@ -358,14 +358,14 @@ class WeightsExtractor : public ExprVisitor {
   }
 
   /*! \brief Visit the constant and save weights */
-  Map<MSCTensor, NDArray> GetWeights(const Function& func);
+  Map<MSCTensor, Tensor> GetWeights(const Function& func);
 
   void VisitExpr_(const ConstantNode* op) final;
 
   void VisitExpr_(const CallNode* op) final;
 
  private:
-  Map<MSCTensor, NDArray> weights_;
+  Map<MSCTensor, Tensor> weights_;
   Map<Expr, Function> local_funcs_;
   IRModule ref_module_;
 };
diff --git a/src/contrib/msc/core/transform/bind_named_params.cc b/src/contrib/msc/core/transform/bind_named_params.cc
index df534f4cfae6..dec4616f5e38 100644
--- a/src/contrib/msc/core/transform/bind_named_params.cc
+++ b/src/contrib/msc/core/transform/bind_named_params.cc
@@ -83,7 +83,7 @@ std::tuple<Map<Var, Expr>, Map<tir::Var, PrimExpr>> NormalizeNamedBindings(
   auto normalize_value = [&](Var key, ffi::Any obj) -> relax::Expr {
     if (auto opt = obj.as<relax::Expr>()) {
       return opt.value();
-    } else if (auto opt = obj.as<runtime::NDArray>()) {
+    } else if (auto opt = obj.as<runtime::Tensor>()) {
       const auto& span = SpanUtils::CreateWithAttr(msc_attr::kName, key->name_hint());
       return Constant(opt.value(), StructInfo(), span);
     } else {
diff --git a/src/contrib/msc/core/transform/rewrite_utils.cc b/src/contrib/msc/core/transform/rewrite_utils.cc
index 9cbc7c1a8c51..c88cad3e64f7 100644
--- a/src/contrib/msc/core/transform/rewrite_utils.cc
+++ b/src/contrib/msc/core/transform/rewrite_utils.cc
@@ -42,7 +42,7 @@ Var RewriteUtils::MakeCall(BlockBuilder builder, const String& name, Expr op, Ar
 
 Expr RewriteUtils::MakeConstant(BlockBuilder builder, const String& name, double value,
                                 const DataType& dtype, size_t ndim) {
-  const auto& data = support::FloatImmToNDArray(FloatImm(dtype, value));
+  const auto& data = support::FloatImmToTensor(FloatImm(dtype, value));
   Span span = SpanUtils::CreateWithAttr(msc_attr::kName, name);
   const auto& constant = Constant(data, std::nullopt, span);
   if (ndim == 0) {
diff --git a/src/contrib/msc/core/utils.h b/src/contrib/msc/core/utils.h
index aeb7f9eb88fd..19ad0020e5ca 100644
--- a/src/contrib/msc/core/utils.h
+++ b/src/contrib/msc/core/utils.h
@@ -325,7 +325,7 @@ class ExprUtils {
    * \return The scalar value.
    */
   template <typename T>
-  TVM_DLL static const T GetScalar(const runtime::NDArray& array, size_t i = 0) {
+  TVM_DLL static const T GetScalar(const runtime::Tensor& array, size_t i = 0) {
     if (array->dtype.code == kDLInt) {
       if (array->dtype.bits == 8) {
         return T(reinterpret_cast<int8_t*>(array->data)[i]);
diff --git a/src/contrib/msc/framework/tensorflow/codegen.cc b/src/contrib/msc/framework/tensorflow/codegen.cc
index 1a5bdfeacb33..6a77440b7204 100644
--- a/src/contrib/msc/framework/tensorflow/codegen.cc
+++ b/src/contrib/msc/framework/tensorflow/codegen.cc
@@ -40,7 +40,7 @@ void TensorflowCodeGen::CodeGenHelper() {
       .func_arg("name", "str")
       .func_arg("shape", "List[int]")
       .func_arg("dtype", "str")
-      .func_arg("weights", "Dict[str, tvm.nd.array]")
+      .func_arg("weights", "Dict[str, tvm.runtime.Tensor]")
       .func_start()
       .cond_if("name in weights")
       .func_call("tf_v1.get_variable", "var")
@@ -63,7 +63,7 @@ void TensorflowCodeGen::CodeGenGraph() {
     const auto& pair = graph()->FindProducerAndIdx(i);
     stack_.func_arg(IdxOutputBase(pair.first, pair.second), "tf_v1.Tensor");
   }
-  stack_.func_arg("weights", "Dict[str, tvm.nd.array]").func_start();
+  stack_.func_arg("weights", "Dict[str, tvm.runtime.Tensor]").func_start();
   // define weights
   stack_.comment("Define the weights");
   for (const auto& n : graph()->node_names) {
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 9c2ba084ad41..12c6e29eb295 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -105,7 +105,7 @@ Array<ArgInfo> ArgInfo::FromPrimFunc(const tir::PrimFunc& func) {
 Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_preproc) {
   if (remove_preproc) {
     IRModule new_mod =
-        tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ true)(mod);
+        tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_tensor_rewrite*/ true)(mod);
     return ArgInfo::FromPrimFunc(FindEntryFunc(new_mod));
   }
   return ArgInfo::FromPrimFunc(FindEntryFunc(mod));
diff --git a/src/meta_schedule/builder/builder.cc b/src/meta_schedule/builder/builder.cc
index 062e32e58e83..5657a362acce 100644
--- a/src/meta_schedule/builder/builder.cc
+++ b/src/meta_schedule/builder/builder.cc
@@ -26,7 +26,7 @@ namespace meta_schedule {
 /******** Constructors ********/
 
 BuilderInput::BuilderInput(IRModule mod, Target target,
-                           Optional<Map<String, runtime::NDArray>> params) {
+                           Optional<Map<String, runtime::Tensor>> params) {
   ObjectPtr<BuilderInputNode> n = make_object<BuilderInputNode>();
   n->mod = std::move(mod);
   n->target = std::move(target);
@@ -59,7 +59,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("meta_schedule.BuilderInput",
-           [](IRModule mod, Target target, Optional<Map<String, runtime::NDArray>> params)
+           [](IRModule mod, Target target, Optional<Map<String, runtime::Tensor>> params)
                -> BuilderInput { return BuilderInput(mod, target, params); })
       .def("meta_schedule.BuilderResult",
            [](Optional<String> artifact_path, Optional<String> error_msg) -> BuilderResult {
diff --git a/src/meta_schedule/feature_extractor/feature_extractor.cc b/src/meta_schedule/feature_extractor/feature_extractor.cc
index 1f0668a84922..e2fa1fc176b4 100644
--- a/src/meta_schedule/feature_extractor/feature_extractor.cc
+++ b/src/meta_schedule/feature_extractor/feature_extractor.cc
@@ -23,7 +23,7 @@
 namespace tvm {
 namespace meta_schedule {
 
-Array<tvm::runtime::NDArray> PyFeatureExtractorNode::ExtractFrom(
+Array<tvm::runtime::Tensor> PyFeatureExtractorNode::ExtractFrom(
     const TuneContext& context, const Array<MeasureCandidate>& candidates) {
   ICHECK(f_extract_from != nullptr) << "PyFeatureExtractor's ExtractFrom method not implemented!";
   return f_extract_from(context, candidates);
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index d99fe6cc7847..7c9a809e7178 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -216,18 +216,18 @@ int64_t GetVarStride(const std::vector<MultiIndex>& multi_indices, const IntVec&
 }
 
 /*!
- * \brief Converts a 2-dimensional STL vector to a TVM NDArray
+ * \brief Converts a 2-dimensional STL vector to a TVM Tensor
  * \param src The source 2-dimensional STL vector
  * \param second_dim_size The length of the second dimension. When the first dim of src is 0,
- * second_dim_size must be specified, and in such case the shape of the result NDArray is
+ * second_dim_size must be specified, and in such case the shape of the result Tensor is
  * (0, second_dim_size).
- * \return The converted TVM NDArray
+ * \return The converted TVM Tensor
  */
-runtime::NDArray AsNDArray(const std::vector<std::vector<double>>& src, int second_dim_size = -1) {
+runtime::Tensor AsTensor(const std::vector<std::vector<double>>& src, int second_dim_size = -1) {
   int n = src.size();
   ICHECK(!src.empty() || second_dim_size != -1);
   int m = src.empty() ? second_dim_size : src[0].size();
-  runtime::NDArray tgt = runtime::NDArray::Empty(
+  runtime::Tensor tgt = runtime::Tensor::Empty(
       /*shape=*/{n, m},
       /*dtype=*/DLDataType{kDLFloat, 64, 1},
       /*ctx=*/DLDevice{kDLCPU, 0});
@@ -308,7 +308,7 @@ Pass SimplifyForFeatureExtraction() {
  */
 Sequential PassListForPerStoreFeature() {
   return Sequential({
-      tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ true),
+      tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_tensor_rewrite*/ true),
       tir::transform::SimplifyForFeatureExtraction(),
       tir::transform::LowerCrossThreadReduction(),
       tir::transform::LowerInitBlock(),
@@ -1398,11 +1398,11 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
     }
   }
 
-  Array<runtime::NDArray> ExtractFrom(const TuneContext& tune_context,
-                                      const Array<MeasureCandidate>& candidates) {
+  Array<runtime::Tensor> ExtractFrom(const TuneContext& tune_context,
+                                     const Array<MeasureCandidate>& candidates) {
     auto& target_keys = tune_context->target.value()->keys;
     bool is_gpu = std::find(target_keys.begin(), target_keys.end(), "gpu") != target_keys.end();
-    std::vector<runtime::NDArray> results;
+    std::vector<runtime::Tensor> results;
     results.resize(candidates.size());
     std::unique_ptr<tir::group6::Feature> feature_group6 = nullptr;
     if (extract_workload) {
@@ -1417,7 +1417,7 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
           feature_group6->Export(&feature);
         }
       }
-      results[task_id] = tir::utils::AsNDArray(features, this->feature_vector_length);
+      results[task_id] = tir::utils::AsTensor(features, this->feature_vector_length);
     };
     support::parallel_for_dynamic(0, candidates.size(), tune_context->num_threads, f);
     return results;
diff --git a/src/meta_schedule/module_equality.cc b/src/meta_schedule/module_equality.cc
index df8c45b5e697..c3b38cf341d9 100644
--- a/src/meta_schedule/module_equality.cc
+++ b/src/meta_schedule/module_equality.cc
@@ -37,20 +37,20 @@ class ModuleEqualityStructural : public ModuleEquality {
   String GetName() const { return "structural"; }
 };
 
-class ModuleEqualityIgnoreNDArray : public ModuleEquality {
+class ModuleEqualityIgnoreTensor : public ModuleEquality {
  public:
   size_t Hash(IRModule mod) const {
     return tvm::ffi::StructuralHash::Hash(mod, /*map_free_vars=*/false,
-                                          /*skip_ndarray_content=*/true);
+                                          /*skip_tensor_content=*/true);
   }
   bool Equal(IRModule lhs, IRModule rhs) const {
     return tvm::ffi::StructuralEqual::Equal(lhs, rhs, /*map_free_vars=*/false,
-                                            /*skip_ndarray_content=*/true);
+                                            /*skip_tensor_content=*/true);
   }
-  String GetName() const { return "ignore-ndarray"; }
+  String GetName() const { return "ignore-tensor"; }
 };
 
-// The NDArray-ignoring variant of structural equal / hash is used for the module equality
+// The Tensor-ignoring variant of structural equal / hash is used for the module equality
 // on the extracted anchor blocks.
 class ModuleEqualityAnchorBlock : public ModuleEquality {
   size_t Hash(IRModule mod) const {
@@ -58,9 +58,9 @@ class ModuleEqualityAnchorBlock : public ModuleEquality {
     if (anchor_block) {
       return ffi::StructuralHash::Hash(GetRef<tir::Block>(anchor_block),
                                        /*map_free_vars=*/false,
-                                       /*skip_ndarray_content=*/true);
+                                       /*skip_tensor_content=*/true);
     }
-    return ModuleEqualityIgnoreNDArray().Hash(mod);
+    return ModuleEqualityIgnoreTensor().Hash(mod);
   }
   bool Equal(IRModule lhs, IRModule rhs) const {
     auto anchor_block_lhs = tir::FindAnchorBlock(lhs);
@@ -69,9 +69,9 @@ class ModuleEqualityAnchorBlock : public ModuleEquality {
       return tvm::ffi::StructuralEqual::Equal(GetRef<tir::Block>(anchor_block_lhs),
                                               GetRef<tir::Block>(anchor_block_rhs),
                                               /*map_free_vars=*/false,
-                                              /*skip_ndarray_content=*/true);
+                                              /*skip_tensor_content=*/true);
     }
-    return ModuleEqualityIgnoreNDArray().Equal(lhs, rhs);
+    return ModuleEqualityIgnoreTensor().Equal(lhs, rhs);
   }
   String GetName() const { return "anchor-block"; }
 };
@@ -79,8 +79,8 @@ class ModuleEqualityAnchorBlock : public ModuleEquality {
 std::unique_ptr<ModuleEquality> ModuleEquality::Create(const std::string& mod_eq_name) {
   if (mod_eq_name == "structural") {
     return std::make_unique<ModuleEqualityStructural>();
-  } else if (mod_eq_name == "ignore-ndarray") {
-    return std::make_unique<ModuleEqualityIgnoreNDArray>();
+  } else if (mod_eq_name == "ignore-tensor") {
+    return std::make_unique<ModuleEqualityIgnoreTensor>();
   } else if (mod_eq_name == "anchor-block") {
     return std::make_unique<ModuleEqualityAnchorBlock>();
   }
diff --git a/src/meta_schedule/module_equality.h b/src/meta_schedule/module_equality.h
index 7aa3944a4048..cd337c6d7ede 100644
--- a/src/meta_schedule/module_equality.h
+++ b/src/meta_schedule/module_equality.h
@@ -41,10 +41,10 @@ class ModuleEquality {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
-   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *    - "ignore-tensor": Same as "structural", but ignore tensor raw data during
    *                        equality testing and hashing.
    *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
-   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      given module. The "ignore-tensor" varint is used for the extracted blocks
    *                      or in case no anchor block is found.
    *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    * \return An owning pointer to the created instance
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 41a22e4d39d8..1810efa1bf2e 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -60,9 +60,9 @@ TVM_FFI_STATIC_INIT_BLOCK({
         return rtmod;
       });
 
-  refl::TypeAttrDef<runtime::NDArray::Container>()
+  refl::TypeAttrDef<runtime::Tensor::Container>()
       .def("__data_to_json__",
-           [](const runtime::NDArray::Container* node) {
+           [](const runtime::Tensor::Container* node) {
              std::string blob;
              dmlc::MemoryStringStream mstrm(&blob);
              support::Base64OutStream b64strm(&mstrm);
@@ -74,7 +74,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
         dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&blob));
         support::Base64InStream b64strm(&mstrm);
         b64strm.InitPosition();
-        runtime::NDArray temp;
+        runtime::Tensor temp;
         ICHECK(temp.Load(&b64strm));
         return temp;
       });
diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index 7f04091fc178..611e63de8954 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -115,7 +115,7 @@ class CodegenCBase {
    *
    * \code
    *
-   * Array<NDArray> foo_consts;
+   * Array<Tensor> foo_consts;
    *
    * // An example code for the generated C function.
    * int foo_wrapper_(DLTensor* arg0,
@@ -129,7 +129,7 @@ class CodegenCBase {
    *
    * TVM_FFI_DLL_EXPORT_TYPED_FUNC(foo, foo_wrapper_);
    *
-   * int foo_init_wrapper_(Array<NDArray> arr) {
+   * int foo_init_wrapper_(Array<Tensor> arr) {
    *   foo_consts = arr;
    *   return 0;
    * }
@@ -220,7 +220,7 @@ class CodegenCBase {
       // codegen. Moreover, in microTVM we dont expect this part to be generated.
       code_stream_ << "#ifdef __cplusplus\n";
       code_stream_ << "int " << func_name
-                   << "_init_wrapper_(tvm::Array<tvm::runtime::NDArray> arr) {\n";
+                   << "_init_wrapper_(tvm::Array<tvm::runtime::Tensor> arr) {\n";
       EnterScope();
       PrintIndents();
       code_stream_ << func_name << "_consts = arr;\n";
@@ -369,7 +369,7 @@ class CodegenCBase {
   }
 
   /*!
-   * \brief Creates a checker to check if the NDArray pool is initialized
+   * \brief Creates a checker to check if the Tensor pool is initialized
    *
    * \param symobl The Symbol of the current function
    *
@@ -389,8 +389,8 @@ class CodegenCBase {
    *
    * \return The created declaration
    */
-  std::string CreateNDArrayPool(const std::string& symbol) const {
-    return "tvm::Array<tvm::runtime::NDArray> " + symbol + "_consts;";
+  std::string CreateTensorPool(const std::string& symbol) const {
+    return "tvm::Array<tvm::runtime::Tensor> " + symbol + "_consts;";
   }
 
   /*!
diff --git a/src/relax/backend/contrib/codegen_json/codegen_json.h b/src/relax/backend/contrib/codegen_json/codegen_json.h
index 3e0b6ea5e8c6..1ea03a63c0dc 100644
--- a/src/relax/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relax/backend/contrib/codegen_json/codegen_json.h
@@ -174,7 +174,7 @@ class OpAttrExtractor {
           this->Visit(field_info->name.data, &value);
           break;
         }
-        case ffi::TypeIndex::kTVMFFINDArray: {
+        case ffi::TypeIndex::kTVMFFITensor: {
           this->Visit(field_info->name.data, &field_value);
           break;
         }
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index 1f9e8c0378a7..c26c043e7483 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -440,7 +440,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
  * module(s).
  * \return The created module.
  */
-void LinkModules(ObjectPtr<VMExecutable> exec, const Map<String, runtime::NDArray>& params,
+void LinkModules(ObjectPtr<VMExecutable> exec, const Map<String, runtime::Tensor>& params,
                  const tvm::ffi::Module& lib, const Array<ffi::Module>& ext_libs) {
   // query if we need const loader for ext_modules
   // Wrap all submodules in the initialization wrapper.
@@ -461,12 +461,12 @@ void LinkModules(ObjectPtr<VMExecutable> exec, const Map<String, runtime::NDArra
   }
   if (!const_vars_by_symbol.empty() || !params.empty()) {
     // need runtime const information, run link const loader
-    std::unordered_map<std::string, runtime::NDArray> const_var_ndarray;
+    std::unordered_map<std::string, runtime::Tensor> const_var_tensor;
     for (const auto& [name, param] : params) {
-      const_var_ndarray[name] = param;
+      const_var_tensor[name] = param;
     }
     ffi::Module const_loader_mod =
-        runtime::ConstLoaderModuleCreate(const_var_ndarray, const_vars_by_symbol);
+        runtime::ConstLoaderModuleCreate(const_var_tensor, const_vars_by_symbol);
     const_loader_mod->ImportModule(lib);
     for (const auto& it : ext_libs) {
       const_loader_mod->ImportModule(it);
@@ -485,7 +485,7 @@ void LinkModules(ObjectPtr<VMExecutable> exec, const Map<String, runtime::NDArra
  * \brief Link the libraries together.
  */
 ffi::Module VMLink(ExecBuilder builder, Target target, Optional<ffi::Module> lib,
-                   Array<ffi::Module> ext_libs, Map<String, runtime::NDArray> params) {
+                   Array<ffi::Module> ext_libs, Map<String, runtime::Tensor> params) {
   ObjectPtr<VMExecutable> executable = builder->Get();
   if (!lib.defined()) {
     lib = codegen::CSourceModuleCreate(";", "c", Array<String>{});
diff --git a/src/relax/ir/block_builder.cc b/src/relax/ir/block_builder.cc
index 3cf24d8a8c1a..1a725db904b0 100644
--- a/src/relax/ir/block_builder.cc
+++ b/src/relax/ir/block_builder.cc
@@ -427,12 +427,12 @@ class BlockBuilderImpl : public BlockBuilderNode {
     return name_supply_->FreshName(prefix, /*add_prefix*/ false, /*add_underscore*/ false);
   }
 
-  /*! \brief A custom structural hashing that ignores NDArray raw data. */
+  /*! \brief A custom structural hashing that ignores Tensor raw data. */
   class StructuralHashIgnoreNDarray {
    public:
     uint64_t operator()(const ObjectRef& key) const {
       return ffi::StructuralHash::Hash(key, /*map_free_vars=*/false,
-                                       /*skip_ndarray_content=*/true);
+                                       /*skip_tensor_content=*/true);
     }
   };
 
diff --git a/src/relax/ir/expr.cc b/src/relax/ir/expr.cc
index 8fbe05e891ee..844fd890e1fd 100644
--- a/src/relax/ir/expr.cc
+++ b/src/relax/ir/expr.cc
@@ -331,7 +331,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
            });
 });
 
-Constant::Constant(runtime::NDArray data, Optional<StructInfo> struct_info_annotation, Span span) {
+Constant::Constant(runtime::Tensor data, Optional<StructInfo> struct_info_annotation, Span span) {
   ObjectPtr<ConstantNode> n = make_object<ConstantNode>();
   n->data = std::move(data);
   n->span = std::move(span);
@@ -356,7 +356,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def(
       "relax.Constant",
-      [](runtime::NDArray data, Optional<StructInfo> struct_info_annotation = std::nullopt,
+      [](runtime::Tensor data, Optional<StructInfo> struct_info_annotation = std::nullopt,
          Span span = Span()) { return Constant(data, struct_info_annotation, span); });
 });
 
diff --git a/src/relax/op/memory/view.cc b/src/relax/op/memory/view.cc
index 1af12b475136..87f6864824ae 100644
--- a/src/relax/op/memory/view.cc
+++ b/src/relax/op/memory/view.cc
@@ -346,7 +346,7 @@ Expr LowerBuiltinView(const BlockBuilder& bb, const Call& call) {
   infer_sinfo_env_func = EnvFunc::Get("tvm.relax.struct_info.infer_view_sinfo");
   auto runtime_view_sinfo = FuncStructInfo::OpaqueFunc(infer_sinfo_env_func, true);
 
-  ExternFunc runtime_view_func("runtime.TVMArrayCreateView", runtime_view_sinfo);
+  ExternFunc runtime_view_func("runtime.TVMTensorCreateView", runtime_view_sinfo);
 
   return Call(runtime_view_func, {data, shape, dtype, relative_byte_offset});
 }
diff --git a/src/relax/transform/bind_params.cc b/src/relax/transform/bind_params.cc
index 13b138ecce55..1940a7a24d64 100644
--- a/src/relax/transform/bind_params.cc
+++ b/src/relax/transform/bind_params.cc
@@ -131,7 +131,7 @@ std::tuple<Map<Var, Expr>, Map<tir::Var, PrimExpr>> NormalizeBindings(
   auto normalize_value = [&](ffi::Any obj) -> relax::Expr {
     if (auto opt = obj.as<relax::Expr>()) {
       return opt.value();
-    } else if (auto opt = obj.as<runtime::NDArray>()) {
+    } else if (auto opt = obj.as<runtime::Tensor>()) {
       return Constant(opt.value());
     } else {
       LOG(FATAL) << "Cannot coerce object of type " << obj.GetTypeKey() << " into relax expression";
diff --git a/src/relax/transform/fold_constant.cc b/src/relax/transform/fold_constant.cc
index 33e077d72641..93b77387d550 100644
--- a/src/relax/transform/fold_constant.cc
+++ b/src/relax/transform/fold_constant.cc
@@ -73,8 +73,8 @@ class ConstantFolder : public ExprMutator {
    * \brief Pattern match op to constant array arguments.
    * \return The constant array arguments, or nullopt if match fails.
    */
-  static Optional<Array<runtime::NDArray>> MatchConstArrayArgs(const Array<Expr>& args) {
-    Array<runtime::NDArray> res;
+  static Optional<Array<runtime::Tensor>> MatchConstArrayArgs(const Array<Expr>& args) {
+    Array<runtime::Tensor> res;
     for (auto arg : args) {
       auto* ptr = arg.as<relax::ConstantNode>();
       if (!ptr) return std::nullopt;
@@ -144,7 +144,7 @@ class ConstantFolder : public ExprMutator {
 
   // Try constant evaluate the function call
   // if failed return std::nullopt
-  Optional<Expr> ConstEvaluateCallTIR(tir::PrimFunc tir_func, Array<runtime::NDArray> arr_args,
+  Optional<Expr> ConstEvaluateCallTIR(tir::PrimFunc tir_func, Array<runtime::Tensor> arr_args,
                                       ffi::Shape shape, DataType ret_type) {
     // obtain function from the cache.
     Optional<ffi::Function> func = GetCachedBuild(tir_func);
@@ -154,11 +154,11 @@ class ConstantFolder : public ExprMutator {
     std::vector<AnyView> packed_args(arr_args.size() + 1);
 
     DLDevice cpu_dev = {DLDeviceType::kDLCPU, 0};
-    runtime::NDArray ret_tensor = runtime::NDArray::Empty(shape, ret_type, cpu_dev);
+    runtime::Tensor ret_tensor = runtime::Tensor::Empty(shape, ret_type, cpu_dev);
 
     // avoid set rvalue ref which get de-allocated later, store args in a vector
     // where temp_args[i] are lvalue ref that is stable
-    std::vector<runtime::NDArray> temp_args(arr_args.begin(), arr_args.end());
+    std::vector<runtime::Tensor> temp_args(arr_args.begin(), arr_args.end());
 
     size_t arg_offset = 0;
     for (; arg_offset < arr_args.size(); ++arg_offset) {
@@ -179,7 +179,7 @@ class ConstantFolder : public ExprMutator {
     ICHECK_GE(call->args.size(), 2);
     Optional<tir::PrimFunc> func = MatchPrimFunc(call->args[0]);
     ICHECK(call->args[1].as<TupleNode>()) << "call_tir.args[1] must be Tuple";
-    Optional<Array<runtime::NDArray>> arr_args =
+    Optional<Array<runtime::Tensor>> arr_args =
         MatchConstArrayArgs(call->args[1].as<TupleNode>()->fields);
     ICHECK_EQ(call->sinfo_args.size(), 1) << "call_tir should have exactly one sinfo arg";
     Optional<ffi::Shape> shape = MatchConstShape(call->sinfo_args[0]);
@@ -268,7 +268,7 @@ class ConstantFolder : public ExprMutator {
         Expr arg = post_call->args[0];
         if (arg->IsInstance<ConstantNode>()) {
           Constant constant = Downcast<Constant>(arg);
-          runtime::NDArray ndarray = constant->data;
+          runtime::Tensor ndarray = constant->data;
           ICHECK_EQ(ndarray->device.device_type, kDLCPU);
           ICHECK(ffi::IsContiguous(*ndarray.get()));
           ICHECK_EQ(ndarray->byte_offset, 0);
@@ -296,7 +296,7 @@ class ConstantFolder : public ExprMutator {
         }
         if (is_known) {
           const auto func = tvm::ffi::Function::GetGlobalRequired("relax.run.shape_to_tensor");
-          runtime::NDArray vals = func(arr).cast<runtime::NDArray>();
+          runtime::Tensor vals = func(arr).cast<runtime::Tensor>();
           return Constant(vals);
         }
       }
diff --git a/src/relax/transform/meta_schedule.cc b/src/relax/transform/meta_schedule.cc
index acad7d154402..5bb8d2d3e305 100644
--- a/src/relax/transform/meta_schedule.cc
+++ b/src/relax/transform/meta_schedule.cc
@@ -37,7 +37,7 @@ class MetaScheduleTuner {
  public:
   explicit MetaScheduleTuner(Target target, String work_dir, Integer max_trials_global,
                              Integer max_trials_per_task, Optional<Array<String>> op_names,
-                             Map<String, runtime::NDArray> params = {})
+                             Map<String, runtime::Tensor> params = {})
       : target_(target),
         work_dir_(work_dir),
         max_trials_global_(max_trials_global),
@@ -68,7 +68,7 @@ class MetaScheduleTuner {
   Integer max_trials_global_;
   Integer max_trials_per_task_;
   Optional<Array<String>> op_names_;
-  Map<String, runtime::NDArray> params_;
+  Map<String, runtime::Tensor> params_;
   tvm::ffi::Function normalize_mod_func_;
 };
 
@@ -93,7 +93,7 @@ Pass MetaScheduleApplyDatabase(Optional<String> work_dir, bool enable_warning =
     }
 
     Map<GlobalVar, BaseFunc> result;
-    auto mod_eq_structural = meta_schedule::ModuleEquality::Create("ignore-ndarray");
+    auto mod_eq_structural = meta_schedule::ModuleEquality::Create("ignore-tensor");
     for (const auto& iter : mod->functions) {
       GlobalVar gv = iter.first;
       BaseFunc base_func = iter.second;
@@ -146,7 +146,7 @@ Pass MetaScheduleApplyDatabase(Optional<String> work_dir, bool enable_warning =
   return CreateModulePass(pass_func, 0, "MetaScheduleApplyDatabase", {});
 }
 
-Pass MetaScheduleTuneIRMod(Map<String, runtime::NDArray> params, String work_dir,
+Pass MetaScheduleTuneIRMod(Map<String, runtime::Tensor> params, String work_dir,
                            Integer max_trials_global,
                            Optional<Integer> max_trials_per_task = std::nullopt,
                            Optional<Array<String>> op_names = std::nullopt) {
diff --git a/src/relax/transform/run_codegen.cc b/src/relax/transform/run_codegen.cc
index 0cc0a070aac5..af02225361f3 100644
--- a/src/relax/transform/run_codegen.cc
+++ b/src/relax/transform/run_codegen.cc
@@ -89,7 +89,7 @@ class CodeGenRunner : ExprMutator {
 
     if (constant_names.size()) {
       // Some backends (e.g. TensorRT) expect constants to be passed when they are instantiated
-      Map<String, runtime::NDArray> constants;
+      Map<String, runtime::Tensor> constants;
       for (const auto& [constant, name] : constant_names) {
         ICHECK(!constants.count(name)) << "More than one constant with the name " << name;
         constants.Set(name, constant->data);
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 009d00260781..e4fe449ed65e 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -319,7 +319,7 @@ class FunctionCopier : public SymbolicVarRenewMutator {
  */
 template <typename T>
 inline Constant MakeConstantScalar(T value, DataType dtype) {
-  runtime::NDArray arr = runtime::NDArray::Empty({}, dtype, {kDLCPU, 0});
+  runtime::Tensor arr = runtime::Tensor::Empty({}, dtype, {kDLCPU, 0});
   if (dtype == DataType::Float(32)) {
     *static_cast<float*>(arr->data) = static_cast<float>(value);
   } else if (dtype == DataType::Float(64)) {
diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc
index 2c02fb556c73..6f07e10f62d7 100644
--- a/src/runtime/const_loader_module.cc
+++ b/src/runtime/const_loader_module.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file src/runtime/const_loader_module.cc
- * \brief A wrapper for initializing imported modules using constant NDArray. This
+ * \brief A wrapper for initializing imported modules using constant Tensor. This
  * module is intended to be used by various runtime in the TVM stack, i.e.
  * graph executor, relax VM, AOT runtime, and various user defined runtimes. It
  * paves the way to separate the code and metedata, which makes compilation
@@ -34,7 +34,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstdint>
 
@@ -48,9 +48,9 @@ namespace runtime {
 class ConstLoaderModuleObj : public ffi::ModuleObj {
  public:
   ConstLoaderModuleObj(
-      const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+      const std::unordered_map<std::string, Tensor>& const_var_tensor,
       const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol)
-      : const_var_ndarray_(const_var_ndarray), const_vars_by_symbol_(const_vars_by_symbol) {
+      : const_var_tensor_(const_var_tensor), const_vars_by_symbol_(const_vars_by_symbol) {
     VLOG(1) << "Creating ConstLoaderModule";
     // Only the related submodules are cached to reduce the number of runtime
     // symbol lookup for initialization. Otherwise, symbols/primitives in the
@@ -59,7 +59,7 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
       for (const auto& var : kv.second) {
         VLOG(1) << "ConstLoaderModuleNode has constant '" << var << "' for function '" << kv.first
                 << "'";
-        ICHECK_GT(const_var_ndarray_.count(var), 0)
+        ICHECK_GT(const_var_tensor_.count(var), 0)
             << "ConstLoaderModuleNode is missing entry for constant '" << var << "' for function '"
             << kv.first << "'";
       }
@@ -78,10 +78,10 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
     }
     ObjectRef _self = ffi::GetRef<ObjectRef>(this);
 
-    if (name == "get_const_var_ndarray") {
+    if (name == "get_const_var_tensor") {
       return ffi::Function([_self, this](ffi::PackedArgs args, ffi::Any* rv) {
         Map<String, ffi::Any> ret_map;
-        for (const auto& kv : const_var_ndarray_) {
+        for (const auto& kv : const_var_tensor_) {
           ret_map.Set(kv.first, kv.second);
         }
         *rv = ret_map;
@@ -107,17 +107,17 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
   /*!
    * \brief Get the list of constants that is required by the given module.
    * \param symbol The symbol that is being queried.
-   * \return The list of needed NDArray.
+   * \return The list of needed Tensor.
    */
-  Array<NDArray> GetRequiredConstants(const std::string& symbol) {
-    Array<NDArray> ret;
+  Array<Tensor> GetRequiredConstants(const std::string& symbol) {
+    Array<Tensor> ret;
     ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U)
         << "No constants known for function '" << symbol << "'";
     std::vector<std::string> vars = const_vars_by_symbol_[symbol];
     for (const auto& var : vars) {
-      ICHECK_GT(const_var_ndarray_.count(var), 0U)
+      ICHECK_GT(const_var_tensor_.count(var), 0U)
           << "No such constant variable '" << var << "' for function '" << symbol << "'";
-      ret.push_back(const_var_ndarray_[var]);
+      ret.push_back(const_var_tensor_[var]);
     }
     return ret;
   }
@@ -157,20 +157,20 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
     dmlc::Stream* stream = &ms;
 
     std::vector<std::string> variables;
-    std::vector<NDArray> const_var_ndarray;
-    for (const auto& it : const_var_ndarray_) {
+    std::vector<Tensor> const_var_tensor;
+    for (const auto& it : const_var_tensor_) {
       String var_name = it.first;
       variables.push_back(var_name);
-      const_var_ndarray.push_back(it.second);
+      const_var_tensor.push_back(it.second);
     }
 
     // Save all variables in the function.
     stream->Write(variables);
     // Save all constant data.
-    uint64_t sz = static_cast<uint64_t>(const_var_ndarray.size());
+    uint64_t sz = static_cast<uint64_t>(const_var_tensor.size());
     stream->Write(sz);
     for (uint64_t i = 0; i < sz; i++) {
-      const_var_ndarray[i].Save(stream);
+      const_var_tensor[i].Save(stream);
     }
 
     // Save the symbol to list of required constant variables mapping
@@ -202,17 +202,17 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
     ICHECK_EQ(static_cast<size_t>(sz), variables.size())
         << "The number of variables and ndarray counts must match";
     // Load the list of ndarray.
-    std::vector<NDArray> arrays;
+    std::vector<Tensor> arrays;
     for (uint64_t i = 0; i < sz; i++) {
-      NDArray temp;
+      Tensor temp;
       temp.Load(stream);
       arrays.push_back(temp);
     }
 
-    std::unordered_map<std::string, NDArray> const_var_ndarray;
+    std::unordered_map<std::string, Tensor> const_var_tensor;
     for (uint64_t i = 0; i < sz; i++) {
-      ICHECK_EQ(const_var_ndarray.count(variables[i]), 0U);
-      const_var_ndarray[variables[i]] = arrays[i];
+      ICHECK_EQ(const_var_tensor.count(variables[i]), 0U);
+      const_var_tensor[variables[i]] = arrays[i];
     }
 
     // Load the symbol to list of required constant variables mapping
@@ -232,7 +232,7 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
       const_vars_by_symbol[symbols[i]] = const_vars[i];
     }
 
-    auto n = make_object<ConstLoaderModuleObj>(const_var_ndarray, const_vars_by_symbol);
+    auto n = make_object<ConstLoaderModuleObj>(const_var_tensor, const_vars_by_symbol);
     return ffi::Module(n);
   }
 
@@ -242,16 +242,16 @@ class ConstLoaderModuleObj : public ffi::ModuleObj {
    * modules using execution engine.
    */
   std::unordered_map<std::string, bool> initialized_;
-  /*! \brief Variable name to NDArray mapping. */
-  std::unordered_map<std::string, NDArray> const_var_ndarray_;
+  /*! \brief Variable name to Tensor mapping. */
+  std::unordered_map<std::string, Tensor> const_var_tensor_;
   /*! \brief Symbol name to required constant variables mapping. */
   std::unordered_map<std::string, std::vector<std::string>> const_vars_by_symbol_;
 };
 
 ffi::Module ConstLoaderModuleCreate(
-    const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+    const std::unordered_map<std::string, Tensor>& const_var_tensor,
     const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol) {
-  auto n = make_object<ConstLoaderModuleObj>(const_var_ndarray, const_vars_by_symbol);
+  auto n = make_object<ConstLoaderModuleObj>(const_var_tensor, const_vars_by_symbol);
   return ffi::Module(n);
 }
 
diff --git a/src/runtime/const_loader_module.h b/src/runtime/const_loader_module.h
index c093818763d8..30bddc7b377a 100644
--- a/src/runtime/const_loader_module.h
+++ b/src/runtime/const_loader_module.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RUNTIME_CONST_LOADER_MODULE_H_
 #define TVM_RUNTIME_CONST_LOADER_MODULE_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <unordered_map>
@@ -37,14 +37,14 @@ namespace runtime {
 /*!
  * \brief Create a ConstLoader module object.
  *
- * \param const_var_ndarray Maps consts var name to NDArray containing data for the var.
+ * \param const_var_tensor Maps consts var name to Tensor containing data for the var.
  * \param const_vars_by_symbol Maps the name of a module init function to a list of names of
  * const vars whose data will be passed to that init function.
  *
  * \return The created ConstLoaderModule.
  */
 ffi::Module ConstLoaderModuleCreate(
-    const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+    const std::unordered_map<std::string, Tensor>& const_var_tensor,
     const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol);
 
 }  // namespace runtime
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 3de9e85a57c5..92e4bd06e254 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -24,7 +24,7 @@
 
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
@@ -77,7 +77,7 @@ class ACLRuntime : public JSONRuntimeBase {
    *
    * \param consts The constant params from compiled model.
    */
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);
diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc
index 9080eeb9bb34..0386bde3783b 100644
--- a/src/runtime/contrib/bnns/bnns_json_runtime.cc
+++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/c_backend_api.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -93,7 +93,7 @@ class BNNSJSONRuntime : public JSONRuntimeBase {
 
   const char* kind() const override { return "bnns_json"; }
 
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
@@ -367,7 +367,7 @@ class BNNSJSONRuntime : public JSONRuntimeBase {
                                                           dst_view.get_bnns_view()};
 
     // BNNS limitation: MatMul use reverse dims values. However strides are calculated correctly
-    //    based on BNNSNDArrayDescriptor::layout value.
+    //    based on BNNSTensorDescriptor::layout value.
     std::reverse(layerParameters.iA_desc.size, layerParameters.iA_desc.size + 3);
     std::reverse(layerParameters.iB_desc.size, layerParameters.iB_desc.size + 3);
     std::reverse(layerParameters.o_desc.size, layerParameters.o_desc.size + 3);
diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h
index f395561a7f6c..1997e0a84d71 100644
--- a/src/runtime/contrib/bnns/bnns_wrp.h
+++ b/src/runtime/contrib/bnns/bnns_wrp.h
@@ -62,7 +62,7 @@ class Tensor {
     auto rank = shape.size();
     ICHECK(rank < BNNS_MAX_TENSOR_DIMENSION);
 
-    desc_ = {BNNSNDArrayFlags(0),
+    desc_ = {BNNSTensorFlags(0),
              getPlainLayout(rank),
              {},       // shape
              {},       // strides
@@ -107,7 +107,7 @@ class Tensor {
     is_external_data = true;
   }
 
-  const BNNSNDArrayDescriptor& get_desc() const { return desc_; }
+  const BNNSTensorDescriptor& get_desc() const { return desc_; }
 
   static BNNSDataLayout getPlainLayout(size_t rank) {
     ICHECK(rank <= BNNS_MAX_TENSOR_DIMENSION);
@@ -116,9 +116,9 @@ class Tensor {
 
   static size_t getRank(BNNSDataLayout layout) { return (layout & 0xF0000) >> 16; }
 
-  static size_t getRank(BNNSNDArrayDescriptor desc) { return getRank(desc.layout); }
+  static size_t getRank(BNNSTensorDescriptor desc) { return getRank(desc.layout); }
 
-  static size_t getSize(BNNSNDArrayDescriptor desc) {
+  static size_t getSize(BNNSTensorDescriptor desc) {
     auto rank = getRank(desc);
     return std::accumulate(desc.size, desc.size + rank, 1, std::multiplies<int>());
   }
@@ -127,13 +127,13 @@ class Tensor {
   static size_t getElementSize(Dtype dtype) { return (dtype & 0xFFFF) / 8; }
 
   /** return size of element in bytes */
-  static size_t getElementSize(const BNNSNDArrayDescriptor& desc) {
+  static size_t getElementSize(const BNNSTensorDescriptor& desc) {
     return getElementSize(desc.data_type);
   }
 
  private:
   bool is_external_data = false;
-  BNNSNDArrayDescriptor desc_;
+  BNNSTensorDescriptor desc_;
 };
 
 using TensorPtr = std::shared_ptr<Tensor>;
@@ -291,14 +291,14 @@ class TView {
   operator bool() const { return origin_ != nullptr; }
 
   /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */
-  const BNNSNDArrayDescriptor& get_bnns_view() const { return view_desc_; }
+  const BNNSTensorDescriptor& get_bnns_view() const { return view_desc_; }
 
  private:
   /** Original tensor object to view on */
   TensorPtr origin_;
 
   /** Batched view parameters */
-  BNNSNDArrayDescriptor view_desc_ = {};
+  BNNSTensorDescriptor view_desc_ = {};
   size_t batch_size_ = 1;
   size_t batch_stride_ = 0;
 
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 9d13e427b24a..39e38aa8725d 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -201,7 +201,7 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param consts The constant params from compiled model.
    */
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);
@@ -270,7 +270,7 @@ class CLMLRuntime : public JSONRuntimeBase {
                     "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
     }
     cl_command_queue queue = CLML_QUEUE;
-    Map<String, NDArray> dump_tensors;
+    Map<String, Tensor> dump_tensors;
     std::ostringstream os;
     dmlc::JSONWriter writer(&os);
     writer.BeginObject();
@@ -293,7 +293,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       // Dump tensor to CPU
       std::vector<int64_t> shape = node.GetOpShape()[0];
       DLDataType tvm_dtype = node.GetOpDataType()[0];
-      NDArray narr = NDArray::Empty(ffi::Shape(shape), tvm_dtype, {kDLCPU, 0});
+      Tensor narr = Tensor::Empty(ffi::Shape(shape), tvm_dtype, {kDLCPU, 0});
       CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);
 
       // Naming convention
@@ -466,8 +466,8 @@ class CLMLRuntime : public JSONRuntimeBase {
           cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
           int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
           void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
-          TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
-                              isize * dtype_size);
+          TVMTensorCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                               isize * dtype_size);
           CopyDataToCLMLTensor(layer_.inputs[nid], tmpptr);
           free(tmpptr);
         }
@@ -553,8 +553,8 @@ class CLMLRuntime : public JSONRuntimeBase {
 
         void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
         CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
-        TVMArrayCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
-                              osize * dtype_size);
+        TVMTensorCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                               osize * dtype_size);
         free(tmpptr);
       }
     }
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
index 4431b63cafcc..716ea4665ea4 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -33,8 +33,8 @@
 #include <CL/opencl.h>
 #include <stdlib.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/profiling.h>
+#include <tvm/runtime/tensor.h>
 
 #include <fstream>
 #include <map>
@@ -253,11 +253,11 @@ struct CachedLayer {
   std::map<cl_ml_op_qcom, std::pair<int, JSONGraphNode>> op_node_map;
   /* The input tensor map  */
   std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
-  /* A place holder Tensor representing TVM NDArray as CLML Tensor */
+  /* A place holder Tensor representing TVM Tensor as CLML Tensor */
   std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
   /* The Output tensor map */
   std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
-  /* A place holder Tensor representing TVM NDArray as CLML Tensor */
+  /* A place holder Tensor representing TVM Tensor as CLML Tensor */
   std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
   /* Tensor shape exception list while returning from CLML Subgraph */
   std::map<int, std::vector<size_t>> out_shapes;
diff --git a/src/runtime/contrib/coreml/coreml_runtime.h b/src/runtime/contrib/coreml/coreml_runtime.h
index 257b624bbf2b..3f7db78bfc31 100644
--- a/src/runtime/contrib/coreml/coreml_runtime.h
+++ b/src/runtime/contrib/coreml/coreml_runtime.h
@@ -31,7 +31,7 @@
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <memory>
 #include <string>
@@ -67,12 +67,12 @@ class CoreMLModel {
    */
   void SetInput(const std::string& key, DLTensor* data_in);
   /*!
-   * \brief Return NDArray for given output index.
+   * \brief Return Tensor for given output index.
    * \param index The output index.
    *
-   * \return NDArray corresponding to given output node index.
+   * \return Tensor corresponding to given output node index.
    */
-  NDArray GetOutput(int index) const;
+  Tensor GetOutput(int index) const;
   /*!
    * \brief Return the number of outputs
    *
diff --git a/src/runtime/contrib/coreml/coreml_runtime.mm b/src/runtime/contrib/coreml/coreml_runtime.mm
index fb5faa8621b2..5926fb32d62c 100644
--- a/src/runtime/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/contrib/coreml/coreml_runtime.mm
@@ -67,7 +67,7 @@
   [input_dict_ setObject:dest forKey:nsKey];
 }
 
-NDArray CoreMLModel::GetOutput(int index) const {
+Tensor CoreMLModel::GetOutput(int index) const {
   MLModelDescription* model_desc = model_.modelDescription;
   NSString* metadata = [model_desc metadata][MLModelDescriptionKey];
   NSData* data = [metadata dataUsingEncoding:NSUTF8StringEncoding];
@@ -103,7 +103,7 @@
       .device_type = kDLCPU,
       .device_id = 0,
   };
-  NDArray ret = NDArray::Empty(shape, dtype, cpu_dev);
+  Tensor ret = Tensor::Empty(shape, dtype, cpu_dev);
   ret.CopyFromBytes(src.dataPointer, size);
 
   return ret;
@@ -157,10 +157,9 @@
 
       // Copy input tensors to corresponding data entries.
       for (auto i = 0; i < args.size() - 1; ++i) {
-        ICHECK(args[i].type_code() == kTVMDLTensorHandle ||
-               args[i].type_code() == kTVMNDArrayHandle)
-            << "Expect NDArray or DLTensor as inputs\n";
-        if (args[i].type_code() == kTVMDLTensorHandle || args[i].type_code() == kTVMNDArrayHandle) {
+        ICHECK(args[i].type_code() == kTVMDLTensorHandle || args[i].type_code() == kTVMTensorHandle)
+            << "Expect Tensor or DLTensor as inputs\n";
+        if (args[i].type_code() == kTVMDLTensorHandle || args[i].type_code() == kTVMTensorHandle) {
           model_->SetInput([input_names[i] UTF8String], args[i]);
         } else {
           LOG(FATAL) << "Not implemented";
@@ -171,12 +170,12 @@
       model_->Invoke();
 
       // TODO: Support multiple outputs.
-      NDArray out = model_->GetOutput(0);
+      Tensor out = model_->GetOutput(0);
       if (args[args.size() - 1].type_code() == kTVMDLTensorHandle) {
         DLTensor* arg = args[args.size() - 1];
         out.CopyTo(arg);
       } else {
-        NDArray arg = args[args.size() - 1];
+        Tensor arg = args[args.size() - 1];
         out.CopyTo(arg);
       }
       *rv = out;
diff --git a/src/runtime/contrib/cublas/cublas_json_runtime.cc b/src/runtime/contrib/cublas/cublas_json_runtime.cc
index 0416391303ad..99eda5cc89f8 100644
--- a/src/runtime/contrib/cublas/cublas_json_runtime.cc
+++ b/src/runtime/contrib/cublas/cublas_json_runtime.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -49,7 +49,7 @@ class CublasJSONRuntime : public JSONRuntimeBase {
                     const Array<String> const_names)
       : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
 
-  void Init(const Array<NDArray>& consts) override {}
+  void Init(const Array<Tensor>& consts) override {}
 
   ffi::Optional<ffi::Function> GetFunction(const String& name) override {
     // JSONRuntimeBase::SetInputOutputBuffers(...) is not thread safe. Since CublasJSONRuntime
@@ -76,8 +76,8 @@ class CublasJSONRuntime : public JSONRuntimeBase {
                                            : EntryID(outputs_[i - input_var_eid_.size()]);
 
       const DLTensor* arg;
-      if (auto opt_nd = args[i].as<NDArray>()) {
-        NDArray arr = opt_nd.value();
+      if (auto opt_nd = args[i].as<Tensor>()) {
+        Tensor arr = opt_nd.value();
         arg = arr.operator->();
       } else {
         arg = args[i].cast<DLTensor*>();
diff --git a/src/runtime/contrib/cudnn/cudnn_json_runtime.cc b/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
index 3888bca3df04..1e17cf2ecfd4 100644
--- a/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
+++ b/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -52,7 +52,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
                    const Array<String> const_names)
       : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
 
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     op_execs_.resize(nodes_.size());
     // get some config from the graph
     for (size_t i = 0; i < nodes_.size(); ++i) {
diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm.cuh b/src/runtime/contrib/cutlass/fp16_group_gemm.cuh
index cb26a0796d53..ffc05893cad6 100644
--- a/src/runtime/contrib/cutlass/fp16_group_gemm.cuh
+++ b/src/runtime/contrib/cutlass/fp16_group_gemm.cuh
@@ -21,7 +21,7 @@
 #include <float.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "cutlass/bfloat16.h"
 #include "cutlass/half.h"
@@ -33,8 +33,8 @@ template <int Arch, typename ElementA, typename ElementB, typename ElementC>
 struct CutlassGroupGemm;
 
 template <int Arch>
-void tvm_cutlass_group_gemm_impl(NDArray x, NDArray weight, NDArray indptr, NDArray workspace,
-                                 NDArray out) {
+void tvm_cutlass_group_gemm_impl(Tensor x, Tensor weight, Tensor indptr, Tensor workspace,
+                                 Tensor out) {
   // Workspace is used for storing device-side group gemm arguments and cutlass internal workspace.
   // Recommened size is 4MB.
   cudaStream_t stream =
diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm_sm100.cu b/src/runtime/contrib/cutlass/fp16_group_gemm_sm100.cu
index 90802969c53e..ef72c0008034 100644
--- a/src/runtime/contrib/cutlass/fp16_group_gemm_sm100.cu
+++ b/src/runtime/contrib/cutlass/fp16_group_gemm_sm100.cu
@@ -21,8 +21,8 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/tensor.h>
 
 #include "fp16_group_gemm.cuh"
 #include "fp16_group_gemm_runner_sm100.cuh"
@@ -42,8 +42,8 @@ struct CutlassGroupGemm<100, ElementA, ElementB, ElementC> {
   }
 };
 
-void tvm_cutlass_group_gemm_sm100(NDArray x, NDArray weight, NDArray indptr, NDArray workspace,
-                                  NDArray out) {
+void tvm_cutlass_group_gemm_sm100(Tensor x, Tensor weight, Tensor indptr, Tensor workspace,
+                                  Tensor out) {
   tvm_cutlass_group_gemm_impl<100>(x, weight, indptr, workspace, out);
 }
 
diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu b/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
index 0b240b85a4f4..508bc77f9205 100644
--- a/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
+++ b/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
@@ -21,7 +21,7 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "fp16_group_gemm.cuh"
 #include "fp16_group_gemm_runner_sm90.cuh"
@@ -41,8 +41,8 @@ struct CutlassGroupGemm<90, ElementA, ElementB, ElementC> {
   }
 };
 
-void tvm_cutlass_group_gemm_sm90(NDArray x, NDArray weight, NDArray indptr, NDArray workspace,
-                                 NDArray out) {
+void tvm_cutlass_group_gemm_sm90(Tensor x, Tensor weight, Tensor indptr, Tensor workspace,
+                                 Tensor out) {
   tvm_cutlass_group_gemm_impl<90>(x, weight, indptr, workspace, out);
 }
 
diff --git a/src/runtime/contrib/cutlass/fp8_gemm.cu b/src/runtime/contrib/cutlass/fp8_gemm.cu
index 5cabd0ca7af2..2be8c09da2dc 100644
--- a/src/runtime/contrib/cutlass/fp8_gemm.cu
+++ b/src/runtime/contrib/cutlass/fp8_gemm.cu
@@ -22,7 +22,7 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "../cublas/cublas_utils.h"
 #include "gemm_runner.cuh"
@@ -39,8 +39,7 @@ namespace tvm {
 namespace runtime {
 
 template <typename ElementA, typename ElementB, typename ElementC>
-void tvm_cutlass_fp8_gemm(NDArray x, NDArray weight, NDArray workspace, NDArray alpha,
-                          NDArray out) {
+void tvm_cutlass_fp8_gemm(Tensor x, Tensor weight, Tensor workspace, Tensor alpha, Tensor out) {
   // Workspace is used for storing device-side gemm arguments and cutlass internal workspace.
   // Recommened size is 4MB.
   cudaStream_t stream =
diff --git a/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu b/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
index 150485b86822..48e68cb804f6 100644
--- a/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
+++ b/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
@@ -22,7 +22,7 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "fp16_group_gemm_runner_sm90.cuh"
 
@@ -42,8 +42,8 @@ namespace tvm {
 namespace runtime {
 
 template <typename ElementA, typename ElementB, typename ElementC>
-void tvm_cutlass_fp8_group_gemm(NDArray x, NDArray weight, NDArray indptr, NDArray workspace,
-                                NDArray alpha, NDArray out) {
+void tvm_cutlass_fp8_group_gemm(Tensor x, Tensor weight, Tensor indptr, Tensor workspace,
+                                Tensor alpha, Tensor out) {
   // Workspace is used for storing device-side group gemm arguments and cutlass internal workspace.
   // Recommened size is 4MB.
   cudaStream_t stream =
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
index 0f688616d55e..e03366a03860 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
@@ -21,7 +21,7 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/extra/c_env_api.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "cutlass/bfloat16.h"
 #include "cutlass/half.h"
@@ -34,10 +34,10 @@ template <int Arch, typename TileShape, typename ClusterShape, typename ElementA
 struct CutlassFP8GroupwiseGemm;
 
 template <int Arch, typename TileShape, typename ClusterShape>
-void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(NDArray a, NDArray b, NDArray scales_a,
-                                                NDArray scales_b, NDArray workspace,
+void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scales_a,
+                                                Tensor scales_b, Tensor workspace,
                                                 int64_t block_size_0, int64_t block_size_1,
-                                                NDArray out) {
+                                                Tensor out) {
   // Workspace is used for storing device-side gemm arguments and cutlass internal workspace.
   // Recommened size is 4MB.
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, a->device.device_id));
@@ -100,10 +100,10 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(NDArray a, NDArray b, NDArray sc
 }
 
 template <int Arch, typename TileShape, typename ClusterShape>
-void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(NDArray a, NDArray b, NDArray scales_a,
-                                               NDArray scales_b, NDArray workspace,
+void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales_a,
+                                               Tensor scales_b, Tensor workspace,
                                                int64_t block_size_0, int64_t block_size_1,
-                                               NDArray out) {
+                                               Tensor out) {
   // Workspace is used for storing device-side gemm arguments and cutlass internal workspace.
   // Recommened size is 4MB.
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, a->device.device_id));
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
index 95fc578fd43f..87cd8108f9ee 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
@@ -53,7 +53,7 @@
   }
 
 using namespace cute;
-using tvm::runtime::NDArray;
+using tvm::runtime::Tensor;
 
 template <typename TileShape, typename ClusterShape, typename ElementD>
 struct CutlassFP8ScaledGroupwiseGemmRunnerSM100 {
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
index 5ec9ed083916..d5321d157c74 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
@@ -54,7 +54,7 @@
 
 using namespace cute;
 using ProblemShape = Shape<int, int, int, int>;
-using tvm::runtime::NDArray;
+using tvm::runtime::Tensor;
 
 template <typename TileShape, typename ClusterShape, typename ElementD, typename SchedulerType,
           int ScaleGranularityM = 1>
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm100.cu b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm100.cu
index 7201604a7c85..bd2d2aa04fb4 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm100.cu
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm100.cu
@@ -21,8 +21,8 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/tensor.h>
 
 #include "../cublas/cublas_utils.h"
 #include "fp8_groupwise_scaled_gemm.cuh"
@@ -47,20 +47,20 @@ struct CutlassFP8GroupwiseGemm<100, TileShape, ClusterShape, ElementA, ElementB,
   }
 };
 
-void tvm_cutlass_fp8_groupwise_scaled_gemm_sm100(NDArray a, NDArray b, NDArray scales_a,
-                                                 NDArray scales_b, NDArray workspace,
+void tvm_cutlass_fp8_groupwise_scaled_gemm_sm100(Tensor a, Tensor b, Tensor scales_a,
+                                                 Tensor scales_b, Tensor workspace,
                                                  int64_t block_size_0, int64_t block_size_1,
-                                                 NDArray out) {
+                                                 Tensor out) {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _1, _1>;
   tvm_cutlass_fp8_groupwise_scaled_gemm_impl<100, TileShape, ClusterShape>(
       a, b, scales_a, scales_b, workspace, block_size_0, block_size_1, out);
 }
 
-void tvm_cutlass_fp8_groupwise_scaled_bmm_sm100(NDArray a, NDArray b, NDArray scales_a,
-                                                NDArray scales_b, NDArray workspace,
+void tvm_cutlass_fp8_groupwise_scaled_bmm_sm100(Tensor a, Tensor b, Tensor scales_a,
+                                                Tensor scales_b, Tensor workspace,
                                                 int64_t block_size_0, int64_t block_size_1,
-                                                NDArray out) {
+                                                Tensor out) {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _1, _1>;
   tvm_cutlass_fp8_groupwise_scaled_bmm_impl<100, TileShape, ClusterShape>(
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm90.cu b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm90.cu
index 8099d91419e5..dc067038c7a9 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm90.cu
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_gemm_sm90.cu
@@ -21,8 +21,8 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/tensor.h>
 
 #include "../cublas/cublas_utils.h"
 #include "fp8_groupwise_scaled_gemm.cuh"
@@ -47,20 +47,19 @@ struct CutlassFP8GroupwiseGemm<90, TileShape, ClusterShape, ElementA, ElementB,
   }
 };
 
-void tvm_cutlass_fp8_groupwise_scaled_gemm_sm90(NDArray a, NDArray b, NDArray scales_a,
-                                                NDArray scales_b, NDArray workspace,
+void tvm_cutlass_fp8_groupwise_scaled_gemm_sm90(Tensor a, Tensor b, Tensor scales_a,
+                                                Tensor scales_b, Tensor workspace,
                                                 int64_t block_size_0, int64_t block_size_1,
-                                                NDArray out) {
+                                                Tensor out) {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _1, _1>;
   tvm_cutlass_fp8_groupwise_scaled_gemm_impl<90, TileShape, ClusterShape>(
       a, b, scales_a, scales_b, workspace, block_size_0, block_size_1, out);
 }
 
-void tvm_cutlass_fp8_groupwise_scaled_bmm_sm90(NDArray a, NDArray b, NDArray scales_a,
-                                               NDArray scales_b, NDArray workspace,
-                                               int64_t block_size_0, int64_t block_size_1,
-                                               NDArray out) {
+void tvm_cutlass_fp8_groupwise_scaled_bmm_sm90(Tensor a, Tensor b, Tensor scales_a, Tensor scales_b,
+                                               Tensor workspace, int64_t block_size_0,
+                                               int64_t block_size_1, Tensor out) {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _1, _1>;
   tvm_cutlass_fp8_groupwise_scaled_bmm_impl<90, TileShape, ClusterShape>(
diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
index b9be378a9aff..420f93d4f2f3 100644
--- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
+++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
@@ -22,8 +22,8 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/tensor.h>
 
 #include "fp8_groupwise_scaled_group_gemm_runner_sm100.cuh"
 
@@ -32,10 +32,10 @@
 namespace tvm {
 namespace runtime {
 
-void tvm_fp8_groupwise_scaled_group_gemm_sm100(NDArray a, NDArray b, NDArray scales_a,
-                                               NDArray scales_b, NDArray indptr, NDArray workspace,
+void tvm_fp8_groupwise_scaled_group_gemm_sm100(Tensor a, Tensor b, Tensor scales_a, Tensor scales_b,
+                                               Tensor indptr, Tensor workspace,
                                                int64_t block_size_0, int64_t block_size_1,
-                                               NDArray out) {
+                                               Tensor out) {
   // Workspace is used for storing device-side group gemm arguments and cutlass internal workspace.
   // Recommended size is 4MB.
   cudaStream_t stream =
diff --git a/src/runtime/contrib/cutlass/weight_preprocess.cc b/src/runtime/contrib/cutlass/weight_preprocess.cc
index c403039c586a..32c30450cf48 100644
--- a/src/runtime/contrib/cutlass/weight_preprocess.cc
+++ b/src/runtime/contrib/cutlass/weight_preprocess.cc
@@ -19,7 +19,7 @@
 
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "cutlass_kernels/cutlass_preprocessors.h"
 
@@ -37,7 +37,7 @@ namespace runtime {
 // The preprocessing functions are defined in C++, so we need to copy the input weight to CPU.
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("cutlass.ft_preprocess_weight", [](NDArray packed_weight, int sm,
+  refl::GlobalDef().def("cutlass.ft_preprocess_weight", [](Tensor packed_weight, int sm,
                                                            bool is_int4) {
     bool is_2d = packed_weight->ndim == 2;
     int num_experts = is_2d ? 1 : packed_weight->shape[0];
@@ -54,7 +54,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
     }
     fastertransformer::preprocess_weights(output_cpu.data(), input_cpu.data(), num_experts, rows,
                                           cols, is_int4, sm);
-    auto out = NDArray::Empty(packed_weight.Shape(), packed_weight->dtype, packed_weight->device);
+    auto out = Tensor::Empty(packed_weight.Shape(), packed_weight->dtype, packed_weight->device);
     out.CopyFromBytes(output_cpu.data(), output_cpu.size());
     return out;
   });
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 59b162e76503..eccfb913d177 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -24,7 +24,7 @@
 
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -60,7 +60,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   const char* kind() const override { return "dnnl_json"; }
 
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
diff --git a/src/runtime/contrib/hipblas/hipblas_json_runtime.cc b/src/runtime/contrib/hipblas/hipblas_json_runtime.cc
index 08866fc1088a..046c1c14b30b 100644
--- a/src/runtime/contrib/hipblas/hipblas_json_runtime.cc
+++ b/src/runtime/contrib/hipblas/hipblas_json_runtime.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -47,7 +47,7 @@ class HipblasJSONRuntime : public JSONRuntimeBase {
                      const Array<String> const_names)
       : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
 
-  void Init(const Array<NDArray>& consts) override {}
+  void Init(const Array<Tensor>& consts) override {}
 
   ffi::Optional<ffi::Function> GetFunction(const String& name) override {
     // JSONRuntimeBase::SetInputOutputBuffers(...) is not thread safe. Since HipblasJSONRuntime
@@ -75,8 +75,8 @@ class HipblasJSONRuntime : public JSONRuntimeBase {
                                            : EntryID(outputs_[i - input_var_eid_.size()]);
 
       const DLTensor* arg;
-      if (auto opt_nd = args[i].as<NDArray>()) {
-        NDArray arr = opt_nd.value();
+      if (auto opt_nd = args[i].as<Tensor>()) {
+        Tensor arr = opt_nd.value();
         arg = arr.operator->();
       } else {
         arg = args[i].cast<DLTensor*>();
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index d9e5af60f299..ea32f7f1f24a 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -26,8 +26,8 @@
 #define TVM_RUNTIME_CONTRIB_JSON_JSON_RUNTIME_H_
 
 #include <tvm/ffi/extra/module.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/profiling.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -63,7 +63,7 @@ class JSONRuntimeBase : public ffi::ModuleObj {
   }
 
   /*! \brief Initialize a specific json runtime. */
-  virtual void Init(const Array<NDArray>& consts) = 0;
+  virtual void Init(const Array<Tensor>& consts) = 0;
 
   /*! \brief Invoke the execution engine to inteprete a specific json runtime. */
   virtual void Run() = 0;
@@ -141,7 +141,7 @@ class JSONRuntimeBase : public ffi::ModuleObj {
         ICHECK_EQ(args.size(), 1U);
         std::lock_guard<std::mutex> guard(this->initialize_mutex_);
         if (!this->initialized_) {
-          this->Init(args[0].cast<Array<NDArray>>());
+          this->Init(args[0].cast<Array<Tensor>>());
           this->initialized_ = true;
         }
         *rv = 0;
@@ -212,14 +212,14 @@ class JSONRuntimeBase : public ffi::ModuleObj {
                                            : EntryID(outputs_[i - input_var_eid_.size()]);
 
       const DLTensor* arg;
-      if (auto opt_nd = args[i].as<NDArray>()) {
-        NDArray arr = opt_nd.value();
+      if (auto opt_nd = args[i].as<Tensor>()) {
+        Tensor arr = opt_nd.value();
         arg = arr.operator->();
       } else {
         arg = args[i].cast<DLTensor*>();
       }
 
-      // Assign input/output the NDArray pointers to data entry so that we can directly
+      // Assign input/output the Tensor pointers to data entry so that we can directly
       // read/write host buffers.
       data_entry_[eid] = arg;
     }
@@ -268,9 +268,9 @@ class JSONRuntimeBase : public ffi::ModuleObj {
    * \brief Set up the constants/weights for inference by binding their DLTensor pointer to
    * the corresponding data entry.
    *
-   * \param consts A list of constant NDArray to be used.
+   * \param consts A list of constant Tensor to be used.
    */
-  void SetupConstants(const Array<NDArray>& consts) {
+  void SetupConstants(const Array<Tensor>& consts) {
     for (size_t i = 0; i < consts.size(); ++i) {
       data_entry_[EntryID(const_idx_[i], 0)] = consts[i].operator->();
     }
diff --git a/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc b/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc
index bc1eb77ea18c..f9769d79099a 100644
--- a/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc
+++ b/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc
@@ -26,7 +26,7 @@
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <string>
@@ -309,8 +309,8 @@ class MarvellHardwareModuleNode : public ffi::ModuleObj {
 
     i_d_buf_float = reinterpret_cast<float*>(i_d_buf);
     for (int in = 0; in < num_inputs_; in++) {
-      if (args[in].IsObjectRef<NDArray>()) {
-        NDArray arr = args[in];
+      if (args[in].IsObjectRef<Tensor>()) {
+        Tensor arr = args[in];
         tensor = arr.operator->();
       } else {
         tensor = args[in].operator DLTensor*();
@@ -345,8 +345,8 @@ class MarvellHardwareModuleNode : public ffi::ModuleObj {
     int out = num_inputs_;
 
     if (num_outputs_ == 1) {
-      if (args[out].IsObjectRef<NDArray>()) {
-        NDArray arr = args[out];
+      if (args[out].IsObjectRef<Tensor>()) {
+        Tensor arr = args[out];
         outTensor = arr.operator->();
       } else {
         outTensor = args[out].operator DLTensor*();
@@ -361,8 +361,8 @@ class MarvellHardwareModuleNode : public ffi::ModuleObj {
 
       for (out = num_inputs_; out < args.size(); out++) {
         int out_tot_dim = 1;
-        if (args[out].IsObjectRef<NDArray>()) {
-          NDArray arr = args[out];
+        if (args[out].IsObjectRef<Tensor>()) {
+          Tensor arr = args[out];
           outTensor = arr.operator->();
         } else {
           outTensor = args[out].operator DLTensor*();
@@ -382,8 +382,8 @@ class MarvellHardwareModuleNode : public ffi::ModuleObj {
     const DLTensor* tensor[64];
 
     for (int in = 0; in < num_inputs_; in++) {
-      if (args[in].IsObjectRef<NDArray>()) {
-        NDArray arr = args[in];
+      if (args[in].IsObjectRef<Tensor>()) {
+        Tensor arr = args[in];
         tensor[in] = arr.operator->();
       } else {
         tensor[in] = args[in].operator DLTensor*();
@@ -398,8 +398,8 @@ class MarvellHardwareModuleNode : public ffi::ModuleObj {
 
     int i = 0;
     for (int out = num_inputs_; out < args.size(); out++) {
-      if (args[out].IsObjectRef<NDArray>()) {
-        NDArray arr = args[out];
+      if (args[out].IsObjectRef<Tensor>()) {
+        Tensor arr = args[out];
         tensor[i] = arr.operator->();
       } else {
         tensor[i] = args[out].operator DLTensor*();
diff --git a/src/runtime/contrib/mrvl/mrvl_runtime.cc b/src/runtime/contrib/mrvl/mrvl_runtime.cc
index 974ca4a69a1f..af384035c96b 100644
--- a/src/runtime/contrib/mrvl/mrvl_runtime.cc
+++ b/src/runtime/contrib/mrvl/mrvl_runtime.cc
@@ -27,7 +27,7 @@
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstddef>
 #include <fstream>
diff --git a/src/runtime/contrib/mrvl/mrvl_sw_runtime_lib.cc b/src/runtime/contrib/mrvl/mrvl_sw_runtime_lib.cc
index c63bafcd0089..8e68cf7e6963 100644
--- a/src/runtime/contrib/mrvl/mrvl_sw_runtime_lib.cc
+++ b/src/runtime/contrib/mrvl/mrvl_sw_runtime_lib.cc
@@ -26,7 +26,7 @@
 
 #include <assert.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <fstream>
 #include <vector>
@@ -36,7 +36,7 @@
 using namespace tvm::runtime;
 
 template <typename T>
-static void NDArrayToFile(const tvm::runtime::NDArray& arr, std::ostream& os) {
+static void TensorToFile(const tvm::runtime::Tensor& arr, std::ostream& os) {
   int ndim = arr->ndim;
   int tot_dim = 1;
   for (int i = 0; i < ndim; i++) {
@@ -70,8 +70,8 @@ static void ReadInputsAndGenerateInputBin(ffi::PackedArgs args, const std::strin
   file_out << R"(    "inputs": [)" << std::endl;
   for (size_t i = 0; i < num_inputs; ++i) {
     const DLTensor* tensor;
-    if (args[i].IsObjectRef<NDArray>()) {
-      NDArray arr = args[i];
+    if (args[i].IsObjectRef<Tensor>()) {
+      Tensor arr = args[i];
       tensor = arr.operator->();
     } else {
       tensor = args[i].cast<DLTensor*>();
@@ -80,9 +80,9 @@ static void ReadInputsAndGenerateInputBin(ffi::PackedArgs args, const std::strin
     for (int64_t i = 0; i < tensor->ndim; i++) {
       shape.push_back(tensor->shape[i]);
     }
-    NDArray arr = NDArray::Empty(shape, tensor->dtype, tensor->device);
+    Tensor arr = Tensor::Empty(shape, tensor->dtype, tensor->device);
     arr.CopyFrom(tensor);
-    NDArrayToFile<float>(arr, file_out);
+    TensorToFile<float>(arr, file_out);
     if (i != num_inputs - 1) {
       file_out << std::endl << "\t," << std::endl;
     }
@@ -108,8 +108,8 @@ static void ReadOutputsAndUpdateRuntime(ffi::PackedArgs args, size_t num_inputs,
                                         const std::string& out_bin_prefix) {
   for (int out = num_inputs; out < args.size(); out++) {
     const DLTensor* outTensor;
-    if (args[out].IsObjectRef<NDArray>()) {
-      NDArray arr = args[out];
+    if (args[out].IsObjectRef<Tensor>()) {
+      Tensor arr = args[out];
       outTensor = arr.operator->();
     } else {
       outTensor = args[out].operator DLTensor*();
@@ -118,7 +118,7 @@ static void ReadOutputsAndUpdateRuntime(ffi::PackedArgs args, size_t num_inputs,
     for (int64_t i = 0; i < outTensor->ndim; i++) {
       shape.push_back(outTensor->shape[i]);
     }
-    NDArray arr = NDArray::Empty(shape, outTensor->dtype, outTensor->device);
+    Tensor arr = Tensor::Empty(shape, outTensor->dtype, outTensor->device);
     int ndim = arr->ndim;
     int tot_dim = 1;
     for (int i = 0; i < ndim; i++) {
diff --git a/src/runtime/contrib/msc/tensorrt_runtime.cc b/src/runtime/contrib/msc/tensorrt_runtime.cc
index 37ae9f254895..3a5f7c02def6 100644
--- a/src/runtime/contrib/msc/tensorrt_runtime.cc
+++ b/src/runtime/contrib/msc/tensorrt_runtime.cc
@@ -25,7 +25,7 @@
 #include <dmlc/parameter.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <fstream>
 #include <memory>
@@ -87,7 +87,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
    *
    * \param consts The constant params from compiled model.
    */
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     LoadGlobalOptions();
@@ -122,14 +122,14 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
     if (tool_tag_.size() > 0) {
       const auto pf = tvm::ffi::Function::GetGlobal("msc_tool.callback_step");
       ICHECK(pf.has_value()) << "Cannot find msc_tool.callback_step func.";
-      Map<String, runtime::NDArray> input_datas;
+      Map<String, runtime::Tensor> input_datas;
       int device_id = 0;
       for (const auto& pair : input_bindings_) {
         const auto& tensor_name = engine_->getBindingName(pair.first);
         input_datas.Set(tensor_name, device_buffers_[pair.first]);
         device_id = data_entry_[pair.first]->device.device_id;
       }
-      Map<String, Map<String, runtime::NDArray>> context;
+      Map<String, Map<String, runtime::Tensor>> context;
       context.Set("datas", input_datas);
       (*pf)(context, "before_forward", graph_name_, tool_tag_);
     }
@@ -155,7 +155,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
     if (tool_tag_.size() > 0) {
       const auto pf = tvm::ffi::Function::GetGlobal("msc_tool.callback_step");
       ICHECK(pf.has_value()) << "Cannot find msc_tool.callback_step func.";
-      Map<String, runtime::NDArray> output_datas;
+      Map<String, runtime::Tensor> output_datas;
       for (int bid = 0; bid < engine_->getNbBindings(); bid++) {
         if (input_bindings_.count(bid)) {
           continue;
@@ -163,7 +163,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
         const auto& tensor_name = engine_->getBindingName(bid);
         output_datas.Set(tensor_name, device_buffers_[bid]);
       }
-      Map<String, Map<String, runtime::NDArray>> context;
+      Map<String, Map<String, runtime::Tensor>> context;
       context.Set("datas", output_datas);
       (*pf)(context, "after_forward", graph_name_, tool_tag_);
     }
@@ -289,14 +289,14 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
         const auto& pair = tensor_ids_[tensor_name];
         auto shape = nodes_[pair.first].GetOpShape()[pair.second];
         auto dtype = nodes_[pair.first].GetOpDataType()[pair.second];
-        device_buffers_[bid] = runtime::NDArray::Empty(shape, dtype, {kDLCUDA, 0});
+        device_buffers_[bid] = runtime::Tensor::Empty(shape, dtype, {kDLCUDA, 0});
       }
       bindings_[bid] = device_buffers_[bid]->data;
       binded.insert(bid);
     }
   }
 
-  NDArray GetOrAllocateDeviceBuffer(int entry_id, int binding_index) {
+  Tensor GetOrAllocateDeviceBuffer(int entry_id, int binding_index) {
     std::vector<int64_t> shape(data_entry_[entry_id]->shape,
                                data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim);
     if (device_buffers_.count(binding_index)) {
@@ -304,7 +304,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
       if (shape[0] > device_buffers_[binding_index]->shape[0]) {
         // Buffer is too small. Need to allocate bigger buffer.
         device_buffers_[binding_index] =
-            runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
+            runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
       } else if (shape[0] < device_buffers_[binding_index]->shape[0]) {
         // Buffer is too large. Create view.
         return device_buffers_[binding_index].CreateView(shape, data_entry_[entry_id]->dtype);
@@ -312,7 +312,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
     } else {
       // Buffer not initialized yet.
       device_buffers_[binding_index] =
-          runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
+          runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
     }
     return device_buffers_.at(binding_index);
   }
@@ -341,7 +341,7 @@ class MSCTensorRTRuntime : public JSONRuntimeBase {
   std::unordered_map<int, uint32_t> output_bindings_;
   std::vector<void*> bindings_;
   std::vector<size_t> binding_sizes_;
-  std::unordered_map<int, NDArray> device_buffers_;
+  std::unordered_map<int, Tensor> device_buffers_;
 #endif
 };
 
diff --git a/src/runtime/contrib/mscclpp/allreduce.cu b/src/runtime/contrib/mscclpp/allreduce.cu
index 2b009c062585..147c306bf452 100644
--- a/src/runtime/contrib/mscclpp/allreduce.cu
+++ b/src/runtime/contrib/mscclpp/allreduce.cu
@@ -18,7 +18,7 @@
  */
 
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "msccl.cuh"
 
diff --git a/src/runtime/contrib/nnapi/nnapi_runtime.cc b/src/runtime/contrib/nnapi/nnapi_runtime.cc
index 51047d90fd73..a1f3b3f132f5 100644
--- a/src/runtime/contrib/nnapi/nnapi_runtime.cc
+++ b/src/runtime/contrib/nnapi/nnapi_runtime.cc
@@ -20,7 +20,7 @@
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <cstdint>
@@ -70,7 +70,7 @@ class NNAPIRuntime : public JSONRuntimeBase {
 
   std::optional<CompiledModel> compiled_model_;
 
-  void Init(const Array<NDArray>& consts) final {
+  void Init(const Array<Tensor>& consts) final {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required constants.";
     SetupConstants(consts);
@@ -225,7 +225,7 @@ class NNAPIRuntime : public JSONRuntimeBase {
   std::unordered_map<uint32_t, NNAPIOperand> node_output_map_;
 
 #else   // ifdef TVM_GRAPH_EXECUTOR_NNAPI
-  void Init(const Array<NDArray>& consts) final {
+  void Init(const Array<Tensor>& consts) final {
     LOG(FATAL) << "NNAPI runtime is not enabled. Build with USE_NNAPI_RUNTIME to enable it.";
   }
 
diff --git a/src/runtime/contrib/nvshmem/memory_allocator.cc b/src/runtime/contrib/nvshmem/memory_allocator.cc
index 6ac7aa04f7bb..0c816669be9a 100644
--- a/src/runtime/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/contrib/nvshmem/memory_allocator.cc
@@ -57,7 +57,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
     return allocator;
   }
 
-  NDArray Empty(ffi::Shape shape, DataType dtype, Device device) {
+  Tensor Empty(ffi::Shape shape, DataType dtype, Device device) {
     class NVSHMEMAlloc {
      public:
       explicit NVSHMEMAlloc(Buffer buffer) : buffer_(buffer) {}
@@ -69,7 +69,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
     };
 
     Buffer buffer = PooledAllocator::Alloc(device, shape, dtype, String("nvshmem"));
-    return NDArray::FromNDAlloc(NVSHMEMAlloc(buffer), shape, dtype, device);
+    return Tensor::FromNDAlloc(NVSHMEMAlloc(buffer), shape, dtype, device);
   }
 
  private:
@@ -86,7 +86,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
   void DeviceFreeDataSpace(Device dev, void* ptr) final { nvshmem_free(ptr); }
 };
 
-NDArray NVSHMEMEmpty(ffi::Shape shape, DataType dtype, Device device) {
+Tensor NVSHMEMEmpty(ffi::Shape shape, DataType dtype, Device device) {
   return NVSHMEMAllocator::Global()->Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 3ab0309630cf..ce9b959a53cc 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/runtime/threading_backend.h>
 
 #include <algorithm>
@@ -122,11 +122,11 @@ class RandomEngine {
     if (data->device.device_type == kDLCPU) {
       FillData(data);
     } else {
-      runtime::NDArray local = runtime::NDArray::Empty(
+      runtime::Tensor local = runtime::Tensor::Empty(
           std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
-      DLTensor* tensor = const_cast<ffi::NDArrayObj*>(local.operator->());
+      DLTensor* tensor = const_cast<ffi::TensorObj*>(local.operator->());
       FillData(tensor);
-      runtime::NDArray::CopyFromTo(tensor, data);
+      runtime::Tensor::CopyFromTo(tensor, data);
     }
   }
 
@@ -134,11 +134,11 @@ class RandomEngine {
     if (data->device.device_type == kDLCPU) {
       FillDataForMeasure(data);
     } else {
-      runtime::NDArray local = runtime::NDArray::Empty(
+      runtime::Tensor local = runtime::Tensor::Empty(
           std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
-      DLTensor* tensor = const_cast<ffi::NDArrayObj*>(local.operator->());
+      DLTensor* tensor = const_cast<ffi::TensorObj*>(local.operator->());
       FillDataForMeasure(tensor);
-      runtime::NDArray::CopyFromTo(tensor, data);
+      runtime::Tensor::CopyFromTo(tensor, data);
     }
   }
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 9bf793bd3e49..179e75a669fa 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -24,7 +24,7 @@
 
 #include "tensorrt_builder.h"
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <memory>
 #include <string>
@@ -233,8 +233,8 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
   }
   weight.count = count;
   weight.values = new float[count];
-  ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
-                                weight_bytes),
+  ICHECK_EQ(TVMTensorCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
+                                 weight_bytes),
             0)
       << TVMGetLastError();
   trt_weights_.push_back(weight);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index 9bccc1ea4848..96905598737c 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index ff565444e2b5..d66b1a1c46e1 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -25,7 +25,7 @@
 #include <dmlc/parameter.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <fstream>
 #include <memory>
@@ -109,7 +109,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
    *
    * \param consts The constant params from compiled model.
    */
-  void Init(const Array<NDArray>& consts) override {
+  void Init(const Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     LoadGlobalAttributes();
@@ -433,7 +433,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
   }
 
   /*! \brief Retreive a GPU buffer for input or output or allocate if needed. */
-  NDArray GetOrAllocateDeviceBuffer(int entry_id, int binding_index) {
+  Tensor GetOrAllocateDeviceBuffer(int entry_id, int binding_index) {
     std::vector<int64_t> shape(data_entry_[entry_id]->shape,
                                data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim);
     if (device_buffers_.count(binding_index)) {
@@ -441,7 +441,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       if (shape[0] > device_buffers_[binding_index]->shape[0]) {
         // Buffer is too small. Need to allocate bigger buffer.
         device_buffers_[binding_index] =
-            runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
+            runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
       } else if (shape[0] < device_buffers_[binding_index]->shape[0]) {
         // Buffer is too large. Create view.
         return device_buffers_[binding_index].CreateView(shape, data_entry_[entry_id]->dtype);
@@ -449,7 +449,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
     } else {
       // Buffer not initialized yet.
       device_buffers_[binding_index] =
-          runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
+          runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0});
     }
     return device_buffers_.at(binding_index);
   }
@@ -476,7 +476,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * is not "cuda". Since TensorRT execution can only read data from GPU, we need to copy data from
    * the runtime device to these buffers first. These will be allocated for the highest batch size
    * used by all engines. */
-  std::unordered_map<int, NDArray> device_buffers_;
+  std::unordered_map<int, Tensor> device_buffers_;
 
   /*! \brief TensorRT logger. */
   TensorRTLogger logger_;
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index d65f2ad65b63..b51b8084cb91 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -131,7 +131,7 @@ void TFLiteRuntime::SetInput(int index, DLTensor* data_in) {
 
 void TFLiteRuntime::SetNumThreads(int num_threads) { interpreter_->SetNumThreads(num_threads); }
 
-NDArray TFLiteRuntime::GetOutput(int index) const {
+Tensor TFLiteRuntime::GetOutput(int index) const {
   TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[index]);
   DataType dtype = TfLiteDType2TVMDType(output->type);
   TfLiteIntArray* dims = output->dims;
@@ -141,7 +141,7 @@ NDArray TFLiteRuntime::GetOutput(int index) const {
     shape.push_back(dims->data[i]);
     size *= dims->data[i];
   }
-  NDArray ret = NDArray::Empty(shape, dtype, device_);
+  Tensor ret = Tensor::Empty(shape, dtype, device_);
   TVM_DTYPE_DISPATCH(dtype, DType, {
     DType* dest = static_cast<DType*>(ret->data);
     DType* src = interpreter_->typed_output_tensor<DType>(index);
diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h
index 396bd01104d5..590ee4df6f7b 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.h
+++ b/src/runtime/contrib/tflite/tflite_runtime.h
@@ -29,7 +29,7 @@
 #include <tensorflow/lite/interpreter.h>
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <memory>
 #include <string>
@@ -84,19 +84,19 @@ class TFLiteRuntime : public ffi::ModuleObj {
    */
   void SetInput(int index, DLTensor* data_in);
   /*!
-   * \brief Return NDArray for given input index.
+   * \brief Return Tensor for given input index.
    * \param index The input index.
    *
-   * \return NDArray corresponding to given input node index.
+   * \return Tensor corresponding to given input node index.
    */
-  NDArray GetInput(int index) const;
+  Tensor GetInput(int index) const;
   /*!
-   * \brief Return NDArray for given output index.
+   * \brief Return Tensor for given output index.
    * \param index The output index.
    *
-   * \return NDArray corresponding to given output node index.
+   * \return Tensor corresponding to given output node index.
    */
-  NDArray GetOutput(int index) const;
+  Tensor GetOutput(int index) const;
   /*!
    * \brief Set the number of threads available to the interpreter.
    * \param num_threads The number of threads to be set.
diff --git a/src/runtime/contrib/vllm/attention_kernels.cu b/src/runtime/contrib/vllm/attention_kernels.cu
index e5e45735fb55..ce3205383215 100644
--- a/src/runtime/contrib/vllm/attention_kernels.cu
+++ b/src/runtime/contrib/vllm/attention_kernels.cu
@@ -20,7 +20,7 @@
 #include <float.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <type_traits>
diff --git a/src/runtime/contrib/vllm/cache_alloc.cc b/src/runtime/contrib/vllm/cache_alloc.cc
index d616923ad78e..673f83e2e0c1 100644
--- a/src/runtime/contrib/vllm/cache_alloc.cc
+++ b/src/runtime/contrib/vllm/cache_alloc.cc
@@ -19,15 +19,15 @@
 #include <cuda_runtime.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 namespace tvm {
 namespace runtime {
 namespace vllm {
 
-Array<NDArray> AllocateKVCache(int head_size, int num_layers, int num_heads, int block_size,
-                               int num_blocks) {
-  Array<NDArray> cache;
+Array<Tensor> AllocateKVCache(int head_size, int num_layers, int num_heads, int block_size,
+                              int num_blocks) {
+  Array<Tensor> cache;
   int element_size = 2;
   int vec_size = 16 / element_size;
 
@@ -37,11 +37,11 @@ Array<NDArray> AllocateKVCache(int head_size, int num_layers, int num_heads, int
   DLDevice dev{DLDeviceType::kDLCUDA, device_id};
 
   for (int i = 0; i < num_layers; ++i) {
-    NDArray key_blocks =
-        NDArray::Empty({num_blocks, num_heads, head_size / vec_size, block_size, vec_size},
-                       runtime::DataType::Float(16), dev);
-    NDArray value_blocks = NDArray::Empty({num_blocks, num_heads, head_size, block_size},
-                                          runtime::DataType::Float(16), dev);
+    Tensor key_blocks =
+        Tensor::Empty({num_blocks, num_heads, head_size / vec_size, block_size, vec_size},
+                      runtime::DataType::Float(16), dev);
+    Tensor value_blocks = Tensor::Empty({num_blocks, num_heads, head_size, block_size},
+                                        runtime::DataType::Float(16), dev);
     cache.push_back(key_blocks);
     cache.push_back(value_blocks);
   }
diff --git a/src/runtime/contrib/vllm/cache_kernels.cu b/src/runtime/contrib/vllm/cache_kernels.cu
index c7f91aa42fce..a68fd66d6269 100644
--- a/src/runtime/contrib/vllm/cache_kernels.cu
+++ b/src/runtime/contrib/vllm/cache_kernels.cu
@@ -18,7 +18,7 @@
  */
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <cassert>
@@ -134,8 +134,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("tvm.contrib.vllm.reshape_and_cache",
-           [](NDArray key, NDArray value, NDArray key_cache, NDArray value_cache,
-              NDArray slot_mapping) {
+           [](Tensor key, Tensor value, Tensor key_cache, Tensor value_cache, Tensor slot_mapping) {
              int num_tokens = key->shape[0];
              int num_heads = key->shape[1];
              int head_size = key->shape[2];
@@ -158,7 +157,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
              return Array{key_cache, value_cache};
            })
       .def("tvm.contrib.vllm.reconstruct_from_cache",
-           [](NDArray key_cache, NDArray value_cache, NDArray slot_mapping) {
+           [](Tensor key_cache, Tensor value_cache, Tensor slot_mapping) {
              int num_tokens = slot_mapping->shape[0];
              int num_heads = value_cache->shape[1];
              int head_size = value_cache->shape[2];
@@ -166,8 +165,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
              int vec_size = key_cache->shape[4];
 
              DLDevice dev = key_cache->device;
-             auto key = NDArray::Empty({num_tokens, num_heads, head_size}, key_cache->dtype, dev);
-             auto value = NDArray::Empty({num_tokens, num_heads, head_size}, key_cache->dtype, dev);
+             auto key = Tensor::Empty({num_tokens, num_heads, head_size}, key_cache->dtype, dev);
+             auto value = Tensor::Empty({num_tokens, num_heads, head_size}, key_cache->dtype, dev);
 
              int key_stride = key->shape[1] * key->shape[2];
              int value_stride = value->shape[1] * value->shape[2];
@@ -185,8 +184,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
 
              return Array{key, value};
            })
-      .def("tvm.contrib.vllm.copy_blocks", [](Array<NDArray> key_value_caches,
-                                              NDArray block_mapping) {
+      .def("tvm.contrib.vllm.copy_blocks", [](Array<Tensor> key_value_caches,
+                                              Tensor block_mapping) {
         auto num_layers = key_value_caches.size() / 2;
         auto num_pairs = block_mapping->shape[0] / 2;
 
@@ -203,20 +202,20 @@ TVM_FFI_STATIC_INIT_BLOCK({
               reinterpret_cast<int64_t>(key_value_caches[2 * layer_idx + 1]->data);
         }
 
-        NDArray key_cache = key_value_caches[1];  // [num_blocks, num_heads, head_size, block_size]
+        Tensor key_cache = key_value_caches[1];  // [num_blocks, num_heads, head_size, block_size]
         DLDevice dev = key_cache->device;
 
-        NDArray key_cache_ptrs_gpu =
-            NDArray::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
-        NDArray value_cache_ptrs_gpu =
-            NDArray::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+        Tensor key_cache_ptrs_gpu =
+            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+        Tensor value_cache_ptrs_gpu =
+            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
         key_cache_ptrs_gpu.CopyFromBytes(key_cache_ptrs.data(),
                                          sizeof(int64_t) * key_cache_ptrs.size());
         value_cache_ptrs_gpu.CopyFromBytes(value_cache_ptrs.data(),
                                            sizeof(int64_t) * value_cache_ptrs.size());
 
-        NDArray block_mapping_gpu =
-            NDArray::Empty(block_mapping.Shape(), runtime::DataType::Int(64), dev);
+        Tensor block_mapping_gpu =
+            Tensor::Empty(block_mapping.Shape(), runtime::DataType::Int(64), dev);
         block_mapping_gpu.CopyFromBytes(block_mapping->data,
                                         sizeof(int64_t) * block_mapping->shape[0]);
 
diff --git a/src/runtime/device_api.cc b/src/runtime/device_api.cc
index 28dc313ba3e6..16fd3c7b7761 100644
--- a/src/runtime/device_api.cc
+++ b/src/runtime/device_api.cc
@@ -21,7 +21,7 @@
  * \file device_api.cc
  * \brief Device specific implementations
  */
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/optional.h>
diff --git a/src/runtime/disco/bcast_session.cc b/src/runtime/disco/bcast_session.cc
index 46ecb49f50fc..f4964b12d709 100644
--- a/src/runtime/disco/bcast_session.cc
+++ b/src/runtime/disco/bcast_session.cc
@@ -51,14 +51,14 @@ DRef BcastSessionObj::GetGlobalFunc(const std::string& name) {
   return BcastSessionObj::Internal::MakeDRef(reg_id, GetRef<Session>(this));
 }
 
-void BcastSessionObj::CopyFromWorker0(const NDArray& host_array, const DRef& remote_array) {
-  this->AppendHostNDArray(host_array);
+void BcastSessionObj::CopyFromWorker0(const Tensor& host_array, const DRef& remote_array) {
+  this->AppendHostTensor(host_array);
   BcastSessionObj::Internal::BroadcastUnpacked(this, DiscoAction::kCopyFromWorker0,
                                                remote_array->reg_id);
 }
 
-void BcastSessionObj::CopyToWorker0(const NDArray& host_array, const DRef& remote_array) {
-  this->AppendHostNDArray(host_array);
+void BcastSessionObj::CopyToWorker0(const Tensor& host_array, const DRef& remote_array) {
+  this->AppendHostTensor(host_array);
   BcastSessionObj::Internal::BroadcastUnpacked(this, DiscoAction::kCopyToWorker0,
                                                remote_array->reg_id);
 }
@@ -114,7 +114,7 @@ int BcastSessionObj::AllocateReg() {
   return reg_id;
 }
 
-void BcastSessionObj::AppendHostNDArray(const NDArray& host_array) {
+void BcastSessionObj::AppendHostTensor(const Tensor& host_array) {
   std::lock_guard<std::mutex> lock(worker_zero_data_.queue_mutex_);
   worker_zero_data_.host_arrays.push(host_array);
 }
diff --git a/src/runtime/disco/bcast_session.h b/src/runtime/disco/bcast_session.h
index f92369d85337..e4ee3bb8a1cb 100644
--- a/src/runtime/disco/bcast_session.h
+++ b/src/runtime/disco/bcast_session.h
@@ -37,8 +37,8 @@ class BcastSessionObj : public SessionObj {
   virtual ~BcastSessionObj() = default;
 
   DRef GetGlobalFunc(const std::string& name) override;
-  void CopyFromWorker0(const NDArray& host_array, const DRef& remote_array) override;
-  void CopyToWorker0(const NDArray& host_array, const DRef& remote_array) override;
+  void CopyFromWorker0(const Tensor& host_array, const DRef& remote_array) override;
+  void CopyToWorker0(const Tensor& host_array, const DRef& remote_array) override;
   void SyncWorker(int worker_id) override;
   void Shutdown() override;
   void InitCCL(String ccl, IntTuple device_ids) override;
@@ -53,11 +53,11 @@ class BcastSessionObj : public SessionObj {
   /*! \brief Allocate a register id, either from `free_regs_` or by incrementing `reg_count_` */
   virtual int AllocateReg();
   /*!
-   * \brief Append an controler-side NDArray to a special queue used to communicate with
+   * \brief Append an controler-side Tensor to a special queue used to communicate with
    worker-0.
    * \param host_array The array to be appended to worker-0
    */
-  virtual void AppendHostNDArray(const NDArray& host_array);
+  virtual void AppendHostTensor(const Tensor& host_array);
   /*!
    * \brief Broadcast a command to all workers via TVM's ffi::Function calling convention.
    * As part of the calling convention, The first argument in the packed sequence must be
diff --git a/src/runtime/disco/builtin.cc b/src/runtime/disco/builtin.cc
index b650b143e401..2cfd91dfde83 100644
--- a/src/runtime/disco/builtin.cc
+++ b/src/runtime/disco/builtin.cc
@@ -70,8 +70,8 @@ ffi::Module LoadVMModule(std::string path, Optional<Device> device) {
   return mod;
 }
 
-NDArray DiscoEmptyNDArray(ffi::Shape shape, DataType dtype, Optional<Device> device) {
-  return NDArray::Empty(shape, dtype, UseDefaultDeviceIfNone(device));
+Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype, Optional<Device> device) {
+  return Tensor::Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
 ffi::Function GetCCLFunc(const char* name) {
@@ -83,37 +83,37 @@ ffi::Function GetCCLFunc(const char* name) {
   return *pf;
 }
 
-void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray recv) {
+void AllReduce(Tensor send, ReduceKind reduce_kind, bool in_group, Tensor recv) {
   GetCCLFunc("allreduce")(send, static_cast<int>(reduce_kind), in_group, recv);
 }
 
-void AllGather(NDArray send, bool in_group, NDArray recv) {
+void AllGather(Tensor send, bool in_group, Tensor recv) {
   GetCCLFunc("allgather")(send, in_group, recv);
 }
 
-TVM_DLL void BroadcastFromWorker0(NDArray send, bool in_group, NDArray recv) {
+TVM_DLL void BroadcastFromWorker0(Tensor send, bool in_group, Tensor recv) {
   GetCCLFunc("broadcast_from_worker0")(send, in_group, recv);
 }
 
-TVM_DLL void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
+TVM_DLL void ScatterFromWorker0(Optional<Tensor> send, bool in_group, Tensor recv) {
   GetCCLFunc("scatter_from_worker0")(send, in_group, recv);
 }
 
-void GatherToWorker0(NDArray send, bool in_group, Optional<NDArray> recv) {
+void GatherToWorker0(Tensor send, bool in_group, Optional<Tensor> recv) {
   GetCCLFunc("gather_to_worker0")(send, in_group, recv);
 }
 
-void RecvFromWorker0(NDArray buffer) { GetCCLFunc("recv_from_worker0")(buffer); }
+void RecvFromWorker0(Tensor buffer) { GetCCLFunc("recv_from_worker0")(buffer); }
 
-void SendToNextGroup(NDArray buffer) { GetCCLFunc("send_to_next_group")(buffer); }
+void SendToNextGroup(Tensor buffer) { GetCCLFunc("send_to_next_group")(buffer); }
 
-void RecvFromPrevGroup(NDArray buffer) { GetCCLFunc("recv_from_prev_group")(buffer); }
+void RecvFromPrevGroup(Tensor buffer) { GetCCLFunc("recv_from_prev_group")(buffer); }
 
-void SendToWorker(NDArray buffer, int receiver_id) {
+void SendToWorker(Tensor buffer, int receiver_id) {
   GetCCLFunc("send_to_worker")(buffer, receiver_id);
 }
 
-void RecvFromWorker(NDArray buffer, int sender_id) {
+void RecvFromWorker(Tensor buffer, int sender_id) {
   GetCCLFunc("recv_from_worker")(buffer, sender_id);
 }
 
@@ -131,7 +131,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def("runtime.disco.load_vm_module", LoadVMModule)
       .def("runtime.disco.empty",
            [](ffi::Shape shape, DataType dtype, Optional<Device> device, bool worker0_only,
-              bool in_group) -> Optional<NDArray> {
+              bool in_group) -> Optional<Tensor> {
              int worker_id = WorkerId();
              int group_size =
                  DiscoWorker::ThreadLocal()->num_workers / DiscoWorker::ThreadLocal()->num_groups;
@@ -140,11 +140,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
              if (worker0_only && !is_worker0) {
                return std::nullopt;
              } else {
-               return DiscoEmptyNDArray(shape, dtype, device);
+               return DiscoEmptyTensor(shape, dtype, device);
              }
            })
       .def("runtime.disco.allreduce",
-           [](NDArray send, ffi::Shape reduce_kind, bool in_group, NDArray recv) {
+           [](Tensor send, ffi::Shape reduce_kind, bool in_group, Tensor recv) {
              int kind = IntegerFromShape(reduce_kind);
              CHECK(0 <= kind && kind <= 4) << "ValueError: Unknown ReduceKind: " << kind;
              AllReduce(send, static_cast<ReduceKind>(kind), in_group, recv);
diff --git a/src/runtime/disco/disco_worker.cc b/src/runtime/disco/disco_worker.cc
index 8e63355283a8..d9865ca2bec4 100644
--- a/src/runtime/disco/disco_worker.cc
+++ b/src/runtime/disco/disco_worker.cc
@@ -36,10 +36,10 @@ TVM_DLL DiscoWorker* DiscoWorker::ThreadLocal() {
 void DiscoWorker::SetRegister(int reg_id, ffi::AnyView value) {
   ICHECK(0 <= reg_id && reg_id < static_cast<int>(register_file.size()));
   ffi::Any& rv = register_file.at(reg_id);
-  if (rv.type_index() == ffi::TypeIndex::kTVMFFINDArray &&
-      value.type_index() == ffi::TypeIndex::kTVMFFINDArray) {
-    NDArray dst = rv.cast<NDArray>();
-    NDArray src = value.cast<NDArray>();
+  if (rv.type_index() == ffi::TypeIndex::kTVMFFITensor &&
+      value.type_index() == ffi::TypeIndex::kTVMFFITensor) {
+    Tensor dst = rv.cast<Tensor>();
+    Tensor src = value.cast<Tensor>();
     dst.CopyFrom(src);
   } else {
     rv = value;
@@ -112,25 +112,25 @@ struct DiscoWorker::Impl {
     }
   }
 
-  static NDArray GetNDArrayFromHost(DiscoWorker* self) {
+  static Tensor GetTensorFromHost(DiscoWorker* self) {
     std::lock_guard<std::mutex> lock(self->worker_zero_data->queue_mutex_);
-    NDArray array = self->worker_zero_data->host_arrays.front();
+    Tensor array = self->worker_zero_data->host_arrays.front();
     self->worker_zero_data->host_arrays.pop();
     return array;
   }
 
   static void CopyFromWorker0(DiscoWorker* self, int reg_id) {
     if (self->worker_id == 0) {
-      NDArray tgt = GetNDArrayFromHost(self);
-      NDArray src = GetReg(self, reg_id).cast<NDArray>();
+      Tensor tgt = GetTensorFromHost(self);
+      Tensor src = GetReg(self, reg_id).cast<Tensor>();
       tgt.CopyFrom(src);
     }
   }
 
   static void CopyToWorker0(DiscoWorker* self, int reg_id) {
     if (self->worker_id == 0) {
-      NDArray src = GetNDArrayFromHost(self);
-      NDArray tgt = GetReg(self, reg_id).cast<NDArray>();
+      Tensor src = GetTensorFromHost(self);
+      Tensor tgt = GetReg(self, reg_id).cast<Tensor>();
       tgt.CopyFrom(src);
     }
   }
diff --git a/src/runtime/disco/distributed/socket_session.cc b/src/runtime/disco/distributed/socket_session.cc
index b4933aa303ef..8e576fff227d 100644
--- a/src/runtime/disco/distributed/socket_session.cc
+++ b/src/runtime/disco/distributed/socket_session.cc
@@ -173,8 +173,8 @@ class SocketSessionObj : public BcastSessionObj {
     return remote_channels_[node_id - 1]->Recv();
   }
 
-  void AppendHostNDArray(const NDArray& host_array) final {
-    local_session_->AppendHostNDArray(host_array);
+  void AppendHostTensor(const Tensor& host_array) final {
+    local_session_->AppendHostTensor(host_array);
   }
 
   void Shutdown() final {
diff --git a/src/runtime/disco/loader.cc b/src/runtime/disco/loader.cc
index 97af8bc9d3de..fec50cd71118 100644
--- a/src/runtime/disco/loader.cc
+++ b/src/runtime/disco/loader.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/disco/builtin.h>
-#include <tvm/runtime/vm/ndarray_cache_support.h>
+#include <tvm/runtime/vm/tensor_cache_support.h>
 
 #include <functional>
 #include <numeric>
@@ -39,9 +39,9 @@
 namespace tvm {
 namespace runtime {
 
-using vm::NDArrayCacheMetadata;
-using FileRecord = NDArrayCacheMetadata::FileRecord;
-using ParamRecord = NDArrayCacheMetadata::FileRecord::ParamRecord;
+using vm::TensorCacheMetadata;
+using FileRecord = TensorCacheMetadata::FileRecord;
+using ParamRecord = TensorCacheMetadata::FileRecord::ParamRecord;
 
 struct ShardInfo {
   struct TensorInfo {
@@ -119,23 +119,23 @@ class ShardLoaderObj : public Object {
   static ObjectRef Create(const std::string& path_to_metadata, const std::string& metadata,
                           std::string shard_info, Optional<ffi::Module> mod);
   /*! \brief Load the i-th parameter */
-  NDArray Load(int weight_index) const;
+  Tensor Load(int weight_index) const;
 
-  NDArray LoadParamOnWorker0(int weight_index) const;
+  Tensor LoadParamOnWorker0(int weight_index) const;
 
   /*! \brief Load all the parameters */
-  Array<NDArray> LoadAll() const;
+  Array<Tensor> LoadAll() const;
 
-  NDArray ApplyShardFunc(const ShardInfo::ShardFunc& shard_func, const NDArray& param) const;
+  Tensor ApplyShardFunc(const ShardInfo::ShardFunc& shard_func, const Tensor& param) const;
 
   /*! \brief Load all the pre-sharded parameters */
-  Array<NDArray> LoadAllPresharded() const;
+  Array<Tensor> LoadAllPresharded() const;
 
   /*! \brief Load the i-th parameter from presharded binaries */
-  NDArray LoadPresharded(int weight_index) const;
+  Tensor LoadPresharded(int weight_index) const;
 
   /*! \brief Slice the given tensor at a specific dimension */
-  NDArray Shard(NDArray source, int dim, int num_slices) const;
+  Tensor Shard(Tensor source, int dim, int num_slices) const;
 
   static constexpr const char* _type_key = "runtime.disco.ShardLoader";
   TVM_DECLARE_FINAL_OBJECT_INFO(ShardLoaderObj, Object);
@@ -149,8 +149,8 @@ class ShardLoaderObj : public Object {
   };
   /*! \brief The ffi::Functions being used during sharding */
   std::unordered_map<std::string, ffi::Function> shard_funcs_;
-  /*! \brief The metadata loaded from `ndarray-cache.json` */
-  NDArrayCacheMetadata metadata_;
+  /*! \brief The metadata loaded from `tensor-cache.json` */
+  TensorCacheMetadata metadata_;
   /*! \brief Sharding information for each weight */
   std::vector<ParamInfo> param_info_;
   /*! \brief Maps the name of a shard to its index */
@@ -167,11 +167,11 @@ class ShardLoaderObj : public Object {
    * check for post-processing that may be required.  Instead, the
    * public function `Load` or `LoadPresharded` should be called.
    *
-   * \param weight_index The index of NDArray tensor to load
+   * \param weight_index The index of Tensor tensor to load
    *
    * \returns The full tensor at the specified index
    */
-  NDArray LoadDirect(int weight_index) const;
+  Tensor LoadDirect(int weight_index) const;
 };
 
 ObjectRef ShardLoaderObj::Create(const std::string& path_to_metadata, const std::string& metadata,
@@ -182,7 +182,7 @@ ObjectRef ShardLoaderObj::Create(const std::string& path_to_metadata, const std:
     }
   }
   ObjectPtr<ShardLoaderObj> n = make_object<ShardLoaderObj>();
-  n->metadata_ = NDArrayCacheMetadata::LoadFromStr(metadata, path_to_metadata);
+  n->metadata_ = TensorCacheMetadata::LoadFromStr(metadata, path_to_metadata);
   n->current_file_ = nullptr;
   n->param_info_.clear();
   std::unordered_map<std::string, ShardInfo> shards = LoadShardInfoFromStr(shard_info);
@@ -209,10 +209,10 @@ ObjectRef ShardLoaderObj::Create(const std::string& path_to_metadata, const std:
   return ObjectRef(std::move(n));
 }
 
-NDArray ShardLoaderObj::ApplyShardFunc(const ShardInfo::ShardFunc& shard_func,
-                                       const NDArray& param) const {
+Tensor ShardLoaderObj::ApplyShardFunc(const ShardInfo::ShardFunc& shard_func,
+                                      const Tensor& param) const {
   Device device = param->device;
-  NDArray o = NDArray::Empty(shard_func.output_info.shape, shard_func.output_info.dtype, device);
+  Tensor o = Tensor::Empty(shard_func.output_info.shape, shard_func.output_info.dtype, device);
   ffi::Function f = this->shard_funcs_.at(shard_func.name);
   int n = static_cast<int>(shard_func.params.size());
   std::vector<ffi::AnyView> packed_args(n + 2);
@@ -236,7 +236,7 @@ std::string GetSiblingPath(const std::string& path, const std::string& filename)
   LOG(FATAL) << "ValueError: Cannot find the parent directory: " << path;
 }
 
-NDArray ShardLoaderObj::LoadParamOnWorker0(int weight_index) const {
+Tensor ShardLoaderObj::LoadParamOnWorker0(int weight_index) const {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   int worker_id = worker->worker_id;
   Device device = worker->default_device;
@@ -255,10 +255,10 @@ NDArray ShardLoaderObj::LoadParamOnWorker0(int weight_index) const {
   };
 
   if (worker_id == 0) {
-    NDArray w = load();
+    Tensor w = load();
     return w;
   } else {
-    NDArray w = NDArray::Empty(param->shape, param->dtype, device);
+    Tensor w = Tensor::Empty(param->shape, param->dtype, device);
     return w;
   }
 }
@@ -285,7 +285,7 @@ std::tuple<int, int> ParseParamShardingInfo(const ParamRecord* param) {
   return {num_shards, worker_id};
 }
 
-NDArray ShardLoaderObj::LoadDirect(int weight_index) const {
+Tensor ShardLoaderObj::LoadDirect(int weight_index) const {
   const ParamInfo& param_info = param_info_.at(weight_index);
   const ParamRecord* param = param_info.param;
   const FileRecord* file = param_info.file;
@@ -301,7 +301,7 @@ NDArray ShardLoaderObj::LoadDirect(int weight_index) const {
   return param->Load(device, &this->current_file_stream_);
 }
 
-NDArray ShardLoaderObj::Load(int weight_index) const {
+Tensor ShardLoaderObj::Load(int weight_index) const {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   int worker_id = worker->worker_id;
   int num_shards = worker->num_workers;
@@ -317,9 +317,9 @@ NDArray ShardLoaderObj::Load(int weight_index) const {
         << "ValueError: The first dimension of the "
         << "output shape must be equal to the "
         << "number of shards, but got: " << shape << " and num_shards = " << num_shards;
-    NDArray recv = NDArray::Empty(ffi::Shape(shape.begin() + 1, shape.end()), dtype, device);
+    Tensor recv = Tensor::Empty(ffi::Shape(shape.begin() + 1, shape.end()), dtype, device);
     if (worker_id == 0) {
-      NDArray w = LoadDirect(weight_index);
+      Tensor w = LoadDirect(weight_index);
       for (const ShardInfo::ShardFunc& shard_func : param_info.shard_info.funcs) {
         w = this->ApplyShardFunc(shard_func, w);
       }
@@ -330,20 +330,20 @@ NDArray ShardLoaderObj::Load(int weight_index) const {
     return recv;
   } else {
     if (worker_id == 0) {
-      NDArray w = LoadDirect(weight_index);
+      Tensor w = LoadDirect(weight_index);
       BroadcastFromWorker0(w, /*in_group=*/false, w);
       return w;
     } else {
-      NDArray w = NDArray::Empty(param->shape, param->dtype, device);
+      Tensor w = Tensor::Empty(param->shape, param->dtype, device);
       BroadcastFromWorker0(w, /*in_group=*/false, w);
       return w;
     }
   }
 }
 
-Array<NDArray> ShardLoaderObj::LoadAll() const {
+Array<Tensor> ShardLoaderObj::LoadAll() const {
   int n = static_cast<int>(param_info_.size());
-  Array<NDArray> shards;
+  Array<Tensor> shards;
   shards.reserve(n);
   for (int i = 0; i < n; ++i) {
     std::string param_name = "param_" + std::to_string(i);
@@ -354,7 +354,7 @@ Array<NDArray> ShardLoaderObj::LoadAll() const {
   return shards;
 }
 
-NDArray ShardLoaderObj::LoadPresharded(int weight_index) const {
+Tensor ShardLoaderObj::LoadPresharded(int weight_index) const {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   int worker_id = worker->worker_id;
   int num_shards = worker->num_workers;
@@ -380,13 +380,13 @@ NDArray ShardLoaderObj::LoadPresharded(int weight_index) const {
   return LoadDirect(index);
 }
 
-Array<NDArray> ShardLoaderObj::LoadAllPresharded() const {
+Array<Tensor> ShardLoaderObj::LoadAllPresharded() const {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   size_t worker_id = static_cast<size_t>(worker->worker_id);
   size_t num_workers = static_cast<size_t>(worker->num_workers);
   size_t num_params = param_info_.size() / num_workers;
 
-  Array<NDArray> params;
+  Array<Tensor> params;
   params.reserve(num_params);
   for (size_t i_param = 0; i_param < num_params; ++i_param) {
     std::string param_name = static_cast<const std::stringstream&>(
diff --git a/src/runtime/disco/nccl/nccl.cc b/src/runtime/disco/nccl/nccl.cc
index 32a194072653..86950eedad45 100644
--- a/src/runtime/disco/nccl/nccl.cc
+++ b/src/runtime/disco/nccl/nccl.cc
@@ -116,7 +116,7 @@ void InitCCLPerWorker(ffi::Shape device_ids, std::string unique_id_bytes) {
   }
 }
 
-void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray recv) {
+void AllReduce(Tensor send, ReduceKind reduce_kind, bool in_group, Tensor recv) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   ffi::Shape shape = send.Shape();
   int64_t numel = shape->Product();
@@ -131,7 +131,7 @@ void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray recv
                           in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
-void AllGather(NDArray send, bool in_group, NDArray recv) {
+void AllGather(Tensor send, bool in_group, Tensor recv) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   ffi::Shape shape = send.Shape();
   int64_t numel = shape->Product();
@@ -141,7 +141,7 @@ void AllGather(NDArray send, bool in_group, NDArray recv) {
                           in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
-void BroadcastFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
+void BroadcastFromWorker0(Optional<Tensor> send, bool in_group, Tensor recv) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   int worker_id = ctx->worker->worker_id;
   int group_size = ctx->worker->num_workers / ctx->worker->num_groups;
@@ -164,7 +164,7 @@ void BroadcastFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
                           /*root=*/0, in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
-void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
+void ScatterFromWorker0(Optional<Tensor> send, bool in_group, Tensor recv) {
   CHECK(recv.defined()) << "ValueError: buffer `recv` must not be None";
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   int worker_id = ctx->worker->worker_id;
@@ -175,7 +175,7 @@ void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
   deviceStream_t stream = ctx->GetDefaultStream();
   if (is_sender) {
     CHECK(send.defined()) << "ValueError: buffer `send` must be provided when worker_id == 0.";
-    NDArray buffer = send.value();
+    Tensor buffer = send.value();
     int64_t numel = buffer.Shape()->Product();
     CHECK_EQ(numel % num_receiver, 0) << "ValueError: Scattering evenly requires that the number "
                                          "of elements in the buffer to be "
@@ -211,7 +211,7 @@ void ScatterFromWorker0(Optional<NDArray> send, bool in_group, NDArray recv) {
   NCCL_CALL(ncclGroupEnd());
 }
 
-void GatherToWorker0(NDArray send, bool in_group, Optional<NDArray> recv) {
+void GatherToWorker0(Tensor send, bool in_group, Optional<Tensor> recv) {
   CHECK(send.defined()) << "ValueError: buffer `send` must not be None";
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   int worker_id = ctx->worker->worker_id;
@@ -222,7 +222,7 @@ void GatherToWorker0(NDArray send, bool in_group, Optional<NDArray> recv) {
   deviceStream_t stream = ctx->GetDefaultStream();
   if (is_sender) {
     CHECK(recv.defined()) << "ValueError: buffer `recv` must be provided when worker_id == 0.";
-    NDArray buffer = recv.value();
+    Tensor buffer = recv.value();
     int64_t numel = buffer.Shape()->Product();
     CHECK_EQ(numel % num_receiver, 0) << "ValueError: Gathering evenly requires that the number "
                                          "of elements in the buffer to be "
@@ -258,7 +258,7 @@ void GatherToWorker0(NDArray send, bool in_group, Optional<NDArray> recv) {
   NCCL_CALL(ncclGroupEnd());
 }
 
-void RecvFromWorker0(NDArray buffer) {
+void RecvFromWorker0(Tensor buffer) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   deviceStream_t stream = ctx->GetDefaultStream();
   CHECK_NE(ctx->worker->worker_id, 0)
@@ -269,7 +269,7 @@ void RecvFromWorker0(NDArray buffer) {
   NCCL_CALL(ncclGroupEnd());
 }
 
-void SendToNextGroup(NDArray buffer) {
+void SendToNextGroup(Tensor buffer) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   deviceStream_t stream = ctx->GetDefaultStream();
   int worker_id = ctx->worker->worker_id;
@@ -283,7 +283,7 @@ void SendToNextGroup(NDArray buffer) {
   NCCL_CALL(ncclGroupEnd());
 }
 
-void RecvFromPrevGroup(NDArray buffer) {
+void RecvFromPrevGroup(Tensor buffer) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   deviceStream_t stream = ctx->GetDefaultStream();
   int worker_id = ctx->worker->worker_id;
@@ -297,7 +297,7 @@ void RecvFromPrevGroup(NDArray buffer) {
   NCCL_CALL(ncclGroupEnd());
 }
 
-void SendToWorker(NDArray buffer, int receiver_id) {
+void SendToWorker(Tensor buffer, int receiver_id) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   deviceStream_t stream = ctx->GetDefaultStream();
   int worker_id = ctx->worker->worker_id;
@@ -309,7 +309,7 @@ void SendToWorker(NDArray buffer, int receiver_id) {
                      receiver_id, ctx->global_comm, stream));
 }
 
-void RecvFromWorker(NDArray buffer, int sender_id) {
+void RecvFromWorker(Tensor buffer, int sender_id) {
   CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
   deviceStream_t stream = ctx->GetDefaultStream();
   int worker_id = ctx->worker->worker_id;
@@ -334,12 +334,12 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".init_ccl", InitCCL)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".init_ccl_per_worker", InitCCLPerWorker)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".allreduce",
-           [](NDArray send, int kind, bool in_group, NDArray recv) {
+           [](Tensor send, int kind, bool in_group, Tensor recv) {
              CHECK(0 <= kind && kind <= 4) << "ValueError: Unknown ReduceKind: " << kind;
              nccl::AllReduce(send, static_cast<ReduceKind>(kind), in_group, recv);
            })
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".allgather",
-           [](NDArray send, bool in_group, NDArray recv) { nccl::AllGather(send, in_group, recv); })
+           [](Tensor send, bool in_group, Tensor recv) { nccl::AllGather(send, in_group, recv); })
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".broadcast_from_worker0", BroadcastFromWorker0)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".scatter_from_worker0", ScatterFromWorker0)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".gather_to_worker0", GatherToWorker0)
@@ -350,7 +350,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".recv_from_worker", RecvFromWorker)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".sync_worker", SyncWorker)
       .def("runtime.disco." TVM_DISCO_CCL_NAME ".test_send_to_next_group_recv_from_prev_group",
-           [](NDArray buffer) {
+           [](Tensor buffer) {
              CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
              CHECK_EQ(ctx->worker->num_workers, 4) << "The test requires the world size to be 4.";
              CHECK_EQ(ctx->worker->num_groups, 2) << "The test requires the group size to be 2.";
@@ -362,17 +362,16 @@ TVM_FFI_STATIC_INIT_BLOCK({
                tvm::runtime::nccl::RecvFromPrevGroup(buffer);
              }
            })
-      .def("runtime.disco." TVM_DISCO_CCL_NAME ".test_worker2_sends_to_worker0",
-           [](NDArray buffer) {
-             CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
-             CHECK_EQ(ctx->worker->num_workers, 4) << "The test requires the world size to be 4.";
-             CHECK_EQ(ctx->worker->num_groups, 2) << "The test requires the group size to be 2.";
-             if (ctx->worker->worker_id == 2) {
-               tvm::runtime::nccl::SendToWorker(buffer, 0);
-             } else if (ctx->worker->worker_id == 0) {
-               tvm::runtime::nccl::RecvFromWorker(buffer, 2);
-             }
-           });
+      .def("runtime.disco." TVM_DISCO_CCL_NAME ".test_worker2_sends_to_worker0", [](Tensor buffer) {
+        CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get();
+        CHECK_EQ(ctx->worker->num_workers, 4) << "The test requires the world size to be 4.";
+        CHECK_EQ(ctx->worker->num_groups, 2) << "The test requires the group size to be 2.";
+        if (ctx->worker->worker_id == 2) {
+          tvm::runtime::nccl::SendToWorker(buffer, 0);
+        } else if (ctx->worker->worker_id == 0) {
+          tvm::runtime::nccl::RecvFromWorker(buffer, 2);
+        }
+      });
 });
 
 }  // namespace nccl
diff --git a/src/runtime/disco/protocol.h b/src/runtime/disco/protocol.h
index ee6d5bf32ccc..3c3193d31147 100644
--- a/src/runtime/disco/protocol.h
+++ b/src/runtime/disco/protocol.h
@@ -87,21 +87,21 @@ struct DiscoProtocol {
 
 /*!
  * \brief The debug extension of the communication protocol that allows serialization and
- * deserialization of NDArrays and reflection-capable TVM objects.
+ * deserialization of Tensors and reflection-capable TVM objects.
  */
 struct DiscoDebugObject : public Object {
  public:
   /*! \brief The data to be serialized */
   ffi::Any data;
 
-  /*! \brief Wrap an NDArray or reflection-capable TVM object into the debug extension. */
+  /*! \brief Wrap an Tensor or reflection-capable TVM object into the debug extension. */
   static ObjectRef Wrap(const ffi::Any& data) {
     ObjectPtr<DiscoDebugObject> n = make_object<DiscoDebugObject>();
     n->data = data;
     return ObjectRef(n);
   }
 
-  /*! \brief Wrap an NDArray or reflection-capable TVM object into the debug extension. */
+  /*! \brief Wrap an Tensor or reflection-capable TVM object into the debug extension. */
   static ObjectRef Wrap(const ffi::AnyView& data) {
     ffi::Any rv;
     rv = data;
@@ -219,8 +219,8 @@ inline void DiscoProtocol<SubClassType>::ReadFFIAny(TVMFFIAny* out) {
 }
 
 inline std::string DiscoDebugObject::SaveToStr() const {
-  if (auto opt_nd = this->data.as<NDArray>()) {
-    NDArray array = opt_nd.value();
+  if (auto opt_nd = this->data.as<Tensor>()) {
+    Tensor array = opt_nd.value();
     std::string result;
     {
       dmlc::MemoryStringStream mstrm(&result);
@@ -256,7 +256,7 @@ inline ObjectPtr<DiscoDebugObject> DiscoDebugObject::LoadFromStr(std::string jso
     dmlc::MemoryStringStream mstrm(&json_str);
     support::Base64InStream b64strm(&mstrm);
     b64strm.InitPosition();
-    runtime::NDArray array;
+    runtime::Tensor array;
     ICHECK(array.Load(&b64strm));
     result->data = std::move(array);
   } else {
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index 4564d72e5eed..4a0a8044fd8e 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -196,15 +196,15 @@ void CopyFile(const std::string& src_file_name, const std::string& dest_file_nam
                << " dest='" << dest_file_name << "'";
 }
 
-Map<String, NDArray> LoadParams(const std::string& param_blob) {
+Map<String, Tensor> LoadParams(const std::string& param_blob) {
   dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
   return LoadParams(&strm);
 }
-Map<String, NDArray> LoadParams(dmlc::Stream* strm) {
-  Map<String, NDArray> params;
+Map<String, Tensor> LoadParams(dmlc::Stream* strm) {
+  Map<String, Tensor> params;
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid parameters file format";
-  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(header == kTVMTensorListMagic) << "Invalid parameters file format";
   ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
 
   std::vector<std::string> names;
@@ -214,15 +214,15 @@ Map<String, NDArray> LoadParams(dmlc::Stream* strm) {
   size_t size = static_cast<size_t>(sz);
   ICHECK(size == names.size()) << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
-    // The data_entry is allocated on device, NDArray.load always load the array into CPU.
-    NDArray temp;
+    // The data_entry is allocated on device, Tensor.load always load the array into CPU.
+    Tensor temp;
     temp.Load(strm);
     params.Set(names[i], temp);
   }
   return params;
 }
 
-void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params) {
+void SaveParams(dmlc::Stream* strm, const Map<String, Tensor>& params) {
   std::vector<std::string> names;
   std::vector<const DLTensor*> arrays;
   for (auto& p : params) {
@@ -230,7 +230,7 @@ void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params) {
     arrays.push_back(p.second.operator->());
   }
 
-  uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+  uint64_t header = kTVMTensorListMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
   strm->Write(names);
@@ -243,7 +243,7 @@ void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params) {
   }
 }
 
-std::string SaveParams(const Map<String, NDArray>& params) {
+std::string SaveParams(const Map<String, Tensor>& params) {
   std::string bytes;
   dmlc::MemoryStringStream strm(&bytes);
   dmlc::Stream* fo = &strm;
@@ -255,12 +255,12 @@ TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("runtime.SaveParams",
-           [](const Map<String, NDArray>& params) {
+           [](const Map<String, Tensor>& params) {
              std::string s = ::tvm::runtime::SaveParams(params);
              return ffi::Bytes(std::move(s));
            })
       .def("runtime.SaveParamsToFile",
-           [](const Map<String, NDArray>& params, const String& path) {
+           [](const Map<String, Tensor>& params, const String& path) {
              tvm::runtime::SimpleBinaryFileStream strm(path, "wb");
              SaveParams(&strm, params);
            })
diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h
index b4da7adea813..43f4a8455f41 100644
--- a/src/runtime/file_utils.h
+++ b/src/runtime/file_utils.h
@@ -104,31 +104,31 @@ void CopyFile(const std::string& src_file_name, const std::string& dest_file_nam
  */
 void RemoveFile(const std::string& file_name);
 
-constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+constexpr uint64_t kTVMTensorListMagic = 0xF7E58D4F05049CB7;
 /*!
  * \brief Load parameters from a string.
  * \param param_blob Serialized string of parameters.
  * \return Map of parameter name to parameter value.
  */
-Map<String, NDArray> LoadParams(const std::string& param_blob);
+Map<String, Tensor> LoadParams(const std::string& param_blob);
 /*!
  * \brief Load parameters from a stream.
  * \param strm Stream to load parameters from.
  * \return Map of parameter name to parameter value.
  */
-Map<String, NDArray> LoadParams(dmlc::Stream* strm);
+Map<String, Tensor> LoadParams(dmlc::Stream* strm);
 /*!
  * \brief Serialize parameters to a byte array.
  * \param params Parameters to save.
  * \return String containing binary parameter data.
  */
-std::string SaveParams(const Map<String, NDArray>& params);
+std::string SaveParams(const Map<String, Tensor>& params);
 /*!
  * \brief Serialize parameters to a stream.
  * \param strm Stream to write to.
  * \param params Parameters to save.
  */
-void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params);
+void SaveParams(dmlc::Stream* strm, const Map<String, Tensor>& params);
 
 /*!
  * \brief A dmlc stream which wraps standard file operations.
diff --git a/src/runtime/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h
index 986d6b6e5ec6..b1bec270d4fe 100644
--- a/src/runtime/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon_buffer.h
@@ -24,7 +24,7 @@
 #include <tvm/runtime/base.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <memory>
 #include <vector>
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index a26f113f1e9b..ec58946b64b1 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -27,7 +27,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <cstdlib>
 #include <cstring>
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index ece8454b859a..d9918a873aa9 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -24,7 +24,7 @@
 #include <tvm/runtime/base.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <utility>
 #include <vector>
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index cef445ee91c0..4f810011e8aa 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -60,10 +60,10 @@ inline size_t GetDataAlignment(const DLDataType& dtype) {
   return align;
 }
 
-NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ffi::Shape shape, DLDataType dtype,
-                                       String scope) {
+Tensor StorageObj::AllocTensorScoped(int64_t offset, ffi::Shape shape, DLDataType dtype,
+                                     String scope) {
   if (scope == "global" || scope.empty()) {
-    return AllocNDArray(offset, shape, dtype);
+    return AllocTensor(offset, shape, dtype);
   }
   VerifyDataType(dtype);
 
@@ -87,11 +87,11 @@ NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ffi::Shape shape, DLDataT
       << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
       << offset << " in region that is " << this->buffer.size << "bytes";
 
-  return NDArray::FromNDAlloc(StorageScopedAlloc(GetRef<Storage>(this)), shape, dtype,
-                              this->buffer.device, shape, scope, offset);
+  return Tensor::FromNDAlloc(StorageScopedAlloc(GetRef<Storage>(this)), shape, dtype,
+                             this->buffer.device, shape, scope, offset);
 }
 
-NDArray StorageObj::AllocNDArray(int64_t offset, ffi::Shape shape, DLDataType dtype) {
+Tensor StorageObj::AllocTensor(int64_t offset, ffi::Shape shape, DLDataType dtype) {
   VerifyDataType(dtype);
 
   size_t needed_size = ffi::GetDataSize(shape.Product(), dtype);
@@ -120,8 +120,8 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ffi::Shape shape, DLDataType dt
     Storage storage_;
   };
 
-  return NDArray::FromNDAlloc(StorageAlloc(GetRef<Storage>(this)), shape, dtype,
-                              this->buffer.device, offset);
+  return Tensor::FromNDAlloc(StorageAlloc(GetRef<Storage>(this)), shape, dtype, this->buffer.device,
+                             offset);
 }
 
 MemoryManager* MemoryManager::Global() {
@@ -213,8 +213,8 @@ void MemoryManager::Clear() {
   }
 }
 
-NDArray Allocator::Empty(ffi::Shape shape, DLDataType dtype, DLDevice dev,
-                         Optional<String> mem_scope) {
+Tensor Allocator::Empty(ffi::Shape shape, DLDataType dtype, DLDevice dev,
+                        Optional<String> mem_scope) {
   VerifyDataType(dtype);
 
   class BufferAlloc {
@@ -239,7 +239,7 @@ NDArray Allocator::Empty(ffi::Shape shape, DLDataType dtype, DLDevice dev,
   } else {
     buffer = this->Alloc(dev, shape, dtype, *mem_scope);
   }
-  return NDArray::FromNDAlloc(BufferAlloc(buffer), shape, dtype, dev);
+  return Tensor::FromNDAlloc(BufferAlloc(buffer), shape, dtype, dev);
 }
 
 bool Allocator::AllowMemoryScope(const std::string& mem_scope) const {
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index aa629aef50a7..bc88529ae19e 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -28,7 +28,7 @@
 #include <dmlc/json.h>
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h
index dfca27c8c3ed..ee08ad12c736 100644
--- a/src/runtime/minrpc/rpc_reference.h
+++ b/src/runtime/minrpc/rpc_reference.h
@@ -24,7 +24,7 @@
 #ifndef TVM_RUNTIME_MINRPC_RPC_REFERENCE_H_
 #define TVM_RUNTIME_MINRPC_RPC_REFERENCE_H_
 
-#include <tvm/ffi/container/ndarray.h>
+#include <tvm/ffi/container/tensor.h>
 
 namespace tvm {
 namespace ffi {
@@ -74,7 +74,7 @@ enum class RPCCode : int {
 enum class RPCServerStatus : int {
   kSuccess = 0,
   kInvalidTypeCodeObject,
-  kInvalidTypeCodeNDArray,
+  kInvalidTypeCodeTensor,
   kInvalidDLTensorFieldStride,
   kInvalidDLTensorFieldByteOffset,
   kUnknownTypeIndex,
@@ -146,8 +146,8 @@ inline const char* RPCServerStatusToString(RPCServerStatus status) {
       return "kSuccess";
     case RPCServerStatus::kInvalidTypeCodeObject:
       return "kInvalidTypeCodeObject";
-    case RPCServerStatus::kInvalidTypeCodeNDArray:
-      return "kInvalidTypeCodeNDArray";
+    case RPCServerStatus::kInvalidTypeCodeTensor:
+      return "kInvalidTypeCodeTensor";
     case RPCServerStatus::kInvalidDLTensorFieldStride:
       return "kInvalidDLTensorFieldStride";
     case RPCServerStatus::kInvalidDLTensorFieldByteOffset: {
@@ -247,7 +247,7 @@ struct RPCReference {
   static void SendDLTensor(TChannelPtr channel, DLTensor* arr) {
     DLDevice dev;
     uint64_t data;
-    // When we return NDArray, we directly return
+    // When we return Tensor, we directly return
     // the space and the context
     // The client will be further wrapping
     dev = arr->device;
@@ -351,8 +351,8 @@ struct RPCReference {
           break;
         }
 
-        case ffi::TypeIndex::kTVMFFINDArray: {
-          channel->ThrowError(RPCServerStatus::kInvalidTypeCodeNDArray);
+        case ffi::TypeIndex::kTVMFFITensor: {
+          channel->ThrowError(RPCServerStatus::kInvalidTypeCodeTensor);
           break;
         }
         case ffi::TypeIndex::kTVMFFIDLTensorPtr: {
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 3e0981146afc..021dad3ca35a 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -29,8 +29,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/profiling.h>
+#include <tvm/runtime/tensor.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
@@ -353,8 +353,8 @@ class OpenCLWorkspace : public DeviceAPI {
                        Optional<String> mem_scope = std::nullopt) final;
   void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
                        Optional<String> mem_scope = std::nullopt);
-  void* GetNativePtr(const tvm::runtime::NDArray& narr);
-  void SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr, size_t buf_size);
+  void* GetNativePtr(const tvm::runtime::Tensor& narr);
+  void SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr, size_t buf_size);
   void SetPerfHint(Device dev, cl_uint perf_hint);
   void FreeDataSpace(Device dev, void* ptr) final;
   void StreamSync(Device dev, TVMStreamHandle stream) final;
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index afa4dd0b8403..1cc4e7936013 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -434,12 +434,12 @@ void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
   }
 }
 
-void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) {
+void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::Tensor& narr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
   return desc->host_ptr;
 }
 
-void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr,
+void OpenCLWorkspace::SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr,
                                    size_t buf_size) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
 
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9d4c01d62366..d5ac8b9de06f 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -182,7 +182,7 @@ void Profiler::Stop() {
   }
 }
 
-std::vector<int64_t> ToShape(NDArray shape_tensor) {
+std::vector<int64_t> ToShape(Tensor shape_tensor) {
   std::vector<int64_t> shape;
   auto rank = shape_tensor.Shape().size();
   auto dtype = shape_tensor.DataType();
@@ -212,7 +212,7 @@ std::vector<int64_t> ToShape(NDArray shape_tensor) {
   return shape;
 }
 
-String ShapeString(NDArray shape, DLDataType dtype) { return ShapeString(ToShape(shape), dtype); }
+String ShapeString(Tensor shape, DLDataType dtype) { return ShapeString(ToShape(shape), dtype); }
 
 String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
   std::stringstream sizes;
@@ -227,9 +227,9 @@ String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
   return String(sizes.str());
 }
 
-String ShapeString(const std::vector<NDArray>& shapes) {
+String ShapeString(const std::vector<Tensor>& shapes) {
   std::stringstream sizes;
-  for (const NDArray& ary : shapes) {
+  for (const Tensor& ary : shapes) {
     if (sizes.tellp() > 0) {
       sizes << ", ";
     }
@@ -871,10 +871,10 @@ ffi::Function WrapTimeEvaluator(ffi::Function pf, Device dev, int number, int re
     pf.CallPacked(args, num_args, &temp);
 
     // allocate two large arrays to flush L2 cache
-    NDArray arr1, arr2;
+    Tensor arr1, arr2;
     if (cache_flush_bytes > 0) {
-      arr1 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
-      arr2 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+      arr1 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+      arr2 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
     }
 
     DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 195adef053bd..9438470cb215 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -78,8 +78,8 @@ class RPCEndpoint {
    *  Shutdown has no effect if the connection has already been shut down.
    *  Shutdown will wait for all output currently queued from the RPC connection (i.e. The user
    * doesn't need to wait for completion before calling Shutdown.) Any further use of objects that
-   * depended on the endpoint (e.g. A tvm.nd.array allocated on the remote RPC session) may throw an
-   * exception when used.
+   * depended on the endpoint (e.g. A tvm.runtime.tensor allocated on the remote RPC session) may
+   * throw an exception when used.
    */
   void Shutdown();
 
diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc
index 3d4928f8b43a..b000e3c01956 100644
--- a/src/runtime/rpc/rpc_local_session.cc
+++ b/src/runtime/rpc/rpc_local_session.cc
@@ -26,7 +26,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <memory>
 #include <vector>
@@ -54,13 +54,13 @@ void LocalSession::EncodeReturn(ffi::Any rv, const FEncodeReturn& encode_return)
   if (rv == nullptr) {
     packed_args[1] = rv;
     encode_return(ffi::PackedArgs(packed_args, 2));
-  } else if (rv.as<NDArray>()) {
-    // We follow a special protocol to return NDArray to client side
-    // The first pack value is the NDArray handle as DLTensor
-    // The second pack value is a customized deleter that deletes the NDArray.
+  } else if (rv.as<Tensor>()) {
+    // We follow a special protocol to return Tensor to client side
+    // The first pack value is the Tensor handle as DLTensor
+    // The second pack value is a customized deleter that deletes the Tensor.
     TVMFFIAny ret_any = ffi::details::AnyUnsafe::MoveAnyToTVMFFIAny(std::move(rv));
     void* opaque_handle = ret_any.v_obj;
-    packed_args[1] = TVMFFINDArrayGetDLTensorPtr(opaque_handle);
+    packed_args[1] = TVMFFITensorGetDLTensorPtr(opaque_handle);
     packed_args[2] = opaque_handle;
     encode_return(ffi::PackedArgs(packed_args, 3));
   } else if (const auto opt_bytes = rv.as<ffi::Bytes>()) {
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index b8c723a402f7..97b90c25ac25 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -41,18 +41,18 @@
 namespace tvm {
 namespace runtime {
 /*!
- * \brief Build a local NDArray with remote backing storage.
+ * \brief Build a local Tensor with remote backing storage.
  * \param sess the RPCSession which owns the given handle.
  * \param handle A pointer valid on the remote end which should form the `data` field of the
  *     underlying DLTensor.
  * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
  *     created array. Needed because it's difficult to pass a shape vector as a ffi::Function arg.
  * \param dev Remote device used with this tensor. Must have non-zero RPCSessMask.
- * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray.
+ * \param remote_tensor_handle The handle returned by RPC server to identify the Tensor.
  */
-NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
-                                      DLTensor* template_tensor, Device dev,
-                                      void* remote_ndarray_handle) {
+Tensor TensorFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
+                                    DLTensor* template_tensor, Device dev,
+                                    void* remote_tensor_handle) {
   ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(dev))
       << "The Device given does not belong to the given session";
   class RemoteSpaceAlloc {
@@ -71,7 +71,7 @@ NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* ha
   space.sess = sess;
   space.data = handle;
   ffi::Shape shape(template_tensor->shape, template_tensor->shape + template_tensor->ndim);
-  return NDArray::FromNDAlloc(RemoteSpaceAlloc(space), shape, template_tensor->dtype, dev);
+  return Tensor::FromNDAlloc(RemoteSpaceAlloc(space), shape, template_tensor->dtype, dev);
 }
 
 /*!
@@ -104,9 +104,9 @@ class RPCWrappedFunc : public Object {
       // run a remote translation to translate RPC related objects to
       // their remote counterparts.
       switch (args[i].type_index()) {
-        case ffi::TypeIndex::kTVMFFINDArray: {
-          // Pass NDArray as DLTensor
-          auto dptr = std::make_unique<DLTensor>(*args[i].cast<NDArray>().operator->());
+        case ffi::TypeIndex::kTVMFFITensor: {
+          // Pass Tensor as DLTensor
+          auto dptr = std::make_unique<DLTensor>(*args[i].cast<Tensor>().operator->());
           dptr->device = RemoveSessMask(dptr->device);
           dptr->data = static_cast<RemoteSpace*>(dptr->data)->data;
           packed_args[i] = dptr.get();
@@ -305,14 +305,14 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(ffi::PackedArgs args, ffi::Any* rv)
     void* handle = args[1].cast<void*>();
     auto n = make_object<RPCModuleNode>(handle, sess_);
     *rv = ffi::Module(n);
-  } else if (type_index == ffi::TypeIndex::kTVMFFINDArray ||
+  } else if (type_index == ffi::TypeIndex::kTVMFFITensor ||
              type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr) {
     ICHECK_EQ(args.size(), 3);
     auto tensor = args[1].cast<DLTensor*>();
     void* nd_handle = args[2].cast<void*>();
-    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
-                                        AddRPCSessionMask(tensor->device, sess_->table_index()),
-                                        nd_handle);
+    *rv = TensorFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
+                                       AddRPCSessionMask(tensor->device, sess_->table_index()),
+                                       nd_handle);
   } else if (type_index == ffi::TypeIndex::kTVMFFIBytes ||
              type_index == ffi::TypeIndex::kTVMFFIStr ||
              type_index == ffi::TypeIndex::kTVMFFISmallStr ||
@@ -480,11 +480,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
                     ICHECK_EQ(tkey, "rpc");
                     *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
                   })
-      .def("tvm.rpc.NDArrayFromRemoteOpaqueHandle",
+      .def("tvm.rpc.TensorFromRemoteOpaqueHandle",
            [](ffi::Module mod, void* remote_array, DLTensor* template_tensor, Device dev,
-              void* ndarray_handle) -> NDArray {
-             return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array,
-                                                  template_tensor, dev, ndarray_handle);
+              void* tensor_handle) -> Tensor {
+             return TensorFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array,
+                                                 template_tensor, dev, tensor_handle);
            });
 });
 
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index c0e09ec004ba..265c58f4af63 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -55,8 +55,8 @@ class RPCSession {
   /*! \brief Module handle in the remote. */
   using ModuleHandle = void*;
 
-  /*! \brief NDArray handle in the remote. */
-  using NDArrayHandle = void*;
+  /*! \brief Tensor handle in the remote. */
+  using TensorHandle = void*;
 
   /*!
    * \brief Callback to send an encoded return values via encode_args.
@@ -66,7 +66,7 @@ class RPCSession {
    * Encoding convention (as list of arguments):
    * - str/float/int/byte: [tcode: int, value: TVMValue] value follows ffi::Function convention.
    * - ffi::Function/Module: [tcode: int, handle: void*]
-   * - NDArray: [tcode: int,  meta: DLTensor*, nd_handle: void*]
+   * - Tensor: [tcode: int,  meta: DLTensor*, nd_handle: void*]
    *            DLTensor* contains the meta-data as well as handle into the remote data.
    *            nd_handle can be used for deletion.
    */
@@ -98,7 +98,7 @@ class RPCSession {
    *  - type_code is follows the ffi::Function convention.
    *  - int/float/string/bytes follows the ffi::Function convention, all data are local.
    *  - ffi::Function/Module and future remote objects: pass remote handle instead.
-   *  - NDArray/DLTensor: pass a DLTensor pointer, the data field of DLTensor
+   *  - Tensor/DLTensor: pass a DLTensor pointer, the data field of DLTensor
    *                      points to a remote data handle returned by the Device API.
    *                      The meta-data of the DLTensor sits on local.
    *
@@ -109,8 +109,8 @@ class RPCSession {
    *
    *  The callee need to store the return value into ret_value.
    *  - ffi::Function/Module are stored as void*
-   *  - NDArray is stored as local NDArray, whose data field is a remote handle.
-   *    Notably the NDArray's deleter won't delete remote handle.
+   *  - Tensor is stored as local Tensor, whose data field is a remote handle.
+   *    Notably the Tensor's deleter won't delete remote handle.
    *    It is up to the user of the RPCSession to such wrapping.
    *  - In short, remote handles are "moved" as return values
    *    and the callee needs to explicitly manage them by calling
diff --git a/src/runtime/ndarray.cc b/src/runtime/tensor.cc
similarity index 74%
rename from src/runtime/ndarray.cc
rename to src/runtime/tensor.cc
index 115d55c8f4e7..2e418304fa82 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/tensor.cc
@@ -18,15 +18,15 @@
  */
 
 /*!
- * \file ndarray.cc
- * \brief NDArray container infratructure.
+ * \file tensor.cc
+ * \brief Tensor container infratructure.
  */
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/base.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include "tvm/runtime/data_type.h"
 
@@ -59,10 +59,10 @@ inline void VerifyDataType(DLDataType dtype) {
   ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
-void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
+void TensorCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   size_t arr_size = GetDataSize(*handle);
-  ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
-  ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
+  ICHECK_EQ(arr_size, nbytes) << "TensorCopyFromBytes: size mismatch";
+  ICHECK(IsContiguous(*handle)) << "TensorCopyFromBytes only support contiguous array for now";
 
   DLTensor from;
   from.data = const_cast<void*>(data);
@@ -77,8 +77,8 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
 }
 
-void NDArray::CopyToBytes(const DLTensor* handle, void* data, size_t nbytes,
-                          TVMStreamHandle stream) {
+void Tensor::CopyToBytes(const DLTensor* handle, void* data, size_t nbytes,
+                         TVMStreamHandle stream) {
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
   ICHECK(ffi::IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
@@ -97,7 +97,7 @@ void NDArray::CopyToBytes(const DLTensor* handle, void* data, size_t nbytes,
   DeviceAPI::Get(handle->device)->StreamSync(handle->device, stream);
 }
 
-NDArray NDArray::Empty(ffi::Shape shape, DLDataType dtype, Device dev, Optional<String> mem_scope) {
+Tensor Tensor::Empty(ffi::Shape shape, DLDataType dtype, Device dev, Optional<String> mem_scope) {
   struct DeviceAPIAlloc {
     void AllocData(DLTensor* tensor, ffi::Optional<ffi::String> mem_scope) {
       tensor->data = DeviceAPI::Get(tensor->device)
@@ -108,11 +108,10 @@ NDArray NDArray::Empty(ffi::Shape shape, DLDataType dtype, Device dev, Optional<
       DeviceAPI::Get(tensor->device)->FreeDataSpace(tensor->device, tensor->data);
     }
   };
-  return ffi::NDArray::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
+  return ffi::Tensor::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
 }
 
-NDArray NDArray::CreateView(ffi::Shape shape, DLDataType dtype,
-                            uint64_t relative_byte_offset) const {
+Tensor Tensor::CreateView(ffi::Shape shape, DLDataType dtype, uint64_t relative_byte_offset) const {
   ICHECK(data_ != nullptr);
 
   const DLTensor& orig = *get_mutable();
@@ -145,14 +144,14 @@ NDArray NDArray::CreateView(ffi::Shape shape, DLDataType dtype,
       << view_size << " bytes.  "
       << "This would occupy bytes " << relative_byte_offset << " <= i_byte < "
       << (relative_byte_offset + view_size) << " within the backing array.  "
-      << "However, the NDArray being viewed only contains " << curr_size << " bytes (shape = "
+      << "However, the Tensor being viewed only contains " << curr_size << " bytes (shape = "
       << ffi::Shape(curr_dl_tensor.shape, curr_dl_tensor.shape + curr_dl_tensor.ndim)
       << ", dtype= " << curr_dl_tensor.dtype << ").";
 
-  // helper allocator class that retains ref count of original NDArray
+  // helper allocator class that retains ref count of original Tensor
   class ViewBasedAlloc {
    public:
-    explicit ViewBasedAlloc(NDArray source) : source_(source) {}
+    explicit ViewBasedAlloc(Tensor source) : source_(source) {}
     void AllocData(DLTensor* tensor, int64_t byte_offset) {
       tensor->data = source_.get_mutable()->data;
       tensor->byte_offset = byte_offset;
@@ -161,30 +160,30 @@ NDArray NDArray::CreateView(ffi::Shape shape, DLDataType dtype,
     void FreeData(DLTensor* tensor) {}
 
    private:
-    NDArray source_;
+    Tensor source_;
   };
 
-  NDArray ret = NDArray::FromNDAlloc(ViewBasedAlloc(NDArray(*this)), shape, dtype, (*this)->device,
-                                     curr_dl_tensor.byte_offset + relative_byte_offset);
+  Tensor ret = Tensor::FromNDAlloc(ViewBasedAlloc(Tensor(*this)), shape, dtype, (*this)->device,
+                                   curr_dl_tensor.byte_offset + relative_byte_offset);
   return ret;
 }
 
-void NDArray::CopyToBytes(void* data, size_t nbytes) const {
+void Tensor::CopyToBytes(void* data, size_t nbytes) const {
   ICHECK(data != nullptr);
   ICHECK(data_ != nullptr);
-  NDArray::CopyToBytes(get_mutable(), data, nbytes);
+  Tensor::CopyToBytes(get_mutable(), data, nbytes);
 }
 
-void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
+void Tensor::CopyFromBytes(const void* data, size_t nbytes) {
   ICHECK(data != nullptr);
   ICHECK(data_ != nullptr);
-  ArrayCopyFromBytes(get_mutable(), data, nbytes);
+  TensorCopyFromBytes(get_mutable(), data, nbytes);
 }
 
-NDArray NDArray::CopyTo(const Device& dev, Optional<String> mem_scope) const {
+Tensor Tensor::CopyTo(const Device& dev, Optional<String> mem_scope) const {
   ICHECK(data_ != nullptr);
   const DLTensor* dptr = operator->();
-  NDArray ret =
+  Tensor ret =
       Empty(ffi::Shape(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev, mem_scope);
   this->CopyTo(ret);
   Device copy_gpu_dev = dptr->device.device_type != kDLCPU ? dptr->device : dev;
@@ -192,10 +191,10 @@ NDArray NDArray::CopyTo(const Device& dev, Optional<String> mem_scope) const {
   return ret;
 }
 
-void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+void Tensor::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   size_t from_size = GetDataSize(*from);
   size_t to_size = GetDataSize(*to);
-  ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size in bytes must exactly match.";
+  ICHECK_EQ(from_size, to_size) << "TVMTensorCopyFromTo: The size in bytes must exactly match.";
 
   ICHECK(from->device.device_type == to->device.device_type || from->device.device_type == kDLCPU ||
          to->device.device_type == kDLCPU || from->device.device_type == kDLCUDAHost ||
@@ -219,13 +218,12 @@ using namespace tvm::runtime;
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def("runtime.TVMArrayAllocWithScope", NDArray::Empty)
-      .def_method("runtime.TVMArrayCreateView", &NDArray::CreateView)
-      .def("runtime.TVMArrayCopyFromBytes",
-           [](DLTensor* arr, void* data, size_t nbytes) { ArrayCopyFromBytes(arr, data, nbytes); })
-      .def(
-          "runtime.TVMArrayCopyToBytes",
-          [](DLTensor* arr, void* data, size_t nbytes) { NDArray::CopyToBytes(arr, data, nbytes); })
-      .def("runtime.TVMArrayCopyFromTo",
-           [](DLTensor* from, DLTensor* to) { NDArray::CopyFromTo(from, to); });
+      .def("runtime.TVMTensorAllocWithScope", Tensor::Empty)
+      .def_method("runtime.TVMTensorCreateView", &Tensor::CreateView)
+      .def("runtime.TVMTensorCopyFromBytes",
+           [](DLTensor* arr, void* data, size_t nbytes) { TensorCopyFromBytes(arr, data, nbytes); })
+      .def("runtime.TVMTensorCopyToBytes",
+           [](DLTensor* arr, void* data, size_t nbytes) { Tensor::CopyToBytes(arr, data, nbytes); })
+      .def("runtime.TVMTensorCopyFromTo",
+           [](DLTensor* from, DLTensor* to) { Tensor::CopyFromTo(from, to); });
 });
diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
index 449a1def0a38..4017738d6685 100644
--- a/src/runtime/vm/attn_backend.h
+++ b/src/runtime/vm/attn_backend.h
@@ -71,22 +71,22 @@ class PagedPrefillFunc : public AttnBackendFunc {
                             AttnBackendKind backend_kind)
       : AttnBackendFunc(std::move(attn_func), attn_kind, backend_kind) {}
 
-  virtual void MHA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-                   NDArray page_indices, NDArray length_info, NDArray q_rope_position,
-                   NDArray k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
-                   double rotary_theta, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  virtual void MHA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+                   Tensor page_indices, Tensor length_info, Tensor q_rope_position,
+                   Tensor k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
+                   double rotary_theta, double sm_scale, Tensor attn_output, Tensor attn_lse,
                    TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MHA computation is not supported by the current backend";
   }
 
-  virtual void MLA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-                   NDArray page_indices, NDArray length_info, bool causal, double sm_scale,
-                   NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) {
+  virtual void MLA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+                   Tensor page_indices, Tensor length_info, bool causal, double sm_scale,
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MLA computation is not supported by the current backend";
   }
 
-  virtual void BeginForward(int depth, NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                            NDArray page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
+  virtual void BeginForward(int depth, Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                            Tensor page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
                             HostMemoryVector* page_indptr, HostMemoryVector* last_page_len,
                             int64_t batch_size, int64_t total_qo_len, int64_t page_size,
                             int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
@@ -101,10 +101,10 @@ class TIRPagedPrefillFunc : public PagedPrefillFunc {
   explicit TIRPagedPrefillFunc(ffi::Function attn_func, AttnKind attn_kind)
       : PagedPrefillFunc(std::move(attn_func), attn_kind, AttnBackendKind::kTIR) {}
 
-  void MHA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-           NDArray page_indices, NDArray length_info, NDArray q_rope_position,
-           NDArray k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
-           double rotary_theta, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  void MHA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+           Tensor page_indices, Tensor length_info, Tensor q_rope_position,
+           Tensor k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
+           double rotary_theta, double sm_scale, Tensor attn_output, Tensor attn_lse,
            TVMStreamHandle compute_stream) final {
     attn_func_(q, qo_indptr, pages, page_indptr, page_indices, length_info, k_rope_pos_offset,
                q_rope_position, attn_output, attn_lse, static_cast<int64_t>(causal),
@@ -112,9 +112,9 @@ class TIRPagedPrefillFunc : public PagedPrefillFunc {
                rotary_theta, sm_scale);
   }
 
-  void MLA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-           NDArray page_indices, NDArray length_info, bool causal, double sm_scale,
-           NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MLA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+           Tensor page_indices, Tensor length_info, bool causal, double sm_scale,
+           Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) final {
     attn_func_(q, qo_indptr, pages, page_indptr, page_indices, length_info, attn_output, attn_lse,
                static_cast<int64_t>(causal), sm_scale);
   }
@@ -128,10 +128,10 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
       : PagedPrefillFunc(std::move(attn_func), attn_kind, AttnBackendKind::kFlashInfer),
         plan_func_(std::move(plan_func)) {}
 
-  void MHA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-           NDArray page_indices, NDArray length_info, NDArray q_rope_position,
-           NDArray k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
-           double rotary_theta, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  void MHA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+           Tensor page_indices, Tensor length_info, Tensor q_rope_position,
+           Tensor k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
+           double rotary_theta, double sm_scale, Tensor attn_output, Tensor attn_lse,
            TVMStreamHandle compute_stream) final {
     auto [float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
           plan_info_vec] = cached_buffers_[depth];
@@ -145,9 +145,9 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
                /*rope_rcp_theta=*/rope_rcp_theta, compute_stream);
   }
 
-  void MLA(int depth, NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-           NDArray page_indices, NDArray length_info, bool causal, double sm_scale,
-           NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MLA(int depth, Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+           Tensor page_indices, Tensor length_info, bool causal, double sm_scale,
+           Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) final {
     auto [float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
           plan_info_vec] = cached_buffers_[depth];
     attn_func_(float_workspace_buffer, int_workspace_buffer, plan_info_vec, q, pages, page_indices,
@@ -155,8 +155,8 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
                /*num_heads=*/q->shape[1], /*page_size=*/pages->shape[1], sm_scale, compute_stream);
   }
 
-  void BeginForward(int depth, NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                    NDArray page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
+  void BeginForward(int depth, Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                    Tensor page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
                     HostMemoryVector* page_indptr, HostMemoryVector* last_page_len,
                     int64_t batch_size, int64_t total_qo_len, int64_t page_size,
                     int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
@@ -174,16 +174,15 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
       // Todo(tvm-team): enable cuda graph
       plan_info_vec =
           plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
-                     qo_indptr->as_ndarray(), page_indptr->as_ndarray(),
-                     IntTuple(std::move(kv_len)), total_qo_len, batch_size, num_qo_heads,
-                     num_kv_heads, page_size,
+                     qo_indptr->as_tensor(), page_indptr->as_tensor(), IntTuple(std::move(kv_len)),
+                     total_qo_len, batch_size, num_qo_heads, num_kv_heads, page_size,
                      /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream)
               .cast<IntTuple>();
     } else if (attn_kind == AttnKind::kMLA) {
       plan_info_vec =
           plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
-                     qo_indptr->as_ndarray(), page_indptr->as_ndarray(),
-                     IntTuple(std::move(kv_len)), num_qo_heads, v_head_dim, causal, copy_stream)
+                     qo_indptr->as_tensor(), page_indptr->as_tensor(), IntTuple(std::move(kv_len)),
+                     num_qo_heads, v_head_dim, causal, copy_stream)
               .cast<IntTuple>();
     }
 
@@ -197,7 +196,7 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
 
  private:
   ffi::Function plan_func_;
-  std::vector<std::tuple<NDArray, NDArray, NDArray, IntTuple>> cached_buffers_;
+  std::vector<std::tuple<Tensor, Tensor, Tensor, IntTuple>> cached_buffers_;
 };
 
 /*! \brief The ragged prefill attention function base class. */
@@ -207,15 +206,15 @@ class RaggedPrefillFunc : public AttnBackendFunc {
                              AttnBackendKind backend_kind)
       : AttnBackendFunc(std::move(attn_func), attn_kind, backend_kind) {}
 
-  virtual void MHA(NDArray q, NDArray k, NDArray v, NDArray qo_indptr, NDArray kv_indptr,
-                   NDArray q_rope_position, NDArray k_rope_pos_offset, bool causal,
+  virtual void MHA(Tensor q, Tensor k, Tensor v, Tensor qo_indptr, Tensor kv_indptr,
+                   Tensor q_rope_position, Tensor k_rope_pos_offset, bool causal,
                    RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-                   NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) {
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MHA computation is not supported by the current backend";
   }
 
-  virtual void BeginForward(NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                            NDArray page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
+  virtual void BeginForward(Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                            Tensor page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
                             HostMemoryVector* kv_indptr, int64_t batch_size, int64_t total_qo_len,
                             int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
                             int64_t v_head_dim, bool causal, TVMStreamHandle copy_stream) {
@@ -229,10 +228,10 @@ class TIRRaggedPrefillFunc : public RaggedPrefillFunc {
   explicit TIRRaggedPrefillFunc(ffi::Function attn_func, AttnKind attn_kind)
       : RaggedPrefillFunc(std::move(attn_func), attn_kind, AttnBackendKind::kTIR) {}
 
-  void MHA(NDArray q, NDArray k, NDArray v, NDArray qo_indptr, NDArray kv_indptr,
-           NDArray q_rope_position, NDArray k_rope_pos_offset, bool causal, RoPEMode rope_mode,
-           double rotary_scale, double rotary_theta, double sm_scale, NDArray attn_output,
-           NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(Tensor q, Tensor k, Tensor v, Tensor qo_indptr, Tensor kv_indptr, Tensor q_rope_position,
+           Tensor k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
+           double rotary_theta, double sm_scale, Tensor attn_output, Tensor attn_lse,
+           TVMStreamHandle compute_stream) final {
     attn_func_(q, qo_indptr, k, v, kv_indptr, q_rope_position, k_rope_pos_offset, attn_output,
                attn_lse, static_cast<int64_t>(causal),
                /*rotary_mode=*/static_cast<int64_t>(rope_mode == RoPEMode::kInline), rotary_scale,
@@ -248,10 +247,10 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc {
       : RaggedPrefillFunc(std::move(attn_func), attn_kind, AttnBackendKind::kFlashInfer),
         plan_func_(std::move(plan_func)) {}
 
-  void MHA(NDArray q, NDArray k, NDArray v, NDArray qo_indptr, NDArray kv_indptr,
-           NDArray q_rope_position, NDArray k_rope_pos_offset, bool causal, RoPEMode rope_mode,
-           double rotary_scale, double rotary_theta, double sm_scale, NDArray attn_output,
-           NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(Tensor q, Tensor k, Tensor v, Tensor qo_indptr, Tensor kv_indptr, Tensor q_rope_position,
+           Tensor k_rope_pos_offset, bool causal, RoPEMode rope_mode, double rotary_scale,
+           double rotary_theta, double sm_scale, Tensor attn_output, Tensor attn_lse,
+           TVMStreamHandle compute_stream) final {
     double rope_rcp_scale = 1 / rotary_scale;
     double rope_rcp_theta = 1 / rotary_theta;
     attn_func_(float_workspace_buffer_, int_workspace_buffer_, plan_info_vec_, q, k, v, qo_indptr,
@@ -263,8 +262,8 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc {
                /*rope_rcp_theta=*/rope_rcp_theta, compute_stream);
   }
 
-  void BeginForward(NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                    NDArray page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
+  void BeginForward(Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                    Tensor page_locked_int_workspace_buffer, HostMemoryVector* qo_indptr,
                     HostMemoryVector* kv_indptr, int64_t batch_size, int64_t total_qo_len,
                     int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
                     int64_t v_head_dim, bool causal, TVMStreamHandle copy_stream) final {
@@ -279,7 +278,7 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc {
     page_locked_int_workspace_buffer_ = page_locked_int_workspace_buffer;
     plan_info_vec_ =
         plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
-                   qo_indptr->as_ndarray(), kv_indptr->as_ndarray(), IntTuple(std::move(kv_len)),
+                   qo_indptr->as_tensor(), kv_indptr->as_tensor(), IntTuple(std::move(kv_len)),
                    total_qo_len, batch_size, num_qo_heads, num_kv_heads, /*page_size=*/1,
                    /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream)
             .cast<IntTuple>();
@@ -287,9 +286,9 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc {
 
  private:
   ffi::Function plan_func_;
-  NDArray float_workspace_buffer_;
-  NDArray int_workspace_buffer_;
-  NDArray page_locked_int_workspace_buffer_;
+  Tensor float_workspace_buffer_;
+  Tensor int_workspace_buffer_;
+  Tensor page_locked_int_workspace_buffer_;
   IntTuple plan_info_vec_;
 };
 
@@ -300,21 +299,21 @@ class PagedDecodeFunc : public AttnBackendFunc {
                            AttnBackendKind backend_kind)
       : AttnBackendFunc(std::move(attn_func), attn_kind, backend_kind) {}
 
-  virtual void MHA(int depth, NDArray q, NDArray pages, NDArray page_indptr, NDArray page_indices,
-                   NDArray length_info, NDArray k_rope_pos_offset, NDArray q_rope_position,
+  virtual void MHA(int depth, Tensor q, Tensor pages, Tensor page_indptr, Tensor page_indices,
+                   Tensor length_info, Tensor k_rope_pos_offset, Tensor q_rope_position,
                    RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-                   NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) {
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MHA computation is not supported by the current backend";
   }
 
-  virtual void MLA(int depth, NDArray q, NDArray pages, NDArray page_indptr, NDArray page_indices,
-                   NDArray length_info, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  virtual void MLA(int depth, Tensor q, Tensor pages, Tensor page_indptr, Tensor page_indices,
+                   Tensor length_info, double sm_scale, Tensor attn_output, Tensor attn_lse,
                    TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MLA computation is not supported by the current backend";
   }
 
-  virtual void BeginForward(int depth, NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                            NDArray page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
+  virtual void BeginForward(int depth, Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                            Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                             int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                             int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
                             RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
@@ -329,18 +328,18 @@ class TIRPagedDecodeFunc : public PagedDecodeFunc {
   explicit TIRPagedDecodeFunc(ffi::Function attn_func, AttnKind attn_kind)
       : PagedDecodeFunc(std::move(attn_func), attn_kind, AttnBackendKind::kTIR) {}
 
-  void MHA(int depth, NDArray q, NDArray pages, NDArray page_indptr, NDArray page_indices,
-           NDArray length_info, NDArray k_rope_pos_offset, NDArray q_rope_position,
-           RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-           NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(int depth, Tensor q, Tensor pages, Tensor page_indptr, Tensor page_indices,
+           Tensor length_info, Tensor k_rope_pos_offset, Tensor q_rope_position, RoPEMode rope_mode,
+           double rotary_scale, double rotary_theta, double sm_scale, Tensor attn_output,
+           Tensor attn_lse, TVMStreamHandle compute_stream) final {
     attn_func_(q, pages, page_indptr, page_indices, length_info, k_rope_pos_offset, q_rope_position,
                attn_output, attn_lse,
                /*rotary_mode=*/static_cast<int64_t>(rope_mode == RoPEMode::kInline), rotary_scale,
                rotary_theta, sm_scale);
   }
 
-  void MLA(int depth, NDArray q, NDArray pages, NDArray page_indptr, NDArray page_indices,
-           NDArray length_info, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  void MLA(int depth, Tensor q, Tensor pages, Tensor page_indptr, Tensor page_indices,
+           Tensor length_info, double sm_scale, Tensor attn_output, Tensor attn_lse,
            TVMStreamHandle compute_stream) final {
     attn_func_(q, pages, page_indptr, page_indices, length_info, attn_output, attn_lse, sm_scale);
   }
@@ -354,10 +353,10 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
       : PagedDecodeFunc(std::move(attn_func), attn_kind, AttnBackendKind::kFlashInfer),
         plan_func_(std::move(plan_func)) {}
 
-  void MHA(int depth, NDArray q, NDArray pages, NDArray page_indptr, NDArray page_indices,
-           NDArray length_info, NDArray k_rope_pos_offset, NDArray q_rope_position,
-           RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-           NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(int depth, Tensor q, Tensor pages, Tensor page_indptr, Tensor page_indices,
+           Tensor length_info, Tensor k_rope_pos_offset, Tensor q_rope_position, RoPEMode rope_mode,
+           double rotary_scale, double rotary_theta, double sm_scale, Tensor attn_output,
+           Tensor attn_lse, TVMStreamHandle compute_stream) final {
     auto [float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
           plan_info_vec] = cached_buffers_[depth];
     double rope_rcp_scale = 1 / rotary_scale;
@@ -369,8 +368,8 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
                /*rope_rcp_theta=*/rope_rcp_theta, compute_stream);
   }
 
-  void BeginForward(int depth, NDArray float_workspace_buffer, NDArray int_workspace_buffer,
-                    NDArray page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
+  void BeginForward(int depth, Tensor float_workspace_buffer, Tensor int_workspace_buffer,
+                    Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                     int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                     int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
                     RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
@@ -378,7 +377,7 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
     // Todo(tvm-team): enable cuda graph
     IntTuple plan_info_vec =
         plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
-                   page_indptr->as_ndarray(), batch_size, num_qo_heads, num_kv_heads, page_size,
+                   page_indptr->as_tensor(), batch_size, num_qo_heads, num_kv_heads, page_size,
                    /*enable_cuda_graph=*/false,
                    static_cast<int64_t>(rope_mode == RoPEMode::kInline),
                    /*window_left=*/-1, qk_head_dim, v_head_dim, q_dtype, kv_dtype, copy_stream)
@@ -394,7 +393,7 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
 
  private:
   ffi::Function plan_func_;
-  std::vector<std::tuple<NDArray, NDArray, NDArray, IntTuple>> cached_buffers_;
+  std::vector<std::tuple<Tensor, Tensor, Tensor, IntTuple>> cached_buffers_;
 };
 
 /*! \brief The paged prefill with tree mask attention function base class. */
@@ -404,22 +403,22 @@ class PagedPrefillTreeMaskFunc : public AttnBackendFunc {
                                     AttnBackendKind backend_kind)
       : AttnBackendFunc(std::move(attn_func), attn_kind, backend_kind) {}
 
-  virtual void MHA(NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-                   NDArray page_indices, NDArray length_info, NDArray k_rope_pos_offset,
-                   NDArray q_rope_position, NDArray tree_attn_mn_indptr, NDArray tree_attn_mask,
+  virtual void MHA(Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+                   Tensor page_indices, Tensor length_info, Tensor k_rope_pos_offset,
+                   Tensor q_rope_position, Tensor tree_attn_mn_indptr, Tensor tree_attn_mask,
                    RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-                   NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) {
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MHA computation is not supported by the current backend";
   }
 
-  virtual void MLA(NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr,
-                   NDArray page_indices, NDArray length_info, NDArray tree_attn_mn_indptr,
-                   NDArray tree_attn_mask, double sm_scale, NDArray attn_output, NDArray attn_lse,
+  virtual void MLA(Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr,
+                   Tensor page_indices, Tensor length_info, Tensor tree_attn_mn_indptr,
+                   Tensor tree_attn_mask, double sm_scale, Tensor attn_output, Tensor attn_lse,
                    TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MLA computation is not supported by the current backend";
   }
 
-  virtual void BeginForward(NDArray temp_float_attn_workspace, NDArray temp_int_attn_workspace,
+  virtual void BeginForward(Tensor temp_float_attn_workspace, Tensor temp_int_attn_workspace,
                             HostMemoryVector* page_indptr, HostMemoryVector* last_page_len,
                             HostMemoryVector* qo_indptr, int64_t batch_size, int64_t page_size,
                             int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
@@ -434,11 +433,11 @@ class TIRPagedPrefillTreeMaskFunc : public PagedPrefillTreeMaskFunc {
   explicit TIRPagedPrefillTreeMaskFunc(ffi::Function attn_func, AttnKind attn_kind)
       : PagedPrefillTreeMaskFunc(std::move(attn_func), attn_kind, AttnBackendKind::kTIR) {}
 
-  void MHA(NDArray q, NDArray qo_indptr, NDArray pages, NDArray page_indptr, NDArray page_indices,
-           NDArray length_info, NDArray k_rope_pos_offset, NDArray q_rope_position,
-           NDArray tree_attn_mn_indptr, NDArray tree_attn_mask, RoPEMode rope_mode,
-           double rotary_scale, double rotary_theta, double sm_scale, NDArray attn_output,
-           NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(Tensor q, Tensor qo_indptr, Tensor pages, Tensor page_indptr, Tensor page_indices,
+           Tensor length_info, Tensor k_rope_pos_offset, Tensor q_rope_position,
+           Tensor tree_attn_mn_indptr, Tensor tree_attn_mask, RoPEMode rope_mode,
+           double rotary_scale, double rotary_theta, double sm_scale, Tensor attn_output,
+           Tensor attn_lse, TVMStreamHandle compute_stream) final {
     attn_func_(q, qo_indptr, pages, page_indptr, page_indices, length_info, k_rope_pos_offset,
                q_rope_position, attn_output, attn_lse,
                /*rotary_mode=*/static_cast<int64_t>(rope_mode == RoPEMode::kInline), rotary_scale,
@@ -453,21 +452,20 @@ class RaggedPrefillTreeMaskFunc : public AttnBackendFunc {
                                      AttnBackendKind backend_kind)
       : AttnBackendFunc(std::move(attn_func), attn_kind, backend_kind) {}
 
-  virtual void MHA(NDArray q, NDArray k, NDArray v, NDArray qo_indptr, NDArray kv_indptr,
-                   NDArray q_rope_position, NDArray tree_attn_mn_indptr, NDArray tree_attn_mask,
+  virtual void MHA(Tensor q, Tensor k, Tensor v, Tensor qo_indptr, Tensor kv_indptr,
+                   Tensor q_rope_position, Tensor tree_attn_mn_indptr, Tensor tree_attn_mask,
                    RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-                   NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) {
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MHA computation is not supported by the current backend";
   }
 
-  virtual void MLA(NDArray q, NDArray compressed_kv, NDArray k_pe, NDArray qo_indptr,
-                   NDArray kv_indptr, NDArray tree_attn_mn_indptr, NDArray tree_attn_mask,
-                   double sm_scale, NDArray attn_output, NDArray attn_lse,
-                   TVMStreamHandle compute_stream) {
+  virtual void MLA(Tensor q, Tensor compressed_kv, Tensor k_pe, Tensor qo_indptr, Tensor kv_indptr,
+                   Tensor tree_attn_mn_indptr, Tensor tree_attn_mask, double sm_scale,
+                   Tensor attn_output, Tensor attn_lse, TVMStreamHandle compute_stream) {
     LOG(FATAL) << "MLA computation is not supported by the current backend";
   }
 
-  virtual void BeginForward(NDArray temp_float_attn_workspace, NDArray temp_int_attn_workspace,
+  virtual void BeginForward(Tensor temp_float_attn_workspace, Tensor temp_int_attn_workspace,
                             HostMemoryVector* page_indptr, HostMemoryVector* last_page_len,
                             HostMemoryVector* qo_indptr, int64_t batch_size, int64_t page_size,
                             int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
@@ -482,10 +480,10 @@ class TIRRaggedPrefillTreeMaskFunc : public RaggedPrefillTreeMaskFunc {
   explicit TIRRaggedPrefillTreeMaskFunc(ffi::Function attn_func, AttnKind attn_kind)
       : RaggedPrefillTreeMaskFunc(std::move(attn_func), attn_kind, AttnBackendKind::kTIR) {}
 
-  void MHA(NDArray q, NDArray k, NDArray v, NDArray qo_indptr, NDArray kv_indptr,
-           NDArray q_rope_position, NDArray tree_attn_mn_indptr, NDArray tree_attn_mask,
-           RoPEMode rope_mode, double rotary_scale, double rotary_theta, double sm_scale,
-           NDArray attn_output, NDArray attn_lse, TVMStreamHandle compute_stream) final {
+  void MHA(Tensor q, Tensor k, Tensor v, Tensor qo_indptr, Tensor kv_indptr, Tensor q_rope_position,
+           Tensor tree_attn_mn_indptr, Tensor tree_attn_mask, RoPEMode rope_mode,
+           double rotary_scale, double rotary_theta, double sm_scale, Tensor attn_output,
+           Tensor attn_lse, TVMStreamHandle compute_stream) final {
     attn_func_(q, qo_indptr, k, v, kv_indptr, q_rope_position, tree_attn_mn_indptr, tree_attn_mask,
                attn_output, attn_lse,
                /*rotary_mode=*/static_cast<int64_t>(rope_mode == RoPEMode::kInline), rotary_scale,
diff --git a/src/runtime/vm/attn_utils.h b/src/runtime/vm/attn_utils.h
index 290ca02653d2..5eff9452c5b9 100644
--- a/src/runtime/vm/attn_utils.h
+++ b/src/runtime/vm/attn_utils.h
@@ -24,7 +24,7 @@
 #ifndef TVM_RUNTIME_VM_ATTN_UTILS_H_
 #define TVM_RUNTIME_VM_ATTN_UTILS_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <limits>
@@ -355,14 +355,14 @@ class HostMemoryVector {
   explicit HostMemoryVector(int64_t reserved_size, DLDataType dtype, Device device)
       : reserved_size_(reserved_size) {
     ICHECK(DataType(dtype) == DataType::Int(32));
-    data_ = NDArray::Empty({reserved_size}, dtype, device);
+    data_ = Tensor::Empty({reserved_size}, dtype, device);
   }
 
   void push_back(int32_t value) {
     ICHECK_LE(current_size_, reserved_size_);
     if (current_size_ == reserved_size_) {
       reserved_size_ *= 2;
-      NDArray new_data = NDArray::Empty({reserved_size_}, data_->dtype, data_->device);
+      Tensor new_data = Tensor::Empty({reserved_size_}, data_->dtype, data_->device);
       std::memcpy(new_data->data, data_->data, current_size_ * DataType(data_->dtype).bytes());
       data_ = new_data;
     }
@@ -386,8 +386,8 @@ class HostMemoryVector {
 
   void clear() { current_size_ = 0; }
 
-  /*! \brief Return the vector as an NDArray. */
-  NDArray as_ndarray() { return data_.CreateView({current_size_}, data_->dtype); }
+  /*! \brief Return the vector as an Tensor. */
+  Tensor as_tensor() { return data_.CreateView({current_size_}, data_->dtype); }
 
   IntTuple as_int_tuple() const {
     std::vector<int64_t> values;
@@ -401,7 +401,7 @@ class HostMemoryVector {
  private:
   int64_t reserved_size_ = 0;
   int64_t current_size_ = 0;
-  NDArray data_{nullptr};
+  Tensor data_{nullptr};
 };
 
 /*!
@@ -411,12 +411,12 @@ class HostMemoryVector {
  *
  * The core functions of this class is `CopyXXXAsync` and `CommitAttnAuxDataCopy`.
  * `CopyXXXAsync` takes the input data on CPU host, and copy the input data
- * to GPU in an asynchronous way, and returns the NDArray view of the data
+ * to GPU in an asynchronous way, and returns the Tensor view of the data
  * on GPU device.
  *
  * Being asynchronous here means the `CopyXXXAsync` function may not perform
  * data copy from CPU to GPU at the time of being called. Therefore, the
- * returned NDArray view may have wrong result, until `CommitAttnAuxDataCopy` is
+ * returned Tensor view may have wrong result, until `CommitAttnAuxDataCopy` is
  * explicitly invoked and the data copy stream is synchronized.
  *
  * We design this manager class in order to reduce the data copy overhead.
@@ -436,16 +436,16 @@ class PagedKVCacheAuxDataManager {
   /*! \brief Reset the attention auxiliary data status of copy manager. */
   virtual void ResetAttnAuxDataCopy() = 0;
   /*! \brief Copy the indptr array of append lengths after coalescing. (see GetChunkedBlockIds) */
-  virtual NDArray CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*! \brief Copy the indptr array of page table. */
-  virtual NDArray CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*! \brief Copy the indices array of page table. */
-  virtual NDArray CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*! \brief Copy the array of KV slot number used in the last page of the seq. */
-  virtual NDArray CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*!
    * \brief Copy the length information of the sequences.
-   * Each NDArray is in shape `(3, n)`. "n" is the number of sequences.
+   * Each Tensor is in shape `(3, n)`. "n" is the number of sequences.
    * For a sequence "i", location
    * - "(0, i)" is the number of KV slots used in the last page of the seq ("last_page_len"),
    * - "(1, i)" is the starting offset of the sliding window in the seq,
@@ -453,51 +453,51 @@ class PagedKVCacheAuxDataManager {
    * \note When sliding window is not enabled, only the
    * "last_page_len" (a.k.a., the first "n" elements) will be effectively used.
    */
-  virtual NDArray CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
-                                             HostMemoryVector* sliding_window_offset,
-                                             HostMemoryVector* sink_size, int depth) = 0;
+  virtual Tensor CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
+                                            HostMemoryVector* sliding_window_offset,
+                                            HostMemoryVector* sink_size, int depth) = 0;
   /*! \brief Copy the k position offset of applying RoPE for each sequence. */
-  virtual NDArray CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*!
    * \brief Copy the append length indptr array on device.
    * \note Since the Q/K/V data may have raggedness in terms of lengths,
    * we represent the append lengths in CSR format.
    */
-  virtual NDArray CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the k position offset of applying RoPE for each sequence. */
-  virtual NDArray CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the q position mapping of applying RoPE for each sequence. */
-  virtual NDArray CopyQRoPEPosMapAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyQRoPEPosMapAsync(HostMemoryVector* data) = 0;
   /*!
    * \brief Copy the corresponding position in global KV cache (pages)
    * for each position along the length dimension of K/V data when
    * appending new K/V data.
    */
-  virtual NDArray CopyAppendPositionMapAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyAppendPositionMapAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the remote position map for KV transfer. */
-  virtual NDArray CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the receiver id for KV transfer. */
-  virtual NDArray CopyKVTransferRecverIDAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKVTransferRecverIDAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the local position map for KV page-to-page transfer. */
-  virtual NDArray CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the remote position map for KV page-to-page transfer. */
-  virtual NDArray CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the receiver id for KV page-to-page transfer. */
-  virtual NDArray CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the tree attention mask. */
-  virtual NDArray CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*! \brief Copy the mn indptr of the tree attention mask. */
-  virtual NDArray CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
+  virtual Tensor CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) = 0;
   /*! \brief Commit all the attention auxiliary data copy operations since the last commit. */
   virtual void CommitAttnAuxDataCopy() = 0;
 
   /*! \brief Reset the compact KV auxiliary data status of copy manager. */
   virtual void ResetCompactKVAuxDataCopy() = 0;
   /*! \brief Copy the length indptr array of KV data copy for each sequence. */
-  virtual NDArray CopyCommitLengthIndptrAsync(HostMemoryVector* data) = 0;
+  virtual Tensor CopyCommitLengthIndptrAsync(HostMemoryVector* data) = 0;
   /*! \brief Copy the src/dst position arrays for each sequence. */
-  virtual NDArray CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
-                                                      HostMemoryVector* dst_data) = 0;
+  virtual Tensor CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
+                                                     HostMemoryVector* dst_data) = 0;
   /*! \brief Commit all the compact KV auxiliary data copy operations since the last commit. */
   virtual void CommitCompactKVAuxDataCopy() = 0;
 
@@ -525,144 +525,144 @@ class PlainPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
       : PagedKVCacheAuxDataManager(dtype_aux, device, preferred_host_device, copy_stream) {
     for (int d = 0; d < kPagedKVCacheMaxBlockDepth; ++d) {
       qo_indptr_on_depths_device_.push_back(
-          NDArray::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
+          Tensor::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
       page_indptr_on_depths_device_.push_back(
-          NDArray::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
+          Tensor::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
       page_indices_on_depths_device_.push_back(
-          NDArray::Empty({num_total_pages}, dtype_aux_, device));
+          Tensor::Empty({num_total_pages}, dtype_aux_, device));
       length_info_on_depths_device_.push_back(
-          NDArray::Empty({3, reserved_num_seqs}, dtype_aux_, device));
+          Tensor::Empty({3, reserved_num_seqs}, dtype_aux_, device));
       k_rope_pos_offset_on_depths_device_.push_back(
-          NDArray::Empty({reserved_num_seqs}, dtype_aux_, device));
-      tree_attn_mask_device_.push_back(NDArray::Empty(
+          Tensor::Empty({reserved_num_seqs}, dtype_aux_, device));
+      tree_attn_mask_device_.push_back(Tensor::Empty(
           {kTreeAttnMaxTreeSize * kTreeAttnMaxTreeSize * reserved_num_seqs}, dtype_aux_, device));
       tree_attn_mn_indptr_device_.push_back(
-          NDArray::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
+          Tensor::Empty({reserved_num_seqs + 1}, dtype_aux_, device));
     }
-    cur_append_length_indptr_device_ = NDArray::Empty({reserved_num_seqs + 1}, dtype_aux_, device);
-    k_ragged_rope_pos_offset_device_ = NDArray::Empty({reserved_num_seqs}, dtype_aux_, device);
-    q_rope_position_map_device_ = NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
-    append_position_map_device_ = NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
+    cur_append_length_indptr_device_ = Tensor::Empty({reserved_num_seqs + 1}, dtype_aux_, device);
+    k_ragged_rope_pos_offset_device_ = Tensor::Empty({reserved_num_seqs}, dtype_aux_, device);
+    q_rope_position_map_device_ = Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
+    append_position_map_device_ = Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
     kv_transfer_remote_position_map_device =
-        NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
-    kv_transfer_recver_id_device = NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
+        Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
+    kv_transfer_recver_id_device = Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
     kv_transfer_page_to_page_local_position_map_device =
         kv_transfer_page_to_page_remote_position_map_device =
-            NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
+            Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
     kv_transfer_page_to_page_recver_id_device =
-        NDArray::Empty({prefill_chunk_size}, dtype_aux_, device);
-    commit_copy_length_indptr_device_ = NDArray::Empty({reserved_num_seqs + 1}, dtype_aux_, device);
+        Tensor::Empty({prefill_chunk_size}, dtype_aux_, device);
+    commit_copy_length_indptr_device_ = Tensor::Empty({reserved_num_seqs + 1}, dtype_aux_, device);
     commit_copy_src_dst_pos_in_page_table_device_ =
-        NDArray::Empty({2, std::min(kTreeAttnMaxTreeSize * reserved_num_seqs, prefill_chunk_size)},
-                       dtype_aux_, device);
+        Tensor::Empty({2, std::min(kTreeAttnMaxTreeSize * reserved_num_seqs, prefill_chunk_size)},
+                      dtype_aux_, device);
   }
 
   // The reset of the plain auxiliary data manager is no-op.
   void ResetAttnAuxDataCopy() final {}
-  NDArray CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = qo_indptr_on_depths_device_[depth].CreateView(
+  Tensor CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = qo_indptr_on_depths_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = page_indptr_on_depths_device_[depth].CreateView(
+  Tensor CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = page_indptr_on_depths_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = page_indices_on_depths_device_[depth].CreateView(
+  Tensor CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = page_indices_on_depths_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = length_info_on_depths_device_[depth].CreateView(
+  Tensor CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = length_info_on_depths_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = k_rope_pos_offset_on_depths_device_[depth].CreateView(
+  Tensor CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = k_rope_pos_offset_on_depths_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) final {
-    NDArray view = cur_append_length_indptr_device_.CreateView({static_cast<int64_t>(data->size())},
-                                                               dtype_aux_);
+  Tensor CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) final {
+    Tensor view = cur_append_length_indptr_device_.CreateView({static_cast<int64_t>(data->size())},
+                                                              dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) final {
-    NDArray view = k_ragged_rope_pos_offset_device_.CreateView({static_cast<int64_t>(data->size())},
-                                                               dtype_aux_);
+  Tensor CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) final {
+    Tensor view = k_ragged_rope_pos_offset_device_.CreateView({static_cast<int64_t>(data->size())},
+                                                              dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyQRoPEPosMapAsync(HostMemoryVector* data) final {
-    NDArray view =
+  Tensor CopyQRoPEPosMapAsync(HostMemoryVector* data) final {
+    Tensor view =
         q_rope_position_map_device_.CreateView({static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyAppendPositionMapAsync(HostMemoryVector* data) final {
-    NDArray view =
+  Tensor CopyAppendPositionMapAsync(HostMemoryVector* data) final {
+    Tensor view =
         append_position_map_device_.CreateView({static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) final {
-    NDArray view = kv_transfer_remote_position_map_device.CreateView(
+  Tensor CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) final {
+    Tensor view = kv_transfer_remote_position_map_device.CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKVTransferRecverIDAsync(HostMemoryVector* data) final {
-    NDArray view =
+  Tensor CopyKVTransferRecverIDAsync(HostMemoryVector* data) final {
+    Tensor view =
         kv_transfer_recver_id_device.CreateView({static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) final {
-    NDArray view = kv_transfer_page_to_page_local_position_map_device.CreateView(
+  Tensor CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) final {
+    Tensor view = kv_transfer_page_to_page_local_position_map_device.CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) final {
-    NDArray view = kv_transfer_page_to_page_remote_position_map_device.CreateView(
+  Tensor CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) final {
+    Tensor view = kv_transfer_page_to_page_remote_position_map_device.CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) final {
-    NDArray view = kv_transfer_page_to_page_recver_id_device.CreateView(
+  Tensor CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) final {
+    Tensor view = kv_transfer_page_to_page_recver_id_device.CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
 
-  NDArray CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view =
+  Tensor CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view =
         tree_attn_mask_device_[depth].CreateView({static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray view = tree_attn_mn_indptr_device_[depth].CreateView(
+  Tensor CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor view = tree_attn_mn_indptr_device_[depth].CreateView(
         {static_cast<int64_t>(data->size())}, dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
 
-  NDArray CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
-                                     HostMemoryVector* sliding_window_offset,
-                                     HostMemoryVector* sink_size, int depth) final {
+  Tensor CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
+                                    HostMemoryVector* sliding_window_offset,
+                                    HostMemoryVector* sink_size, int depth) final {
     int n_elem = last_page_len->size();
     ICHECK_GT(n_elem, 0);
-    NDArray view = length_info_on_depths_device_[depth].CreateView({3, n_elem}, dtype_aux_);
+    Tensor view = length_info_on_depths_device_[depth].CreateView({3, n_elem}, dtype_aux_);
     ffi::Shape copy_shape{n_elem};
     CopyVecDataToArray(view, last_page_len->data(), copy_shape);
     CopyVecDataToArray(view, sliding_window_offset->data(), copy_shape,
@@ -678,18 +678,17 @@ class PlainPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
   // The reset of the plain auxiliary data manager is no-op.
   void ResetCompactKVAuxDataCopy() final {}
 
-  NDArray CopyCommitLengthIndptrAsync(HostMemoryVector* data) final {
-    NDArray view = commit_copy_length_indptr_device_.CreateView(
-        {static_cast<int64_t>(data->size())}, dtype_aux_);
+  Tensor CopyCommitLengthIndptrAsync(HostMemoryVector* data) final {
+    Tensor view = commit_copy_length_indptr_device_.CreateView({static_cast<int64_t>(data->size())},
+                                                               dtype_aux_);
     CopyVecDataToArray(view, data->data());
     return view;
   }
-  NDArray CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
-                                              HostMemoryVector* dst_data) final {
+  Tensor CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
+                                             HostMemoryVector* dst_data) final {
     int n_elem = src_data->size();
     ICHECK_GT(n_elem, 0);
-    NDArray view =
-        commit_copy_src_dst_pos_in_page_table_device_.CreateView({2, n_elem}, dtype_aux_);
+    Tensor view = commit_copy_src_dst_pos_in_page_table_device_.CreateView({2, n_elem}, dtype_aux_);
     ffi::Shape copy_shape{n_elem};
     CopyVecDataToArray(view, src_data->data(), copy_shape);
     CopyVecDataToArray(view, dst_data->data(), copy_shape,
@@ -702,11 +701,11 @@ class PlainPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
 
  private:
   /*!
-   * \brief Copy a vector of data to the input NDArray.
+   * \brief Copy a vector of data to the input Tensor.
    * It optionally supports specifying the shape of copy and the element
-   * offset to the destination NDArray.
+   * offset to the destination Tensor.
    */
-  void CopyVecDataToArray(NDArray array, int32_t* vec_data,
+  void CopyVecDataToArray(Tensor array, int32_t* vec_data,
                           Optional<ffi::Shape> shape = std::nullopt, int dst_elem_offset = 0) {
     if (array->shape[0] == 0) {
       return;
@@ -743,27 +742,27 @@ class PlainPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
     copy_src.shape = copy_dst.shape;
     copy_src.strides = nullptr;
     copy_src.byte_offset = 0;
-    NDArray::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
+    Tensor::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
   }
 
-  std::vector<NDArray> qo_indptr_on_depths_device_;
-  std::vector<NDArray> page_indptr_on_depths_device_;
-  std::vector<NDArray> page_indices_on_depths_device_;
-  std::vector<NDArray> length_info_on_depths_device_;
-  std::vector<NDArray> k_rope_pos_offset_on_depths_device_;
-  std::vector<NDArray> tree_attn_mask_device_;
-  std::vector<NDArray> tree_attn_mn_indptr_device_;
-  NDArray cur_append_length_indptr_device_;
-  NDArray k_ragged_rope_pos_offset_device_;
-  NDArray q_rope_position_map_device_;
-  NDArray append_position_map_device_;
-  NDArray kv_transfer_remote_position_map_device;
-  NDArray kv_transfer_recver_id_device;
-  NDArray kv_transfer_page_to_page_local_position_map_device;
-  NDArray kv_transfer_page_to_page_remote_position_map_device;
-  NDArray kv_transfer_page_to_page_recver_id_device;
-  NDArray commit_copy_length_indptr_device_;
-  NDArray commit_copy_src_dst_pos_in_page_table_device_;
+  std::vector<Tensor> qo_indptr_on_depths_device_;
+  std::vector<Tensor> page_indptr_on_depths_device_;
+  std::vector<Tensor> page_indices_on_depths_device_;
+  std::vector<Tensor> length_info_on_depths_device_;
+  std::vector<Tensor> k_rope_pos_offset_on_depths_device_;
+  std::vector<Tensor> tree_attn_mask_device_;
+  std::vector<Tensor> tree_attn_mn_indptr_device_;
+  Tensor cur_append_length_indptr_device_;
+  Tensor k_ragged_rope_pos_offset_device_;
+  Tensor q_rope_position_map_device_;
+  Tensor append_position_map_device_;
+  Tensor kv_transfer_remote_position_map_device;
+  Tensor kv_transfer_recver_id_device;
+  Tensor kv_transfer_page_to_page_local_position_map_device;
+  Tensor kv_transfer_page_to_page_remote_position_map_device;
+  Tensor kv_transfer_page_to_page_recver_id_device;
+  Tensor commit_copy_length_indptr_device_;
+  Tensor commit_copy_src_dst_pos_in_page_table_device_;
 };
 
 /*!
@@ -790,7 +789,7 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
     merged_attn_aux_data_host_ =
         HostMemoryVector(attn_aux_data_cache_size, dtype_aux, preferred_host_device);
     // - Initialize the device auxiliary data buffer.
-    merged_attn_aux_data_device_ = NDArray::Empty({attn_aux_data_cache_size}, dtype_aux, device);
+    merged_attn_aux_data_device_ = Tensor::Empty({attn_aux_data_cache_size}, dtype_aux, device);
 
     // - Calculate cache size of all the compact KV auxiliary arrays in
     // local cache and the large on-device array.
@@ -800,60 +799,60 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
     merged_compact_kv_aux_data_host_ =
         HostMemoryVector(compact_kv_aux_data_cache_size, dtype_aux, preferred_host_device);
     merged_compact_kv_aux_data_device_ =
-        NDArray::Empty({compact_kv_aux_data_cache_size}, dtype_aux, device);
+        Tensor::Empty({compact_kv_aux_data_cache_size}, dtype_aux, device);
   }
 
   void ResetAttnAuxDataCopy() final { attn_aux_data_copy_offset_ = 0; }
-  NDArray CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyQOIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyPageIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyPageIndicesOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyLastPageLenOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyKRoPEPosOffsetOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) final {
+  Tensor CopyCurAppendLengthIndptrAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) final {
+  Tensor CopyKRaggedRoPEPosOffsetAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyQRoPEPosMapAsync(HostMemoryVector* data) final { return CopyAttnAuxVecToCache(data); }
-  NDArray CopyAppendPositionMapAsync(HostMemoryVector* data) final {
+  Tensor CopyQRoPEPosMapAsync(HostMemoryVector* data) final { return CopyAttnAuxVecToCache(data); }
+  Tensor CopyAppendPositionMapAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) final {
+  Tensor CopyKVTransferRemotePositionMapAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKVTransferRecverIDAsync(HostMemoryVector* data) final {
+  Tensor CopyKVTransferRecverIDAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) final {
+  Tensor CopyKVTransferPage2PageLocalPositionMapAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) final {
+  Tensor CopyKVTransferPage2PageRemotePositionMapAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) final {
+  Tensor CopyKVTransferPage2PageRecverIDAsync(HostMemoryVector* data) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) final {
-    NDArray mask_1d = CopyAttnAuxVecToCache(data);
+  Tensor CopyTreeAttnMaskOnDepthAsync(HostMemoryVector* data, int depth) final {
+    Tensor mask_1d = CopyAttnAuxVecToCache(data);
     return mask_1d.CreateView({static_cast<int64_t>(data->size() / 2), 2}, mask_1d->dtype);
   }
-  NDArray CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
+  Tensor CopyTreeAttnMNIndptrOnDepthAsync(HostMemoryVector* data, int depth) final {
     return CopyAttnAuxVecToCache(data);
   }
-  NDArray CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
-                                     HostMemoryVector* sliding_window_offset,
-                                     HostMemoryVector* sink_size, int depth) final {
+  Tensor CopyLengthInfoOnDepthAsync(HostMemoryVector* last_page_len,
+                                    HostMemoryVector* sliding_window_offset,
+                                    HostMemoryVector* sink_size, int depth) final {
     int64_t n_elem = last_page_len->size();
     std::memcpy(merged_attn_aux_data_host_.data() + attn_aux_data_copy_offset_,
                 last_page_len->data(), n_elem * elem_byte_size_);
@@ -861,7 +860,7 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
                 sliding_window_offset->data(), n_elem * elem_byte_size_);
     std::memcpy(merged_attn_aux_data_host_.data() + attn_aux_data_copy_offset_ + 2 * n_elem,
                 sink_size->data(), n_elem * elem_byte_size_);
-    NDArray view = merged_attn_aux_data_device_.CreateView(
+    Tensor view = merged_attn_aux_data_device_.CreateView(
         {3, n_elem}, dtype_aux_, attn_aux_data_copy_offset_ * elem_byte_size_);
     attn_aux_data_copy_offset_ += CeilDivElemAlignment(3 * n_elem);
     return view;
@@ -881,22 +880,22 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
     DLTensor copy_src = copy_dst;
     copy_src.data = merged_attn_aux_data_host_.data();
     copy_src.device = Device{kDLCPU, 0};
-    NDArray::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
+    Tensor::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
   }
 
   void ResetCompactKVAuxDataCopy() final { compact_kv_aux_data_copy_offset_ = 0; }
 
-  NDArray CopyCommitLengthIndptrAsync(HostMemoryVector* data) final {
+  Tensor CopyCommitLengthIndptrAsync(HostMemoryVector* data) final {
     return CopyCompactKVAuxVecToCache(data);
   }
-  NDArray CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
-                                              HostMemoryVector* dst_data) final {
+  Tensor CopyCommitSrcDstPosInPageTableAsync(HostMemoryVector* src_data,
+                                             HostMemoryVector* dst_data) final {
     int64_t n_elem = src_data->size();
     std::memcpy(merged_compact_kv_aux_data_host_.data() + compact_kv_aux_data_copy_offset_,
                 src_data->data(), n_elem * elem_byte_size_);
     std::memcpy(merged_compact_kv_aux_data_host_.data() + compact_kv_aux_data_copy_offset_ + n_elem,
                 dst_data->data(), n_elem * elem_byte_size_);
-    NDArray view = merged_compact_kv_aux_data_device_.CreateView(
+    Tensor view = merged_compact_kv_aux_data_device_.CreateView(
         {2, n_elem}, dtype_aux_, compact_kv_aux_data_copy_offset_ * elem_byte_size_);
     compact_kv_aux_data_copy_offset_ += CeilDivElemAlignment(2 * n_elem);
     return view;
@@ -916,7 +915,7 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
     DLTensor copy_src = copy_dst;
     copy_src.data = merged_compact_kv_aux_data_host_.data();
     copy_src.device = Device{kDLCPU, 0};
-    NDArray::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
+    Tensor::CopyFromTo(&copy_src, &copy_dst, copy_stream_);
   }
 
  private:
@@ -985,23 +984,23 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
 
   /*!
    * \brief Copy the input data to the cache at the given offset.
-   * And return the NDArray view of the cache starting at the offset.
+   * And return the Tensor view of the cache starting at the offset.
    */
-  NDArray CopyAttnAuxVecToCache(HostMemoryVector* data) {
+  Tensor CopyAttnAuxVecToCache(HostMemoryVector* data) {
     int64_t n_elem = data->size();
     std::memcpy(merged_attn_aux_data_host_.data() + attn_aux_data_copy_offset_, data->data(),
                 n_elem * elem_byte_size_);
-    NDArray view = merged_attn_aux_data_device_.CreateView(
+    Tensor view = merged_attn_aux_data_device_.CreateView(
         {n_elem}, dtype_aux_, attn_aux_data_copy_offset_ * elem_byte_size_);
     attn_aux_data_copy_offset_ += CeilDivElemAlignment(n_elem);
     return view;
   }
 
-  NDArray CopyCompactKVAuxVecToCache(HostMemoryVector* data) {
+  Tensor CopyCompactKVAuxVecToCache(HostMemoryVector* data) {
     int64_t n_elem = data->size();
     std::memcpy(merged_compact_kv_aux_data_host_.data() + compact_kv_aux_data_copy_offset_,
                 data->data(), n_elem * elem_byte_size_);
-    NDArray view = merged_compact_kv_aux_data_device_.CreateView(
+    Tensor view = merged_compact_kv_aux_data_device_.CreateView(
         {n_elem}, dtype_aux_, compact_kv_aux_data_copy_offset_ * elem_byte_size_);
     compact_kv_aux_data_copy_offset_ += CeilDivElemAlignment(n_elem);
     return view;
@@ -1020,8 +1019,8 @@ class CachedPagedKVCacheAuxDataManager : public PagedKVCacheAuxDataManager {
   int64_t compact_kv_aux_data_copy_offset_ = 0;
   HostMemoryVector merged_attn_aux_data_host_;
   HostMemoryVector merged_compact_kv_aux_data_host_;
-  NDArray merged_attn_aux_data_device_;
-  NDArray merged_compact_kv_aux_data_device_;
+  Tensor merged_attn_aux_data_device_;
+  Tensor merged_compact_kv_aux_data_device_;
 };
 
 }  // namespace vm
diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc
index 90e3b4c54922..9427d6805db5 100644
--- a/src/runtime/vm/builtin.cc
+++ b/src/runtime/vm/builtin.cc
@@ -29,7 +29,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/runtime/vm/builtin.h>
 #include <tvm/runtime/vm/bytecode.h>
 #include <tvm/runtime/vm/vm.h>
@@ -38,7 +38,7 @@ namespace tvm {
 namespace runtime {
 namespace vm {
 
-using tvm::runtime::NDArray;
+using tvm::runtime::Tensor;
 
 //-------------------------------------------------
 //  Shape/StructInfo handling.
@@ -47,9 +47,9 @@ using tvm::runtime::NDArray;
  * \brief Builtin function to allocate shape heap.
  * \param ctx_ptr The context module pointer.
  * \param size the size of the heap.
- * \return An allocate NDArray as shape heap.
+ * \return An allocate Tensor as shape heap.
  */
-NDArray AllocShapeHeap(void* ctx_ptr, int64_t size) {
+Tensor AllocShapeHeap(void* ctx_ptr, int64_t size) {
   VirtualMachine* vm = static_cast<VirtualMachine*>(ctx_ptr);
   // use host allocator, which is always last element.
   size_t host_device_index = vm->devices.size() - 1;
@@ -122,7 +122,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
 void MatchShape(ffi::PackedArgs args, ffi::Any* rv) {
   // input shape the first argument can take in tensor or shape.
   ffi::Shape input_shape;
-  if (auto opt_nd = args[0].as<NDArray>()) {
+  if (auto opt_nd = args[0].as<Tensor>()) {
     input_shape = opt_nd.value().Shape();
   } else {
     input_shape = args[0].cast<ffi::Shape>();
@@ -388,7 +388,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("vm.builtin.alloc_storage", VMAllocStorage)
-      .def_method("vm.builtin.alloc_tensor", &StorageObj::AllocNDArray);
+      .def_method("vm.builtin.alloc_tensor", &StorageObj::AllocTensor);
 });
 
 //-------------------------------------------------
@@ -436,14 +436,13 @@ TVM_FFI_STATIC_INIT_BLOCK({
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def_method("vm.builtin.shape_of", &NDArray::Shape)
+      .def_method("vm.builtin.shape_of", &Tensor::Shape)
       .def("vm.builtin.copy", [](ffi::Any a) -> ffi::Any { return a; })
-      .def("vm.builtin.reshape",
-           [](NDArray data, ffi::Shape new_shape) {
-             return data.CreateView(new_shape, data->dtype);
-           })
+      .def(
+          "vm.builtin.reshape",
+          [](Tensor data, ffi::Shape new_shape) { return data.CreateView(new_shape, data->dtype); })
       .def("vm.builtin.null_value", []() -> std::nullptr_t { return nullptr; })
-      .def("vm.builtin.to_device", [](NDArray data, int dev_type, int dev_id) {
+      .def("vm.builtin.to_device", [](Tensor data, int dev_type, int dev_id) {
         Device dst_device = {(DLDeviceType)dev_type, dev_id};
         return data.CopyTo(dst_device);
       });
@@ -458,7 +457,7 @@ bool ReadIfCond(ffi::AnyView cond) {
   if (auto opt_int = cond.try_cast<bool>()) {
     return opt_int.value();
   }
-  NDArray arr = cond.cast<tvm::runtime::NDArray>();
+  Tensor arr = cond.cast<tvm::runtime::Tensor>();
   if (arr->device.device_type != kDLCPU) {
     arr = arr.CopyTo(DLDevice{kDLCPU, 0});
   }
@@ -548,8 +547,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
                     *rv = arr;
                   })
       .def("vm.builtin.tensor_to_shape",
-           [](NDArray data) {
-             NDArray arr = data;
+           [](Tensor data) {
+             Tensor arr = data;
              if (data->device.device_type != kDLCPU) {
                arr = data.CopyTo(DLDevice{kDLCPU, 0});
              }
@@ -581,7 +580,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
              }
              return ffi::Shape(out_shape);
            })
-      .def("vm.builtin.ensure_zero_offset", [](NDArray data) {
+      .def("vm.builtin.ensure_zero_offset", [](Tensor data) {
         if (data->byte_offset == 0) {
           return data;
         }
@@ -592,9 +591,9 @@ TVM_FFI_STATIC_INIT_BLOCK({
           dl_tensor->dl_tensor.data =
               reinterpret_cast<char*>(dl_tensor->dl_tensor.data) + dl_tensor->dl_tensor.byte_offset;
           dl_tensor->dl_tensor.byte_offset = 0;
-          return NDArray::FromDLPack(dl_tensor);
+          return Tensor::FromDLPack(dl_tensor);
         } else {
-          auto new_array = NDArray::Empty(data.Shape(), data->dtype, data->device);
+          auto new_array = Tensor::Empty(data.Shape(), data->dtype, data->device);
           new_array.CopyFrom(data);
           return new_array;
         }
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index ef6fbe6373af..287af83c6058 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -48,11 +48,11 @@ std::string VMExecutable::Stats() const {
   oss << "Relax VM executable statistics:" << std::endl;
 
   // Get the number of constants.
-  // If the constant is an NDArray, get the shape of each of them.
+  // If the constant is an Tensor, get the shape of each of them.
   // If the constant is an DLDataType, get the data type of each of them.
   oss << "  Constant pool (# " << constants.size() << "): [";
   for (const auto& it : constants) {
-    if (auto opt_nd = it.as<runtime::NDArray>()) {
+    if (auto opt_nd = it.as<runtime::Tensor>()) {
       const auto ndarray = opt_nd.value();
       const auto& shape = ndarray.Shape();
       // Scalar
@@ -248,8 +248,8 @@ void VMExecutable::SaveGlobalSection(dmlc::Stream* strm) const { strm->Write(fun
 void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const {
   strm->Write(static_cast<uint64_t>(this->constants.size()));
   for (const auto& it : this->constants) {
-    if (auto opt_nd = it.as<runtime::NDArray>()) {
-      strm->Write<int32_t>(ffi::TypeIndex::kTVMFFINDArray);
+    if (auto opt_nd = it.as<runtime::Tensor>()) {
+      strm->Write<int32_t>(ffi::TypeIndex::kTVMFFITensor);
       runtime::SaveDLTensor(strm, opt_nd.value().operator->());
     } else if (auto opt_shape = it.as<ffi::Shape>()) {
       ffi::Shape shape = opt_shape.value();
@@ -299,13 +299,13 @@ void VMExecutable::LoadConstantSection(dmlc::Stream* strm) {
   STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "constant");
 
   size_t size = static_cast<size_t>(sz);
-  runtime::NDArray ndarray;
+  runtime::Tensor ndarray;
   DLDataType dtype;
   // Load each of the constants.
   for (size_t i = 0; i < size; i++) {
     int constant_type;
     STREAM_CHECK(strm->Read(&constant_type, sizeof(constant_type)), "constant");
-    if (constant_type == ffi::TypeIndex::kTVMFFINDArray) {
+    if (constant_type == ffi::TypeIndex::kTVMFFITensor) {
       ndarray.Load(strm);
       ffi::Any cell;
       cell = ndarray;
@@ -348,7 +348,7 @@ void VMExecutable::LoadConstantSection(dmlc::Stream* strm) {
       cell = value;
       this->constants.push_back(cell);
     } else {
-      LOG(FATAL) << "Constant pool can only contain NDArray and DLDataType, but got "
+      LOG(FATAL) << "Constant pool can only contain Tensor and DLDataType, but got "
                  << ffi::TypeIndexToTypeKey(constant_type) << " when loading the VM constant pool.";
     }
   }
diff --git a/src/runtime/vm/hexagon/builtin.cc b/src/runtime/vm/hexagon/builtin.cc
index be5c7f5fd6f9..ee18de4bf9b3 100644
--- a/src/runtime/vm/hexagon/builtin.cc
+++ b/src/runtime/vm/hexagon/builtin.cc
@@ -31,12 +31,13 @@
 namespace tvm {
 namespace runtime {
 namespace vm {
+// clang-format off
 
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("vm.builtin.hexagon.dma_copy",
-           [](ffi::AnyView vm_ptr, NDArray src_arr, NDArray dst_arr, int queue_id,
+           [](ffi::AnyView vm_ptr, Tensor src_arr, Tensor dst_arr, int queue_id,
               bool bypass_cache) {
              const DLTensor* dptr = dst_arr.operator->();
              const DLTensor* sptr = src_arr.operator->();
@@ -57,8 +58,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
              CHECK(ret == DMA_SUCCESS);
            })
       .def("vm.builtin.hexagon.dma_wait", [](ffi::AnyView vm_ptr, int queue_id, int inflight_dma,
-                                             bool bypass_cache, [[maybe_unused]] NDArray src_arr,
-                                             [[maybe_unused]] NDArray dst_arr) {
+                                             bool bypass_cache, [[maybe_unused]] Tensor src_arr,
+                                             [[maybe_unused]] Tensor dst_arr) {
         ICHECK(inflight_dma >= 0);
         tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id, inflight_dma);
         if (bypass_cache) {
@@ -70,6 +71,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
         }
       });
 });
+
+// clang-format on
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/kv_state.cc b/src/runtime/vm/kv_state.cc
index 5d13be7ef519..366e22c36baf 100644
--- a/src/runtime/vm/kv_state.cc
+++ b/src/runtime/vm/kv_state.cc
@@ -76,32 +76,32 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def_method("vm.builtin.attention_kv_cache_debug_get_kv_mla",
                   &AttentionKVCacheObj::DebugGetKVMLA)
       .def("vm.builtin.attention_kv_cache_attention_with_fused_qkv",
-           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, NDArray qkv_data,
-              NDArray o_data) {
+           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, Tensor qkv_data,
+              Tensor o_data) {
              kv_cache->AttentionWithFusedQKV(layer_id, std::move(qkv_data), std::nullopt,
                                              std::move(o_data), sm_scale);
            })
       .def("vm.builtin.attention_kv_cache_self_attention",
-           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, NDArray q_data,
-              NDArray k_data, NDArray v_data, NDArray o_data, NDArray lse_data) {
+           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, Tensor q_data,
+              Tensor k_data, Tensor v_data, Tensor o_data, Tensor lse_data) {
              kv_cache->SelfAttention(layer_id, std::move(q_data), std::move(k_data),
                                      std::move(v_data), std::move(o_data), std::move(lse_data),
                                      sm_scale);
            })
       .def("vm.builtin.attention_kv_cache_cross_attention",
-           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, NDArray q_data,
-              NDArray o_data, NDArray lse_data) {
+           [](AttentionKVCache kv_cache, int64_t layer_id, double sm_scale, Tensor q_data,
+              Tensor o_data, Tensor lse_data) {
              kv_cache->CrossAttention(layer_id, std::move(q_data), std::move(o_data),
                                       std::move(lse_data), sm_scale);
            })
       .def("vm.builtin.attention_kv_cache_append_mla_kv",
-           [](AttentionKVCache kv_cache, int64_t layer_id, NDArray kv_data) {
+           [](AttentionKVCache kv_cache, int64_t layer_id, Tensor kv_data) {
              kv_cache->AppendMLAKV(layer_id, std::move(kv_data));
              return kv_cache;
            })
       .def("vm.builtin.attention_kv_cache_merge_attn_output_inplace",
-           [](AttentionKVCache kv_cache, NDArray o_self_attn, NDArray lse_self_attn,
-              NDArray o_cross_attn, NDArray lse_cross_attn) {
+           [](AttentionKVCache kv_cache, Tensor o_self_attn, Tensor lse_self_attn,
+              Tensor o_cross_attn, Tensor lse_cross_attn) {
              return kv_cache->MergeAttnOutputInplace(
                  std::move(o_self_attn), std::move(lse_self_attn), std::move(o_cross_attn),
                  std::move(lse_cross_attn));
@@ -114,7 +114,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef()
       .def_method("vm.builtin.rnn_state_get", &RNNStateObj::Get)
       .def("vm.builtin.rnn_state_set",
-           [](RNNState state, int64_t layer_id, int64_t state_id, NDArray data) {
+           [](RNNState state, int64_t layer_id, int64_t state_id, Tensor data) {
              state->Set(layer_id, state_id, data);
              return state;
            })
diff --git a/src/runtime/vm/kv_state.h b/src/runtime/vm/kv_state.h
index 46d8f4f59603..de42488b7f40 100644
--- a/src/runtime/vm/kv_state.h
+++ b/src/runtime/vm/kv_state.h
@@ -23,8 +23,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/int_tuple.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
 
 namespace tvm {
 namespace runtime {
@@ -178,8 +178,8 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param sm_scale The additional attention scaling factor.
    * \sa AttentionKVCache::Attention
    */
-  virtual void AttentionWithFusedQKV(int64_t layer_id, NDArray qkv_data, Optional<NDArray> mask,
-                                     NDArray o_data, double sm_scale) = 0;
+  virtual void AttentionWithFusedQKV(int64_t layer_id, Tensor qkv_data, Optional<Tensor> mask,
+                                     Tensor o_data, double sm_scale) = 0;
 
   /*!
    * \brief Fine-grained API that computes ragged self attention with Q/K/V data.
@@ -191,8 +191,8 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param lse_data The output attention LSE data, in layout `(total_length, num_qo_heads)`.
    * \param sm_scale The additional attention scaling factor.
    */
-  virtual void SelfAttention(int64_t layer_id, NDArray q_data, NDArray k_data, NDArray v_data,
-                             NDArray o_data, NDArray lse_data, double sm_scale) = 0;
+  virtual void SelfAttention(int64_t layer_id, Tensor q_data, Tensor k_data, Tensor v_data,
+                             Tensor o_data, Tensor lse_data, double sm_scale) = 0;
 
   /*!
    * \brief Fine-grained API that computes paged cross attention with Q and in-cache KV data.
@@ -202,7 +202,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param lse_data The output attention LSE data, in layout `(total_length, num_qo_heads)`.
    * \param sm_scale The additional attention scaling factor.
    */
-  virtual void CrossAttention(int64_t layer_id, NDArray q_data, NDArray o_data, NDArray lse_data,
+  virtual void CrossAttention(int64_t layer_id, Tensor q_data, Tensor o_data, Tensor lse_data,
                               double sm_scale) = 0;
 
   /*!
@@ -210,7 +210,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param layer_id The model layer where the attention compute happens.
    * \param kv_data The input KV data to append, in layout `(total_length, qk_head_dim)`.
    */
-  virtual void AppendMLAKV(int64_t layer_id, NDArray kv_data) = 0;
+  virtual void AppendMLAKV(int64_t layer_id, Tensor kv_data) = 0;
 
   /*!
    * \brief Fine-grained API that merges the attention output from two sources.
@@ -220,8 +220,8 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param lse2_data The second source LSE data.
    * \return The merged O and LSE data.
    */
-  virtual Array<NDArray> MergeAttnOutputInplace(NDArray o_self_attn, NDArray lse_self_attn,
-                                                NDArray o_cross_attn, NDArray lse_cross_attn) = 0;
+  virtual Array<Tensor> MergeAttnOutputInplace(Tensor o_self_attn, Tensor lse_self_attn,
+                                               Tensor o_cross_attn, Tensor lse_cross_attn) = 0;
 
   /*!
    * \brief Compute linear attention with Q/K/V data.
@@ -233,7 +233,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param sm_scale The additional attention scaling factor.
    * \sa AttentionKVCache::Attention
    */
-  virtual void LinearAttention(int64_t layer_id, NDArray q_data, NDArray k_data, NDArray v_data,
+  virtual void LinearAttention(int64_t layer_id, Tensor q_data, Tensor k_data, Tensor v_data,
                                double sm_scale) = 0;
 
   /************** Positions **************/
@@ -243,7 +243,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * This function is supposed to be invoked after calling BeginForward.
    * \return The in-sequence query positions, in shape `(total_length,)`.
    */
-  virtual NDArray GetQueryPositions() = 0;
+  virtual Tensor GetQueryPositions() = 0;
 
   /************** Debug Helpers **************/
 
@@ -265,7 +265,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param V_data The output V data of the given sequence in layout elaborated above.
    */
   virtual void DebugGetKV(int64_t seq_id,  //
-                          int64_t start_pos, int64_t end_pos, NDArray k_data, NDArray v_data) = 0;
+                          int64_t start_pos, int64_t end_pos, Tensor k_data, Tensor v_data) = 0;
 
   /*!
    * \brief Fetch the compact K/V data of the given sequence for MLA cache.
@@ -275,7 +275,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param kv_data The output KV data of the given sequence in layout elaborated above.
    */
   virtual void DebugGetKVMLA(int64_t seq_id, int64_t start_pos, int64_t end_pos,
-                             NDArray kv_data) = 0;
+                             Tensor kv_data) = 0;
 
   /*!
    * \brief Set the K/V data of the given sequence from input K/V data.
@@ -291,7 +291,7 @@ class AttentionKVCacheObj : public KVStateObj {
    * \param k_data The K data to set in layout elaborated above.
    * \param v_data The V data to set in layout elaborated above.
    */
-  virtual void DebugSetKV(int64_t seq_id, int64_t start_pos, NDArray k_data, NDArray v_data) = 0;
+  virtual void DebugSetKV(int64_t seq_id, int64_t start_pos, Tensor k_data, Tensor v_data) = 0;
 
   static constexpr const char* _type_key = "relax.vm.AttentionKVCache";
   TVM_DECLARE_BASE_OBJECT_INFO(AttentionKVCacheObj, KVStateObj);
@@ -317,7 +317,7 @@ class RNNStateObj : public KVStateObj {
    * \return The array of State data, each element corresponds to a state.
    * \throws Error if the given sequence id is not valid.
    */
-  virtual void Get(int64_t layer_id, int64_t state_id, NDArray o_data) = 0;
+  virtual void Get(int64_t layer_id, int64_t state_id, Tensor o_data) = 0;
 
   /*!
    * \brief Set the State data for the specified sequence.
@@ -326,7 +326,7 @@ class RNNStateObj : public KVStateObj {
    * \param data The data to be set.
    * \throws Error if the given sequence id is not valid.
    */
-  virtual void Set(int64_t layer_id, int64_t state_id, NDArray data) = 0;
+  virtual void Set(int64_t layer_id, int64_t state_id, Tensor data) = 0;
 
   /*!
    * \brief Fetch the compact rnn state data of the given sequence.
@@ -334,7 +334,7 @@ class RNNStateObj : public KVStateObj {
    * \param state_id The state id within the layer.
    * \param seq_id The sequence whose state data is to be fetched.
    */
-  virtual NDArray DebugGet(int64_t layer_id, int64_t state_id, int64_t seq_id) = 0;
+  virtual Tensor DebugGet(int64_t layer_id, int64_t state_id, int64_t seq_id) = 0;
 
   static constexpr const char* _type_key = "relax.vm.RNNState";
   TVM_DECLARE_BASE_OBJECT_INFO(RNNStateObj, KVStateObj);
diff --git a/src/runtime/vm/lm_support.cc b/src/runtime/vm/lm_support.cc
index 599978579f67..416ece17b402 100644
--- a/src/runtime/vm/lm_support.cc
+++ b/src/runtime/vm/lm_support.cc
@@ -42,7 +42,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/runtime/vm/vm.h>
 
 #include <cmath>
@@ -66,7 +66,7 @@ class AttentionKVCacheLegacyObj : public Object {
   /*!
    * \brief Underlying support data.
    */
-  NDArray data;
+  Tensor data;
 
   /*!
    * \brief number of slots already filled.
@@ -82,7 +82,7 @@ class AttentionKVCacheLegacyObj : public Object {
    * \brief View all current cached values as one array.
    * \param shape The cached values.
    */
-  NDArray View(const ffi::Shape& shape) {
+  Tensor View(const ffi::Shape& shape) {
     CHECK_EQ(shape[0], fill_count) << "Requested shape do not match the filled count";
     for (int i = 1; i < this->data->ndim; ++i) {
       CHECK_EQ(shape[i], data->shape[i]) << "Dimension " << i << " mismatch";
@@ -102,7 +102,7 @@ class AttentionKVCacheLegacyObj : public Object {
     this->fill_count -= n;
   }
 
-  void Update(NDArray value) {
+  void Update(Tensor value) {
     CHECK(data.DataType() == value.DataType()) << "dtype mismatch";
     CHECK_EQ(value->shape[0], fill_count) << "Requested shape do not match the filled count";
     ICHECK(data.IsContiguous());
@@ -111,7 +111,7 @@ class AttentionKVCacheLegacyObj : public Object {
     DLTensor copy_dst = *(data.operator->());
     copy_dst.byte_offset = 0;
     copy_dst.shape = value->shape;
-    NDArray::CopyFromTo(value.operator->(), &copy_dst);
+    Tensor::CopyFromTo(value.operator->(), &copy_dst);
     this->fill_count = value->shape[0];
   }
 
@@ -121,7 +121,7 @@ class AttentionKVCacheLegacyObj : public Object {
    * \param max_cache_size max size of the cache.
    * \param num_attention_sinks number of sinks to store (https://arxiv.org/abs/2309.17453).
    */
-  void WindowOverride(NDArray value, int64_t max_cache_size, int64_t num_attention_sinks = 0) {
+  void WindowOverride(Tensor value, int64_t max_cache_size, int64_t num_attention_sinks = 0) {
     CHECK(data.DataType() == value.DataType()) << "dtype mismatch";
     CHECK_LE(value->shape[0], max_cache_size - num_attention_sinks) << "dim 0 of value too large";
     // reallocate cache
@@ -133,7 +133,7 @@ class AttentionKVCacheLegacyObj : public Object {
       if (reserved_slots != data->shape[0]) {
         std::vector<int64_t> new_shape(data->shape, data->shape + data->ndim);
         new_shape[0] = reserved_slots;
-        NDArray new_data = NDArray::Empty(new_shape, data->dtype, data->device);
+        Tensor new_data = Tensor::Empty(new_shape, data->dtype, data->device);
         new_data.CreateView(data.Shape(), data->dtype).CopyFrom(data);
         this->data = new_data;
       }
@@ -165,7 +165,7 @@ class AttentionKVCacheLegacyObj : public Object {
       copy_src.byte_offset = 0;
       copy_src.shape = &shape[0];
 
-      NDArray::CopyFromTo(&copy_src, &copy_dst);
+      Tensor::CopyFromTo(&copy_src, &copy_dst);
     }
 
     // copy the remainder to the beginning of the cache
@@ -186,7 +186,7 @@ class AttentionKVCacheLegacyObj : public Object {
           num_filled_elements * ((value->dtype.bits * value->dtype.lanes + 7) / 8);
       copy_src.shape = &shape[0];
 
-      NDArray::CopyFromTo(&copy_src, &copy_dst);
+      Tensor::CopyFromTo(&copy_src, &copy_dst);
       this->window_attention_current_pos =
           value->shape[0] - num_elements_to_copy + num_attention_sinks;
     }
@@ -196,7 +196,7 @@ class AttentionKVCacheLegacyObj : public Object {
    * \brief Append value to the cache.
    * \param value The value to be appended.
    */
-  void Append(NDArray value) {
+  void Append(Tensor value) {
     CHECK(data.DataType() == value.DataType()) << "dtype mismatch";
     // reallocate cache
     int64_t reserved_slots = data->shape[0];
@@ -206,7 +206,7 @@ class AttentionKVCacheLegacyObj : public Object {
     if (reserved_slots != data->shape[0]) {
       std::vector<int64_t> new_shape(data->shape, data->shape + data->ndim);
       new_shape[0] = reserved_slots;
-      NDArray new_data = NDArray::Empty(new_shape, data->dtype, data->device);
+      Tensor new_data = Tensor::Empty(new_shape, data->dtype, data->device);
       new_data.CreateView(data.Shape(), data->dtype).CopyFrom(data);
       this->data = new_data;
     }
@@ -223,7 +223,7 @@ class AttentionKVCacheLegacyObj : public Object {
     DLTensor copy_dst = *(data.operator->());
     copy_dst.byte_offset = num_filled_elements * ((data->dtype.bits * data->dtype.lanes + 7) / 8);
     copy_dst.shape = value->shape;
-    NDArray::CopyFromTo(value.operator->(), &copy_dst);
+    Tensor::CopyFromTo(value.operator->(), &copy_dst);
     this->fill_count += value->shape[0];
   }
 
@@ -238,10 +238,10 @@ class AttentionKVCacheLegacy : public ObjectRef {
    * \brief Create the attention kv cache.
    * \param init_data The initial reserved.
    */
-  static AttentionKVCacheLegacy Create(NDArray init_data, ffi::Shape reserve_shape,
+  static AttentionKVCacheLegacy Create(Tensor init_data, ffi::Shape reserve_shape,
                                        int init_fill_count) {
     auto n = make_object<AttentionKVCacheLegacyObj>();
-    n->data = NDArray::Empty(reserve_shape, init_data->dtype, init_data->device);
+    n->data = Tensor::Empty(reserve_shape, init_data->dtype, init_data->device);
     n->fill_count = 0;
     n->Append(init_data);
     if (init_fill_count >= 0) {
@@ -263,7 +263,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef().def("vm.builtin.attention_kv_cache_create", AttentionKVCacheLegacy::Create);
 });
 
-AttentionKVCacheLegacy AttentionKVCacheUpdate(AttentionKVCacheLegacy cache, NDArray value) {
+AttentionKVCacheLegacy AttentionKVCacheUpdate(AttentionKVCacheLegacy cache, Tensor value) {
   cache->Update(value);
   return cache;
 }
@@ -273,7 +273,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef().def("vm.builtin.attention_kv_cache_update", AttentionKVCacheUpdate);
 });
 
-AttentionKVCacheLegacy AttentionKVCacheAppend(AttentionKVCacheLegacy cache, NDArray value) {
+AttentionKVCacheLegacy AttentionKVCacheAppend(AttentionKVCacheLegacy cache, Tensor value) {
   cache->Append(value);
   return cache;
 }
@@ -283,7 +283,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef().def("vm.builtin.attention_kv_cache_append", AttentionKVCacheAppend);
 });
 
-AttentionKVCacheLegacy AttentionKVCacheWindowOverride(AttentionKVCacheLegacy cache, NDArray value,
+AttentionKVCacheLegacy AttentionKVCacheWindowOverride(AttentionKVCacheLegacy cache, Tensor value,
                                                       int64_t max_cache_size) {
   cache->WindowOverride(value, max_cache_size);
   return cache;
@@ -296,8 +296,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
 });
 
 AttentionKVCacheLegacy AttentionKVCacheWindowOverrideWithSinks(AttentionKVCacheLegacy cache,
-                                                               NDArray value,
-                                                               int64_t max_cache_size,
+                                                               Tensor value, int64_t max_cache_size,
                                                                int64_t num_attention_sinks) {
   cache->WindowOverride(value, max_cache_size, num_attention_sinks);
   return cache;
@@ -309,7 +308,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
                         AttentionKVCacheWindowOverrideWithSinks);
 });
 
-NDArray AttentionKVCacheView(AttentionKVCacheLegacy cache, ffi::Shape shape) {
+Tensor AttentionKVCacheView(AttentionKVCacheLegacy cache, ffi::Shape shape) {
   return cache->View(shape);
 }
 
@@ -358,7 +357,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
 });
 
 // NOTE this is a built-in highly related to LM so we put it here.
-int SampleTopPFromLogits(NDArray logits, double temperature, double top_p, double uniform_sample) {
+int SampleTopPFromLogits(Tensor logits, double temperature, double top_p, double uniform_sample) {
   ICHECK(logits.IsContiguous());
   ICHECK(logits.DataType() == DataType::Float(32));
 
@@ -424,7 +423,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef().def("vm.builtin.sample_top_p_from_logits", SampleTopPFromLogits);
 });
 
-int SampleTopPFromProb(NDArray prob, double top_p, double uniform_sample) {
+int SampleTopPFromProb(Tensor prob, double top_p, double uniform_sample) {
   ICHECK(prob.IsContiguous());
   ICHECK(prob.DataType() == DataType::Float(32));
 
@@ -522,7 +521,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef().def("vm.builtin.sample_top_p_from_prob", SampleTopPFromProb);
 });
 
-NDArray MultinomialFromUniform(NDArray prob, NDArray uniform_sample) {
+Tensor MultinomialFromUniform(Tensor prob, Tensor uniform_sample) {
   ICHECK(prob.IsContiguous());
   ICHECK(uniform_sample.IsContiguous());
 
@@ -540,7 +539,7 @@ NDArray MultinomialFromUniform(NDArray prob, NDArray uniform_sample) {
   int64_t vocab_size = prob->shape[prob->ndim - 1];
   const float* pprob = static_cast<float*>(prob->data);
   const float* psample = static_cast<float*>(uniform_sample->data);
-  NDArray new_array = NDArray::Empty({batch_size, 1}, DataType::Int(64), uniform_sample->device);
+  Tensor new_array = Tensor::Empty({batch_size, 1}, DataType::Int(64), uniform_sample->device);
   int64_t* parray = static_cast<int64_t*>(new_array->data);
   for (int64_t i = 0; i < batch_size; ++i) {
     float cum_sum_prob = 0.0f;
@@ -563,7 +562,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
 });
 
 // This is an inplace operation.
-void ApplyRepetitionPenalty(NDArray logits, NDArray token_ids, double penalty) {
+void ApplyRepetitionPenalty(Tensor logits, Tensor token_ids, double penalty) {
   ICHECK(logits.IsContiguous());
   ICHECK(token_ids.IsContiguous());
   ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
@@ -597,7 +596,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
  * \param presence_penalty The penalty factor, applied if a token appeared in an one-off manner.
  * \param frequency_penalty The penalty factor, contributes more the more frequent a token appears.
  */
-void ApplyPresenceAndFrequencyPenalty(NDArray logits, NDArray token_ids, NDArray token_freqs,
+void ApplyPresenceAndFrequencyPenalty(Tensor logits, Tensor token_ids, Tensor token_freqs,
                                       double presence_penalty, double frequency_penalty) {
   // See https://platform.openai.com/docs/guides/text-generation/frequency-and-presence-penalties
   ICHECK(logits.IsContiguous());
@@ -628,7 +627,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
 });
 
 // This is an inplace operation.
-void ApplySoftmaxWithTemperature(NDArray logits, double temperature) {
+void ApplySoftmaxWithTemperature(Tensor logits, double temperature) {
   ICHECK(logits.IsContiguous());
   ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
   ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
diff --git a/src/runtime/vm/paged_kv_cache.cc b/src/runtime/vm/paged_kv_cache.cc
index 405f2f482a01..9ac3ab95ccf2 100644
--- a/src/runtime/vm/paged_kv_cache.cc
+++ b/src/runtime/vm/paged_kv_cache.cc
@@ -26,7 +26,7 @@
 #include <tvm/runtime/disco/disco_worker.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <numeric>
@@ -111,7 +111,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   /*! \brief The RoPE theta. */
   const double rotary_theta_;
   /*! \brief The optional RoPE extension factors for RoPE scaling. */
-  const Optional<NDArray> rope_ext_factors_;
+  const Optional<Tensor> rope_ext_factors_;
 
   /*! \brief The KV cache dtype. */
   const DataType kv_dtype_;
@@ -122,15 +122,15 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
 
   /*!
    * \brief The KV data managed by the KV cache.
-   * If KV transfer function is specifed, pages_ will be allocated by NVSHMEM as a whole NDArray.
+   * If KV transfer function is specifed, pages_ will be allocated by NVSHMEM as a whole Tensor.
    * pages_ will contain tensor view of each layer.
-   * Otherwise, pages_ has `num_layers` NDArrays, each of them
+   * Otherwise, pages_ has `num_layers` Tensors, each of them
    * has layout (num_pages, 2, num_heads, page_size, qk_head_dim).
    * Along on the "2" dimension, index 0 stands for K and 1 stands for V.
    */
-  std::vector<NDArray> pages_;
+  std::vector<Tensor> pages_;
   /*! \brief The whole KV cache allocated by NVSHMEM*/
-  NDArray nvshmem_pages_;
+  Tensor nvshmem_pages_;
   /*! \brief The list of ids of released pages for page reuse. */
   std::vector<int32_t> free_page_ids_;
   /*! \brief The mapping from sequence ids to sequences. */
@@ -181,15 +181,15 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   std::unique_ptr<PagedKVCacheAuxDataManager> aux_data_manager_;
 
   // Temporary arrays to store intermediate attention results.
-  NDArray temp_attn_q_device_;
-  NDArray temp_attn_k_device_;
-  NDArray temp_attn_v_device_;
-  NDArray temp_attn_output_device_;
-  NDArray temp_attn_lse_device_;
-  NDArray merged_attn_lse_device_;
-  std::vector<NDArray> temp_int_attn_workspace_;
-  std::vector<NDArray> temp_int_pinned_attn_workspace_;
-  NDArray temp_float_attn_workspace_;
+  Tensor temp_attn_q_device_;
+  Tensor temp_attn_k_device_;
+  Tensor temp_attn_v_device_;
+  Tensor temp_attn_output_device_;
+  Tensor temp_attn_lse_device_;
+  Tensor merged_attn_lse_device_;
+  std::vector<Tensor> temp_int_attn_workspace_;
+  std::vector<Tensor> temp_int_pinned_attn_workspace_;
+  Tensor temp_float_attn_workspace_;
 
   //-------------------------------------------
   // Below are the auxiliary data structure on CPU.
@@ -227,29 +227,29 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   // after each synchronization and pass these views as input for
   // attention/append.
   //-------------------------------------------
-  NDArray cur_append_length_indptr_view_;
-  NDArray k_ragged_rope_pos_offset_view_;
-  NDArray q_rope_position_map_view_;
-  NDArray append_position_map_view_;
-  NDArray kv_transfer_remote_position_map_view_;
-  NDArray kv_transfer_recver_id_view_;
-  NDArray kv_transfer_page_to_page_local_position_map_view_;
-  NDArray kv_transfer_page_to_page_remote_position_map_view_;
-  NDArray kv_transfer_page_to_page_recver_id_view_;
-  NDArray temp_attn_output_view_;
-  NDArray temp_attn_lse_view_;
-  NDArray merged_attn_lse_view_;
-  std::vector<NDArray> qo_indptr_on_depths_view_;
-  std::vector<NDArray> page_indptr_on_depths_view_;
-  std::vector<NDArray> page_indices_on_depths_view_;
-  std::vector<NDArray> page_indptr_sliding_window_on_depths_view_;
-  std::vector<NDArray> page_indices_sliding_window_on_depths_view_;
-  std::vector<NDArray> length_info_on_depths_view_;
-  std::vector<NDArray> layer_sliding_window_length_info_on_depths_view_;
-  std::vector<NDArray> k_rope_pos_offset_view_;
-  std::vector<NDArray> k_rope_pos_offset_sliding_window_view_;
-  std::vector<NDArray> tree_attn_mask_view_;
-  std::vector<NDArray> tree_attn_mn_indptr_view_;
+  Tensor cur_append_length_indptr_view_;
+  Tensor k_ragged_rope_pos_offset_view_;
+  Tensor q_rope_position_map_view_;
+  Tensor append_position_map_view_;
+  Tensor kv_transfer_remote_position_map_view_;
+  Tensor kv_transfer_recver_id_view_;
+  Tensor kv_transfer_page_to_page_local_position_map_view_;
+  Tensor kv_transfer_page_to_page_remote_position_map_view_;
+  Tensor kv_transfer_page_to_page_recver_id_view_;
+  Tensor temp_attn_output_view_;
+  Tensor temp_attn_lse_view_;
+  Tensor merged_attn_lse_view_;
+  std::vector<Tensor> qo_indptr_on_depths_view_;
+  std::vector<Tensor> page_indptr_on_depths_view_;
+  std::vector<Tensor> page_indices_on_depths_view_;
+  std::vector<Tensor> page_indptr_sliding_window_on_depths_view_;
+  std::vector<Tensor> page_indices_sliding_window_on_depths_view_;
+  std::vector<Tensor> length_info_on_depths_view_;
+  std::vector<Tensor> layer_sliding_window_length_info_on_depths_view_;
+  std::vector<Tensor> k_rope_pos_offset_view_;
+  std::vector<Tensor> k_rope_pos_offset_sliding_window_view_;
+  std::vector<Tensor> tree_attn_mask_view_;
+  std::vector<Tensor> tree_attn_mn_indptr_view_;
 
   Optional<ffi::Function> f_transpose_append_mha_;
   Optional<ffi::Function> f_transpose_append_mla_;
@@ -279,14 +279,14 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   TVMStreamHandle kv_transfer_stream_ = nullptr;
 
  public:
-  /*! \brief Constructor. Take the cache configuration and initialize the NDArrays. */
+  /*! \brief Constructor. Take the cache configuration and initialize the Tensors. */
   explicit PagedAttentionKVCacheObj(
       int64_t page_size, int64_t num_layers, int64_t layer_id_begin_offset,
       int64_t layer_id_end_offset, int64_t num_qo_heads, int64_t num_kv_heads, int64_t qk_head_dim,
       int64_t v_head_dim, std::vector<AttnKind> attn_kinds, int64_t reserved_num_seqs,
       int64_t num_total_pages, int64_t prefill_chunk_size, bool support_sliding_window,
       RoPEMode rope_mode, double rotary_scale, double rotary_theta,
-      Optional<NDArray> rope_ext_factors, bool enable_kv_transfer, DLDataType dtype, Device device,
+      Optional<Tensor> rope_ext_factors, bool enable_kv_transfer, DLDataType dtype, Device device,
       Optional<ffi::Function> f_transpose_append_mha,
       Optional<ffi::Function> f_transpose_append_mla, ffi::Function f_compact_copy,
       std::unique_ptr<RaggedPrefillFunc> f_attention_prefill_ragged,
@@ -360,7 +360,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
           (*f_nvshmem_empty)(
               ffi::Shape({num_layers, num_total_pages, 2, num_kv_heads, page_size, qk_head_dim}),
               dtype, device)
-              .cast<NDArray>();
+              .cast<Tensor>();
       for (int i = 0; i < num_layers; ++i) {
         pages_.push_back(nvshmem_pages_.CreateView(
             {num_total_pages_, 2, num_kv_heads_, page_size_, qk_head_dim_}, nvshmem_pages_->dtype,
@@ -380,7 +380,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         ffi::Shape kv_cache_shape =
             GetKVCacheShape(attn_kinds_[layer_id_begin_offset_ + i], num_total_pages,
                             reserved_num_seqs, num_kv_heads, page_size, qk_head_dim, v_head_dim);
-        pages_.push_back(NDArray::Empty(kv_cache_shape, dtype, device));
+        pages_.push_back(Tensor::Empty(kv_cache_shape, dtype, device));
       }
     }
 
@@ -442,47 +442,47 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     for (int d = 0; d < kPagedKVCacheMaxBlockDepth; ++d) {
       if (NeedKernelBeginForward()) {
         temp_int_attn_workspace_.push_back(
-            NDArray::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
-        temp_int_pinned_attn_workspace_.push_back(NDArray::Empty(
+            Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+        temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
             {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
       }
-      qo_indptr_on_depths_view_.push_back(NDArray());
-      page_indptr_on_depths_view_.push_back(NDArray());
-      page_indices_on_depths_view_.push_back(NDArray());
-      page_indptr_sliding_window_on_depths_view_.push_back(NDArray());
-      page_indices_sliding_window_on_depths_view_.push_back(NDArray());
-      length_info_on_depths_view_.push_back(NDArray());
-      layer_sliding_window_length_info_on_depths_view_.push_back(NDArray());
-      k_rope_pos_offset_view_.push_back(NDArray());
-      k_rope_pos_offset_sliding_window_view_.push_back(NDArray());
-      tree_attn_mask_view_.push_back(NDArray());
-      tree_attn_mn_indptr_view_.push_back(NDArray());
+      qo_indptr_on_depths_view_.push_back(Tensor());
+      page_indptr_on_depths_view_.push_back(Tensor());
+      page_indices_on_depths_view_.push_back(Tensor());
+      page_indptr_sliding_window_on_depths_view_.push_back(Tensor());
+      page_indices_sliding_window_on_depths_view_.push_back(Tensor());
+      length_info_on_depths_view_.push_back(Tensor());
+      layer_sliding_window_length_info_on_depths_view_.push_back(Tensor());
+      k_rope_pos_offset_view_.push_back(Tensor());
+      k_rope_pos_offset_sliding_window_view_.push_back(Tensor());
+      tree_attn_mask_view_.push_back(Tensor());
+      tree_attn_mn_indptr_view_.push_back(Tensor());
       is_chain_on_depths_.push_back(true);
     }
     // Additional workspace for the "prefill with ragged kv" kernel.
     if (NeedKernelBeginForward()) {
       temp_int_attn_workspace_.push_back(
-          NDArray::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
-      temp_int_pinned_attn_workspace_.push_back(NDArray::Empty(
+          Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+      temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
           {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
       temp_float_attn_workspace_ =
-          NDArray::Empty({kFloatAttnWorkspaceByte}, DataType::UInt(8), device);
+          Tensor::Empty({kFloatAttnWorkspaceByte}, DataType::UInt(8), device);
     }
 
     if (std::find(attn_kinds_.begin(), attn_kinds_.end(), AttnKind::kMHA) != attn_kinds_.end()) {
       temp_attn_q_device_ =
-          NDArray::Empty({prefill_chunk_size_, num_qo_heads, qk_head_dim}, dtype, device);
+          Tensor::Empty({prefill_chunk_size_, num_qo_heads, qk_head_dim}, dtype, device);
       temp_attn_k_device_ =
-          NDArray::Empty({prefill_chunk_size_, num_kv_heads, qk_head_dim}, dtype, device);
+          Tensor::Empty({prefill_chunk_size_, num_kv_heads, qk_head_dim}, dtype, device);
       temp_attn_v_device_ =
-          NDArray::Empty({prefill_chunk_size_, num_kv_heads, v_head_dim}, dtype, device);
+          Tensor::Empty({prefill_chunk_size_, num_kv_heads, v_head_dim}, dtype, device);
     }
     temp_attn_output_device_ =
-        NDArray::Empty({prefill_chunk_size_, num_qo_heads, v_head_dim}, dtype, device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads, v_head_dim}, dtype, device);
     temp_attn_lse_device_ =
-        NDArray::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
     merged_attn_lse_device_ =
-        NDArray::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
     for (int64_t page_id = num_total_pages - 1; page_id >= 0; --page_id) {
       free_page_ids_.push_back(page_id);
     }
@@ -694,7 +694,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       DeviceAPI::Get(device_)->SetStream(device_, copy_stream_);
     }
     for (int layer = 0; layer < num_layers_; ++layer) {
-      NDArray page_layer_view = pages_[layer];
+      Tensor page_layer_view = pages_[layer];
       f_copy_single_page_(page_layer_view, src_page_id, tgt_page_id, copy_length);
     }
     if (copy_stream_ != compute_stream_) {
@@ -712,9 +712,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
 
     // Copy indptr/src/dst arrays to GPU.
     aux_data_manager_->ResetCompactKVAuxDataCopy();
-    NDArray commit_copy_length_indptr_view =
+    Tensor commit_copy_length_indptr_view =
         aux_data_manager_->CopyCommitLengthIndptrAsync(&commit_copy_length_indptr_host_);
-    NDArray commit_copy_src_dst_pos_in_page_table_view =
+    Tensor commit_copy_src_dst_pos_in_page_table_view =
         aux_data_manager_->CopyCommitSrcDstPosInPageTableAsync(
             &commit_copy_src_pos_in_page_table_host_, &commit_copy_dst_pos_in_page_table_host_);
     aux_data_manager_->CommitCompactKVAuxDataCopy();
@@ -1271,13 +1271,13 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
                  sequence->kv_transfer_metadata.local_position_map.end());
   }
 
-  void AttentionWithFusedQKV(int64_t layer_id, NDArray qkv_data, Optional<NDArray> mask,
-                             NDArray o_data, double sm_scale) final {
+  void AttentionWithFusedQKV(int64_t layer_id, Tensor qkv_data, Optional<Tensor> mask,
+                             Tensor o_data, double sm_scale) final {
     // Part 1. Shape and dtype check.
     int64_t local_layer_id = layer_id - layer_id_begin_offset_;
     CHECK_GE(local_layer_id, 0);
     CHECK_LT(local_layer_id, num_layers_);
-    NDArray pages = pages_[local_layer_id];
+    Tensor pages = pages_[local_layer_id];
     CHECK(qkv_data.DataType() == pages.DataType());
     CHECK(o_data.DataType() == pages.DataType());
     CHECK(attn_kinds_[layer_id] == AttnKind::kMHA ||
@@ -1308,15 +1308,15 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     // The auxiliary data structure on device must have been synchronized.
     ICHECK(!dirty_aux_data_device_);
 
-    NDArray q_data = temp_attn_q_device_.CreateView({total_seq_length, num_qo_heads_, qk_head_dim_},
-                                                    qkv_data->dtype);
-    NDArray k_data = temp_attn_k_device_.CreateView({total_seq_length, num_kv_heads_, qk_head_dim_},
-                                                    qkv_data->dtype);
-    NDArray v_data = temp_attn_v_device_.CreateView({total_seq_length, num_kv_heads_, qk_head_dim_},
-                                                    qkv_data->dtype);
+    Tensor q_data = temp_attn_q_device_.CreateView({total_seq_length, num_qo_heads_, qk_head_dim_},
+                                                   qkv_data->dtype);
+    Tensor k_data = temp_attn_k_device_.CreateView({total_seq_length, num_kv_heads_, qk_head_dim_},
+                                                   qkv_data->dtype);
+    Tensor v_data = temp_attn_v_device_.CreateView({total_seq_length, num_kv_heads_, qk_head_dim_},
+                                                   qkv_data->dtype);
 
-    NDArray qkv_data_view = qkv_data;
-    NDArray o_data_view = o_data;
+    Tensor qkv_data_view = qkv_data;
+    Tensor o_data_view = o_data;
     if (total_seq_length != qkv_data->shape[0]) {
       qkv_data_view = qkv_data.CreateView(
           {total_seq_length, qkv_data->shape[1], qkv_data->shape[2]}, qkv_data->dtype);
@@ -1372,13 +1372,13 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void SelfAttention(int64_t layer_id, NDArray q_data, NDArray k_data, NDArray v_data,
-                     NDArray o_data, NDArray lse_data, double sm_scale) final {
+  void SelfAttention(int64_t layer_id, Tensor q_data, Tensor k_data, Tensor v_data, Tensor o_data,
+                     Tensor lse_data, double sm_scale) final {
     // Shape and dtype check.
     int64_t local_layer_id = layer_id - layer_id_begin_offset_;
     CHECK_GE(local_layer_id, 0);
     CHECK_LT(local_layer_id, num_layers_);
-    NDArray pages = pages_[local_layer_id];
+    Tensor pages = pages_[local_layer_id];
     CHECK(q_data.DataType() == pages.DataType());
     CHECK(k_data.DataType() == pages.DataType());
     CHECK(v_data.DataType() == pages.DataType());
@@ -1415,13 +1415,13 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void CrossAttention(int64_t layer_id, NDArray q_data, NDArray o_data, NDArray lse_data,
+  void CrossAttention(int64_t layer_id, Tensor q_data, Tensor o_data, Tensor lse_data,
                       double sm_scale) final {
     // Shape and dtype check.
     int64_t local_layer_id = layer_id - layer_id_begin_offset_;
     CHECK_GE(local_layer_id, 0);
     CHECK_LT(local_layer_id, num_layers_);
-    NDArray pages = pages_[local_layer_id];
+    Tensor pages = pages_[local_layer_id];
     CHECK(q_data.DataType() == pages.DataType());
     CHECK(o_data.DataType() == pages.DataType());
     AttnKind attn_kind = attn_kinds_[layer_id];
@@ -1455,12 +1455,12 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void AppendMLAKV(int64_t layer_id, NDArray kv_data) final {
+  void AppendMLAKV(int64_t layer_id, Tensor kv_data) final {
     // Shape and dtype check.
     int64_t local_layer_id = layer_id - layer_id_begin_offset_;
     CHECK_GE(local_layer_id, 0);
     CHECK_LT(local_layer_id, num_layers_);
-    NDArray pages = pages_[local_layer_id];
+    Tensor pages = pages_[local_layer_id];
     CHECK(kv_data.DataType() == pages.DataType());
     CHECK(attn_kinds_[layer_id] == AttnKind::kMLA);
 
@@ -1481,14 +1481,14 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     f_transpose_append_mla_.value()(pages_[local_layer_id], kv_data, append_position_map_view_);
   }
 
-  Array<NDArray> MergeAttnOutputInplace(NDArray o_self_attn, NDArray lse_self_attn,
-                                        NDArray o_cross_attn, NDArray lse_cross_attn) final {
+  Array<Tensor> MergeAttnOutputInplace(Tensor o_self_attn, Tensor lse_self_attn,
+                                       Tensor o_cross_attn, Tensor lse_cross_attn) final {
     CHECK_GE(f_merge_inplace_.size(), 2) << "The general attention merge function is not defined.";
     f_merge_inplace_[1](o_self_attn, lse_self_attn, o_cross_attn, lse_cross_attn);
     return {o_self_attn, lse_self_attn};
   }
 
-  void LinearAttention(int64_t layer_id, NDArray q_data, NDArray k_data, NDArray v_data,
+  void LinearAttention(int64_t layer_id, Tensor q_data, Tensor k_data, Tensor v_data,
                        double sm_scale) {
     // Todo(ruihang): implement it
   }
@@ -1586,7 +1586,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  NDArray GetQueryPositions() final {
+  Tensor GetQueryPositions() final {
     // Sync the copy stream and the compute stream.
     ComputeStreamWaitForCopyStream();
     // The auxiliary data structure on device must have been synchronized.
@@ -1594,8 +1594,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     return q_rope_position_map_view_;
   };
 
-  void DebugGetKV(int64_t seq_id, int64_t start_pos, int64_t end_pos, NDArray k_data,
-                  NDArray v_data) final {
+  void DebugGetKV(int64_t seq_id, int64_t start_pos, int64_t end_pos, Tensor k_data,
+                  Tensor v_data) final {
     CHECK(f_debug_get_kv_.defined())
         << "PageAttentionKVCache requires the `f_debug_get_kv` to be explicitly passed in when "
            "initialization. Please construct the KV cache with `f_debug_get_kv`.";
@@ -1609,8 +1609,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     static constexpr const char* error_msg =
         "DebugGetKV expects the k_data in layout (num_layers, seq_length, num_kv_heads, "
         "qk_head_dim).";
-    std::vector<NDArray*> vec_kv_data = {&k_data, &v_data};
-    for (const NDArray* data_ptr : vec_kv_data) {
+    std::vector<Tensor*> vec_kv_data = {&k_data, &v_data};
+    for (const Tensor* data_ptr : vec_kv_data) {
       CHECK_EQ((*data_ptr)->ndim, 4) << error_msg;
       CHECK_EQ((*data_ptr)->shape[0], num_layers_)
           << error_msg << " The number of layers mismatches.";
@@ -1635,7 +1635,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         append_position_map.push_back(page_id * page_size_ + page_offset);
       }
     }
-    NDArray position_map_device = NDArray::Empty({end_pos - start_pos}, dtype_aux_, device_);
+    Tensor position_map_device = Tensor::Empty({end_pos - start_pos}, dtype_aux_, device_);
     position_map_device.CopyFromBytes(
         append_position_map.data() + start_pos,
         (end_pos - start_pos) * ((dtype_aux_.bits * dtype_aux_.lanes + 7) / 8));
@@ -1645,7 +1645,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void DebugGetKVMLA(int64_t seq_id, int64_t start_pos, int64_t end_pos, NDArray kv_data) final {
+  void DebugGetKVMLA(int64_t seq_id, int64_t start_pos, int64_t end_pos, Tensor kv_data) final {
     CHECK(f_debug_get_kv_.defined())
         << "PageAttentionKVCache requires the `f_debug_get_kv` to be explicitly passed in when "
            "initialization. Please construct the KV cache with `f_debug_get_kv`.";
@@ -1678,7 +1678,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         append_position_map.push_back(page_id * page_size_ + page_offset);
       }
     }
-    NDArray position_map_device = NDArray::Empty({end_pos - start_pos}, dtype_aux_, device_);
+    Tensor position_map_device = Tensor::Empty({end_pos - start_pos}, dtype_aux_, device_);
     position_map_device.CopyFromBytes(
         append_position_map.data() + start_pos,
         (end_pos - start_pos) * ((dtype_aux_.bits * dtype_aux_.lanes + 7) / 8));
@@ -1688,7 +1688,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void DebugSetKV(int64_t seq_id, int64_t start_pos, NDArray k_data, NDArray v_data) final {
+  void DebugSetKV(int64_t seq_id, int64_t start_pos, Tensor k_data, Tensor v_data) final {
     ICHECK(false) << "DebugSetKV for PageAttentionKVCache not implemented yet.";
   }
 
@@ -2080,8 +2080,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
    * \brief Compute attention for between the input q data and the
    * input k/v data and the k/v data in cache on the given layer.
    */
-  void AttentionInternal(int64_t layer_id, NDArray q_data, NDArray k_data, NDArray v_data,
-                         NDArray output, double sm_scale) {
+  void AttentionInternal(int64_t layer_id, Tensor q_data, Tensor k_data, Tensor v_data,
+                         Tensor output, double sm_scale) {
     int64_t local_layer_id = layer_id - layer_id_begin_offset_;
     CHECK_GE(local_layer_id, 0);
     CHECK_LT(local_layer_id, num_layers_);
@@ -2099,8 +2099,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         << "Both self-attention and cross-attention are not computed.";
   }
 
-  void MHASelfAttnInternal(NDArray q_data, NDArray k_data, NDArray v_data, NDArray o_data,
-                           NDArray lse_data, double sm_scale) {
+  void MHASelfAttnInternal(Tensor q_data, Tensor k_data, Tensor v_data, Tensor o_data,
+                           Tensor lse_data, double sm_scale) {
     if (is_chain_on_depths_[0]) {
       // If the batch does not form a tree, use raggedness prefill kernel.
       ICHECK_NOTNULL(f_attention_prefill_ragged_);
@@ -2121,8 +2121,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     }
   }
 
-  void MLASelfAttnInternal(NDArray q_data, NDArray k_data, NDArray v_data, NDArray o_data,
-                           NDArray lse_data, double sm_scale) {
+  void MLASelfAttnInternal(Tensor q_data, Tensor k_data, Tensor v_data, Tensor o_data,
+                           Tensor lse_data, double sm_scale) {
     CHECK(is_chain_on_depths_[0]) << "Tree attn not able for MLA for now.";
     // If the batch does not form a tree, use raggedness prefill kernel.
     ICHECK_NOTNULL(f_attention_prefill_ragged_);
@@ -2133,8 +2133,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   }
 
   /*! \brief Compute cross-attention for MHA. Return if there is effective computation. */
-  bool MHACrossAttnInternal(int64_t local_layer_id, NDArray q_data, NDArray o_data,
-                            NDArray lse_data, double sm_scale, bool is_first_kernel) {
+  bool MHACrossAttnInternal(int64_t local_layer_id, Tensor q_data, Tensor o_data, Tensor lse_data,
+                            double sm_scale, bool is_first_kernel) {
     std::unique_ptr<PagedPrefillFunc>& f_prefill =
         (!support_sliding_window_ &&
          attn_kinds_[local_layer_id + layer_id_begin_offset_] != AttnKind::kMHASliding)
@@ -2152,8 +2152,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       if (page_indices_on_depths_view_[d]->shape[0] == 0) {
         continue;
       }
-      NDArray attn_output;
-      NDArray attn_lse;
+      Tensor attn_output;
+      Tensor attn_lse;
       if (is_first_kernel) {
         attn_output = o_data;
         attn_lse = lse_data;
@@ -2162,10 +2162,10 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         attn_lse = temp_attn_lse_view_;
       }
       // If layer is sliding window, use sliding window index pointer/indices
-      NDArray page_indptr;
-      NDArray page_indices;
-      NDArray length_info;
-      NDArray k_rope_pos;
+      Tensor page_indptr;
+      Tensor page_indices;
+      Tensor length_info;
+      Tensor k_rope_pos;
       double rotary_theta;
       double rotary_scale;
 
@@ -2219,8 +2219,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   }
 
   /*! \brief Compute cross-attention for MLA. Return if there is effective computation. */
-  bool MLACrossAttnInternal(int64_t local_layer_id, NDArray q_data, NDArray o_data,
-                            NDArray lse_data, double sm_scale) {
+  bool MLACrossAttnInternal(int64_t local_layer_id, Tensor q_data, Tensor o_data, Tensor lse_data,
+                            double sm_scale) {
     CHECK_GE(num_depths_, 1) << "The number of effective depths must be greater or equal to 1.";
 
     bool is_first_kernel = true;
@@ -2228,8 +2228,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       if (page_indices_on_depths_view_[d]->shape[0] == 0) {
         continue;
       }
-      NDArray attn_output;
-      NDArray attn_lse;
+      Tensor attn_output;
+      Tensor attn_lse;
       if (is_first_kernel) {
         attn_output = o_data;
         attn_lse = lse_data;
@@ -2259,7 +2259,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       // If the auxiliary data is already synced, return and no need to sync again.
       return;
     }
-    // - Sync NDArrays to GPU.
+    // - Sync Tensors to GPU.
     SyncAuxArrayToDevice();
     KernelBeginForward();
     // - Clear the dirty flag.
@@ -2463,8 +2463,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
         int rope_mode = args[8].cast<int>();
         double rotary_scale = args[9].cast<double>();
         double rotary_theta = args[10].cast<double>();
-        Optional<NDArray> rope_ext_factors = std::nullopt;  // args[11]
-        NDArray init = args[12].cast<NDArray>();
+        Optional<Tensor> rope_ext_factors = std::nullopt;  // args[11]
+        Tensor init = args[12].cast<Tensor>();
         Optional<ffi::Function> f_transpose_append_mha = std::nullopt;  // args[13]
         Optional<ffi::Function> f_transpose_append_mla = std::nullopt;  // args[14]
         std::unique_ptr<RaggedPrefillFunc> f_attention_prefill_ragged =
@@ -2489,7 +2489,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
         ffi::Function f_debug_get_kv = args[26].cast<ffi::Function>();
         ffi::Function f_compact_copy = args[27].cast<ffi::Function>();
 
-        if (auto opt_nd = args[11].as<NDArray>()) {
+        if (auto opt_nd = args[11].as<Tensor>()) {
           rope_ext_factors = opt_nd.value();
         }
         auto f_convert_optional_packed_func = [&args](int arg_idx) -> Optional<ffi::Function> {
diff --git a/src/runtime/vm/rnn_state.cc b/src/runtime/vm/rnn_state.cc
index 085860348e2f..76457dd0d113 100644
--- a/src/runtime/vm/rnn_state.cc
+++ b/src/runtime/vm/rnn_state.cc
@@ -78,9 +78,9 @@ class RNNStateImpObj : public RNNStateObj {
   const int64_t max_history_ = 1;
   /*!
    * \brief The init value for ALL layer in the storage.
-   * The array has `num_states_per_layer_` NDArrays
+   * The array has `num_states_per_layer_` Tensors
    */
-  const Array<NDArray> init_layer_value_;
+  const Array<Tensor> init_layer_value_;
 
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
   const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
@@ -89,12 +89,12 @@ class RNNStateImpObj : public RNNStateObj {
 
   /*!
    * \brief The storages of space state models.
-   * The array has `num_layers * num_states_per_layer_` NDArrays,
+   * The array has `num_layers * num_states_per_layer_` Tensors,
    * each of them has layout `(num_seq, max_history, state_size)`.
    * \note As `num_states_per_layer_` may vary for different dtype and shape,
-   * we use a 2D array to store the NDArrays for each layer.
+   * we use a 2D array to store the Tensors for each layer.
    */
-  Array<Array<NDArray>> storages_;
+  Array<Array<Tensor>> storages_;
   /*! \brief The list of ids of released seq slot for reuse. */
   std::vector<int64_t> free_slot_ids_;
   /*! \brief The mapping from sequence ids to sequences. */
@@ -117,19 +117,19 @@ class RNNStateImpObj : public RNNStateObj {
    */
   bool dirty_aux_data_device_ = false;
   /*! \brief The device array of the sequence ids. */
-  NDArray seq_slot_ids_device_;
+  Tensor seq_slot_ids_device_;
   /*!
    * \brief The view of the device array of the sequence ids.
    * The view is used to reuse the memory but with different shape.
    */
-  NDArray seq_slot_ids_view_;
+  Tensor seq_slot_ids_view_;
   /*! \brief The device array of the history slot ids. */
-  NDArray history_slot_ids_device_;
+  Tensor history_slot_ids_device_;
   /*!
    * \brief The view of the device array of the history slot ids.
    * The view is used to reuse the memory but with different shape.
    */
-  NDArray history_slot_ids_view_;
+  Tensor history_slot_ids_view_;
 
   /******************* Interaction Functions *******************/
 
@@ -144,7 +144,7 @@ class RNNStateImpObj : public RNNStateObj {
   /*!
    * \brief The function to set the state data to the storage.
    * The function signature is `f_set_(state, seq_slot_ids, history_slot_ids, data, max_history)`.
-   * where `state` is the storage NDArray, `seq_slot_ids` and `history_slot_ids` are
+   * where `state` is the storage Tensor, `seq_slot_ids` and `history_slot_ids` are
    * 1-D int32 arrays of the same length as the batch size, and `data` is the input data.
    * \note The `history_slot_ids` is the slot of this round, but we need to write to the
    * slot of the next round.
@@ -154,14 +154,14 @@ class RNNStateImpObj : public RNNStateObj {
   Array<ffi::Function> f_sets_;
 
  public:
-  /*! \brief Constructor. Take the cache configuration and initialize the NDArrays. */
+  /*! \brief Constructor. Take the cache configuration and initialize the Tensors. */
   explicit RNNStateImpObj(int64_t num_layers,           //
                           int64_t reserved_num_seqs,    //
                           int64_t max_history,          //
                           DLDevice device,              //
                           Array<ffi::Function> f_gets,  //
                           Array<ffi::Function> f_sets,  //
-                          Array<NDArray> init_layer_value)
+                          Array<Tensor> init_layer_value)
       : num_layers_(num_layers),
         reserved_num_seqs_(reserved_num_seqs),
         num_states_per_layer_(init_layer_value.size()),
@@ -172,14 +172,14 @@ class RNNStateImpObj : public RNNStateObj {
     // Allocate the storage for the space state models.
     storages_.reserve(num_layers_);
     for (int64_t layer_id = 0; layer_id < num_layers_; ++layer_id) {
-      Array<NDArray> layer_storages;
+      Array<Tensor> layer_storages;
       layer_storages.reserve(num_states_per_layer_);
       for (int64_t state_id = 0; state_id < num_states_per_layer_; ++state_id) {
         ffi::Shape state_shape = init_layer_value[state_id].Shape();
         std::vector<ffi::ShapeObj::index_type> storage_shape = {reserved_num_seqs, max_history};
         storage_shape.insert(storage_shape.end(), state_shape.begin(), state_shape.end());
-        NDArray state_storage =
-            NDArray::Empty(storage_shape, init_layer_value[state_id].DataType(), device);
+        Tensor state_storage =
+            Tensor::Empty(storage_shape, init_layer_value[state_id].DataType(), device);
         layer_storages.push_back(state_storage);
       }
       storages_.push_back(layer_storages);
@@ -188,8 +188,8 @@ class RNNStateImpObj : public RNNStateObj {
     CHECK_GT(max_history_, 0) << "At least 1 history slot to store the current state";
 
     // Allocate the auxiliary arrays on device.
-    seq_slot_ids_device_ = NDArray::Empty({reserved_num_seqs}, dtype_aux_, device);
-    history_slot_ids_device_ = NDArray::Empty({reserved_num_seqs}, dtype_aux_, device);
+    seq_slot_ids_device_ = Tensor::Empty({reserved_num_seqs}, dtype_aux_, device);
+    history_slot_ids_device_ = Tensor::Empty({reserved_num_seqs}, dtype_aux_, device);
 
     Clear();
   }
@@ -259,7 +259,7 @@ class RNNStateImpObj : public RNNStateObj {
     dirty_aux_data_device_ = true;
   }
 
-  void Get(int64_t layer_id, int64_t state_id, NDArray o_data) final {
+  void Get(int64_t layer_id, int64_t state_id, Tensor o_data) final {
     // The auxiliary data structure on device must have been synchronized.
     CHECK(!dirty_aux_data_device_)
         << "The auxiliary arrays are not synchronized to device. Please call "
@@ -269,11 +269,11 @@ class RNNStateImpObj : public RNNStateObj {
     CHECK_GT(cur_batch_size_, 0) << "The curent batch size should be greater than 0.";
     // TODO(siyuan): support zero-copy when seq_len is one
     // Copy the state data to the return array.
-    NDArray state = storages_[layer_id][state_id];
+    Tensor state = storages_[layer_id][state_id];
     f_gets_[state_id](state, seq_slot_ids_view_, history_slot_ids_view_, o_data);
   }
 
-  void Set(int64_t layer_id, int64_t state_id, NDArray data) final {
+  void Set(int64_t layer_id, int64_t state_id, Tensor data) final {
     // The auxiliary data structure on device must have been synchronized.
     CHECK(!dirty_aux_data_device_)
         << "The auxiliary arrays are not synchronized to device. Please call "
@@ -282,24 +282,24 @@ class RNNStateImpObj : public RNNStateObj {
         << "The batch size is not consistent with the number of sequence ids.";
     CHECK_GT(cur_batch_size_, 0) << "The curent batch size should be greater than 0.";
 
-    NDArray state = storages_[layer_id][state_id];
+    Tensor state = storages_[layer_id][state_id];
     f_sets_[state_id](state, seq_slot_ids_view_, history_slot_ids_view_, data);
   }
 
-  NDArray DebugGet(int64_t layer_id, int64_t state_id, int64_t seq_id) {
+  Tensor DebugGet(int64_t layer_id, int64_t state_id, int64_t seq_id) {
     auto it = seq_map_.find(seq_id);
     CHECK(it != seq_map_.end()) << "The sequence \"" << seq_id
                                 << "\" cannot be found in the space state storage.";
-    NDArray state = storages_[layer_id][state_id];
+    Tensor state = storages_[layer_id][state_id];
     int64_t seq_slot_id = it->second.seq_slot_id;
     int64_t history_slot_id = it->second.history_slot_id;
 
     std::vector<int64_t> shape{state.Shape().begin() + 2, state.Shape().end()};
-    NDArray result = NDArray::Empty(shape, state->dtype, state->device);
+    Tensor result = Tensor::Empty(shape, state->dtype, state->device);
     DLTensor copy_src = GetStatePtrBySeqHistory(layer_id, state_id, seq_slot_id, history_slot_id);
     DLTensor copy_dst = *result.operator->();
 
-    NDArray::CopyFromTo(&copy_src, &copy_dst);
+    Tensor::CopyFromTo(&copy_src, &copy_dst);
     return result;
   }
 
@@ -316,8 +316,8 @@ class RNNStateImpObj : public RNNStateObj {
       for (int64_t state_id = 0; state_id < num_states_per_layer_; ++state_id) {
         DLTensor dst =
             GetStatePtrBySeqHistory(layer_id, state_id, seq_slot_id, /*history_slot_id=*/0);
-        NDArray init = init_layer_value_[state_id];
-        NDArray::CopyFromTo(init.operator->(), &dst);
+        Tensor init = init_layer_value_[state_id];
+        Tensor::CopyFromTo(init.operator->(), &dst);
       }
     }
 
@@ -352,7 +352,7 @@ class RNNStateImpObj : public RNNStateObj {
       for (int64_t state_id = 0; state_id < num_states_per_layer_; ++state_id) {
         DLTensor copy_src = GetStatePtrBySeq(layer_id, state_id, parent_slot_id);
         DLTensor copy_dst = GetStatePtrBySeq(layer_id, state_id, child_slot_id);
-        NDArray::CopyFromTo(&copy_src, &copy_dst);
+        Tensor::CopyFromTo(&copy_src, &copy_dst);
       }
     }
     dirty_aux_data_device_ = true;
@@ -385,7 +385,7 @@ class RNNStateImpObj : public RNNStateObj {
 
   DLTensor GetStatePtrBySeqHistory(int64_t layer_id, int64_t state_id, int64_t seq_slot_id,
                                    int64_t history_slot_id) {
-    NDArray state = storages_[layer_id][state_id];
+    Tensor state = storages_[layer_id][state_id];
     int64_t state_size = 1;
     for (int64_t i = 2; i < state->ndim; ++i) {
       state_size *= state->shape[i];
@@ -401,7 +401,7 @@ class RNNStateImpObj : public RNNStateObj {
   }
 
   DLTensor GetStatePtrBySeq(int64_t layer_id, int64_t state_id, int64_t seq_slot_id) {
-    NDArray state = storages_[layer_id][state_id];
+    Tensor state = storages_[layer_id][state_id];
     int64_t state_size = 1;
     for (int64_t i = 1; i < state->ndim; ++i) {
       state_size *= state->shape[i];
@@ -422,7 +422,7 @@ class RNNStateImpObj : public RNNStateObj {
    * invoked before running attention computation on device.
    */
   void SyncAuxArrayToDevice() {
-    auto fcopy_from_vec = [](NDArray array, std::vector<int32_t> vec_data) {
+    auto fcopy_from_vec = [](Tensor array, std::vector<int32_t> vec_data) {
       DLTensor copy_dst = *array.operator->();
       DLTensor copy_src;
       copy_src.data = vec_data.data();
@@ -432,7 +432,7 @@ class RNNStateImpObj : public RNNStateObj {
       copy_src.shape = array->shape;
       copy_src.strides = array->strides;
       copy_src.byte_offset = 0;
-      NDArray::CopyFromTo(&copy_src, &copy_dst);
+      Tensor::CopyFromTo(&copy_src, &copy_dst);
     };
 
     std::vector<int32_t> seq_slot_ids;
@@ -473,14 +473,14 @@ TVM_FFI_STATIC_INIT_BLOCK({
                                                           int64_t max_history,          //
                                                           Array<ffi::Function> f_gets,  //
                                                           Array<ffi::Function> f_sets,  //
-                                                          Array<NDArray> init_layer_value) {
+                                                          Array<Tensor> init_layer_value) {
     CHECK_GT(num_layers, 0) << "The number of layers should be greater than 0.";
     CHECK_GT(reserved_num_seqs, 0) << "The number of reserved sequences should be greater than 0.";
     CHECK_GE(max_history, 0) << "The maximum history length should be greater or equal than 0.";
     CHECK_GT(init_layer_value.size(), 0)
         << "The number of states per layer should be greater than 0.";
     Device device = init_layer_value[0]->device;
-    for (const NDArray& state : init_layer_value) {
+    for (const Tensor& state : init_layer_value) {
       CHECK(state->device.device_type == device.device_type &&
             state->device.device_id == device.device_id)
           << "The device type of all states should be the same.";
diff --git a/src/runtime/vm/ndarray_cache_support.cc b/src/runtime/vm/tensor_cache_support.cc
similarity index 74%
rename from src/runtime/vm/ndarray_cache_support.cc
rename to src/runtime/vm/tensor_cache_support.cc
index cfd979cc6f24..cff92994e41f 100644
--- a/src/runtime/vm/ndarray_cache_support.cc
+++ b/src/runtime/vm/tensor_cache_support.cc
@@ -17,17 +17,17 @@
  * under the License.
  */
 /*!
- * \file src/runtime/vm/ndarray_cache_support.cc
- * \brief Runtime to support ndarray cache file loading.
+ * \file src/runtime/vm/tensor_cache_support.cc
+ * \brief Runtime to support tensor cache file loading.
  *
- * This file provides a minimum support for ndarray cache file loading.
+ * This file provides a minimum support for tensor cache file loading.
  *
  * The main focus of this implementation is to enable loading
  * with minimum set of intermediate files while also being
  * compatible to some of the multi-shard files that are more
  * friendly in some of the environments.
  *
- * NDArray cache also provides a way to do system-wide
+ * Tensor cache also provides a way to do system-wide
  * parameter sharing across multiple VMs.
  *
  * There are likely other ways to load the parameters ndarray-ache.
@@ -41,8 +41,8 @@
 #include <picojson.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/vm/ndarray_cache_support.h>
+#include <tvm/runtime/tensor.h>
+#include <tvm/runtime/vm/tensor_cache_support.h>
 
 #include <string>
 #include <vector>
@@ -65,7 +65,7 @@ inline ValueType GetValue(const picojson::object& json, const std::string& key)
   return AsType<ValueType>(json.at(key));
 }
 
-NDArrayCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const picojson::object& json) {
+TensorCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const picojson::object& json) {
   std::vector<ffi::Shape::index_type> shape;
   {
     picojson::array shape_json = GetValue<picojson::array>(json, "shape");
@@ -74,7 +74,7 @@ NDArrayCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const picojson::
       shape.push_back(AsType<int64_t>(d));
     }
   }
-  NDArrayCacheMetadata::FileRecord::ParamRecord result;
+  TensorCacheMetadata::FileRecord::ParamRecord result;
   std::string dtype = GetValue<std::string>(json, "dtype");
   result.name = GetValue<std::string>(json, "name");
   result.dtype = DataType(StringToDLDataType(dtype));
@@ -85,9 +85,9 @@ NDArrayCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const picojson::
   return result;
 }
 
-NDArrayCacheMetadata::FileRecord JSONAsFileRecord(const picojson::object& json) {
+TensorCacheMetadata::FileRecord JSONAsFileRecord(const picojson::object& json) {
   picojson::array records = GetValue<picojson::array>(json, "records");
-  NDArrayCacheMetadata::FileRecord result;
+  TensorCacheMetadata::FileRecord result;
   result.data_path = GetValue<std::string>(json, "dataPath");
   result.format = GetValue<std::string>(json, "format");
   result.nbytes = GetValue<int64_t>(json, "nbytes");
@@ -98,9 +98,9 @@ NDArrayCacheMetadata::FileRecord JSONAsFileRecord(const picojson::object& json)
   return result;
 }
 
-NDArrayCacheMetadata JSONAsNDArrayCacheMetadata(const picojson::object& json) {
+TensorCacheMetadata JSONAsTensorCacheMetadata(const picojson::object& json) {
   picojson::array records = GetValue<picojson::array>(json, "records");
-  NDArrayCacheMetadata result;
+  TensorCacheMetadata result;
   result.records.reserve(records.size());
   for (const picojson::value& item : records) {
     result.records.push_back(JSONAsFileRecord(AsType<picojson::object>(item)));
@@ -108,8 +108,8 @@ NDArrayCacheMetadata JSONAsNDArrayCacheMetadata(const picojson::object& json) {
   return result;
 }
 
-NDArrayCacheMetadata NDArrayCacheMetadata::LoadFromStr(const std::string& json_str,
-                                                       const std::string& path) {
+TensorCacheMetadata TensorCacheMetadata::LoadFromStr(const std::string& json_str,
+                                                     const std::string& path) {
   picojson::value json_info;
   {
     std::string err = picojson::parse(json_info, json_str);
@@ -119,16 +119,16 @@ NDArrayCacheMetadata NDArrayCacheMetadata::LoadFromStr(const std::string& json_s
     CHECK(json_info.is<picojson::object>())
         << "ValueError: The given string is not a JSON object: " << json_str;
   }
-  NDArrayCacheMetadata result = JSONAsNDArrayCacheMetadata(AsType<picojson::object>(json_info));
+  TensorCacheMetadata result = JSONAsTensorCacheMetadata(AsType<picojson::object>(json_info));
   result.path = path;
   return result;
 }
 
-TVM_DLL NDArrayCacheMetadata NDArrayCacheMetadata::Load(const std::string& path) {
+TVM_DLL TensorCacheMetadata TensorCacheMetadata::Load(const std::string& path) {
   picojson::value json_info;
   {
     std::string json_str;
-    LoadBinaryFromFile(path + "/ndarray-cache.json", &json_str);
+    LoadBinaryFromFile(path + "/tensor-cache.json", &json_str);
     std::string err = picojson::parse(json_info, json_str);
     if (!err.empty()) {
       LOG(FATAL) << "Failed to parse JSON: err. The JSON string is:" << json_str;
@@ -136,13 +136,13 @@ TVM_DLL NDArrayCacheMetadata NDArrayCacheMetadata::Load(const std::string& path)
     CHECK(json_info.is<picojson::object>())
         << "ValueError: The given string is not a JSON object: " << json_str;
   }
-  NDArrayCacheMetadata result = JSONAsNDArrayCacheMetadata(AsType<picojson::object>(json_info));
+  TensorCacheMetadata result = JSONAsTensorCacheMetadata(AsType<picojson::object>(json_info));
   result.path = path;
   return result;
 }
 
-void CopyNDArrayFromBytes(NDArray param, const void* data, size_t nbytes,
-                          Optional<NDArray>* staging_buffer) {
+void CopyTensorFromBytes(Tensor param, const void* data, size_t nbytes,
+                         Optional<Tensor>* staging_buffer) {
   Device device = param->device;
   if (device.device_type != kDLOpenCL || staging_buffer == nullptr) {
     param.CopyFromBytes(data, nbytes);
@@ -158,17 +158,18 @@ void CopyNDArrayFromBytes(NDArray param, const void* data, size_t nbytes,
     }
   }
   if (!staging_buffer->defined()) {
-    *staging_buffer = NDArray::Empty(param.Shape(), param->dtype, param->device);
+    *staging_buffer = Tensor::Empty(param.Shape(), param->dtype, param->device);
   }
-  NDArray staging_view = staging_buffer->value().CreateView(param.Shape(), param->dtype);
+  Tensor staging_view = staging_buffer->value().CreateView(param.Shape(), param->dtype);
   staging_view.CopyFromBytes(data, nbytes);
   param.CopyFrom(staging_view);
   DeviceAPI::Get(device)->StreamSync(device, nullptr);
 }
 
-NDArray NDArrayCacheMetadata::FileRecord::ParamRecord::Load(
-    Device device, const std::string* raw_data, Optional<NDArray>* staging_buffer) const {
-  NDArray arr = NDArray::Empty(shape, dtype, device);
+Tensor TensorCacheMetadata::FileRecord::ParamRecord::Load(Device device,
+                                                          const std::string* raw_data,
+                                                          Optional<Tensor>* staging_buffer) const {
+  Tensor arr = Tensor::Empty(shape, dtype, device);
   if (dtype == DataType::Float(32) && format == "f32-to-bf16") {
     // decode bf16 to f32
     std::vector<uint16_t> buffer(nbytes / 2);
@@ -177,24 +178,24 @@ NDArray NDArrayCacheMetadata::FileRecord::ParamRecord::Load(
     for (size_t i = 0; i < buffer.size(); ++i) {
       decoded[i] = static_cast<uint32_t>(buffer[i]) << 16;
     }
-    CopyNDArrayFromBytes(arr, decoded.data(), decoded.size() * sizeof(uint32_t), staging_buffer);
+    CopyTensorFromBytes(arr, decoded.data(), decoded.size() * sizeof(uint32_t), staging_buffer);
   } else {
-    CopyNDArrayFromBytes(arr, raw_data->data() + byte_offset, nbytes, staging_buffer);
+    CopyTensorFromBytes(arr, raw_data->data() + byte_offset, nbytes, staging_buffer);
   }
   return arr;
 }
 
-TVM_DLL Array<NDArray> NDArrayCacheMetadata::FileRecord::Load(
+TVM_DLL Array<Tensor> TensorCacheMetadata::FileRecord::Load(
     Device device,
     const std::string& path_prefix,  //
     std::string* raw_data_buffer,    //
-    Optional<NDArray>* staging_buffer) const {
+    Optional<Tensor>* staging_buffer) const {
   LoadBinaryFromFile(path_prefix + "/" + this->data_path, raw_data_buffer);
   CHECK_EQ(this->format, "raw-shard") << "ValueError: Only `raw-shard` format is supported";
   CHECK_EQ(this->nbytes, raw_data_buffer->length())
       << "ValueError: Encountered an corrupted parameter shard. It means it is not downloaded "
          "completely or downloading is interrupted. Please try to download again.";
-  Array<NDArray> result;
+  Array<Tensor> result;
   result.reserve(this->records.size());
   for (const ParamRecord& nd_rec : this->records) {
     result.push_back(nd_rec.Load(device, raw_data_buffer, staging_buffer));
@@ -203,25 +204,25 @@ TVM_DLL Array<NDArray> NDArrayCacheMetadata::FileRecord::Load(
 }
 
 /*!
- * A NDArray cache to store pre-loaded arrays in the system.
+ * A Tensor cache to store pre-loaded arrays in the system.
  */
-class NDArrayCache {
+class TensorCache {
  public:
-  static NDArrayCache* Global() {
-    static NDArrayCache* inst = new NDArrayCache();
+  static TensorCache* Global() {
+    static TensorCache* inst = new TensorCache();
     return inst;
   }
 
-  static void Update(String name, NDArray arr, bool override) {
-    NDArrayCache* pool = Global();
+  static void Update(String name, Tensor arr, bool override) {
+    TensorCache* pool = Global();
     if (!override) {
       ICHECK_EQ(pool->pool_.count(name), 0) << "Name " << name << " already exists in the cache";
     }
     pool->pool_.Set(name, arr);
   }
 
-  static Optional<NDArray> Get(String name) {
-    NDArrayCache* pool = Global();
+  static Optional<Tensor> Get(String name) {
+    TensorCache* pool = Global();
     auto it = pool->pool_.find(name);
     if (it != pool->pool_.end()) {
       return (*it).second;
@@ -231,7 +232,7 @@ class NDArrayCache {
   }
 
   static void Remove(String name) {
-    NDArrayCache* pool = Global();
+    TensorCache* pool = Global();
     pool->pool_.erase(name);
   }
 
@@ -245,11 +246,11 @@ class NDArrayCache {
    */
   static void Load(const std::string& cache_path, int device_type, int device_id) {
     DLDevice device{static_cast<DLDeviceType>(device_type), device_id};
-    NDArrayCacheMetadata metadata = NDArrayCacheMetadata::Load(cache_path);
-    Optional<NDArray> staging_buffer;
+    TensorCacheMetadata metadata = TensorCacheMetadata::Load(cache_path);
+    Optional<Tensor> staging_buffer;
     std::string raw_data;
-    Array<NDArray> params;
-    for (const NDArrayCacheMetadata::FileRecord& shard_rec : metadata.records) {
+    Array<Tensor> params;
+    for (const TensorCacheMetadata::FileRecord& shard_rec : metadata.records) {
       try {
         params = shard_rec.Load(device, cache_path, &raw_data, &staging_buffer);
       } catch (const dmlc::Error& e) {
@@ -264,40 +265,40 @@ class NDArrayCache {
   }
 
  private:
-  Map<String, NDArray> pool_;
+  Map<String, Tensor> pool_;
 };
 
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def("vm.builtin.ndarray_cache.get", NDArrayCache::Get)
-      .def_packed("vm.builtin.ndarray_cache.update",
+      .def("vm.builtin.tensor_cache.get", TensorCache::Get)
+      .def_packed("vm.builtin.tensor_cache.update",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     CHECK(args.size() == 2 || args.size() == 3);
                     String name = args[0].cast<String>();
                     bool is_override = args.size() == 2 ? false : args[2].cast<bool>();
 
-                    NDArray arr;
-                    if (auto opt_nd = args[1].as<NDArray>()) {
+                    Tensor arr;
+                    if (auto opt_nd = args[1].as<Tensor>()) {
                       arr = opt_nd.value();
                     } else {
-                      // We support converting DLTensors to NDArrays as RPC references are always
+                      // We support converting DLTensors to Tensors as RPC references are always
                       // DLTensors
                       auto tensor = args[1].cast<DLTensor*>();
                       std::vector<int64_t> shape;
                       for (int64_t i = 0; i < tensor->ndim; i++) {
                         shape.push_back(tensor->shape[i]);
                       }
-                      arr = NDArray::Empty(shape, tensor->dtype, tensor->device);
+                      arr = Tensor::Empty(shape, tensor->dtype, tensor->device);
                       arr.CopyFrom(tensor);
                       DeviceAPI::Get(arr->device)->StreamSync(arr->device, nullptr);
                     }
 
-                    NDArrayCache::Update(name, arr, is_override);
+                    TensorCache::Update(name, arr, is_override);
                   })
-      .def("vm.builtin.ndarray_cache.remove", NDArrayCache::Remove)
-      .def("vm.builtin.ndarray_cache.clear", NDArrayCache::Clear)
-      .def("vm.builtin.ndarray_cache.load", NDArrayCache::Load);
+      .def("vm.builtin.tensor_cache.remove", TensorCache::Remove)
+      .def("vm.builtin.tensor_cache.clear", TensorCache::Clear)
+      .def("vm.builtin.tensor_cache.load", TensorCache::Load);
 });
 
 // This param module node can be useful to get param dict in RPC mode
@@ -315,11 +316,11 @@ class ParamModuleNode : public ffi::ModuleObj {
     }
   }
 
-  static Array<NDArray> GetParams(const String& prefix, int num_params) {
-    Array<NDArray> params;
+  static Array<Tensor> GetParams(const String& prefix, int num_params) {
+    Array<Tensor> params;
     for (int i = 0; i < num_params || num_params == -1; ++i) {
       std::string name = prefix + "_" + std::to_string(i);
-      auto opt = NDArrayCache::Get(name);
+      auto opt = TensorCache::Get(name);
       if (opt) {
         params.push_back(opt.value());
       } else {
@@ -330,11 +331,11 @@ class ParamModuleNode : public ffi::ModuleObj {
     return params;
   }
 
-  static Array<NDArray> GetParamByName(const Array<String>& names) {
-    Array<NDArray> result;
+  static Array<Tensor> GetParamByName(const Array<String>& names) {
+    Array<Tensor> result;
     result.reserve(names.size());
     for (const String& name : names) {
-      if (Optional<NDArray> opt = NDArrayCache::Get(name)) {
+      if (Optional<Tensor> opt = TensorCache::Get(name)) {
         result.push_back(opt.value());
       } else {
         LOG(FATAL) << "ValueError: Cannot find parameter in cache: " << name;
@@ -356,7 +357,7 @@ class ParamModuleNode : public ffi::ModuleObj {
   }
 
  private:
-  Array<NDArray> params_;
+  Array<Tensor> params_;
 };
 
 TVM_FFI_STATIC_INIT_BLOCK({
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index c4fdedd815a9..149948fb0ecf 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -84,7 +84,7 @@ ffi::Any IndexIntoNestedObject(ffi::Any obj, ffi::PackedArgs args, int starting_
   return obj;
 }
 
-NDArray ConvertNDArrayToDevice(NDArray src, const DLDevice& dev, Allocator* alloc) {
+Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc) {
   if (src->device.device_type == dev.device_type && src->device.device_id == dev.device_id) {
     return src;
   } else {
@@ -95,8 +95,8 @@ NDArray ConvertNDArrayToDevice(NDArray src, const DLDevice& dev, Allocator* allo
 }
 
 Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc) {
-  if (src.as<NDArray::ContainerType>()) {
-    return ConvertNDArrayToDevice(src.cast<NDArray>(), dev, alloc);
+  if (src.as<Tensor::ContainerType>()) {
+    return ConvertTensorToDevice(src.cast<Tensor>(), dev, alloc);
   } else if (src.as<ffi::ArrayObj>()) {
     std::vector<Any> ret;
     auto arr = src.cast<ffi::Array<Any>>();
@@ -112,8 +112,8 @@ Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc) {
 ffi::Any ConvertArgToDevice(ffi::AnyView input, Device dev, Allocator* alloc) {
   // in terms of memory-behavior.
   // To be extra careful, we copy DLTensor.
-  // The developer can still explicitly allocate NDArray
-  // in TVM Native API or NDArray::FromDLPack to regain zero copy behavior.
+  // The developer can still explicitly allocate Tensor
+  // in TVM Native API or Tensor::FromDLPack to regain zero copy behavior.
   ffi::Any ret;
   if (auto opt_obj = input.as<ObjectRef>()) {
     ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc);
@@ -245,7 +245,7 @@ class VirtualMachineImpl : public VirtualMachine {
    * correct device for the function, they will be copied to the device.
    * \param with_param_module If set to true, the last argument will be a module and can be invoked
    *        to get the argument, this is mainly used for debugging purposes and setting composite
-   * objects. \note This interface works when using VM over RPC by internally converting NDArray in
+   * objects. \note This interface works when using VM over RPC by internally converting Tensor in
    * the arguments to DLTensor, which is supported in RPC where remote could only have a minimal C
    * runtime.
    */
@@ -470,7 +470,7 @@ void VirtualMachineImpl::Init(const std::vector<Device>& devices,
   // Setup constant sections.
   this->const_pool_.reserve(exec_->constants.size());
   for (const auto& constant : exec_->constants) {
-    if (auto opt_nd = constant.as<NDArray>()) {
+    if (auto opt_nd = constant.as<Tensor>()) {
       this->const_pool_.push_back(ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0]));
     } else {
       this->const_pool_.push_back(constant);
@@ -1029,11 +1029,11 @@ class VirtualMachineProfiler : public VirtualMachineImpl {
     if (prof_ && prof_->IsRunning()) {
       auto f_name = GetFuncName(inst.func_idx);
       std::optional<Device> dev;
-      std::vector<NDArray> arrs;
+      std::vector<Tensor> arrs;
 
-      auto f_check_ndarray_arg = [&dev, &arrs](const RegType& arg) {
-        if (auto opt_nd = arg.as<NDArray>()) {
-          NDArray arr = opt_nd.value();
+      auto f_check_tensor_arg = [&dev, &arrs](const RegType& arg) {
+        if (auto opt_nd = arg.as<Tensor>()) {
+          Tensor arr = opt_nd.value();
           if (arr.defined()) {
             dev = arr->device;
             arrs.push_back(arr);
@@ -1045,10 +1045,10 @@ class VirtualMachineProfiler : public VirtualMachineImpl {
         Instruction::Arg arg = inst.args[i];
         if (arg.kind() == Instruction::ArgKind::kRegister) {
           auto reg = ReadRegister(curr_frame, arg.value());
-          f_check_ndarray_arg(reg);
+          f_check_tensor_arg(reg);
         } else if (arg.kind() == Instruction::ArgKind::kConstIdx) {
           const auto& const_val = this->const_pool_[arg.value()];
-          f_check_ndarray_arg(const_val);
+          f_check_tensor_arg(const_val);
         }
       }
 
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 33a687f54bc4..06790ad4fab3 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -507,8 +507,8 @@ AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_s
   return AllocateFrame(n);
 }
 
-AllocateConstFrame AllocateConst(tvm::runtime::NDArray data, DataType dtype,
-                                 Array<PrimExpr> extents, Optional<Map<String, Any>> annotations) {
+AllocateConstFrame AllocateConst(tvm::runtime::Tensor data, DataType dtype, Array<PrimExpr> extents,
+                                 Optional<Map<String, Any>> annotations) {
   ObjectPtr<AllocateConstFrameNode> n = make_object<AllocateConstFrameNode>();
   n->dtype = dtype;
   n->extents = extents;
diff --git a/src/script/printer/relax/expr.cc b/src/script/printer/relax/expr.cc
index c411622e6409..903aef5a697e 100644
--- a/src/script/printer/relax/expr.cc
+++ b/src/script/printer/relax/expr.cc
@@ -79,7 +79,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           return Relax(d, "shape")->Call({ListDoc(values_doc)});
         });
 
-Optional<ExprDoc> SpecialScalar(const runtime::NDArray& n, const AccessPath& p) {
+Optional<ExprDoc> SpecialScalar(const runtime::Tensor& n, const AccessPath& p) {
   DataType dtype = n.DataType();
   const void* data = n->data;
   if (n->ndim != 0 || n->device.device_type != kDLCPU) {
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 5a52de1849f1..14acff77bed8 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -252,7 +252,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         });
 
 template <typename T>
-ExprDoc PrintNDArray(::tvm::runtime::NDArray arr) {
+ExprDoc PrintTensor(::tvm::runtime::Tensor arr) {
   // FIXME(@junrushao): this is a hack and can be wrong in most of the cases
   constexpr int NUM_PRINT = 200;
   int ndim = arr->ndim;
@@ -287,35 +287,35 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           ExprDoc data_doc{nullptr};
           if (stmt->dtype.is_int()) {
             if (stmt->dtype.bits() == 8) {
-              data_doc = PrintNDArray<int8_t>(stmt->data.value());
+              data_doc = PrintTensor<int8_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 16) {
-              data_doc = PrintNDArray<int16_t>(stmt->data.value());
+              data_doc = PrintTensor<int16_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 32) {
-              data_doc = PrintNDArray<int32_t>(stmt->data.value());
+              data_doc = PrintTensor<int32_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 64) {
-              data_doc = PrintNDArray<int64_t>(stmt->data.value());
+              data_doc = PrintTensor<int64_t>(stmt->data.value());
             } else {
               LOG(FATAL) << "DataType not supported";
             }
           } else if (stmt->dtype.is_uint()) {
             if (stmt->dtype.bits() == 8) {
-              data_doc = PrintNDArray<uint8_t>(stmt->data.value());
+              data_doc = PrintTensor<uint8_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 16) {
-              data_doc = PrintNDArray<uint16_t>(stmt->data.value());
+              data_doc = PrintTensor<uint16_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 32) {
-              data_doc = PrintNDArray<uint32_t>(stmt->data.value());
+              data_doc = PrintTensor<uint32_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 64) {
-              data_doc = PrintNDArray<uint64_t>(stmt->data.value());
+              data_doc = PrintTensor<uint64_t>(stmt->data.value());
             } else {
               LOG(FATAL) << "DataType not supported";
             }
           } else if (stmt->dtype.is_float()) {
             if (stmt->dtype.bits() == 16) {
-              data_doc = PrintNDArray<int16_t>(stmt->data.value());
+              data_doc = PrintTensor<int16_t>(stmt->data.value());
             } else if (stmt->dtype.bits() == 32) {
-              data_doc = PrintNDArray<float>(stmt->data.value());
+              data_doc = PrintTensor<float>(stmt->data.value());
             } else if (stmt->dtype.bits() == 64) {
-              data_doc = PrintNDArray<double>(stmt->data.value());
+              data_doc = PrintTensor<double>(stmt->data.value());
             } else {
               LOG(FATAL) << "DataType not supported";
             }
diff --git a/src/support/scalars.cc b/src/support/scalars.cc
index b2581ecb3c99..692746852694 100644
--- a/src/support/scalars.cc
+++ b/src/support/scalars.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file src/support/scalars.cc
- * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and Tensor forms.
  */
 
 #include "./scalars.h"
@@ -38,9 +38,9 @@ static const DataType kFloat32 = DataType::Float(32);
 static const DataType kFloat64 = DataType::Float(64);
 static const DataType kBool = DataType::Bool();
 
-runtime::NDArray IntImmToNDArray(const IntImm& int_imm) {
+runtime::Tensor IntImmToTensor(const IntImm& int_imm) {
   DLDevice dev = {DLDeviceType::kDLCPU, 0};
-  auto data = runtime::NDArray::Empty({}, int_imm->dtype, dev);
+  auto data = runtime::Tensor::Empty({}, int_imm->dtype, dev);
   if (int_imm.dtype() == kInt16) {
     auto* array = reinterpret_cast<int16_t*>(data->data);
     array[0] = static_cast<int16_t>(int_imm->value);
@@ -56,9 +56,9 @@ runtime::NDArray IntImmToNDArray(const IntImm& int_imm) {
   return data;
 }
 
-runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm) {
+runtime::Tensor FloatImmToTensor(const FloatImm& float_imm) {
   DLDevice dev = {DLDeviceType::kDLCPU, 0};
-  auto data = runtime::NDArray::Empty({}, float_imm->dtype, dev);
+  auto data = runtime::Tensor::Empty({}, float_imm->dtype, dev);
   if (float_imm.dtype() == kFloat16) {
     auto* array = reinterpret_cast<uint16_t*>(data->data);
     array[0] = __gnu_f2h_ieee(static_cast<float>(float_imm->value));
@@ -74,15 +74,15 @@ runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm) {
   return data;
 }
 
-runtime::NDArray BoolToNDArray(bool value) {
+runtime::Tensor BoolToTensor(bool value) {
   DLDevice dev = {DLDeviceType::kDLCPU, 0};
-  auto data = runtime::NDArray::Empty({}, kBool, dev);
+  auto data = runtime::Tensor::Empty({}, kBool, dev);
   auto array = reinterpret_cast<bool*>(data->data);
   array[0] = value;
   return data;
 }
 
-std::string NDArrayScalarToString(const runtime::NDArray& data) {
+std::string TensorScalarToString(const runtime::Tensor& data) {
   std::ostringstream os;
   DataType dtype(data->dtype);
   ICHECK_EQ(data->device.device_type, kDLCPU) << "Scalars must reside on the CPU to be printed";
@@ -108,7 +108,7 @@ std::string NDArrayScalarToString(const runtime::NDArray& data) {
     auto value = static_cast<const uint8_t*>(data->data)[0];
     os << (value ? "True" : "False");
   } else {
-    LOG(FATAL) << "Unrecognized NDArray scalar dtype: " << DLDataTypeToString(dtype);
+    LOG(FATAL) << "Unrecognized Tensor scalar dtype: " << DLDataTypeToString(dtype);
   }
   return os.str();
 }
diff --git a/src/support/scalars.h b/src/support/scalars.h
index d9f2d7c54316..fa5a3482f5f6 100644
--- a/src/support/scalars.h
+++ b/src/support/scalars.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file src/support/scalars.h
- * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and Tensor forms.
  */
 
 #ifndef TVM_SUPPORT_SCALARS_H_
@@ -28,18 +28,18 @@
 #include <string>
 
 #include "tvm/ir/expr.h"
-#include "tvm/runtime/ndarray.h"
+#include "tvm/runtime/tensor.h"
 
 namespace tvm {
 namespace support {
 
-/*! \brief Returns NDArray 'scalar' for given TIR immediate. */
-runtime::NDArray IntImmToNDArray(const IntImm& int_imm);
-runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm);
-runtime::NDArray BoolToNDArray(bool value);
+/*! \brief Returns Tensor 'scalar' for given TIR immediate. */
+runtime::Tensor IntImmToTensor(const IntImm& int_imm);
+runtime::Tensor FloatImmToTensor(const FloatImm& float_imm);
+runtime::Tensor BoolToTensor(bool value);
 
-/*! \brief Returns literal text for NDArray 'scalar'. */
-std::string NDArrayScalarToString(const runtime::NDArray& data);
+/*! \brief Returns literal text for Tensor 'scalar'. */
+std::string TensorScalarToString(const runtime::Tensor& data);
 
 /*! \brief Returns literal text for given TIR immediate. */
 std::string IntImmToString(const IntImm& int_imm);
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index bd45ce32e053..b452c26ca96d 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -352,7 +352,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
   refl::GlobalDef()
       .def("runtime.ModuleImportsBlobName",
            []() -> std::string { return ffi::symbol::tvm_ffi_library_bin; })
-      .def("runtime.ModulePackImportsToNDArray",
+      .def("runtime.ModulePackImportsToTensor",
            [](const ffi::Module& mod) {
              std::string buffer = PackImportsToBytes(mod);
              ffi::Shape::index_type size = buffer.size();
@@ -363,7 +363,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
              DLDevice dev;
              dev.device_type = kDLCPU;
              dev.device_id = 0;
-             auto array = runtime::NDArray::Empty({size}, uchar, dev);
+             auto array = runtime::Tensor::Empty({size}, uchar, dev);
              array.CopyFromBytes(buffer.data(), size);
              return array;
            })
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index ac73c9c3fccb..bb4a76bc19c9 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -2041,7 +2041,7 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
 void CodeGenLLVM::VisitStmt_(const AllocateConstNode* op) {
   EmitDebugLocation(op);
   auto data = op->data.value();
-  auto array = NDArrayToLLVMArray(llvm_target_->GetContext(), data);
+  auto array = TensorToLLVMArray(llvm_target_->GetContext(), data);
   std::string symbol_name = op->buffer_var->name_hint;
   llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
       *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 81ed4462318f..e2e5323445c8 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -70,7 +70,7 @@ void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_ele
                  [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
 }
 
-llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
+llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::Tensor arr) {
   llvm::Type* element_type = nullptr;
 
   auto arr_type = arr.DataType();
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
index 9d05621469a7..b59630fb6150 100644
--- a/src/target/llvm/codegen_params.h
+++ b/src/target/llvm/codegen_params.h
@@ -24,7 +24,7 @@
 #ifndef TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
 #define TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 namespace llvm {
 class ConstantArray;
@@ -35,15 +35,15 @@ namespace tvm {
 namespace codegen {
 
 /*!
- * \brief Convert an NDArray to an LLVM array of constants.
+ * \brief Convert an Tensor to an LLVM array of constants.
  *
- * The supplied NDArray is flattened, and each element is converted to the appropriate LLVM type.
+ * The supplied Tensor is flattened, and each element is converted to the appropriate LLVM type.
  *
  * \param ctx LLVM context used to create the various primitive datatypes.
- * \param arr NDArray to convert.
+ * \param arr Tensor to convert.
  * \return LLVM array containing the array data.
  */
-llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, tvm::runtime::NDArray arr);
+llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, tvm::runtime::Tensor arr);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 65c57cf882b4..49b444e49516 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -778,7 +778,7 @@ void CodeGenC::VisitStmt_(const AllocateConstNode* op) {
   decl_stream << " __attribute__((section(\".rodata.tvm\"), "
               << "aligned(" << constants_byte_alignment_->value << "))) " << symbol_name << "["
               << num_elements << "] = {\n";
-  NDArrayDataToC(data, 4, decl_stream);
+  TensorDataToC(data, 4, decl_stream);
 
   decl_stream << "};\n"
               << "#ifdef __cplusplus\n"
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index cd2bcd769c04..d840ebec7df3 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -160,8 +160,8 @@ void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars,
   }
 }
 
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os,
-                    const std::string& eol) {
+void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& os,
+                   const std::string& eol) {
   auto arr_type = arr.DataType();
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
                                 << arr_type.lanes();
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
index 6df800ed1721..5c8c129006b3 100644
--- a/src/target/source/codegen_params.h
+++ b/src/target/source/codegen_params.h
@@ -24,7 +24,7 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
 #define TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <iostream>
 #include <string>
@@ -36,8 +36,8 @@ namespace codegen {
  * \brief Write a C representation of arr to os.
  *
  * This function generates a comma-separated, indented list of C integer listeals suitable for use
- * in an initializer. The NDArray is flattened and then the list is produced element by element.
- * For the int16_t NDArray [-3, -2, -1, 0, 1, 2, 3, ...], and indent_chars = 4, the following output
+ * in an initializer. The Tensor is flattened and then the list is produced element by element.
+ * For the int16_t Tensor [-3, -2, -1, 0, 1, 2, 3, ...], and indent_chars = 4, the following output
  * is produced:
  *     -0x0003, -0x0002, -0x0001, +0x0000, +0x0001, +0x0002, +0x0003
  *
@@ -45,8 +45,8 @@ namespace codegen {
  * \param indent_chars Number of chars to indent
  * \param os Output stream where the array data should be written.
  */
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os,
-                    const std::string& eol = "\n");
+void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& os,
+                   const std::string& eol = "\n");
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index f077f8c3a83b..97828249ce24 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -163,7 +163,7 @@ ffi::Module CSourceModuleCreate(const String& code, const String& fmt,
  * \param target The target that all the modules are compiled for
  * \return The wrapped module.
  */
-ffi::Module CreateMetadataModule(const std::unordered_map<std::string, runtime::NDArray>& params,
+ffi::Module CreateMetadataModule(const std::unordered_map<std::string, runtime::Tensor>& params,
                                  ffi::Module target_module, const Array<ffi::Module>& ext_modules,
                                  Target target);
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 1350357d866c..6638ed0e05a5 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -26,7 +26,7 @@
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <algorithm>
 #include <functional>
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 7408eb46eb51..ce9a5846ddf8 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -750,7 +750,7 @@ PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
 }
 
 PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
-                                     const Array<runtime::NDArray>& constants,
+                                     const Array<runtime::Tensor>& constants,
                                      std::optional<DataType> index_dtype_override) {
   // Information used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
@@ -827,7 +827,7 @@ PrimFunc GenerateAndCompletePrimFunc(const Array<ObjectRef>& arg_tir_var_list,
 }
 
 PrimFunc CreatePrimFuncWithConstants(const Array<ObjectRef>& arg_list,
-                                     const Array<runtime::NDArray>& constants,
+                                     const Array<runtime::Tensor>& constants,
                                      std::optional<DataType> index_dtype_override) {
   Array<te::Tensor> tensor_arg_list;
   for (const ObjectRef& x : arg_list) {
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index eb4a6183dd5c..9e61d87ce332 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -39,7 +39,7 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list,
  * will be embedded in the body as AllocateConstNode.
  */
 PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
-                                     const Array<runtime::NDArray>& constants,
+                                     const Array<runtime::Tensor>& constants,
                                      std::optional<DataType> index_dtype_override = std::nullopt);
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
@@ -52,7 +52,7 @@ PrimFunc CreatePrimFunc(const Array<ObjectRef>& arg_list,
  * will be embedded in the body as AllocateConstNode.
  */
 PrimFunc CreatePrimFuncWithConstants(const Array<ObjectRef>& arg_list,
-                                     const Array<runtime::NDArray>& constants,
+                                     const Array<runtime::Tensor>& constants,
                                      std::optional<DataType> index_dtype_override);
 
 }  // namespace tir
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 34e7e9c56f9f..5c2541b10b1e 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -255,7 +255,7 @@ Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
   return output;
 }
 
-runtime::NDArray IndexMapNode::MapNDArray(runtime::NDArray arr_src) const {
+runtime::Tensor IndexMapNode::MapTensor(runtime::Tensor arr_src) const {
   arith::Analyzer analyzer;
   auto shape = arr_src.Shape();
   ICHECK(shape.size() == initial_indices.size())
@@ -305,7 +305,7 @@ runtime::NDArray IndexMapNode::MapNDArray(runtime::NDArray arr_src) const {
               bytes_dst.begin() + dst_linear_index * elem_bytes);
   }
 
-  auto arr_dst = runtime::NDArray::Empty(dst_shape_int, arr_src->dtype, arr_src->device);
+  auto arr_dst = runtime::Tensor::Empty(dst_shape_int, arr_src->dtype, arr_src->device);
   arr_dst.CopyFromBytes(bytes_dst.data(), bytes_dst.size());
   return arr_dst;
 }
@@ -443,8 +443,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
              arith::Analyzer analyzer;
              return map.Inverse(initial_ranges, &analyzer);
            })
-      .def("tir.IndexMapMapNDArray",
-           [](IndexMap map, runtime::NDArray arr) { return map->MapNDArray(arr); })
+      .def("tir.IndexMapMapTensor",
+           [](IndexMap map, runtime::Tensor arr) { return map->MapTensor(arr); })
       .def("tir.IndexMapNonSurjectiveInverse", [](IndexMap forward, Array<Range> initial_ranges) {
         arith::Analyzer analyzer;
         auto result = forward.NonSurjectiveInverse(initial_ranges, &analyzer);
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 4b3b4d191510..305dd5ec9af6 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -312,11 +312,11 @@ AllocateConst::AllocateConst(Var buffer_var, DataType dtype, Array<PrimExpr> ext
   node->body = std::move(body);
   node->annotations = annotations;
   node->span = std::move(span);
-  if (data_or_idx->IsInstance<runtime::NDArray::ContainerType>()) {
-    node->data = Optional<tvm::runtime::NDArray>(Downcast<runtime::NDArray>(data_or_idx));
+  if (data_or_idx->IsInstance<runtime::Tensor::ContainerType>()) {
+    node->data = Optional<tvm::runtime::Tensor>(Downcast<runtime::Tensor>(data_or_idx));
     node->irmod_storage_idx = Optional<Integer>();
   } else if (data_or_idx->IsInstance<IntImmNode>()) {
-    node->data = Optional<tvm::runtime::NDArray>();
+    node->data = Optional<tvm::runtime::Tensor>();
     node->irmod_storage_idx = Optional<Integer>(Downcast<Integer>(data_or_idx));
   } else {
     LOG(FATAL) << "Data type not supported: " << data_or_idx->GetTypeKey();
diff --git a/src/tir/transforms/bind_params.cc b/src/tir/transforms/bind_params.cc
index 06d596adb44d..520f6e871200 100644
--- a/src/tir/transforms/bind_params.cc
+++ b/src/tir/transforms/bind_params.cc
@@ -40,7 +40,7 @@ namespace tir {
 
 class ParamsCollector : public StmtExprVisitor {
  public:
-  explicit ParamsCollector(const Map<tir::Var, runtime::NDArray>& constant_map)
+  explicit ParamsCollector(const Map<tir::Var, runtime::Tensor>& constant_map)
       : constant_map_(constant_map) {}
   std::vector<const tir::VarNode*> CollectParams(tir::Stmt body) {
     this->VisitStmt(body);
@@ -75,11 +75,11 @@ class ParamsCollector : public StmtExprVisitor {
 
  private:
   std::vector<const tir::VarNode*> constant_list_;
-  Map<tir::Var, runtime::NDArray> constant_map_;
+  Map<tir::Var, runtime::Tensor> constant_map_;
 };
 
-PrimFunc BindParams(PrimFunc f, const Array<runtime::NDArray>& constants) {
-  Map<tir::Var, runtime::NDArray> constant_map;
+PrimFunc BindParams(PrimFunc f, const Array<runtime::Tensor>& constants) {
+  Map<tir::Var, runtime::Tensor> constant_map;
 
   // Remove constants from the primfunc signature
   size_t num_constants = constants.size();
@@ -126,7 +126,7 @@ PrimFunc BindParams(PrimFunc f, const Array<runtime::NDArray>& constants) {
 
 namespace transform {
 
-Pass BindParams(const Array<runtime::NDArray>& constants) {
+Pass BindParams(const Array<runtime::Tensor>& constants) {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     return BindParams(f, constants);
   };
diff --git a/src/tir/transforms/extract_constants.cc b/src/tir/transforms/extract_constants.cc
index 301c6c13b9f0..51cd08c7a877 100644
--- a/src/tir/transforms/extract_constants.cc
+++ b/src/tir/transforms/extract_constants.cc
@@ -36,14 +36,14 @@
 namespace tvm {
 namespace tir {
 
-using ConstArrayType = Array<runtime::NDArray>;
+using ConstArrayType = Array<runtime::Tensor>;
 class Applicator : public tir::StmtMutator {
  protected:
   // returns index of the a in constant_array_, if not found - appends
-  size_t DeDup(const runtime::NDArray& a) {
+  size_t DeDup(const runtime::Tensor& a) {
     tvm::StructuralEqual eql;
     auto it = std::find_if(constant_array_.begin(), constant_array_.end(),
-                           [&eql, a](const runtime::NDArray& v) { return eql(a, v); });
+                           [&eql, a](const runtime::Tensor& v) { return eql(a, v); });
     if (it != constant_array_.end()) {
       return it - constant_array_.begin();
     }
diff --git a/src/tir/transforms/ir_utils.h b/src/tir/transforms/ir_utils.h
index cc58f96b83fb..b77213bdf10a 100644
--- a/src/tir/transforms/ir_utils.h
+++ b/src/tir/transforms/ir_utils.h
@@ -322,7 +322,7 @@ std::pair<PrimExpr, PrimExpr> GetAsyncWaitAttributes(const AttrStmtNode* op);
  * function body.
  * \return The updated function.
  */
-PrimFunc BindParams(PrimFunc f, const Array<runtime::NDArray>& constants);
+PrimFunc BindParams(PrimFunc f, const Array<runtime::Tensor>& constants);
 
 /*! \brief The quad used by StorageAlign for (buffer_idx, axis, factor, offset) */
 using StorageAlignTuple = ffi::Tuple<int32_t, int32_t, int32_t, int32_t>;
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index f557cab91ad8..198b8cfc2e32 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -299,16 +299,16 @@ PrimFunc MakePackedAPI(PrimFunc func) {
                                            type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
                                            type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                                        tvm::tir::StringImm(msg.str()), nop));
-      // if type_index is NDArray, we need to add the offset of the DLTensor header
+      // if type_index is Tensor, we need to add the offset of the DLTensor header
       // which always equals 16 bytes, this ensures that T.handle always shows up as a DLTensor*
       const int64_t object_cell_offset = sizeof(TVMFFIObject);
       static_assert(object_cell_offset == 24);
       arg_value = f_load_arg_value(param.dtype(), i);
-      PrimExpr handle_from_ndarray =
+      PrimExpr handle_from_tensor =
           Call(DataType::Handle(), tir::builtin::handle_add_byte_offset(),
                {arg_value, IntImm(DataType::Int(32), object_cell_offset)});
       arg_value =
-          Select(type_index == ffi::TypeIndex::kTVMFFINDArray, handle_from_ndarray, arg_value);
+          Select(type_index == ffi::TypeIndex::kTVMFFITensor, handle_from_tensor, arg_value);
     } else if (dtype.is_bool()) {
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be boolean";
@@ -341,7 +341,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     var_def.emplace_back(arg_value, param);
     if (func_ptr->buffer_map.count(param)) {
       // buffer binding now depends on type index
-      // if the index is NDArray handle, we need to offset to get the DLTensor*
+      // if the index is Tensor handle, we need to offset to get the DLTensor*
       buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
     }
   }
diff --git a/src/tir/transforms/remove_weight_layout_rewrite_block.cc b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
index 3c1e12bc3af9..13dac2789b43 100644
--- a/src/tir/transforms/remove_weight_layout_rewrite_block.cc
+++ b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
@@ -150,11 +150,11 @@ class AllocateConstRewrite : public StmtExprMutator {
       const BufferVarMap& buffer_var_map,
       const std::unordered_map<const VarNode*, IndexMap>& buffer_var_to_index_map,
       const std::unordered_map<const VarNode*, Array<PrimExpr>>& buffer_var_to_rewritten_shape,
-      bool skip_ndarray_rewrite)
+      bool skip_tensor_rewrite)
       : buffer_var_map_(buffer_var_map),
         buffer_var_to_index_map_(buffer_var_to_index_map),
         buffer_var_to_rewritten_shape_(buffer_var_to_rewritten_shape),
-        skip_ndarray_rewrite_(skip_ndarray_rewrite) {}
+        skip_tensor_rewrite_(skip_tensor_rewrite) {}
 
  private:
   Stmt VisitStmt_(const BlockNode* op) final {
@@ -178,13 +178,13 @@ class AllocateConstRewrite : public StmtExprMutator {
         it != buffer_var_to_index_map_.end()) {
       ICHECK(buffer_var_to_rewritten_shape_.count(alloc->buffer_var.get()));
       auto new_body = StmtMutator::VisitStmt(alloc->body);
-      auto rewritten_ndarray = RewriteNDArray(
+      auto rewritten_tensor = RewriteTensor(
           alloc->data.value(), it->second, buffer_var_to_rewritten_shape_[alloc->buffer_var.get()]);
       Array<PrimExpr> rewritten_extents;
-      for (auto s : rewritten_ndarray.Shape()) {
+      for (auto s : rewritten_tensor.Shape()) {
         rewritten_extents.push_back(PrimExpr(static_cast<int>(s)));
       }
-      return AllocateConst(alloc->buffer_var, alloc->dtype, rewritten_extents, rewritten_ndarray,
+      return AllocateConst(alloc->buffer_var, alloc->dtype, rewritten_extents, rewritten_tensor,
                            new_body, alloc->annotations, alloc->span);
     }
     return StmtMutator::VisitStmt_(alloc);
@@ -202,9 +202,9 @@ class AllocateConstRewrite : public StmtExprMutator {
     return ExprMutator::VisitExpr_(op);
   }
 
-  runtime::NDArray RewriteNDArray(runtime::NDArray src, const IndexMap& index_map,
-                                  const Array<PrimExpr>& dst_shape) {
-    if (skip_ndarray_rewrite_) {
+  runtime::Tensor RewriteTensor(runtime::Tensor src, const IndexMap& index_map,
+                                const Array<PrimExpr>& dst_shape) {
+    if (skip_tensor_rewrite_) {
       // Only the shape of the destination array needs to be correct.
       std::vector<int64_t> dst_shape_int;
       for (auto s : dst_shape) {
@@ -213,7 +213,7 @@ class AllocateConstRewrite : public StmtExprMutator {
       }
       return src.CreateView(dst_shape_int, src.DataType());
     } else {
-      return index_map->MapNDArray(src);
+      return index_map->MapTensor(src);
     }
   }
 
@@ -226,8 +226,8 @@ class AllocateConstRewrite : public StmtExprMutator {
   std::unordered_map<const VarNode*, Array<PrimExpr>> buffer_var_to_rewritten_shape_;
   /*! \brief Maps load buffer variables to newly created buffers */
   std::unordered_map<const VarNode*, Buffer> new_load_buf_;
-  /*! \brief Whether or not to skip rewriting of NDArray contents */
-  bool skip_ndarray_rewrite_;
+  /*! \brief Whether or not to skip rewriting of Tensor contents */
+  bool skip_tensor_rewrite_;
 };
 
 class CollectAllocateConstBufferVars : public StmtVisitor {
@@ -242,7 +242,7 @@ class CollectAllocateConstBufferVars : public StmtVisitor {
 
 class WeightLayoutRewriteBlockRemover : public StmtMutator {
  public:
-  static PrimFunc Remove(PrimFunc f, bool skip_ndarray_rewrite) {
+  static PrimFunc Remove(PrimFunc f, bool skip_tensor_rewrite) {
     CollectAllocateConstBufferVars collector;
     collector(f->body);
 
@@ -260,7 +260,7 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
     PrimFuncNode* n = f_.CopyOnWrite();
 
     AllocateConstRewrite rewriter(buffer_var_map, buffer_var_to_index_map,
-                                  buffer_var_to_rewritten_shape, skip_ndarray_rewrite);
+                                  buffer_var_to_rewritten_shape, skip_tensor_rewrite);
     n->body = rewriter(std::move(n->body));
 
     Map<tir::Var, Buffer> buffer_map;
@@ -279,9 +279,9 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
 
 namespace transform {
 
-Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite) {
-  auto pass_func = [skip_ndarray_rewrite](PrimFunc f, IRModule m, PassContext ctx) {
-    return WeightLayoutRewriteBlockRemover::Remove(std::move(f), skip_ndarray_rewrite);
+Pass RemoveWeightLayoutRewriteBlock(bool skip_tensor_rewrite) {
+  auto pass_func = [skip_tensor_rewrite](PrimFunc f, IRModule m, PassContext ctx) {
+    return WeightLayoutRewriteBlockRemover::Remove(std::move(f), skip_tensor_rewrite);
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.RemoveWeightLayoutRewriteBlock", {});
 }
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index 433a641ad068..2324e845b934 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -53,8 +53,8 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def_packed("topi.flip",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     // pass empty seq_lengths tensor to reverse_sequence
-                    *rv =
-                        reverse_sequence(args[0].cast<te::Tensor>(), Tensor(), args[1].cast<int>());
+                    *rv = reverse_sequence(args[0].cast<te::Tensor>(), te::Tensor(),
+                                           args[1].cast<int>());
                   })
       .def_packed("topi.reverse_sequence",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -87,9 +87,9 @@ TVM_FFI_STATIC_INIT_BLOCK({
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     *rv = shape(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
                   })
-      .def_packed("topi.ndarray_size",
+      .def_packed("topi.tensor_size",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = ndarray_size(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = tensor_size(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
                   })
       .def_packed("topi.split",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -210,7 +210,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def_packed(
           "topi.strided_slice",
           [](ffi::PackedArgs args, ffi::Any* rv) {
-            Tensor x = args[0].cast<te::Tensor>();
+            te::Tensor x = args[0].cast<te::Tensor>();
             Array<PrimExpr> begin = args[1].cast<Array<PrimExpr>>();
             Array<PrimExpr> end = args[2].cast<Array<PrimExpr>>();
             Array<PrimExpr> strides = args[3].cast<Array<PrimExpr>>();
diff --git a/tests/cpp-runtime/opencl/opencl_nativeptr.cc b/tests/cpp-runtime/opencl/opencl_nativeptr.cc
index 260effadea0b..1694de418b5c 100644
--- a/tests/cpp-runtime/opencl/opencl_nativeptr.cc
+++ b/tests/cpp-runtime/opencl/opencl_nativeptr.cc
@@ -32,7 +32,7 @@ using namespace tvm::runtime::cl;
 TEST(OpenCLNativePtr, access_memory) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
 
-  auto A = tvm::runtime::NDArray::Empty({128, 128}, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
+  auto A = tvm::runtime::Tensor::Empty({128, 128}, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
   void* nptr = workspace->GetNativePtr(A);
   memset(nptr, 0x0, 128 * 128 * 4);
 }
@@ -40,8 +40,8 @@ TEST(OpenCLNativePtr, access_memory) {
 TEST(OpenCLNatvePtr, data_loop) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
 
-  auto cl_arr = tvm::runtime::NDArray::Empty({1024}, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
-  auto cpu_arr = tvm::runtime::NDArray::Empty({1024}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cl_arr = tvm::runtime::Tensor::Empty({1024}, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
+  auto cpu_arr = tvm::runtime::Tensor::Empty({1024}, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   std::random_device rdev;
   std::mt19937 mt(rdev());
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
index 61d9044b6d86..c9ee44515d1f 100644
--- a/tests/cpp-runtime/opencl/texture_copy_test.cc
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -61,10 +61,10 @@ TEST(TextureCopy, HostDeviceRT) {
   (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
       thr->device, tvm::runtime::memory::AllocatorType::kPooled);
   std::vector<int64_t> shape{16, 16, 4};
-  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr0 = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr1 = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
   String mem_scope = "global.texture";
-  auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
+  auto opencl_txarr0 = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
 
   size_t size = 1;
   for (size_t i = 0; i < shape.size(); ++i) {
@@ -94,8 +94,8 @@ TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   using namespace tvm;
   std::vector<int64_t> shape{1, 16, 16, 8};
   std::vector<int64_t> same_shape{1, 8, 16, 16};
-  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   String mem_scope = "global";
 
@@ -104,9 +104,9 @@ TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   auto buffer = allocator->Alloc(cl_dev, ffi::Shape(shape), {kDLFloat, 32, 1});
   auto stor = Storage(buffer, allocator);
 
-  auto opencl_memobj = stor->AllocNDArrayScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, mem_scope);
+  auto opencl_memobj = stor->AllocTensorScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, mem_scope);
   auto opencl_memview =
-      stor->AllocNDArrayScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, mem_scope);
+      stor->AllocTensorScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, mem_scope);
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -153,17 +153,17 @@ TEST_F(TextureCopyTest, ViewBufferAsImage) {
   // Shape that doesn't cause padding for image row
   std::vector<int64_t> shape{1, 16, 16, 8, 4};
   std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
-  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   DLDevice cl_dev = {kDLOpenCL, 0};
   auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
   auto buffer = allocator->Alloc(cl_dev, ffi::Shape(shape), {kDLFloat, 32, 1});
   auto stor = Storage(buffer, allocator);
 
-  auto opencl_buf_obj = stor->AllocNDArrayScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global");
+  auto opencl_buf_obj = stor->AllocTensorScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global");
   auto opencl_img_obj =
-      stor->AllocNDArrayScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global.texture");
+      stor->AllocTensorScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global.texture");
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -210,8 +210,8 @@ TEST_F(TextureCopyTest, ViewImageAsBuffer) {
   // Shape that doesn't cause padding for image row
   std::vector<int64_t> shape{1, 16, 16, 8, 4};
   std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
-  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   DLDevice cl_dev = {kDLOpenCL, 0};
   auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
@@ -219,9 +219,9 @@ TEST_F(TextureCopyTest, ViewImageAsBuffer) {
   auto stor = Storage(buffer, allocator);
 
   auto opencl_img_obj =
-      stor->AllocNDArrayScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global.texture");
+      stor->AllocTensorScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global.texture");
   auto opencl_buf_obj =
-      stor->AllocNDArrayScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global");
+      stor->AllocTensorScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global");
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -268,8 +268,8 @@ TEST_F(TextureCopyTest, ViewImageAsImage) {
   // Shape that doesn't cause padding for image row
   std::vector<int64_t> shape{1, 16, 16, 8, 4};
   std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
-  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::Tensor::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   DLDevice cl_dev = {kDLOpenCL, 0};
   auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
@@ -277,9 +277,9 @@ TEST_F(TextureCopyTest, ViewImageAsImage) {
   auto stor = Storage(buffer, allocator);
 
   auto opencl_img_obj_1 =
-      stor->AllocNDArrayScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global.texture");
+      stor->AllocTensorScoped(0, ffi::Shape(shape), {kDLFloat, 32, 1}, "global.texture");
   auto opencl_img_obj_2 =
-      stor->AllocNDArrayScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global.texture");
+      stor->AllocTensorScoped(0, ffi::Shape(same_shape), {kDLFloat, 32, 1}, "global.texture");
 
   std::random_device dev;
   std::mt19937 mt(dev());
diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc
index 57ad3ba90b40..c2452f9146b1 100644
--- a/tests/cpp/ndarray_test.cc
+++ b/tests/cpp/ndarray_test.cc
@@ -19,12 +19,12 @@
 
 #include <gtest/gtest.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 using namespace tvm;
 
-TEST(NDArrayTest, IsContiguous_ContiguousStride) {
-  auto array = runtime::NDArray::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+TEST(TensorTest, IsContiguous_ContiguousStride) {
+  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1};
@@ -35,8 +35,8 @@ TEST(NDArrayTest, IsContiguous_ContiguousStride) {
   managed_tensor->deleter(managed_tensor);
 }
 
-TEST(NDArrayTest, IsContiguous_NullStride) {
-  auto array = runtime::NDArray::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+TEST(TensorTest, IsContiguous_NullStride) {
+  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   managed_tensor->dl_tensor.strides = nullptr;
@@ -46,8 +46,8 @@ TEST(NDArrayTest, IsContiguous_NullStride) {
   managed_tensor->deleter(managed_tensor);
 }
 
-TEST(NDArrayTest, IsContiguous_AnyStrideForSingular) {
-  auto array = runtime::NDArray::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+TEST(TensorTest, IsContiguous_AnyStrideForSingular) {
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1, 1};  // strides[1] is normalized to 1 because shape[1] == 1.
@@ -59,8 +59,8 @@ TEST(NDArrayTest, IsContiguous_AnyStrideForSingular) {
   managed_tensor->deleter(managed_tensor);
 }
 
-TEST(NDArrayTest, IsContiguous_UncontiguousStride) {
-  auto array = runtime::NDArray::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+TEST(TensorTest, IsContiguous_UncontiguousStride) {
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {1, 1, 1};
diff --git a/tests/cpp/support/scalars_test.cc b/tests/cpp/support/scalars_test.cc
index 52bd2dc148c8..12a5145f2145 100644
--- a/tests/cpp/support/scalars_test.cc
+++ b/tests/cpp/support/scalars_test.cc
@@ -28,17 +28,17 @@ namespace {
 // Note that functional testing is via test_ir_parser.py and test_ir_text_printer.py.
 // Here we just check handling which is difficult to test via the standard Python API.
 
-TEST(Scalars, IntImmToNDArray_Unsupported) {
-  ASSERT_THROW(IntImmToNDArray(IntImm(DataType::Int(15), 42)), runtime::InternalError);
+TEST(Scalars, IntImmToTensor_Unsupported) {
+  ASSERT_THROW(IntImmToTensor(IntImm(DataType::Int(15), 42)), runtime::InternalError);
 }
 
-TEST(Scalars, FloatImmtoNDArray_Unsupported) {
-  ASSERT_THROW(FloatImmToNDArray(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
+TEST(Scalars, FloatImmtoTensor_Unsupported) {
+  ASSERT_THROW(FloatImmToTensor(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
 }
 
-TEST(Scalars, NDArrayScalarToString_Unsupported) {
-  auto ndarray = runtime::NDArray::Empty({}, DataType::Int(8), {DLDeviceType::kDLCPU, 0});
-  ASSERT_THROW(NDArrayScalarToString(ndarray), runtime::InternalError);
+TEST(Scalars, TensorScalarToString_Unsupported) {
+  auto ndarray = runtime::Tensor::Empty({}, DataType::Int(8), {DLDeviceType::kDLCPU, 0});
+  ASSERT_THROW(TensorScalarToString(ndarray), runtime::InternalError);
 }
 
 TEST(Scalars, IntImmToString_Unsupported) {
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
index 4767c24b693a..a9dbf74269e7 100644
--- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -50,9 +50,9 @@ def check_llvm():
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 29867c3ed8ee..7e00ba64fac4 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -31,11 +31,11 @@
 def test_nd_create(target, dev, dtype):
     x = np.random.randint(0, 10, size=(3, 4))
     x = np.array(x, dtype=dtype)
-    y = tvm.nd.array(x, device=dev)
+    y = tvm.runtime.tensor(x, device=dev)
     z = y.copyto(dev)
     assert y.dtype == x.dtype
     assert y.shape == x.shape
-    assert isinstance(y, tvm.nd.NDArray)
+    assert isinstance(y, tvm.runtime.Tensor)
     np.testing.assert_equal(x, y.numpy())
     np.testing.assert_equal(x, z.numpy())
 
@@ -48,7 +48,7 @@ def test_memory_usage(target, dev, dtype):
     if available_memory_before is None:
         pytest.skip(reason=f"Target '{target}' does not support queries of available memory")
 
-    arr = tvm.nd.empty([1024, 1024], dtype=dtype, device=dev)
+    arr = tvm.runtime.empty([1024, 1024], dtype=dtype, device=dev)
     available_memory_after = dev.available_global_memory
 
     num_elements = math.prod(arr.shape)
@@ -61,8 +61,8 @@ def test_memory_usage(target, dev, dtype):
     # available memory may decrease by more than the requested amount.
     assert available_memory_after <= expected_memory_after
 
-    # TVM's NDArray type is a reference-counted handle to the
-    # underlying reference.  After the last reference to an NDArray is
+    # TVM's Tensor type is a reference-counted handle to the
+    # underlying reference.  After the last reference to an Tensor is
     # cleared, the backing allocation will be freed.
     del arr
 
diff --git a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
index f315b8f3c210..404ca5d1d94d 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
@@ -121,13 +121,13 @@ def test_numpy_scalar():
     assert tvm.testing.echo(np.int64(maxint)) == maxint
 
 
-def test_ndarray_args():
+def test_tensor_args():
     def check(arr):
         assert not arr.is_view
         assert tvm.testing.object_use_count(arr) == 2
 
     fcheck = tvm.runtime.convert(check)
-    x = tvm.nd.array([1, 2, 3])
+    x = tvm.runtime.tensor([1, 2, 3])
     fcheck(x)
     assert tvm.testing.object_use_count(x) == 1
 
@@ -145,7 +145,7 @@ def test_dict_function_value_type():
 
 
 if __name__ == "__main__":
-    test_ndarray_args()
+    test_tensor_args()
     test_numpy_scalar()
     test_rvalue_ref()
     test_empty_array()
diff --git a/tests/python/codegen/test_gpu_codegen_allreduce.py b/tests/python/codegen/test_gpu_codegen_allreduce.py
index 5e8c3a05db52..fe6a9179f41c 100644
--- a/tests/python/codegen/test_gpu_codegen_allreduce.py
+++ b/tests/python/codegen/test_gpu_codegen_allreduce.py
@@ -76,8 +76,8 @@ def test_allreduce_sum(dims, target, dev):
     # prepare input and output array
     a_np = np.random.rand(1, d1, d2, d3).astype("float32")
     b_np = a_np.sum(axis=-1).astype("float32")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros_like(b_np), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(np.zeros_like(b_np), dev)
 
     # launch kernel
     f(a, b)
@@ -143,8 +143,8 @@ def test_allreduce_max(dims, target, dev):
     # prepare input and output array
     a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
     b_np = a_np.max(axis=-1).astype("float32")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros_like(b_np), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(np.zeros_like(b_np), dev)
 
     # launch kernel
     f(a, b)
diff --git a/tests/python/codegen/test_inject_ptx_ldg32.py b/tests/python/codegen/test_inject_ptx_ldg32.py
index a45e8f57f38f..fd2f598c924e 100644
--- a/tests/python/codegen/test_inject_ptx_ldg32.py
+++ b/tests/python/codegen/test_inject_ptx_ldg32.py
@@ -50,8 +50,8 @@ def test_inject_ptx_intrin():
     A_np = np.random.rand(16).astype("float32")
     B_np = np.zeros((32)).astype("float32")
     dev = tvm.cuda(0)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
 
     C_np = np.zeros((32)).astype("float32")
diff --git a/tests/python/codegen/test_target_codegen_blob.py b/tests/python/codegen/test_target_codegen_blob.py
index 39373c4d840c..d57297ee6e22 100644
--- a/tests/python/codegen/test_target_codegen_blob.py
+++ b/tests/python/codegen/test_target_codegen_blob.py
@@ -77,8 +77,8 @@ def popen_check():
         # Load the system wide library
         dev = tvm.cuda()
         a_np = np.random.uniform(size=12).astype("float32")
-        a_nd = tvm.nd.array(a_np, dev)
-        b_nd = tvm.nd.array(a_np, dev)
+        a_nd = tvm.runtime.tensor(a_np, dev)
+        b_nd = tvm.runtime.tensor(a_np, dev)
         syslibA = tvm.runtime.system_lib("modA_")
         syslibB = tvm.runtime.system_lib("modB_")
         # reload same lib twice
diff --git a/tests/python/codegen/test_target_codegen_bool.py b/tests/python/codegen/test_target_codegen_bool.py
index 96bd21329c93..d4524ac1d5fe 100644
--- a/tests/python/codegen/test_target_codegen_bool.py
+++ b/tests/python/codegen/test_target_codegen_bool.py
@@ -56,9 +56,9 @@ def test_cmp_load_store(target, dev, arr_size, compute, get_module):
 
     a_np = np.random.uniform(size=arr_size).astype(A.dtype)
     b_np = np.random.uniform(size=arr_size).astype(B.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    d = tvm.nd.array(np.zeros(arr_size, dtype=D.dtype), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
+    d = tvm.runtime.tensor(np.zeros(arr_size, dtype=D.dtype), dev)
     f(a, b, d)
     np.testing.assert_equal(
         d.numpy(),
diff --git a/tests/python/codegen/test_target_codegen_c_host.py b/tests/python/codegen/test_target_codegen_c_host.py
index 8f3798861f46..e95108aeac17 100644
--- a/tests/python/codegen/test_target_codegen_c_host.py
+++ b/tests/python/codegen/test_target_codegen_c_host.py
@@ -47,9 +47,9 @@ def check_c():
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
@@ -78,8 +78,8 @@ def check_c():
         fadd = m["test_reinterpret"]
         dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.randint(-(2**30), 2**30, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.randint(-(2**30), 2**30, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(n, dtype=B.dtype), dev)
         fadd(a, b)
         tvm.testing.assert_allclose(b.numpy(), (2 + a.numpy()).view("float32"))
 
@@ -106,8 +106,8 @@ def check_c():
         fceil = m["test_ceil"]
         dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(n, dtype=B.dtype), dev)
         fceil(a, b)
         tvm.testing.assert_allclose(b.numpy(), (np.ceil(a.numpy()).view("float32")))
 
@@ -134,8 +134,8 @@ def check_c():
         ffloor = m["test_floor"]
         dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(n, dtype=B.dtype), dev)
         ffloor(a, b)
         tvm.testing.assert_allclose(b.numpy(), (np.floor(a.numpy()).view("float32")))
 
@@ -162,8 +162,8 @@ def check_c():
         fround = m["test_round"]
         dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(n, dtype=B.dtype), dev)
         fround(a, b)
         tvm.testing.assert_allclose(b.numpy(), (np.round(a.numpy()).view("float32")))
 
diff --git a/tests/python/codegen/test_target_codegen_cross_llvm.py b/tests/python/codegen/test_target_codegen_cross_llvm.py
index 9ae516c7de30..3cb8c3037254 100644
--- a/tests/python/codegen/test_target_codegen_cross_llvm.py
+++ b/tests/python/codegen/test_target_codegen_cross_llvm.py
@@ -81,9 +81,9 @@ def build_arm():
             farm = remote.load_module("myadd.o")
             dev = remote.cpu(0)
             n = nn
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+            b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+            c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
             farm(a, b, c)
             tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
             print("Verification finish on remote..")
diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py
index fb9c47410fea..db49f56045ad 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -49,8 +49,8 @@ def check_cuda(dtype, n, lanes):
         fun = tvm.compile(sch.mod, target="cuda")
 
         dev = tvm.cuda(0)
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), B.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
+        c = tvm.runtime.empty((n,), B.dtype, dev)
         fun(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
@@ -105,10 +105,10 @@ def check_cuda(n, lanes):
         dev = tvm.cuda(0)
         np_a = np.random.uniform(size=(n, lanes)).astype("float32")
         np_a = np_bf162np_float(np_float2np_bf16(np_a))
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_float2np_bf16(np_a))
-        c = tvm.nd.empty((n,), B.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(np_float2np_bf16(np_a))
+        c = tvm.runtime.empty((n,), B.dtype, dev)
         fun(a, c)
-        c = tvm.nd.empty((n, lanes), "uint16", dev).copyfrom(c)
+        c = tvm.runtime.empty((n, lanes), "uint16", dev).copyfrom(c)
         tvm.testing.assert_allclose(c.numpy(), np_float2np_bf16(np_a + 1))
 
     check_cuda(64, 2)
@@ -143,10 +143,10 @@ def check_cuda(dtype, n, lanes):
         np_c = np.random.randint(low=0, high=127, size=(n,))
         np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
         dev = tvm.cuda(0)
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
-        b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np_b)
-        c = tvm.nd.empty((n,), C.dtype, dev).copyfrom(np_c)
-        d = tvm.nd.empty((n,), D.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(np_a)
+        b = tvm.runtime.empty((n,), B.dtype, dev).copyfrom(np_b)
+        c = tvm.runtime.empty((n,), C.dtype, dev).copyfrom(np_c)
+        d = tvm.runtime.empty((n,), D.dtype, dev)
         fun(a, b, c, d)
         tvm.testing.assert_allclose(d.numpy(), np_d)
 
@@ -170,8 +170,8 @@ def check_cuda(dtype, n, lanes):
         fun = tvm.compile(sch.mod, target="cuda")
 
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
-        b = tvm.nd.empty((n,), B.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(np_a)
+        b = tvm.runtime.empty((n,), B.dtype, dev)
         fun(a, b)
         tvm.testing.assert_allclose(a.numpy(), b.numpy())
 
@@ -197,7 +197,7 @@ def check_cuda(n, value, lanes):
         fun = tvm.compile(sch.mod, target="cuda")
 
         np_a = np.full((n, lanes), value, dtype=dtype)
-        a = tvm.nd.empty(np_a.shape, dtype, dev)
+        a = tvm.runtime.empty(np_a.shape, dtype, dev)
         fun(a)
         np.testing.assert_equal(a.numpy(), np_a)
 
@@ -228,8 +228,8 @@ def check_inf_nan(dev, n, value, dtype):
         sch.bind(xi, "threadIdx.x")
         fun = tvm.compile(sch.mod, target="cuda")
 
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -267,8 +267,8 @@ def verify(nthd):
         vals = [nthd - 1, nthd, nthd + 1]
         for kk in [x for x in vals]:
             size = (nn, kk)
-            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
+            a = tvm.runtime.tensor(np.random.uniform(size=size).astype(A.dtype), dev)
+            b = tvm.runtime.tensor(np.zeros(nn, dtype=B.dtype), dev)
             func(a, b)
             tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-3)
 
@@ -306,8 +306,8 @@ def verify(nthdx, nthdy):
         vy = [nthdy - 1, nthdy, nthdy + 1]
         for kk0, kk1 in [(x, y) for x in vx for y in vy]:
             size = (nn, kk0, kk1)
-            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
+            a = tvm.runtime.tensor(np.random.uniform(size=size).astype(A.dtype), dev)
+            b = tvm.runtime.tensor(np.zeros(nn, dtype=B.dtype), dev)
             func(a, b)
             tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=(1, 2)), rtol=1e-3)
 
@@ -352,8 +352,8 @@ def test_cuda_const_float_to_half():
     dev = tvm.cuda(0)
     a_np = np.random.uniform(size=shape).astype(a.dtype)
     c_np = np.zeros(shape=shape, dtype=c.dtype)
-    a = tvm.nd.array(a_np, dev)
-    c = tvm.nd.array(c_np, dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    c = tvm.runtime.tensor(c_np, dev)
     func(a, c)
     np.testing.assert_equal(c.numpy(), a_np > b.value)
 
@@ -379,8 +379,8 @@ def test_cuda_floordiv_with_vectorization():
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         b_np = np.array([a_np[i // k] for i in range(0, n)])
-        a_nd = tvm.nd.array(a_np, dev)
-        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
+        a_nd = tvm.runtime.tensor(a_np, dev)
+        b_nd = tvm.runtime.tensor(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.numpy(), b_np, rtol=1e-3)
 
@@ -405,8 +405,8 @@ def test_cuda_floormod_with_vectorization():
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         b_np = np.array([a_np[i % k] for i in range(0, n)])
-        a_nd = tvm.nd.array(a_np, dev)
-        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
+        a_nd = tvm.runtime.tensor(a_np, dev)
+        b_nd = tvm.runtime.tensor(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.numpy(), b_np, rtol=1e-3)
 
@@ -438,9 +438,9 @@ def check(t0, t1, factor):
         a_np = np.random.randint(low, high, size=n).astype(A.dtype)
         b_np = np.random.randint(low, high, size=n).astype(B.dtype)
         c_np = (a_np + b_np).astype(A.dtype)
-        a_nd = tvm.nd.array(a_np, dev)
-        b_nd = tvm.nd.array(b_np, dev)
-        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
+        a_nd = tvm.runtime.tensor(a_np, dev)
+        b_nd = tvm.runtime.tensor(b_np, dev)
+        c_nd = tvm.runtime.tensor(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
         func(a_nd, b_nd, c_nd)
         tvm.testing.assert_allclose(c_nd.numpy(), c_np, rtol=1e-3)
 
@@ -535,8 +535,8 @@ def run_test(tvm_intrin, np_func, dtype):
         B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
         f = sched(A, B)
         dev = tvm.cuda(0)
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(shape=(n,)).astype(A.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
 
@@ -560,8 +560,8 @@ def run_test(tvm_intrin, np_func):
         B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B")
         f = sched(A, B)
         dev = tvm.cuda(0)
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(shape=(n,)).astype(A.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
 
@@ -585,8 +585,8 @@ def run_test(dtype):
         B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B")
         f = sched(A, B)
         dev = tvm.cuda(0)
-        a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(shape=(n,)).astype(B.dtype), dev)
         f(a, b)
         ref = np.vectorize(ref_popcount)(a.numpy())
         tvm.testing.assert_allclose(b.numpy(), ref)
@@ -623,8 +623,8 @@ def check_cuda(dtype, n, l, padding, lanes):
         fun = tvm.compile(sch.mod, target="cuda")
 
         np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
-        a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
-        b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
+        a = tvm.runtime.empty((n, l), A.dtype, dev).copyfrom(np_a)
+        b = tvm.runtime.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
         fun(a, b)
         np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1)
         ref = np.pad(
@@ -666,8 +666,8 @@ def build(A, C, N, C_N):
         kernel_source = f.imports[0].inspect_source()
         dev = tvm.cuda()
         a_data = np.arange(0, N).astype(A.dtype)
-        a = tvm.nd.array(a_data, dev)
-        c = tvm.nd.array(np.zeros(C_N, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(a_data, dev)
+        c = tvm.runtime.tensor(np.zeros(C_N, dtype=C.dtype), dev)
         f(a, c)
 
         return a_data, c.numpy(), kernel_source
@@ -834,9 +834,9 @@ def main(
     dev = tvm.cuda(0)
     a_np = np.random.randint(0, 10, (128, 128), dtype="int32")
     b_np = np.random.randint(0, 10, (128, 128), dtype="int32")
-    a_tvm = tvm.nd.array(a_np, device=dev)
-    b_tvm = tvm.nd.array(b_np, device=dev)
-    c_tvm = tvm.nd.empty((128, 128), dtype="int32", device=dev)
+    a_tvm = tvm.runtime.tensor(a_np, device=dev)
+    b_tvm = tvm.runtime.tensor(b_np, device=dev)
+    c_tvm = tvm.runtime.empty((128, 128), dtype="int32", device=dev)
     lib["main"](a_tvm, b_tvm, c_tvm)
     tvm.testing.assert_allclose(c_tvm.numpy(), a_np + b_np)
 
diff --git a/tests/python/codegen/test_target_codegen_cuda_fp4.py b/tests/python/codegen/test_target_codegen_cuda_fp4.py
index 364f9461c2f9..a578dc14a595 100644
--- a/tests/python/codegen/test_target_codegen_cuda_fp4.py
+++ b/tests/python/codegen/test_target_codegen_cuda_fp4.py
@@ -76,12 +76,12 @@ def add(
 
     np_shape = (vector_length, lanes) if lanes > 1 else (vector_length,)
     a_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
-    a = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    a = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     a.copyfrom(a_np)
     b_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
-    b = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    b = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     b.copyfrom(b_np)
-    c = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    c = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     fadd(a, b, c)
 
     tvm.testing.assert_allclose(
diff --git a/tests/python/codegen/test_target_codegen_cuda_fp8.py b/tests/python/codegen/test_target_codegen_cuda_fp8.py
index c0b6130bcb80..51a9db240f4c 100644
--- a/tests/python/codegen/test_target_codegen_cuda_fp8.py
+++ b/tests/python/codegen/test_target_codegen_cuda_fp8.py
@@ -76,9 +76,9 @@ def add(
 
     dev = tvm.device(target, 0)
 
-    a = tvm.nd.array(np.random.uniform(low=0, high=5, size=64).astype(dtype), dev)
-    b = tvm.nd.array(np.random.uniform(low=0, high=5, size=64).astype(dtype), dev)
-    c = tvm.nd.array(np.zeros(64, dtype=dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(low=0, high=5, size=64).astype(dtype), dev)
+    b = tvm.runtime.tensor(np.random.uniform(low=0, high=5, size=64).astype(dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(64, dtype=dtype), dev)
     fadd(a, b, c)
 
     tvm.testing.assert_allclose(
@@ -135,9 +135,9 @@ def add(
 
     np_shape = (length, vector_length)
     a_np = np.random.uniform(low=0, high=5, size=np_shape).astype(dtype)
-    a = tvm.nd.empty(shape=(length,), dtype=native_dtype, device=dev)
-    r = tvm.nd.empty(shape=(length,), dtype=packed_dtype, device=dev)
-    b = tvm.nd.empty(shape=(length,), dtype=native_dtype, device=dev)
+    a = tvm.runtime.empty(shape=(length,), dtype=native_dtype, device=dev)
+    r = tvm.runtime.empty(shape=(length,), dtype=packed_dtype, device=dev)
+    b = tvm.runtime.empty(shape=(length,), dtype=native_dtype, device=dev)
     a.copyfrom(a_np)
     f(a, r, b)
     tvm.testing.assert_allclose(a.numpy().astype("float16"), b.numpy().astype("float16"))
@@ -205,12 +205,12 @@ def add(
 
     np_shape = (vector_length, lanes) if lanes > 1 else (vector_length,)
     a_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
-    a = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    a = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     a.copyfrom(a_np)
     b_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
-    b = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    b = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     b.copyfrom(b_np)
-    c = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    c = tvm.runtime.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
     fadd(a, b, c)
 
     tvm.testing.assert_allclose(
@@ -243,8 +243,8 @@ def vector_broadcast(a: T.Buffer((), dtype), vec: T.Buffer((bcast_length,), dtyp
     dev = tvm.device(target, 0)
 
     a_np = np.random.uniform(low=0, high=4, size=()).astype(dtype)
-    a = tvm.nd.array(a_np, device=dev)
-    b = tvm.nd.empty((bcast_length,), dtype=dtype, device=dev)
+    a = tvm.runtime.tensor(a_np, device=dev)
+    b = tvm.runtime.empty((bcast_length,), dtype=dtype, device=dev)
 
     func(a, b)
 
@@ -276,9 +276,9 @@ def vector_load(
 
     dev = tvm.device(target, 0)
     a_np = np.random.uniform(low=0, high=1, size=(length,)).astype(dtype)
-    a = tvm.nd.array(a_np, device=dev)
+    a = tvm.runtime.tensor(a_np, device=dev)
 
-    b = tvm.nd.empty((length // vector_length,), dtype=vec_dtype, device=dev)
+    b = tvm.runtime.empty((length // vector_length,), dtype=vec_dtype, device=dev)
 
     f(a, b)
 
@@ -325,12 +325,12 @@ def add(
     dev = tvm.device(target, 0)
 
     a_np = np.random.uniform(-1, 1, (length, vector_length)).astype(dtype)
-    a = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+    a = tvm.runtime.empty(shape=(length,), dtype=vec_dtype, device=dev)
     a.copyfrom(a_np)
     b_np = np.random.uniform(-1, 1, (length, vector_length)).astype(dtype)
-    b = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+    b = tvm.runtime.empty(shape=(length,), dtype=vec_dtype, device=dev)
     b.copyfrom(b_np)
-    c = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+    c = tvm.runtime.empty(shape=(length,), dtype=vec_dtype, device=dev)
 
     fadd(a, b, c)
     c_expected = a_np + b_np
@@ -805,7 +805,7 @@ def test_main(self, weight_shape, model_dtype, target_str, compiled_functions):
         dev = tvm.device(target_str, 0)
 
         weight_np = np.random.uniform(-100, 100, weight_shape).astype(model_dtype)
-        weight = tvm.nd.array(weight_np, device=dev)
+        weight = tvm.runtime.tensor(weight_np, device=dev)
         quant_weight, scales = quant(weight)
         quant_weight_np, scales_np = quant_weight.numpy(), scales.numpy()
 
@@ -955,16 +955,16 @@ def _pipeline(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     dev = tvm.cuda(0)
 
     x_data = np.zeros((1, reduce_size), dtype=np.float16)
-    x = tvm.nd.array(x_data, device=dev)
+    x = tvm.runtime.tensor(x_data, device=dev)
 
     indptr_data = np.zeros((1, 2), dtype=np.int32)
-    indptr = tvm.nd.array(indptr_data, device=dev)
+    indptr = tvm.runtime.tensor(indptr_data, device=dev)
 
     weight_data = np.zeros((num_experts, spatial_size, reduce_size), dtype="float8_e4m3fn")
-    weight = tvm.nd.array(weight_data, device=dev)
+    weight = tvm.runtime.tensor(weight_data, device=dev)
 
     scale_data = np.zeros((1,), dtype=np.float32)
-    scale = tvm.nd.array(scale_data, device=dev)
+    scale = tvm.runtime.tensor(scale_data, device=dev)
 
     vm = relax.VirtualMachine(rt_mod, dev)
     # Ensure this runs without failure. Utilizing dlight thread extents TS, TR = 4, 64
@@ -1000,9 +1000,9 @@ def func_vectorize(
     a_np = np.random.rand(128).astype("float8_e4m3fn")
     b_np = np.random.rand(128).astype(dtype)
     c_np = (a_np.astype(dtype) * b_np) + 3
-    a_tvm = tvm.nd.array(a_np, device=device)
-    b_tvm = tvm.nd.array(b_np, device=device)
-    c_tvm = tvm.nd.empty((128,), dtype=dtype, device=device)
+    a_tvm = tvm.runtime.tensor(a_np, device=device)
+    b_tvm = tvm.runtime.tensor(b_np, device=device)
+    c_tvm = tvm.runtime.empty((128,), dtype=dtype, device=device)
     f(a_tvm, b_tvm, c_tvm)
     c_tvm = c_tvm.numpy()
     np.testing.assert_allclose(
diff --git a/tests/python/codegen/test_target_codegen_device.py b/tests/python/codegen/test_target_codegen_device.py
index 4dad03d7004c..b897d50b41c7 100644
--- a/tests/python/codegen/test_target_codegen_device.py
+++ b/tests/python/codegen/test_target_codegen_device.py
@@ -50,7 +50,7 @@ def check_target(device):
         dev = tvm.device(device, 0)
         f = tvm.compile(sch.mod, target=device)
         # launch the kernel.
-        a = tvm.nd.empty((n,), dtype=A.dtype, device=dev)
+        a = tvm.runtime.empty((n,), dtype=A.dtype, device=dev)
         f(a)
         assert a.numpy()[0] == value + 3
 
@@ -95,12 +95,12 @@ def check_target(device, host):
         dev = tvm.device(device, 0)
         target = tvm.target.Target(device, host)
         mhost = tvm.tir.build(sch.mod, target=target)
-        f = mhost.entry_func
+        f = mhost.main
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=()).astype(B.dtype), dev)
-        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=()).astype(B.dtype), dev)
+        d = tvm.runtime.tensor(np.zeros(n, dtype=D.dtype), dev)
         f(a, b, d)
         tvm.testing.assert_allclose(d.numpy(), a.numpy() + b.numpy() + 1)
 
diff --git a/tests/python/codegen/test_target_codegen_extern.py b/tests/python/codegen/test_target_codegen_extern.py
index 35227baaff5b..f02a717747b4 100644
--- a/tests/python/codegen/test_target_codegen_extern.py
+++ b/tests/python/codegen/test_target_codegen_extern.py
@@ -73,8 +73,8 @@ def check_target(target):
         dev = tvm.device(target, 0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
@@ -109,8 +109,8 @@ def check_target(target):
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
 
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy())
@@ -140,8 +140,8 @@ def check_target(target):
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
 
         @tvm.register_func
         def my_extern_array_func2(aa, bb):
diff --git a/tests/python/codegen/test_target_codegen_gpu_common.py b/tests/python/codegen/test_target_codegen_gpu_common.py
index 08f43a114084..b115fddb57f7 100644
--- a/tests/python/codegen/test_target_codegen_gpu_common.py
+++ b/tests/python/codegen/test_target_codegen_gpu_common.py
@@ -41,8 +41,8 @@ def run_test(tvm_intrin, np_func, dtype):
         (x,) = sch.get_loops(sch.get_block("B"))
         sch.bind(x, "threadIdx.x")
         f = tvm.compile(sch.mod, target=target)
-        a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(shape=(n,)).astype(B.dtype), dev)
         f(a, b)
         ref = np.vectorize(partial(np_func, dtype=dtype))(a.numpy())
         tvm.testing.assert_allclose(b.numpy(), ref)
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index b303cf289eca..88b791d1aa52 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -118,7 +118,7 @@ def check_llvm():
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.empty((), dtype=A.dtype, device=dev)
+        a = tvm.runtime.empty((), dtype=A.dtype, device=dev)
         f(a)
         assert a.numpy() == value + 3
 
@@ -160,8 +160,8 @@ def check_llvm():
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), np.sqrt(a.numpy() + 1) * 2 + 2, rtol=1e-5)
 
@@ -193,8 +193,8 @@ def check_llvm(nn, base):
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n + base)).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy()[::-1][:n])
 
@@ -226,9 +226,9 @@ def test_llvm_vadd_pipeline():
     f = tvm.compile(sch.mod, target="llvm")
     dev = tvm.cpu(0)
     n = 128
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
@@ -258,8 +258,8 @@ def check_llvm(nn, base, stride):
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n + base, stride)).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((n, stride), dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy()[base:] + 1)
 
@@ -288,8 +288,8 @@ def check_llvm():
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1 + 1)
 
@@ -320,9 +320,9 @@ def test_multiple_func():
     f = tvm.compile(mod, target="llvm")
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(n, dtype=C.dtype), dev)
 
     # Test both functions
     f["fadd1"](a, b, c)
@@ -345,8 +345,8 @@ def check_llvm(n, offset):
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         f(a, c)
         c_np = a.numpy()
         c_np[:offset] = 0
@@ -369,8 +369,8 @@ def check_llvm(n):
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
-        c = tvm.nd.empty((n,), C.dtype, dev)
+        a = tvm.runtime.tensor(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+        c = tvm.runtime.empty((n,), C.dtype, dev)
         f(a, c)
         c_np = a.numpy() == 1
         tvm.testing.assert_allclose(c.numpy(), c_np)
@@ -395,9 +395,9 @@ def check_llvm(n):
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
-        sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
-        d = tvm.nd.empty((), D.dtype, dev)
+        a = tvm.runtime.tensor(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+        sc = tvm.runtime.tensor(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
+        d = tvm.runtime.empty((), D.dtype, dev)
         f(a, sc, d)
         d_np = np.sum(a.numpy()) * sc.numpy() + 1
         tvm.testing.assert_allclose(d.numpy(), d_np)
@@ -423,9 +423,9 @@ def check_llvm(n):
             f = tvm.compile(sch.mod, target="llvm")
             dev = tvm.cpu(0)
             # launch the kernel.
-            a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
-            sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
-            d = tvm.nd.empty((), D.dtype, dev)
+            a = tvm.runtime.tensor(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+            sc = tvm.runtime.tensor(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
+            d = tvm.runtime.empty((), D.dtype, dev)
             f(a, sc, d)
             d_np = np.sum(a.numpy()) * sc.numpy() + 1
             tvm.testing.assert_allclose(d.numpy(), d_np)
@@ -531,16 +531,16 @@ def clipb(x):
         f = tvm.compile(sch.mod, target="llvm")
 
         # Fill input arrays with values
-        A_arr = tvm.nd.empty((end - start + 1,), dtype)
-        B_arr = tvm.nd.empty((dend - dstart + 1,), dtype)
+        A_arr = tvm.runtime.empty((end - start + 1,), dtype)
+        B_arr = tvm.runtime.empty((dend - dstart + 1,), dtype)
         A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype))
         B_np = np.arange(dstart, dend + 1, dtype=dtype)
         # If the range of the divisor contains 0, replace it with 1 to avoid division by zero
         if dend >= 0 and dstart <= 0:
             B_np[-dstart] = 1
         B_arr.copyfrom(B_np)
-        D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)
-        M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)
+        D_arr = tvm.runtime.empty((end - start + 1, dend - dstart + 1), dtype)
+        M_arr = tvm.runtime.empty((end - start + 1, dend - dstart + 1), dtype)
 
         # Run the function and convert the results to numpy
         f(A_arr, B_arr, D_arr, M_arr)
@@ -636,8 +636,8 @@ def check_llvm_reciprocal(n):
         # Build from scheduled TIR
         f = tvm.compile(sch.mod, target="llvm")
 
-        a = tvm.nd.array(np.full((n,), 100, "float32"))
-        b = tvm.nd.empty((n,), "float32")
+        a = tvm.runtime.tensor(np.full((n,), 100, "float32"))
+        b = tvm.runtime.empty((n,), "float32")
         f(a, b)
         tvm.testing.assert_allclose(b.numpy(), np.zeros((n,), "float32"))
 
@@ -656,8 +656,8 @@ def check_llvm_sigmoid(n):
         # Build from scheduled TIR
         f = tvm.compile(sch.mod, target="llvm")
 
-        a = tvm.nd.array(np.full((n,), -1000, "float32"))
-        b = tvm.nd.empty((n,), "float32")
+        a = tvm.runtime.tensor(np.full((n,), -1000, "float32"))
+        b = tvm.runtime.empty((n,), "float32")
         f(a, b)
         tvm.testing.assert_allclose(b.numpy(), np.zeros((n,), "float32"))
 
@@ -780,9 +780,9 @@ def dotest(do_vectorize):
         npa = np.random.rand(32).astype("bfloat16")
         npb = np.random.rand(32).astype("bfloat16")
         res = npa + npb
-        a_ = tvm.nd.array(npa)
-        b_ = tvm.nd.array(npb)
-        c_ = tvm.nd.empty((32,), "bfloat16")
+        a_ = tvm.runtime.tensor(npa)
+        b_ = tvm.runtime.tensor(npb)
+        c_ = tvm.runtime.empty((32,), "bfloat16")
         module(a_, b_, c_)
         # Note: directly compare without casting to float32 should work with the
         # latest numpy version.
@@ -868,8 +868,8 @@ def check_llvm(use_file):
         f = tvm.compile(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(B.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.numpy(), a.numpy() + 1.0)
 
@@ -1027,7 +1027,7 @@ def subroutine(A_data: T.handle("float32")):
 
     built = tvm.compile(mod)
 
-    arr = tvm.nd.array(np.zeros([1], "float32"), device=dev)
+    arr = tvm.runtime.tensor(np.zeros([1], "float32"), device=dev)
     built["main"](arr)
     assert arr.numpy()[0] == 42.0
 
@@ -1191,10 +1191,10 @@ def func(a0: T.bool, a1: T.Buffer([10], "float32")) -> T.int32:
         built(1, 1)
 
     with pytest.raises(RuntimeError):
-        built(1, tvm.nd.empty([10], "int32"))
+        built(1, tvm.runtime.empty([10], "int32"))
 
     with pytest.raises(RuntimeError):
-        built(False, tvm.nd.empty([11], "float32"))
+        built(False, tvm.runtime.empty([11], "float32"))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/codegen/test_target_codegen_metal.py b/tests/python/codegen/test_target_codegen_metal.py
index 6b413d532371..8f50ec829843 100644
--- a/tests/python/codegen/test_target_codegen_metal.py
+++ b/tests/python/codegen/test_target_codegen_metal.py
@@ -37,8 +37,8 @@ def check_inf_nan(dev, n, value, dtype):
         (x,) = sch.get_loops(sch.get_block("C"))
         sch.bind(x, "threadIdx.x")
         fun = tvm.compile(sch.mod, target=target)
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -70,8 +70,8 @@ def main(A: T.Buffer((2, 3), "float32"), B: T.Buffer((6,), "float32")):
     dev = tvm.metal()
 
     a = (np.arange(6).reshape(2, 3)).astype("float32")
-    a_nd = tvm.nd.array(a, dev)
-    b_nd = tvm.nd.empty((6,), "float32", dev)
+    a_nd = tvm.runtime.tensor(a, dev)
+    b_nd = tvm.runtime.empty((6,), "float32", dev)
     f = tvm.compile(IRModule, target=target)
     f(a_nd, b_nd)
     np.testing.assert_allclose(b_nd.numpy(), a.reshape(6), atol=1e-5, rtol=1e-5)
@@ -90,8 +90,8 @@ def check_erf(dev, n, dtype):
         (x,) = sch.get_loops(sch.get_block("C"))
         sch.bind(x, "threadIdx.x")
         fun = tvm.compile(sch.mod, target=target)
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -119,7 +119,7 @@ def main(A: T.Buffer((1, 2), "int32")):
 
     f = tvm.compile(IRModule, target=target)
     dev = tvm.metal()
-    a_nd = tvm.nd.empty((1, 2), "int32", dev)
+    a_nd = tvm.runtime.empty((1, 2), "int32", dev)
     f(a_nd)
     assert tuple(a_nd.numpy()[0, :]) == (0, 3)
 
@@ -141,8 +141,8 @@ def main(A: T.Buffer((6), "float32"), B: T.Buffer((6,), "float32")):
     target = "metal"
     dev = tvm.metal()
     a = np.arange(6).astype("float32")
-    a_nd = tvm.nd.array(a, dev)
-    b_nd = tvm.nd.empty((6,), "float32", dev)
+    a_nd = tvm.runtime.tensor(a, dev)
+    b_nd = tvm.runtime.empty((6,), "float32", dev)
     f = tvm.compile(IRModule, target=target)
     f(a_nd, b_nd)
     a.reshape(3, 2)[:, 1] = 0
@@ -162,8 +162,8 @@ def func(A: T.Buffer((16), "uint8"), B: T.Buffer((16), "float32")):
 
     dev = tvm.metal()
     a = np.arange(16).astype("uint8")
-    a_nd = tvm.nd.array(a, dev)
-    b_nd = tvm.nd.empty((16,), "float32", dev)
+    a_nd = tvm.runtime.tensor(a, dev)
+    b_nd = tvm.runtime.empty((16,), "float32", dev)
     f = tvm.compile(func, target="metal")
     f(a_nd, b_nd)
     np.testing.assert_allclose(b_nd.numpy(), a.astype("float32"), atol=1e-5, rtol=1e-5)
diff --git a/tests/python/codegen/test_target_codegen_opencl.py b/tests/python/codegen/test_target_codegen_opencl.py
index 4eb96747bcee..3e0fe7e31e50 100644
--- a/tests/python/codegen/test_target_codegen_opencl.py
+++ b/tests/python/codegen/test_target_codegen_opencl.py
@@ -39,8 +39,8 @@ def check_if_then_else(dev, n, dtype):
         (x,) = sch.get_loops(sch.get_block("C"))
         sch.bind(x, "threadIdx.x")
         fun = tvm.tir.build(sch.mod, target=target)
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -57,8 +57,8 @@ def check_select(dev, n, dtype):
         sch.bind(x, "threadIdx.x")
         fun = tvm.tir.build(sch.mod, target=target)
 
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -86,8 +86,8 @@ def check_inf_nan(dev, n, value, dtype):
         (x,) = sch.get_loops(sch.get_block("C"))
         sch.bind(x, "threadIdx.x")
         fun = tvm.tir.build(sch.mod, target=target)
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -115,8 +115,8 @@ def check_max(dev, n, dtype):
         sch.bind(x, "threadIdx.x")
         fun = tvm.tir.build(sch.mod, target=target)
 
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -179,7 +179,7 @@ def check_type_casting(ctx, n, dtype):
         sch.vectorize(vx)
 
         fun = tvm.tir.build(sch.mod, target=target)
-        c = tvm.nd.empty((n,), dtype, ctx)
+        c = tvm.runtime.empty((n,), dtype, ctx)
         assembly = fun.imports[0].inspect_source()
         lcond = "convert_int4(((convert_uint4(((uint4)(((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3)))))"
         rcond = "(convert_uint4(((((int4)(((convert_int(get_local_id(0))))+(1*0), ((convert_int(get_local_id(0))))+(1*1), ((convert_int(get_local_id(0))))+(1*2), ((convert_int(get_local_id(0))))+(1*3))) % ((int4)(3, 3, 3, 3))) == ((int4)(1, 1, 1, 1))))))))"
diff --git a/tests/python/codegen/test_target_codegen_rocm.py b/tests/python/codegen/test_target_codegen_rocm.py
index a89d71f2be48..cdd84fc57ae1 100644
--- a/tests/python/codegen/test_target_codegen_rocm.py
+++ b/tests/python/codegen/test_target_codegen_rocm.py
@@ -32,8 +32,8 @@ def check_inf_nan(dev, n, value, dtype):
         sch.bind(xo, "blockIdx.x")
         sch.bind(xi, "threadIdx.x")
         fun = tvm.compile(sch.mod, "rocm")
-        a = tvm.nd.empty((n,), A.dtype, dev)
-        c = tvm.nd.empty((n,), A.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev)
+        c = tvm.runtime.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
@@ -53,7 +53,7 @@ def check_rocm(dtype, n):
         A = te.placeholder((n,), name="A", dtype=dtype)
         dev = tvm.rocm(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(a_np)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(a_np)
         b_np = a.numpy()
         tvm.testing.assert_allclose(a_np, b_np)
         tvm.testing.assert_allclose(a_np, a.numpy())
@@ -79,8 +79,8 @@ def check_rocm(dtype, n, lanes):
         fun = tvm.compile(sch.mod, target="rocm")
 
         dev = tvm.rocm(0)
-        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), B.dtype, dev)
+        a = tvm.runtime.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
+        c = tvm.runtime.empty((n,), B.dtype, dev)
         fun(a, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
@@ -109,7 +109,7 @@ def func(
 
     mod = tvm.compile(func, target="rocm")
     dev = tvm.rocm(0)
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype("float32"), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=(32,)).astype("float32"), dev)
     mod(a)
     tvm.testing.assert_allclose(a.numpy(), np.ones((32,)) * a.numpy()[0])
 
@@ -132,7 +132,7 @@ def func(
 
     mod = tvm.compile(func, target="rocm")
     dev = tvm.rocm(0)
-    a = tvm.nd.array(np.ones((4,)).astype("float32"), dev)
-    b = tvm.nd.array(np.zeros((4,)).astype("float32"), dev)
+    a = tvm.runtime.tensor(np.ones((4,)).astype("float32"), dev)
+    b = tvm.runtime.tensor(np.zeros((4,)).astype("float32"), dev)
     mod(a, b)
     tvm.testing.assert_allclose(b.numpy(), np.exp2(a.numpy()))
diff --git a/tests/python/codegen/test_target_codegen_static_init.py b/tests/python/codegen/test_target_codegen_static_init.py
index 4d993e5d6b7b..ad3863abd13d 100644
--- a/tests/python/codegen/test_target_codegen_static_init.py
+++ b/tests/python/codegen/test_target_codegen_static_init.py
@@ -36,7 +36,7 @@ def test_static_callback():
 
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "ramp"))
     f = tvm.driver.build(mod, target="llvm")
-    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
     f(a)
     f(a)
     np.testing.assert_equal(a.numpy(), np.ones(a.shape[0]))
@@ -59,7 +59,7 @@ def test_cb(sh, A):
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "ramp"))
     f = tvm.driver.build(mod, target="llvm")
-    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
     f(a)
 
 
diff --git a/tests/python/codegen/test_target_codegen_vulkan.py b/tests/python/codegen/test_target_codegen_vulkan.py
index a523ae037794..cf7b46692661 100644
--- a/tests/python/codegen/test_target_codegen_vulkan.py
+++ b/tests/python/codegen/test_target_codegen_vulkan.py
@@ -99,7 +99,7 @@ def test_array_copy(dev, dtype, fuzz_seed):
     log_arr_size = np.random.uniform(low=np.log(1), high=np.log(32768))
     arr_size = np.exp(log_arr_size).astype(int)
     a_np = np.random.uniform(size=(arr_size,)).astype(dtype)
-    a = tvm.nd.empty((arr_size,), dtype, dev).copyfrom(a_np)
+    a = tvm.runtime.empty((arr_size,), dtype, dev).copyfrom(a_np)
     b_np = a.numpy()
     tvm.testing.assert_allclose(a_np, b_np)
     tvm.testing.assert_allclose(a_np, a.numpy())
@@ -123,8 +123,10 @@ def test_array_vectorize_add(target, dev, dtype):
     sch.bind(xi, "threadIdx.x")
     f = tvm.compile(sch.mod, target=target)
 
-    a = tvm.nd.empty((arr_size,), A.dtype, dev).copyfrom(np.random.uniform(size=(arr_size, lanes)))
-    c = tvm.nd.empty((arr_size,), B.dtype, dev)
+    a = tvm.runtime.empty((arr_size,), A.dtype, dev).copyfrom(
+        np.random.uniform(size=(arr_size, lanes))
+    )
+    c = tvm.runtime.empty((arr_size,), B.dtype, dev)
     f(a, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
@@ -146,8 +148,8 @@ def test_vulkan_bool_load(target, dev):
 
     a_np = np.random.uniform(size=arr_size) > 0.5
     b_np = np.zeros((arr_size,), dtype="int32")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
     f(a, b)
     ref = a_np.astype(np.int32)
     tvm.testing.assert_allclose(b.numpy(), ref)
@@ -198,8 +200,8 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para
 
     n = 1024
     scalars = np.array([1 for _ in scalars]).astype(dtype)
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.zeros(n, dtype=B.dtype), dev)
     f_add(*scalars, a, b)
 
     tvm.testing.assert_allclose(a.numpy() + sum(scalars), b.numpy())
@@ -244,13 +246,13 @@ def do_compute(A, B, n):
     # Build
     func = tvm.compile(sch.mod, target=target)
 
-    a = tvm.nd.array(np.array([5], dtype=A.dtype), dev)
-    b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(np.array([5], dtype=A.dtype), dev)
+    b = tvm.runtime.tensor(np.zeros(n, dtype=A.dtype), dev)
     func(a, b)
     tvm.testing.assert_allclose(b.numpy(), [55])
 
-    a = tvm.nd.array(np.array([-5], dtype=A.dtype), dev)
-    b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(np.array([-5], dtype=A.dtype), dev)
+    b = tvm.runtime.tensor(np.zeros(n, dtype=A.dtype), dev)
     func(a, b)
     tvm.testing.assert_allclose(b.numpy(), [210])
 
@@ -295,8 +297,8 @@ def do_compute(A, B, n):
     n = 32
     a_np = np.arange(n).astype(dtype=A.dtype)
     b_np = np.zeros((n,), dtype="int32")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
     func(a, b)
     tvm.testing.assert_allclose(b.numpy(), a_np)
 
@@ -386,9 +388,9 @@ def test_ramp_broadcast_index(self, target, dev, mod, ref_data):
         f = tvm.compile(mod, target=target)
 
         a_np, reorder_np, b_np = ref_data
-        a = tvm.nd.array(a_np, dev)
-        r = tvm.nd.array(reorder_np, dev)
-        b = tvm.nd.array(np.zeros(shape=b_np.shape, dtype="int32"), dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        r = tvm.runtime.tensor(reorder_np, dev)
+        b = tvm.runtime.tensor(np.zeros(shape=b_np.shape, dtype="int32"), dev)
         f(a, r, b)
         tvm.testing.assert_allclose(b.numpy(), b_np)
 
@@ -426,7 +428,7 @@ def func(A: T.Buffer((N, 2), "int32")):
 
     built = tvm.compile(func, target=target)
 
-    a_dev = tvm.nd.empty([N, 2], "int32", dev)
+    a_dev = tvm.runtime.empty([N, 2], "int32", dev)
     built(a_dev)
     a = a_dev.numpy()
 
@@ -538,9 +540,9 @@ def tensorize_load(block, dim):
 
         dev = tvm.device(target, 0)
 
-        A = tvm.nd.array(np.random.randn(M, K).astype("float16"), dev)
-        B = tvm.nd.array(np.random.randn(K, N).astype("float16"), dev)
-        C = tvm.nd.array(np.random.randn(M, N).astype(out_dtype), dev)
+        A = tvm.runtime.tensor(np.random.randn(M, K).astype("float16"), dev)
+        B = tvm.runtime.tensor(np.random.randn(K, N).astype("float16"), dev)
+        C = tvm.runtime.tensor(np.random.randn(M, N).astype(out_dtype), dev)
 
         f(A, B, C)
 
@@ -614,8 +616,8 @@ def run_test(tvm_intrin, np_func):
         else:
             data = np.random.uniform(0.1, 0.9, size=n)
 
-        a = tvm.nd.array(data.astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(data.astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(n, dtype=A.dtype), dev)
         func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
 
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index c0e1553ea782..e2a15cc60b10 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -71,9 +71,15 @@ def verify(target="llvm"):
         )
         if target == "c":
             f = compiling(f, name)
-        matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
-        matrix_input2 = tvm.nd.array(np.random.uniform(size=bshape).astype(input2_data.dtype), dev)
-        matrix_result = tvm.nd.array(np.zeros((matrix_n, matrix_m), dtype=final_result.dtype), dev)
+        matrix_input1 = tvm.runtime.tensor(
+            np.random.uniform(size=ashape).astype(input1_data.dtype), dev
+        )
+        matrix_input2 = tvm.runtime.tensor(
+            np.random.uniform(size=bshape).astype(input2_data.dtype), dev
+        )
+        matrix_result = tvm.runtime.tensor(
+            np.zeros((matrix_n, matrix_m), dtype=final_result.dtype), dev
+        )
         matrix_bias = 10.0
         f(matrix_input1, matrix_input2, matrix_result, matrix_bias)
         tvm.testing.assert_allclose(
@@ -149,13 +155,15 @@ def verify(target="llvm"):
         f = tvm.compile(
             te.create_prim_func([input1_data, input2_data, final_result, bias]), target=target
         )
-        matrix_input1 = tvm.nd.array(
+        matrix_input1 = tvm.runtime.tensor(
             np.random.randint(low=0, high=50, size=ashape).astype(input1_data.dtype), dev
         )
-        matrix_input2 = tvm.nd.array(
+        matrix_input2 = tvm.runtime.tensor(
             np.random.randint(low=0, high=50, size=bshape).astype(input2_data.dtype), dev
         )
-        matrix_result = tvm.nd.array(np.zeros((matrix_n, matrix_m), dtype=final_result.dtype), dev)
+        matrix_result = tvm.runtime.tensor(
+            np.zeros((matrix_n, matrix_m), dtype=final_result.dtype), dev
+        )
         matrix_bias = 10
         f(matrix_input1, matrix_input2, matrix_result, matrix_bias)
         tvm.testing.assert_allclose(
@@ -235,9 +243,13 @@ def verify(target="llvm"):
         )
         if target == "c":
             f = compiling(f, name)
-        matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
-        matrix_input2 = tvm.nd.array(np.random.uniform(size=bshape).astype(input2_data.dtype), dev)
-        matrix_result = tvm.nd.array(
+        matrix_input1 = tvm.runtime.tensor(
+            np.random.uniform(size=ashape).astype(input1_data.dtype), dev
+        )
+        matrix_input2 = tvm.runtime.tensor(
+            np.random.uniform(size=bshape).astype(input2_data.dtype), dev
+        )
+        matrix_result = tvm.runtime.tensor(
             np.zeros((batch, matrix_n, matrix_m), dtype=final_result.dtype), dev
         )
         f(matrix_input1, matrix_input2, matrix_result)
diff --git a/tests/python/contrib/test_coreml_runtime.py b/tests/python/contrib/test_coreml_runtime.py
index c2284dbe64f6..014a57b28787 100644
--- a/tests/python/contrib/test_coreml_runtime.py
+++ b/tests/python/contrib/test_coreml_runtime.py
@@ -73,7 +73,7 @@ def verify(coreml_model, model_path, dev):
         # inference via tvm coreml runtime
         runtime = coreml_runtime.create("main", model_path, dev)
         for name in inputs:
-            runtime.set_input(name, tvm.nd.array(inputs[name], dev))
+            runtime.set_input(name, tvm.runtime.tensor(inputs[name], dev))
         runtime.invoke()
         tvm_outputs = [runtime.get_output(i).numpy() for i in range(runtime.get_num_outputs())]
 
diff --git a/tests/python/contrib/test_cutlass_gemm.py b/tests/python/contrib/test_cutlass_gemm.py
index 33f7ef1160a1..951085e8530c 100644
--- a/tests/python/contrib/test_cutlass_gemm.py
+++ b/tests/python/contrib/test_cutlass_gemm.py
@@ -24,7 +24,7 @@
 from tvm.contrib.pickle_memoize import memoize
 
 
-def get_random_ndarray(shape, dtype):
+def get_random_tensor(shape, dtype):
     if dtype == "int8":
         return np.random.randint(-128, 128, shape).astype(dtype)
     elif dtype == "uint8":
@@ -44,8 +44,8 @@ def verify_group_gemm(
     def get_ref_data():
         assert M % num_groups == 0
         M_per_group = M // num_groups
-        a_np = get_random_ndarray((M, K), x_dtype)
-        b_np = get_random_ndarray((num_groups, N, K), weight_dtype)
+        a_np = get_random_tensor((M, K), x_dtype)
+        b_np = get_random_tensor((num_groups, N, K), weight_dtype)
         indptr_np = np.arange(1, num_groups + 1).astype("int64") * M_per_group
         c_np = np.concatenate(
             [a_np[i * M_per_group : (i + 1) * M_per_group] @ b_np[i].T for i in range(num_groups)],
@@ -59,13 +59,13 @@ def to_numpy_dtype(dtype):
 
     a_np, b_np, indptr_np, c_np = get_ref_data()
     dev = tvm.cuda(0)
-    a_nd = tvm.nd.array(a_np.astype(to_numpy_dtype(x_dtype)), device=dev)
-    b_nd = tvm.nd.array(b_np.astype(to_numpy_dtype(weight_dtype)), device=dev)
-    c_nd = tvm.nd.empty(c_np.shape, dtype=out_dtype, device=dev)
-    indptr_nd = tvm.nd.array(indptr_np, device=dev)
-    workspace = tvm.nd.empty((4096 * 1024,), dtype="uint8", device=dev)
+    a_nd = tvm.runtime.tensor(a_np.astype(to_numpy_dtype(x_dtype)), device=dev)
+    b_nd = tvm.runtime.tensor(b_np.astype(to_numpy_dtype(weight_dtype)), device=dev)
+    c_nd = tvm.runtime.empty(c_np.shape, dtype=out_dtype, device=dev)
+    indptr_nd = tvm.runtime.tensor(indptr_np, device=dev)
+    workspace = tvm.runtime.empty((4096 * 1024,), dtype="uint8", device=dev)
     if use_scale:
-        scale = tvm.nd.array(np.array([1.0], dtype="float32"), device=dev)
+        scale = tvm.runtime.tensor(np.array([1.0], dtype="float32"), device=dev)
         group_gemm_func(a_nd, b_nd, indptr_nd, workspace, scale, c_nd)
     else:
         group_gemm_func(a_nd, b_nd, indptr_nd, workspace, c_nd)
@@ -319,12 +319,12 @@ def test_fp8_e4m3_groupwise_scaled_gemm():
     x_np, x_scale_np = rowwise_quant_fp8_e4m3((M, K), block_size, dtype)
     w_np, w_scale_np = blockwise_quant_fp8_e4m3((N, K), block_size, dtype)
     o_np = blockwise_matmul(x_np, x_scale_np, w_np, w_scale_np, block_size, dtype)
-    x_tvm = tvm.nd.array(x_np, device=device)
-    x_scale_tvm = tvm.nd.array(x_scale_np.T, device=device)
-    w_tvm = tvm.nd.array(w_np, device=device)
-    w_scale_tvm = tvm.nd.array(w_scale_np, device=device)
-    workspace = tvm.nd.empty((4096 * 1024,), dtype="uint8", device=device)
-    o_tvm = tvm.nd.empty((M, N), dtype=dtype, device=device)
+    x_tvm = tvm.runtime.tensor(x_np, device=device)
+    x_scale_tvm = tvm.runtime.tensor(x_scale_np.T, device=device)
+    w_tvm = tvm.runtime.tensor(w_np, device=device)
+    w_scale_tvm = tvm.runtime.tensor(w_scale_np, device=device)
+    workspace = tvm.runtime.empty((4096 * 1024,), dtype="uint8", device=device)
+    o_tvm = tvm.runtime.empty((M, N), dtype=dtype, device=device)
     gemm_func(
         x_tvm, w_tvm, x_scale_tvm, w_scale_tvm, workspace, block_size[0], block_size[1], o_tvm
     )
@@ -353,12 +353,12 @@ def test_fp8_e4m3_groupwise_scaled_bmm():
     x_np, x_scale_np = rowwise_quant_fp8_e4m3((B, M, K), block_size, dtype)
     w_np, w_scale_np = blockwise_quant_fp8_e4m3((B, N, K), block_size, dtype)
     o_np = blockwise_bmm(x_np, x_scale_np, w_np, w_scale_np, block_size, dtype)
-    x_tvm = tvm.nd.array(x_np, device=device)
-    x_scale_tvm = tvm.nd.array(x_scale_np.transpose(0, 2, 1), device=device)
-    w_tvm = tvm.nd.array(w_np, device=device)
-    w_scale_tvm = tvm.nd.array(w_scale_np, device=device)
-    workspace = tvm.nd.empty((4096 * 1024,), dtype="uint8", device=device)
-    o_tvm = tvm.nd.empty((B, M, N), dtype=dtype, device=device)
+    x_tvm = tvm.runtime.tensor(x_np, device=device)
+    x_scale_tvm = tvm.runtime.tensor(x_scale_np.transpose(0, 2, 1), device=device)
+    w_tvm = tvm.runtime.tensor(w_np, device=device)
+    w_scale_tvm = tvm.runtime.tensor(w_scale_np, device=device)
+    workspace = tvm.runtime.empty((4096 * 1024,), dtype="uint8", device=device)
+    o_tvm = tvm.runtime.empty((B, M, N), dtype=dtype, device=device)
     gemm_func(
         x_tvm, w_tvm, x_scale_tvm, w_scale_tvm, workspace, block_size[0], block_size[1], o_tvm
     )
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 421853899979..20992048b208 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -23,17 +23,17 @@
 
 def verify_torch_dlpack():
     a = np.random.randn(1337)
-    tvm_a = tvm.nd.array(a)
-    np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).numpy(), a)
+    tvm_a = tvm.runtime.tensor(a)
+    np.testing.assert_equal(tvm.runtime.from_dlpack(tvm_a.to_dlpack()).numpy(), a)
 
     try:
         import torch
         import torch.utils.dlpack
 
         x = torch.rand(56, 56)
-        tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        tvm_x = tvm.runtime.from_dlpack(torch.utils.dlpack.to_dlpack(x))
         np.testing.assert_equal(x.numpy(), tvm_x.numpy())
-        y = tvm.nd.from_dlpack(tvm_x)
+        y = tvm.runtime.from_dlpack(tvm_x)
         np.testing.assert_equal(y.numpy(), tvm_x.numpy())
         np.testing.assert_equal(
             torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.numpy()
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
index 2bf58106dfdc..6fdd1799a1eb 100644
--- a/tests/python/contrib/test_edgetpu_runtime.py
+++ b/tests/python/contrib/test_edgetpu_runtime.py
@@ -76,7 +76,7 @@ def check_remote(server, target_edgetpu=False):
 
         with open(tflite_model_path, "rb") as model_fin:
             runtime = tflite_runtime.create(model_fin.read(), dev, runtime_target)
-            runtime.set_input(0, tvm.nd.array(tflite_input, dev))
+            runtime.set_input(0, tvm.runtime.tensor(tflite_input, dev))
             runtime.invoke()
             out = runtime.get_output(0)
             np.testing.assert_equal(out.numpy(), tflite_output)
diff --git a/tests/python/contrib/test_hexagon/README_RPC.md b/tests/python/contrib/test_hexagon/README_RPC.md
index 8d185fcbebeb..f1942d252f06 100644
--- a/tests/python/contrib/test_hexagon/README_RPC.md
+++ b/tests/python/contrib/test_hexagon/README_RPC.md
@@ -125,23 +125,23 @@ TVM_FFI_STATIC_INIT_BLOCK({
 [https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183](https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183)
 
 ```python
-check_call(_LIB.TVMArrayCopyFromBytes(self.handle, data, nbytes))
+check_call(_LIB.TVMTensorCopyFromBytes(self.handle, data, nbytes))
 ```
 
-[https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322](https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322)
+[https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/tensor.cc#L322](https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/tensor.cc#L322)
 
 ```cpp
-int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
+int TVMTensorCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
   API_BEGIN();
-  ArrayCopyFromBytes(handle, data, nbytes);
+  TensorCopyFromBytes(handle, data, nbytes);
   API_END();
 }
 ```
 
-Now we come to `ArrayCopyFromBytes` function. The first non-obvious question is, which `DeviceAPI` is selected by `DeviceAPI::Get(handle->device)`?
+Now we come to `TensorCopyFromBytes` function. The first non-obvious question is, which `DeviceAPI` is selected by `DeviceAPI::Get(handle->device)`?
 
 ```cpp
-void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
+void TensorCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   ...
   DLTensor from;
   ...
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 376cc8c7da12..4718fa7e0671 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -100,9 +100,9 @@ def build_and_run(inputs, func, target: str, target_host: str, *args, **kwargs):
     dev = tvm.device(target)
     tensors = []
     for tensor in inputs:
-        tensors.append(tvm.nd.array(tensor, dev))
+        tensors.append(tvm.runtime.tensor(tensor, dev))
     tensors.append(
-        tvm.nd.array(
+        tvm.runtime.tensor(
             numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype),
             dev,
         )
diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
index c078edf7a934..925c29282b18 100644
--- a/tests/python/contrib/test_hexagon/pytest_util.py
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -140,7 +140,7 @@ def get_numpy_dtype_info(dtype) -> Union[np.finfo, np.iinfo]:
 TensorContentDtypeMax = collections.namedtuple("TensorContentDtypeMax", [])
 
 
-def create_populated_numpy_ndarray(
+def create_populated_numpy_tensor(
     input_shape: Union[list, tuple], dtype: str, input_tensor_populator
 ) -> np.ndarray:
     """
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index ab1cce52eac8..e5fc783510ac 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -281,9 +281,9 @@ def evaluate(
         )
     module = hexagon_session.load_module(func_tir)
 
-    a_hexagon = tvm.runtime.ndarray.array(a_data, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(b_data, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c_data, device=hexagon_session.device)
+    a_hexagon = tvm.runtime.tensor(a_data, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.tensor(b_data, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.tensor(c_data, device=hexagon_session.device)
 
     if tvm.testing.utils.IS_IN_CI:
         # Run with reduced number and repeat for CI
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index d3adbc12c922..dc77b7ad39a4 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -242,9 +242,9 @@ def _benchmark_hexagon_elementwise_add_kernel(
             )
 
             # Create the target-side tensors to hold the primfunc's inputs and outputs...
-            input1_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
-            input2_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
-            output_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
+            input1_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
+            input2_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
+            output_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
 
             # Populate the primfunc's input tensors...
             input1_data.copyfrom(host_numpy_input1_data)
diff --git a/tests/python/contrib/test_hexagon/test_dma_builtin.py b/tests/python/contrib/test_hexagon/test_dma_builtin.py
index 479b680065e1..1592bd020fd6 100644
--- a/tests/python/contrib/test_hexagon/test_dma_builtin.py
+++ b/tests/python/contrib/test_hexagon/test_dma_builtin.py
@@ -164,8 +164,8 @@ def test_vtcm_alloc_compute(self, hexagon_launcher, mode, module):
             vm_rt = relax.VirtualMachine(
                 vm_mod, dev, "naive"
             )  # Use naive allocator to exercise VTCM allocation in relax
-            data0 = tvm.nd.array(input_arg0_data, dev)
-            data1 = tvm.nd.array(input_arg1_data, dev)
+            data0 = tvm.runtime.tensor(input_arg0_data, dev)
+            data1 = tvm.runtime.tensor(input_arg1_data, dev)
             vm_rt.set_input("main", data0, data1)
             vm_rt.invoke_stateful("main")
             hexagon_output = vm_rt.get_outputs("main").numpy()
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index c7f9d2a00fed..5d9f4128d172 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -174,9 +174,9 @@ def verify_dense(sch, target, m_size, n_size, k_size, hexagon_session):
                         k_output * 4 + t_idx
                     ]
 
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(pack_width, dev)
-    c = tvm.nd.array(np.zeros((m_size, n_size), dtype="int32"), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(pack_width, dev)
+    c = tvm.runtime.tensor(np.zeros((m_size, n_size), dtype="int32"), dev)
 
     mod(a, b, c)
     np.testing.assert_equal(c.numpy(), c_np)
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index cab3f7d64f9b..6abfa812175f 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -148,9 +148,9 @@ def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
     b = np.random.randint(0, 16, b_shape, dtype=b_dtype)
     c = np.zeros(c_shape, dtype=c_dtype)
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+    a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device)
 
     # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
     number = 1
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index 89385b2aeb8f..ceabc6355732 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -318,9 +318,9 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global")
     func_tir = tvm.compile(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope)
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope)
+    a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device, mem_scope=mem_scope)
+    b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device, mem_scope=mem_scope)
+    c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device, mem_scope=mem_scope)
 
     # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
     number = 1
@@ -341,16 +341,16 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
     b_vtcm = np.zeros((b.size), dtype="uint8")
     c_vtcm = np.zeros((c.size), dtype="int32")
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope="global")
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope="global")
-    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+    a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device, mem_scope="global")
+    b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device, mem_scope="global")
+    c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.tensor(
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
-    b_vtcm_hexagon = tvm.runtime.ndarray.array(
+    b_vtcm_hexagon = tvm.runtime.tensor(
         b_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
-    c_vtcm_hexagon = tvm.runtime.ndarray.array(
+    c_vtcm_hexagon = tvm.runtime.tensor(
         c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
index d9b9a2480312..60731a8febe0 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -96,9 +96,9 @@ def evaluate(hexagon_session, operations, expected, sch):
     b = np.random.random(shape).astype(dtype)
     c = np.zeros(shape, dtype=dtype)
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+    a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device)
 
     # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
     number = 1
diff --git a/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py b/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
index 42038b97f90e..8a56e91581cb 100644
--- a/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
+++ b/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
@@ -86,7 +86,7 @@ def test_alloc_storage_with_scope_global(hexagon_launcher):
         vm_mod = session.get_executor_from_factory(lib)
         # This is the important line which tests nd allocator
         vm_rt = relax.VirtualMachine(vm_mod, dev, memory_cfg="naive")
-        x = tvm.nd.array(arg0, dev)
+        x = tvm.runtime.tensor(arg0, dev)
         vm_rt.set_input("main", x)
         vm_rt.invoke_stateful("main")
         hexagon_output = vm_rt.get_outputs("main").numpy()
diff --git a/tests/python/contrib/test_hexagon/test_relax_integration.py b/tests/python/contrib/test_hexagon/test_relax_integration.py
index 5e1bfac3625e..4a3d122ce0fb 100644
--- a/tests/python/contrib/test_hexagon/test_relax_integration.py
+++ b/tests/python/contrib/test_hexagon/test_relax_integration.py
@@ -57,7 +57,7 @@ def test_mobilenet_onnx(hexagon_session: Session):
 
     vm_mod = hexagon_session.get_executor_from_factory(exe)
     vm_rt = relax.VirtualMachine(vm_mod, dev)
-    data = tvm.nd.array(data_np, dev)
+    data = tvm.runtime.tensor(data_np, dev)
     vm_rt.set_input("main", data)
     vm_rt.invoke_stateful("main")
     hexagon_res = vm_rt.get_outputs("main")
@@ -67,7 +67,7 @@ def test_mobilenet_onnx(hexagon_session: Session):
     exe = tvm.compile(relax_mod, "llvm")
     dev = tvm.cpu()
     vm_rt = relax.VirtualMachine(exe, dev)
-    data = tvm.nd.array(data_np, dev)
+    data = tvm.runtime.tensor(data_np, dev)
     llvm_res = vm_rt["main"](data)
     tvm.testing.assert_allclose(hexagon_res.numpy(), llvm_res.numpy(), rtol=1e-3)
 
@@ -91,7 +91,7 @@ def test_mobilenet(hexagon_session: Session):
 
     vm_mod = hexagon_session.get_executor_from_factory(exe)
     vm_rt = relax.VirtualMachine(vm_mod, dev)
-    data = tvm.nd.array(data_np, dev)
+    data = tvm.runtime.tensor(data_np, dev)
     vm_rt.set_input("main", data)
     vm_rt.invoke_stateful("main")
     hexagon_res = vm_rt.get_outputs("main")
@@ -101,7 +101,7 @@ def test_mobilenet(hexagon_session: Session):
     exe = tvm.compile(relax_mod, "llvm")
     dev = tvm.cpu()
     vm_rt = relax.VirtualMachine(exe, dev)
-    data = tvm.nd.array(data_np, dev)
+    data = tvm.runtime.tensor(data_np, dev)
     llvm_res = vm_rt["main"](data)
     tvm.testing.assert_allclose(hexagon_res.numpy(), llvm_res.numpy(), rtol=1e-3)
 
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 3be9683a7deb..714d37a3b982 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -188,12 +188,12 @@ def test_async_software_pipeline(
         with hexagon_launcher.create_session() as hexagon_session:
             dev = hexagon_session.device
             mod = hexagon_session.load_module(func)
-            out = tvm.nd.array(out_np, device=dev)
-            a = tvm.nd.array(a_np, device=dev)
+            out = tvm.runtime.tensor(out_np, device=dev)
+            a = tvm.runtime.tensor(a_np, device=dev)
             if comp_type == "single_input":
                 mod(a, out)
             else:
-                b = tvm.nd.array(b_np, device=dev)
+                b = tvm.runtime.tensor(b_np, device=dev)
                 mod(a, b, out)
 
             verify(out, ref)
diff --git a/tests/python/contrib/test_hexagon/test_take.py b/tests/python/contrib/test_hexagon/test_take.py
index 15058e17af5a..4f6169b48ca7 100644
--- a/tests/python/contrib/test_hexagon/test_take.py
+++ b/tests/python/contrib/test_hexagon/test_take.py
@@ -322,7 +322,7 @@ def abs(
 
 # Quantizing input : scale is returned as float64 and zp is returned as int32
 inp_quant, inp_scale, inp_zero_point = quantize_np(data, dtype)
-inp_quant = tvm.nd.array(inp_quant.astype(np.uint8))
+inp_quant = tvm.runtime.tensor(inp_quant.astype(np.uint8))
 
 
 # Test the implementations value output with numpy data. First the IR is runn through pass
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index 2dc426749680..f61a2560cfad 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -60,9 +60,9 @@ def elemwise_sum_parallel(a: T.handle, b: T.handle, c: T.handle, n: T.int32):
 
 
 def generate_add_test_data(hexagon_session: Session, n=128 * 1024):
-    a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
-    b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
-    c = tvm.nd.array(np.zeros(n, dtype="float32"), hexagon_session.device)
+    a = tvm.runtime.tensor(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
+    b = tvm.runtime.tensor(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
+    c = tvm.runtime.tensor(np.zeros(n, dtype="float32"), hexagon_session.device)
     return (a, b, c, n)
 
 
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 015a9f0656ed..42fca9c153aa 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -101,8 +101,8 @@ def evaluate(hexagon_session, sch, size):
     a = np.random.randint(-128, 127, a_shape, dtype="int8")
     a_vtcm = np.zeros(a_shape, dtype="int8")
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
-    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+    a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.tensor(
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
diff --git a/tests/python/contrib/test_hipblas.py b/tests/python/contrib/test_hipblas.py
index 33187fa4efba..d285dd45491d 100644
--- a/tests/python/contrib/test_hipblas.py
+++ b/tests/python/contrib/test_hipblas.py
@@ -36,9 +36,9 @@ def verify(target="rocm"):
             return
         dev = tvm.rocm(0)
         f = tvm.compile(te.create_prim_func([A, B, C]), target=target)
-        a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.numpy(), np.dot(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)), rtol=rtol
@@ -60,13 +60,13 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
     f = tvm.compile(te.create_prim_func([A, B, C]), target="rocm")
 
     if "int" in in_dtype:
-        a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
-        b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
     else:
-        a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=Ashape).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=Bshape).astype(B.dtype), dev)
 
-    c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(Cshape, dtype=C.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(
         c.numpy(),
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 41847f3b8fea..cc459e81f51d 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -36,9 +36,9 @@ def verify(A, B, C):
             return
         dev = tvm.metal(0)
         f = tvm.compile(te.create_prim_func([A, B, C]), target="metal")
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
 
@@ -65,9 +65,9 @@ def verify(A, B, C, target="llvm"):
             return
         dev = tvm.metal(0)
         f = tvm.compile(te.create_prim_func([A, B, C]), target="metal")
-        a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev)
         f(a, b, c)
 
     verify(A, B, C, s1)
diff --git a/tests/python/contrib/test_msc/test_plugin.py b/tests/python/contrib/test_msc/test_plugin.py
index 3cacb8a646ba..1feeed2a7c84 100644
--- a/tests/python/contrib/test_msc/test_plugin.py
+++ b/tests/python/contrib/test_msc/test_plugin.py
@@ -241,7 +241,7 @@ def _get_tvm_model(tvm_manager):
             data = block_builder.emit_output(data)
         block_builder.emit_func_output(data)
     mod = block_builder.finalize()
-    return BindParams("main", {"weight": tvm.nd.array(weights)})(mod)
+    return BindParams("main", {"weight": tvm.runtime.tensor(weights)})(mod)
 
 
 def _build_plugin(frameworks, plugin_root):
@@ -264,7 +264,7 @@ def _run_relax(relax_mod, target_name, data):
     with tvm.transform.PassContext(opt_level=3):
         relax_exec = tvm.compile(relax_mod, target)
         runnable = tvm.relax.VirtualMachine(relax_exec, device)
-    data = tvm.nd.array(data, device)
+    data = tvm.runtime.tensor(data, device)
     return runnable["main"](data).numpy()
 
 
diff --git a/tests/python/contrib/test_msc/test_translate_relax.py b/tests/python/contrib/test_msc/test_translate_relax.py
index 41e8f0e44e64..0a8be3df11a0 100644
--- a/tests/python/contrib/test_msc/test_translate_relax.py
+++ b/tests/python/contrib/test_msc/test_translate_relax.py
@@ -40,7 +40,7 @@ def verify_model(torch_model, input_info, opt_config=None):
     args = [msc_utils.random_data(i, MSCFramework.TVM) for i in input_info]
 
     def _tvm_runtime_to_np(obj):
-        if isinstance(obj, tvm.runtime.NDArray):
+        if isinstance(obj, tvm.runtime.Tensor):
             return obj.numpy()
         elif isinstance(obj, tvm.runtime.ShapeTuple):
             return np.array(obj, dtype="int64")
diff --git a/tests/python/contrib/test_msc/test_translate_tensorrt.py b/tests/python/contrib/test_msc/test_translate_tensorrt.py
index a3eaae09afbc..66b56210c233 100644
--- a/tests/python/contrib/test_msc/test_translate_tensorrt.py
+++ b/tests/python/contrib/test_msc/test_translate_tensorrt.py
@@ -47,7 +47,7 @@ def build_and_run(mod, inputs):
         rt_mod = tvm.compile(mod, target)
         runnable = tvm.relax.VirtualMachine(rt_mod, tvm.cuda())
     res = runnable["main"](*inputs)
-    if isinstance(res, tvm.runtime.NDArray):
+    if isinstance(res, tvm.runtime.Tensor):
         return [res.numpy()]
     return [e.numpy() for e in res]
 
@@ -104,7 +104,7 @@ def verify_model(torch_model, input_info, **trans_config):
     output_folder = msc_utils.msc_dir()
     # tranalte to tensorrt
     mod = codegen.to_tensorrt(mod, graphs, weights, output_folder=output_folder)
-    tvm_datas = [tvm.nd.array(i, device=tvm.cuda()) for i in datas]
+    tvm_datas = [tvm.runtime.tensor(i, device=tvm.cuda()) for i in datas]
     results = build_and_run(mod, tvm_datas)
     for gol, res in zip(golden, results):
         tvm.testing.assert_allclose(gol, res, atol=1e-3, rtol=1e-3)
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index c8c8054dfb6b..10091cb9adff 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -40,7 +40,7 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         f = tvm.compile(te.create_prim_func([A]), target=target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
         assert abs(np.mean(na)) < 0.3
@@ -65,7 +65,7 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         f = tvm.compile(te.create_prim_func([A]), target=target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
         assert abs(np.mean(na) - 0.5) < 1e-1
@@ -90,7 +90,7 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         f = tvm.compile(te.create_prim_func([A]), target=target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
         assert abs(np.mean(na) - 3) < 1e-1
@@ -107,7 +107,7 @@ def test_local(dev, dtype):
         if not tvm.get_global_func("tvm.contrib.random.random_fill", True):
             print("skip because extern function is not available")
             return
-        value = tvm.nd.empty((512, 512), dtype, dev)
+        value = tvm.runtime.empty((512, 512), dtype, dev)
         random_fill = tvm.get_global_func("tvm.contrib.random.random_fill")
         random_fill(value)
 
@@ -126,7 +126,7 @@ def test_rpc(dtype):
 
         def check_remote(server):
             remote = rpc.connect(server.host, server.port)
-            value = tvm.nd.empty((512, 512), dtype, remote.cpu())
+            value = tvm.runtime.empty((512, 512), dtype, remote.cpu())
             random_fill = remote.get_function("tvm.contrib.random.random_fill")
             random_fill(value)
 
@@ -170,7 +170,7 @@ def test_body():
             configure_threads = tvm.get_global_func("runtime.config_threadpool")
             configure_threads(1, num_thread_used)
 
-            test_input = tvm.runtime.ndarray.empty((10, 10))
+            test_input = tvm.runtime.empty((10, 10))
             random_fill = tvm.get_global_func("tvm.contrib.random.random_fill_for_measure")
             random_fill(test_input)
         except:  # pylint: disable=bare-except
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index a715a5bb4a74..6b57395ce847 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -40,9 +40,9 @@ def verify(target="rocm"):
             return
         dev = tvm.rocm(0)
         f = tvm.compile(te.create_prim_func([A, B, C]), target=target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
 
@@ -73,9 +73,9 @@ def verify(target="rocm"):
             return
         dev = tvm.rocm(0)
         f = tvm.compile(te.create_prim_func([A, B, C]), target=target)
-        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=ashape).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=bshape).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.zeros((batch, m, n), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.numpy(), get_numpy(a.numpy(), b.numpy(), transa, transb), rtol=1e-5
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index a853df569498..aa80cf484823 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -53,9 +53,9 @@ def test_sort():
     dev = tvm.cpu(0)
     target = "llvm"
     f = tvm.compile(te.create_prim_func([data, sort_num, out]), target=target)
-    a = tvm.nd.array(np.array(input_data).astype(data.dtype), dev)
-    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(input_data).astype(data.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
 
@@ -85,9 +85,9 @@ def test_sort_np():
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
     sort_num_input = np.full(reduced_shape, dshape[axis])
-    a = tvm.nd.array(np.array(np_data).astype(data.dtype), dev)
-    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(np_data).astype(data.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5)
 
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
index 9938f85cd563..f75156fa0467 100644
--- a/tests/python/contrib/test_tflite_runtime.py
+++ b/tests/python/contrib/test_tflite_runtime.py
@@ -92,7 +92,7 @@ def test_local():
     # inference via tvm tflite runtime
     with open(tflite_model_path, "rb") as model_fin:
         runtime = tflite_runtime.create(model_fin.read(), tvm.cpu(0))
-        runtime.set_input(0, tvm.nd.array(tflite_input))
+        runtime.set_input(0, tvm.runtime.tensor(tflite_input))
         runtime.invoke()
         out = runtime.get_output(0)
         np.testing.assert_equal(out.numpy(), tflite_output)
@@ -138,7 +138,7 @@ def check_remote(server):
 
         with open(tflite_model_path, "rb") as model_fin:
             runtime = tflite_runtime.create(model_fin.read(), remote.cpu(0))
-            runtime.set_input(0, tvm.nd.array(tflite_input, remote.cpu(0)))
+            runtime.set_input(0, tvm.runtime.tensor(tflite_input, remote.cpu(0)))
             runtime.invoke()
             out = runtime.get_output(0)
             np.testing.assert_equal(out.numpy(), tflite_output)
diff --git a/tests/python/contrib/test_tir_triton_integration.py b/tests/python/contrib/test_tir_triton_integration.py
index b349d2fabce5..95ccf28fbddb 100644
--- a/tests/python/contrib/test_tir_triton_integration.py
+++ b/tests/python/contrib/test_tir_triton_integration.py
@@ -110,8 +110,8 @@ def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle):
     assert len(Module.get_attr("external_mods")) == 1
 
     device = tvm.cuda(0)
-    x_nd = tvm.nd.array(np.random.rand(256).astype(np.float32), device)
-    y_nd = tvm.nd.array(np.random.rand(256).astype(np.float32), device)
+    x_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
+    y_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
     output_np = x_nd.numpy() + y_nd.numpy()
 
     with tvm.target.Target("cuda"):
diff --git a/tests/python/contrib/test_tvmjs.py b/tests/python/contrib/test_tvmjs.py
index 22742ec224ef..4de1b6c9850c 100644
--- a/tests/python/contrib/test_tvmjs.py
+++ b/tests/python/contrib/test_tvmjs.py
@@ -52,8 +52,8 @@ def test_save_load_float8(dtype):
     arr = np.arange(16, dtype=np_dtype)
 
     with tempfile.TemporaryDirectory(prefix="tvm_") as temp_dir:
-        tvmjs.dump_ndarray_cache({"arr": arr}, temp_dir)
-        cache, _ = tvmjs.load_ndarray_cache(temp_dir, tvm.cpu())
+        tvmjs.dump_tensor_cache({"arr": arr}, temp_dir)
+        cache, _ = tvmjs.load_tensor_cache(temp_dir, tvm.cpu())
 
     after_roundtrip = cache["arr"].numpy()
 
diff --git a/tests/python/disco/test_callback.py b/tests/python/disco/test_callback.py
index d0defa15b869..8e78058331a5 100644
--- a/tests/python/disco/test_callback.py
+++ b/tests/python/disco/test_callback.py
@@ -91,7 +91,7 @@ def transform_params(
         params = transform_params(worker_id, fget_item)
 
         # Worker 0 is the same PID as the controlling scope, so
-        # `debug_get_from_remote(0)` returns the NDArray containing
+        # `debug_get_from_remote(0)` returns the Tensor containing
         # the output.
         params_gpu0 = params.debug_get_from_remote(0)
         assert params_gpu0[0].device == tvm.cuda(0)
@@ -109,7 +109,7 @@ def transform_params(
         )
 
         # Worker 1 is a different PID altogether, so
-        # `debug_get_from_remote(1)` returns a new NDArray within the
+        # `debug_get_from_remote(1)` returns a new Tensor within the
         # calling scope's PID.
         params_gpu1 = params.debug_get_from_remote(1)
         assert params_gpu1[0].device == tvm.cpu()
diff --git a/tests/python/disco/test_ccl.py b/tests/python/disco/test_ccl.py
index 649b865b6c3b..260ac12d8d0c 100644
--- a/tests/python/disco/test_ccl.py
+++ b/tests/python/disco/test_ccl.py
@@ -491,9 +491,9 @@ def relax_build(mod, target):
     W1 = np.random.randn(128, 128).astype("float32")
     W2 = np.random.randn(128, 128).astype("float32")
     Y_expected = VirtualMachine(relax_build(MLP, target), device=dev)["main"](
-        tvm.nd.array(X, device=dev),
-        tvm.nd.array(W1, device=dev),
-        tvm.nd.array(W2, device=dev),
+        tvm.runtime.tensor(X, device=dev),
+        tvm.runtime.tensor(W1, device=dev),
+        tvm.runtime.tensor(W2, device=dev),
     ).numpy()
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -512,7 +512,7 @@ def relax_build(mod, target):
         d_W2.debug_copy_from(0, W2[:64, :])
         d_W2.debug_copy_from(1, W2[64:, :])
         d_Y = mod["main"](d_X, d_W1, d_W2)
-        Y_result = tvm.nd.empty((128, 128), "float32", device=dev)
+        Y_result = tvm.runtime.empty((128, 128), "float32", device=dev)
         sess.copy_from_worker_0(Y_result, d_Y)
         sess.sync_worker_0()
         Y_result = Y_result.numpy()
@@ -632,11 +632,11 @@ def relax_build(mod, target):
     Wv = np.random.randn(128, 512).astype("float32")
     Wo = np.random.randn(512, 128).astype("float32")
     Y_expected = VirtualMachine(relax_build(Attention, target), device=dev)["main"](
-        tvm.nd.array(X, device=dev),
-        tvm.nd.array(Wq, device=dev),
-        tvm.nd.array(Wk, device=dev),
-        tvm.nd.array(Wv, device=dev),
-        tvm.nd.array(Wo, device=dev),
+        tvm.runtime.tensor(X, device=dev),
+        tvm.runtime.tensor(Wq, device=dev),
+        tvm.runtime.tensor(Wk, device=dev),
+        tvm.runtime.tensor(Wv, device=dev),
+        tvm.runtime.tensor(Wo, device=dev),
     ).numpy()
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -661,7 +661,7 @@ def relax_build(mod, target):
         d_Wo.debug_copy_from(0, Wo[:256, :])
         d_Wo.debug_copy_from(1, Wo[256:, :])
         d_Y = mod["main"](d_X, d_Wq, d_Wk, d_Wv, d_Wo)
-        Y_result = tvm.nd.empty((1, 10, 128), "float32", device=dev)
+        Y_result = tvm.runtime.empty((1, 10, 128), "float32", device=dev)
         sess.copy_from_worker_0(Y_result, d_Y)
         sess.sync_worker_0()
         Y_result = Y_result.numpy()
diff --git a/tests/python/disco/test_loader.py b/tests/python/disco/test_loader.py
index cf5955b10d9f..b41ff526f083 100644
--- a/tests/python/disco/test_loader.py
+++ b/tests/python/disco/test_loader.py
@@ -82,12 +82,12 @@ def _shard_qkv_1(src, tgt):
 
 
 def _create_loader(sess, path, param_dict, shard_info):
-    path_ndarray_cache = path + "/ndarray-cache.json"
-    tvmjs.dump_ndarray_cache(param_dict, path, encode_format="raw")
-    with open(path_ndarray_cache, "r", encoding="utf-8") as i_f:
-        ndarray_cache = i_f.read()
+    path_tensor_cache = path + "/tensor-cache.json"
+    tvmjs.dump_tensor_cache(param_dict, path, encode_format="raw")
+    with open(path_tensor_cache, "r", encoding="utf-8") as i_f:
+        tensor_cache = i_f.read()
     loader_create = sess.get_global_func("runtime.disco.ShardLoader")
-    loader = loader_create(path_ndarray_cache, ndarray_cache, json.dumps(shard_info), None)
+    loader = loader_create(path_tensor_cache, tensor_cache, json.dumps(shard_info), None)
     return loader
 
 
@@ -100,7 +100,8 @@ def _simulate_presharded_weights(base_path, param_dict, num_shards, shard_info):
         assert key in shard_info, f"ShardInfo lacks shard info about param: {key}"
         shard_dim = shard_info[key]
         sharded_params[key] = [
-            tvm.nd.array(np_shard) for np_shard in np.split(ndarray, num_shards, axis=shard_dim)
+            tvm.runtime.tensor(np_shard)
+            for np_shard in np.split(ndarray, num_shards, axis=shard_dim)
         ]
 
     # Re-order so that the parameter order is sorted first by shard,
@@ -113,7 +114,7 @@ def _simulate_presharded_weights(base_path, param_dict, num_shards, shard_info):
         for key, shards in sharded_params.items()
     }
 
-    tvmjs.dump_ndarray_cache(
+    tvmjs.dump_tensor_cache(
         sharded_params,
         base_path,
         encode_format="raw",
@@ -169,11 +170,11 @@ def test_load_shard():
 
 
 def _create_presharded_loader(sess, path):
-    path_ndarray_cache = path + "/ndarray-cache.json"
-    with open(path_ndarray_cache, "r", encoding="utf-8") as i_f:
-        ndarray_cache = i_f.read()
+    path_tensor_cache = path + "/tensor-cache.json"
+    with open(path_tensor_cache, "r", encoding="utf-8") as i_f:
+        tensor_cache = i_f.read()
     loader_create = sess.get_global_func("runtime.disco.ShardLoader")
-    loader = loader_create(path_ndarray_cache, ndarray_cache, json.dumps({}), None)
+    loader = loader_create(path_tensor_cache, tensor_cache, json.dumps({}), None)
     return loader
 
 
diff --git a/tests/python/disco/test_session.py b/tests/python/disco/test_session.py
index db357c54397b..721115947480 100644
--- a/tests/python/disco/test_session.py
+++ b/tests/python/disco/test_session.py
@@ -37,13 +37,13 @@
 
 def _numpy_to_worker_0(sess: di.Session, np_array: np.array, device):
     x_array = sess.empty(np_array.shape, "float32", device=device)
-    host_array = tvm.nd.array(np_array, device=device)
+    host_array = tvm.runtime.tensor(np_array, device=device)
     sess.copy_to_worker_0(host_array, x_array)
     return x_array
 
 
 def _numpy_from_worker_0(sess: di.Session, remote_array, shape, dtype):
-    host_array = tvm.nd.empty(shape, dtype, device=tvm.cpu())
+    host_array = tvm.runtime.empty(shape, dtype, device=tvm.cpu())
     sess.copy_from_worker_0(host_array, remote_array)
     sess.sync_worker_0()
     return host_array.numpy()
@@ -142,14 +142,14 @@ def test_float(session_kind):
 
 
 @pytest.mark.parametrize("session_kind", _all_session_kinds)
-def test_ndarray(session_kind):
+def test_tensor(session_kind):
     num_workers = 4
     sess = session_kind(num_workers=num_workers)
     device = tvm.cpu(0)
     x_np = np.arange(6).astype("float32").reshape([2, 3])
     y_np = np.arange(6).astype("float32").reshape([2, 3]) + 1
     x_disc = _numpy_to_worker_0(sess, x_np, device=device)
-    y_disc = sess.get_global_func("tests.disco.add_one_ndarray")(x_disc)
+    y_disc = sess.get_global_func("tests.disco.add_one_tensor")(x_disc)
     y_nd = _numpy_from_worker_0(sess, y_disc, shape=y_np.shape, dtype=y_np.dtype)
     np.testing.assert_equal(y_nd, y_np)
 
diff --git a/tests/python/driver/test_compile.py b/tests/python/driver/test_compile.py
index 1ed4fc67ca6a..f0bd17a2f6b9 100644
--- a/tests/python/driver/test_compile.py
+++ b/tests/python/driver/test_compile.py
@@ -47,9 +47,9 @@ def test_compile_tir():
     dev = tvm.cpu(0)
     a_np = np.random.uniform(size=10).astype(np.float32)
     b_np = np.random.uniform(size=10).astype(np.float32)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros(10, dtype=np.float32), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
+    c = tvm.runtime.tensor(np.zeros(10, dtype=np.float32), dev)
 
     exec_prim(a, b, c)
     np.testing.assert_allclose(c.numpy(), a_np + b_np)
@@ -77,8 +77,8 @@ def main(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")) -> R.Te
     dev = tvm.cpu(0)
     x_np = np.random.uniform(size=(3, 4)).astype(np.float32)
     y_np = np.random.uniform(size=(3, 4)).astype(np.float32)
-    x = tvm.nd.array(x_np, dev)
-    y = tvm.nd.array(y_np, dev)
+    x = tvm.runtime.tensor(x_np, dev)
+    y = tvm.runtime.tensor(y_np, dev)
 
     vm = relax.VirtualMachine(exec_relax, dev)
     z = vm["main"](x, y)
@@ -107,8 +107,8 @@ def main(x: R.Tensor((4,), "float32")):
     assert isinstance(ex, Executable)
 
     dev = tvm.cpu(0)
-    x = tvm.nd.array(np.array([1, 2, 3, 4], dtype=np.float32), dev)
-    y = tvm.nd.array(np.zeros(4, dtype=np.float32), dev)
+    x = tvm.runtime.tensor(np.array([1, 2, 3, 4], dtype=np.float32), dev)
+    y = tvm.runtime.tensor(np.zeros(4, dtype=np.float32), dev)
     # For tir function, we can directly call the function
     ex["add_one"](x, y)
     np.testing.assert_allclose(y.numpy(), x.numpy() + 1)
diff --git a/tests/python/ir/test_datatype_nv_fp4.py b/tests/python/ir/test_datatype_nv_fp4.py
index 85047fc4a5fd..d237176e6c55 100644
--- a/tests/python/ir/test_datatype_nv_fp4.py
+++ b/tests/python/ir/test_datatype_nv_fp4.py
@@ -36,7 +36,7 @@ def test_create_nv_fp4_nd_array(np_dtype, dtype_str):
         """Skip test if ml_dtypes is not installed"""
         return
     x = np.random.rand(128, 128).astype(np_dtype)
-    x_nd = tvm.nd.array(x)
+    x_nd = tvm.runtime.tensor(x)
     assert x_nd.dtype == dtype_str
     np.testing.assert_equal(x_nd.numpy(), x)
 
diff --git a/tests/python/ir/test_datatype_nv_fp8.py b/tests/python/ir/test_datatype_nv_fp8.py
index d27cc0314328..0c17e844757f 100644
--- a/tests/python/ir/test_datatype_nv_fp8.py
+++ b/tests/python/ir/test_datatype_nv_fp8.py
@@ -85,7 +85,7 @@ def test_create_nv_fp8_nd_array(np_dtype, dtype_str):
         """Skip test if ml_dtypes is not installed"""
         return
     x = np.random.rand(128, 128).astype(np_dtype)
-    x_nd = tvm.nd.array(x)
+    x_nd = tvm.runtime.tensor(x)
     assert x_nd.dtype == dtype_str
     np.testing.assert_equal(x_nd.numpy(), x)
 
@@ -110,7 +110,7 @@ def test_fp8_unary_op(np_dtype, dtype_str):
     a_fp32 = np.zeros(128).astype(np.float32)
     a_roundtrip = np.zeros(128).astype(np_dtype)
     args = list(
-        map(lambda _: tvm.nd.array(_), [a, b, a_add_b, a_sub_b, a_mul_b, a_fp32, a_roundtrip])
+        map(lambda _: tvm.runtime.tensor(_), [a, b, a_add_b, a_sub_b, a_mul_b, a_fp32, a_roundtrip])
     )
     f(*args)
     expected_a_fp32 = a.astype(np.float32)
diff --git a/tests/python/ir/test_ir_container.py b/tests/python/ir/test_ir_container.py
index 177925181782..12502b6e6c7e 100644
--- a/tests/python/ir/test_ir_container.py
+++ b/tests/python/ir/test_ir_container.py
@@ -101,12 +101,12 @@ def test_in_container():
     assert "d" not in arr
 
 
-def test_ndarray_container():
-    x = tvm.nd.array([1, 2, 3])
+def test_tensor_container():
+    x = tvm.runtime.tensor([1, 2, 3])
     arr = tvm.runtime.convert([x, x])
     assert arr[0].same_as(x)
     assert arr[1].same_as(x)
-    assert isinstance(arr[0], tvm.nd.NDArray)
+    assert isinstance(arr[0], tvm.runtime.Tensor)
 
 
 def test_return_variant_type():
diff --git a/tests/python/ir/test_node_reflection.py b/tests/python/ir/test_node_reflection.py
index be00bc3a4777..2db0359b6d3a 100644
--- a/tests/python/ir/test_node_reflection.py
+++ b/tests/python/ir/test_node_reflection.py
@@ -163,19 +163,19 @@ def test_dict():
     assert set(dir(x.__class__)) <= set(dir(x))
 
 
-def test_ndarray():
+def test_tensor():
     dev = tvm.cpu(0)
-    tvm_arr = tvm.nd.array(np.random.rand(4), device=dev)
+    tvm_arr = tvm.runtime.tensor(np.random.rand(4), device=dev)
     tvm_arr2 = tvm.ir.load_json(tvm.ir.save_json(tvm_arr))
     tvm.ir.assert_structural_equal(tvm_arr, tvm_arr2)
     np.testing.assert_array_equal(tvm_arr.numpy(), tvm_arr2.numpy())
 
 
-def test_ndarray_dict():
+def test_tensor_dict():
     dev = tvm.cpu(0)
     m1 = {
-        "key1": tvm.nd.array(np.random.rand(4), device=dev),
-        "key2": tvm.nd.array(np.random.rand(4), device=dev),
+        "key1": tvm.runtime.tensor(np.random.rand(4), device=dev),
+        "key2": tvm.runtime.tensor(np.random.rand(4), device=dev),
     }
     m2 = tvm.ir.load_json(tvm.ir.save_json(m1))
     tvm.ir.assert_structural_equal(m1, m2)
@@ -196,7 +196,7 @@ def test_alloc_const():
     shape = (16,)
     buf = tvm.tir.decl_buffer(shape, dtype)
     np_data = np.random.rand(*shape).astype(dtype)
-    data = tvm.nd.array(np_data, device=dev)
+    data = tvm.runtime.tensor(np_data, device=dev)
     body = tvm.tir.Evaluate(0)
     alloc_const = tvm.tir.AllocateConst(buf.data, dtype, shape, data, body)
     alloc_const2 = tvm.ir.load_json(tvm.ir.save_json(alloc_const))
diff --git a/tests/python/meta_schedule/test_meta_schedule_database.py b/tests/python/meta_schedule/test_meta_schedule_database.py
index 84ec862f0ef8..f8b2354c33bf 100644
--- a/tests/python/meta_schedule/test_meta_schedule_database.py
+++ b/tests/python/meta_schedule/test_meta_schedule_database.py
@@ -587,7 +587,7 @@ def MatmulPrimFunc() -> IRModule:
 
 
 @pytest.mark.parametrize("f_mod", [MatmulPrimFunc])
-@pytest.mark.parametrize("mod_eq", ["structural", "ignore-ndarray", "anchor-block"])
+@pytest.mark.parametrize("mod_eq", ["structural", "ignore-tensor", "anchor-block"])
 def test_json_database_commit_workload(f_mod, mod_eq):
     mod: IRModule = f_mod()
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -596,7 +596,7 @@ def test_json_database_commit_workload(f_mod, mod_eq):
 
 
 @pytest.mark.parametrize("f_mod", [MatmulPrimFunc])
-@pytest.mark.parametrize("mod_eq", ["structural", "ignore-ndarray", "anchor-block"])
+@pytest.mark.parametrize("mod_eq", ["structural", "ignore-tensor", "anchor-block"])
 def test_memory_database_commit_workload(f_mod, mod_eq):
     mod: IRModule = f_mod()
     database = ms.database.MemoryDatabase(module_equality=mod_eq)
diff --git a/tests/python/meta_schedule/test_meta_schedule_feature_extractor.py b/tests/python/meta_schedule/test_meta_schedule_feature_extractor.py
index 84d07dbf6e11..8b718f86a104 100644
--- a/tests/python/meta_schedule/test_meta_schedule_feature_extractor.py
+++ b/tests/python/meta_schedule/test_meta_schedule_feature_extractor.py
@@ -19,11 +19,11 @@
 from typing import List
 
 import numpy as np
+import tvm.runtime
 from tvm.meta_schedule import TuneContext
 from tvm.meta_schedule.feature_extractor import PyFeatureExtractor
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.utils import derived_object
-from tvm.runtime.ndarray import array
 
 
 def test_meta_schedule_feature_extractor():
@@ -34,7 +34,7 @@ def extract_from(
             context: TuneContext,  # pylint: disable = unused-argument
             candidates: List[MeasureCandidate],  # pylint: disable = unused-argument
         ) -> List[np.ndarray]:
-            return [array(np.random.rand(4, 5))]
+            return [tvm.runtime.tensor(np.random.rand(4, 5))]
 
     extractor = FancyFeatureExtractor()
     features = extractor.extract_from(TuneContext(), [])
diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index 3f0964cfa8ed..72edf67d68e4 100644
--- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -47,8 +47,8 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, tar
     ex = relax.build(tvm_mod, target=target, relax_pipeline=relax_pipeline)
     vm = relax.VirtualMachine(ex, dev)
 
-    gpu_data = tvm.nd.array(raw_data_for_tvm, dev)
-    gpu_params = [tvm.nd.array(p, dev) for p in tvm_params["main"]]
+    gpu_data = tvm.runtime.tensor(raw_data_for_tvm, dev)
+    gpu_params = [tvm.runtime.tensor(p, dev) for p in tvm_params["main"]]
     gpu_out = vm["main"](gpu_data, *gpu_params)
 
     pytorch_out = torch_module(torch_data)
diff --git a/tests/python/nightly/test_nnapi/test_network.py b/tests/python/nightly/test_nnapi/test_network.py
index 2f9863eb4ee8..82094cb74c29 100644
--- a/tests/python/nightly/test_nnapi/test_network.py
+++ b/tests/python/nightly/test_nnapi/test_network.py
@@ -125,7 +125,7 @@ def test_network(name, dtype):
     for _name, (shape, _dtype) in inputs.items():
         input_data[_name] = np.random.uniform(-1.0, 1.0, shape).astype(_dtype)
 
-    inputs_tvm: List[tvm.nd.NDArray] = [tvm.nd.array(v) for k, v in input_data.items()]
+    inputs_tvm: List[tvm.runtime.Tensor] = [tvm.runtime.tensor(v) for k, v in input_data.items()]
     outputs = _build_and_run_network(remote_obj, tracker, mod, inputs_tvm)
     nnapi_out = outputs[0]
     expected_out = outputs[1]
diff --git a/tests/python/nightly/test_nnapi/test_ops.py b/tests/python/nightly/test_nnapi/test_ops.py
index a6837d2ce5c1..fc10e9b169c0 100644
--- a/tests/python/nightly/test_nnapi/test_ops.py
+++ b/tests/python/nightly/test_nnapi/test_ops.py
@@ -255,7 +255,7 @@ def main(
         tracker,
         mod,
         inputs=[
-            tvm.nd.array(np.random.uniform(size=(8, 10, 15)).astype("float32")),
+            tvm.runtime.tensor(np.random.uniform(size=(8, 10, 15)).astype("float32")),
         ],
     )
 
@@ -284,7 +284,7 @@ def main(
         tracker,
         mod,
         inputs=[
-            tvm.nd.array(np.random.uniform(size=(1, 10, 15)).astype("float32")),
+            tvm.runtime.tensor(np.random.uniform(size=(1, 10, 15)).astype("float32")),
         ],
     )
 
@@ -351,7 +351,7 @@ def main(
 
 
 def verify(remote_obj, tracker, mod, inputs):
-    inputs_tvm: List[tvm.nd.NDArray] = [tvm.nd.array(v) for v in inputs]
+    inputs_tvm: List[tvm.runtime.Tensor] = [tvm.runtime.tensor(v) for v in inputs]
     outputs = _build_and_run_network(remote_obj, tracker, mod, inputs_tvm)
     nnapi_out = outputs[0]
     expected_out = outputs[1]
diff --git a/tests/python/relax/backend/clml/utils.py b/tests/python/relax/backend/clml/utils.py
index dd7e269f5535..d32a2df38ffd 100644
--- a/tests/python/relax/backend/clml/utils.py
+++ b/tests/python/relax/backend/clml/utils.py
@@ -56,7 +56,7 @@ def build_and_run(
         vm = relax.VirtualMachine(ex, dev)
 
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
     vm.set_input("main", *inputs)
     vm.invoke_stateful("main")
     tvm_output = vm.get_outputs("main")
diff --git a/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer.py b/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer.py
index 81acf5ee863d..5c994028ac88 100644
--- a/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer.py
+++ b/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer.py
@@ -170,7 +170,7 @@ def set_global_func(head_dim, dtype):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -212,7 +212,7 @@ def create_kv_cache(head_dim, dtype, rope_mode, support_sliding_window):
         rope_scale,
         rope_theta,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         ftranspose_append,
         None,  # f_transpose_append_mla
         ["tir", fattn_prefill_ragged],
@@ -262,8 +262,8 @@ def verify_cached_kv(kv_cache, seq_ids, expected_k, expected_v):
         values_expected = expected_v[seq_id]
         assert keys_expected.shape == values_expected.shape
         seq_length = expected_k[seq_id].shape[1]
-        keys = tvm.nd.empty(keys_expected.shape, dtype=dtype, device=device)
-        values = tvm.nd.empty(values_expected.shape, dtype=dtype, device=device)
+        keys = tvm.runtime.empty(keys_expected.shape, dtype=dtype, device=device)
+        values = tvm.runtime.empty(values_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, keys, values)
         torch.testing.assert_close(
             torch.from_numpy(keys.numpy()).to(device_torch), keys_expected, rtol=1e-3, atol=1e-3
@@ -460,8 +460,10 @@ def apply_attention(
         queries_np = global_new_q[layer_id]
         keys_np = global_new_k[layer_id]
         values_np = global_new_v[layer_id]
-        qkv = tvm.nd.array(torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device)
-        outputs = tvm.nd.empty(queries_np.shape, dtype, device=device)
+        qkv = tvm.runtime.tensor(
+            torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device
+        )
+        outputs = tvm.runtime.empty(queries_np.shape, dtype, device=device)
         if not only_update_host:
             fattention_with_fuse_qkv(kv_cache, layer_id, sm_scale, qkv, outputs)
 
diff --git a/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer_kernel.py b/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer_kernel.py
index b0b41c8e92b4..302ae1cd568d 100644
--- a/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer_kernel.py
+++ b/tests/python/relax/nvshmem/test_runtime_builtin_kv_cache_transfer_kernel.py
@@ -62,12 +62,12 @@ def test_kv_transfer_without_disco():
     k_np = np.random.rand(ntokens, num_kv_heads, head_dim).astype(np.float16)
     v_np = np.random.rand(ntokens, num_kv_heads, head_dim).astype(np.float16)
     if rank == 0:
-        k = tvm.nd.array(k_np, dev)
-        v = tvm.nd.array(v_np, dev)
+        k = tvm.runtime.tensor(k_np, dev)
+        v = tvm.runtime.tensor(v_np, dev)
         remote_position_map_np = np.array(position_map_array, dtype=np.int32)
-        remote_position_map = tvm.nd.array(remote_position_map_np, dev)
+        remote_position_map = tvm.runtime.tensor(remote_position_map_np, dev)
         remote_tp_group_pe_offset_np = np.array([1] * len(position_map_array), dtype=np.int32)
-        remote_tp_group_pe_offset = tvm.nd.array(remote_tp_group_pe_offset_np, dev)
+        remote_tp_group_pe_offset = tvm.runtime.tensor(remote_tp_group_pe_offset_np, dev)
         transfer_func = tvm.get_global_func("nvshmem.KVTransfer")
         layer_view = pages._create_view(
             [num_pages, 2, num_kv_heads, page_size, head_dim],
@@ -120,13 +120,13 @@ def test_kv_transfer_page_to_page_without_disco():
     if rank == 0:
         pages.copyfrom(pages_np)
         remote_position_map_np = np.array(rank_1_position_map_array, dtype=np.int32)
-        remote_position_map = tvm.nd.array(remote_position_map_np, dev)
+        remote_position_map = tvm.runtime.tensor(remote_position_map_np, dev)
         local_position_map_np = np.array(rank_0_position_map_array, dtype=np.int32)
-        local_position_map = tvm.nd.array(local_position_map_np, dev)
+        local_position_map = tvm.runtime.tensor(local_position_map_np, dev)
         remote_tp_group_pe_offset_np = np.array(
             [1] * len(rank_0_position_map_array), dtype=np.int32
         )
-        remote_tp_group_pe_offset = tvm.nd.array(remote_tp_group_pe_offset_np, dev)
+        remote_tp_group_pe_offset = tvm.runtime.tensor(remote_tp_group_pe_offset_np, dev)
         transfer_func = tvm.get_global_func("nvshmem.KVTransferPageToPage")
         layer_view = pages._create_view(
             [num_pages, 2, num_kv_heads, page_size, head_dim],
@@ -197,7 +197,7 @@ def test_kv_transfer_with_disco():
         remote_position_map = sess.empty((len(position_map_array),), "int32")
         remote_tp_group_pe_offset_np = np.array([2] * len(position_map_array), dtype=np.int32)
         remote_tp_group_pe_offset = sess.empty((len(remote_tp_group_pe_offset_np),), "int32")
-        f_view_func = sess.get_global_func("runtime.TVMArrayCreateView")
+        f_view_func = sess.get_global_func("runtime.TVMTensorCreateView")
         layer_view = f_view_func(
             pages,
             ShapeTuple([num_pages, 2, num_kv_heads, page_size, head_dim]),
diff --git a/tests/python/relax/test_backend_dispatch_sort_scan.py b/tests/python/relax/test_backend_dispatch_sort_scan.py
index 004050aaf892..d48227fc6277 100644
--- a/tests/python/relax/test_backend_dispatch_sort_scan.py
+++ b/tests/python/relax/test_backend_dispatch_sort_scan.py
@@ -428,7 +428,7 @@ def main(x: R.Tensor(("m", "n"), "int32")):
         mod = DispatchSortScan()(Module)
         ex = tvm.compile(mod, target)
         vm = tvm.relax.VirtualMachine(ex, dev)
-        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_data = tvm.runtime.tensor(np_data, dev)
         cumsum = vm["main"](tvm_data)
         tvm.testing.assert_allclose(cumsum.numpy(), np_cumsum)
 
diff --git a/tests/python/relax/test_codegen_coreml.py b/tests/python/relax/test_codegen_coreml.py
index 7b9c22b8b9d8..b07271e8949a 100644
--- a/tests/python/relax/test_codegen_coreml.py
+++ b/tests/python/relax/test_codegen_coreml.py
@@ -75,8 +75,8 @@ def test_add():
             gv = bb.emit_output(lv0)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
-    y_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
+    y_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data, y_data])
 
 
@@ -90,7 +90,7 @@ def test_add_const():
             gv = bb.emit_output(lv0)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -105,14 +105,14 @@ def test_multiply():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
-    y_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
+    y_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data, y_data])
 
 
 def test_matmul():
     x = relax.Var("x", relax.TensorStructInfo([8, 10], "float32"))
-    y = relax.Constant(tvm.nd.array(np.random.rand(10, 8).astype("float32"), dev))
+    y = relax.Constant(tvm.runtime.tensor(np.random.rand(10, 8).astype("float32"), dev))
     bb = relax.BlockBuilder()
     with bb.function("main", [x]):
         with bb.dataflow():
@@ -121,7 +121,7 @@ def test_matmul():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(8, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(8, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
     x = relax.Var("x", relax.TensorStructInfo([8, 10], "float32"))
@@ -134,8 +134,8 @@ def test_matmul():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(8, 10).astype("float32"), dev)
-    y_data = tvm.nd.array(np.random.rand(10, 8).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(8, 10).astype("float32"), dev)
+    y_data = tvm.runtime.tensor(np.random.rand(10, 8).astype("float32"), dev)
     verify(mod, [x_data, y_data])
 
 
@@ -150,7 +150,7 @@ def test_clip():
         bb.emit_func_output(gv0)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
     x = relax.Var("x", relax.TensorStructInfo([10, 10], "float32"))
@@ -164,7 +164,7 @@ def test_clip():
             gv1 = bb.emit_output(lv1)
         bb.emit_func_output([gv0, gv1])
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -179,7 +179,7 @@ def get_mod(axis):
             bb.emit_func_output(gv)
         return bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(get_mod(axis=0), [x_data])
     verify(get_mod(axis=1), [x_data])
 
@@ -194,7 +194,7 @@ def test_relu():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -209,7 +209,7 @@ def test_batch_flatten():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -224,7 +224,7 @@ def test_softmax():
         bb.emit_func_output(gv)
     mod = bb.get()
 
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -238,7 +238,7 @@ def test_conv2d():
             gv = bb.emit_output(lv0)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -251,7 +251,7 @@ def test_global_avg_pool2d():
             gv = bb.emit_output(lv0)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(1, 1, 10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(1, 1, 10, 10).astype("float32"), dev)
     verify(mod, [x_data])
 
 
@@ -266,8 +266,8 @@ def test_subgraph1():
             gv = bb.emit_output(lv1)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
-    y_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
+    y_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data, y_data])
 
 
@@ -287,8 +287,8 @@ def test_subgraph2():
             gv = bb.emit_output(lv3)
         bb.emit_func_output(gv)
     mod = bb.get()
-    x_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
-    y_data = tvm.nd.array(np.random.rand(10, 10).astype("float32"), dev)
+    x_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
+    y_data = tvm.runtime.tensor(np.random.rand(10, 10).astype("float32"), dev)
     verify(mod, [x_data, y_data])
 
 
diff --git a/tests/python/relax/test_codegen_cublas.py b/tests/python/relax/test_codegen_cublas.py
index 152f04fc3ce7..32666ebd1d8c 100644
--- a/tests/python/relax/test_codegen_cublas.py
+++ b/tests/python/relax/test_codegen_cublas.py
@@ -52,7 +52,7 @@ def build_and_run(mod, inputs_np, target, legalize=False, cuda_graph=False):
         ex = tvm.compile(mod, target)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
 
     # For cuda graph, run the compiled function twice to make sure that we can launch the cached
     # graph on the second run.
diff --git a/tests/python/relax/test_codegen_cudnn.py b/tests/python/relax/test_codegen_cudnn.py
index 990f21138619..10ba775a6dae 100644
--- a/tests/python/relax/test_codegen_cudnn.py
+++ b/tests/python/relax/test_codegen_cudnn.py
@@ -113,7 +113,7 @@ def build_and_run(mod, inputs_np, target, legalize=False, cuda_graph=False):
         ex = tvm.compile(mod, target)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
 
     # For cuda graph, run the compiled function twice to make sure that we can launch the cached
     # graph on the second run.
diff --git a/tests/python/relax/test_codegen_cutlass.py b/tests/python/relax/test_codegen_cutlass.py
index 6528e1c93c0c..c645dce96bd4 100644
--- a/tests/python/relax/test_codegen_cutlass.py
+++ b/tests/python/relax/test_codegen_cutlass.py
@@ -94,7 +94,7 @@ def build_and_run(mod, inputs_np, target, legalize=True, cuda_graph=False):
     dev = tvm.device(target, 0)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
 
     # For cuda graph, run the compiled function twice to make sure that we can launch the cached
     # graph on the second run.
@@ -1481,15 +1481,15 @@ def main_residual(
     vm = relax.vm.VirtualMachine(ex, tvm.cpu(0))
 
     packed_weight, scales, bias_trans = vm[transform_func_name](
-        (tvm.nd.array(y), tvm.nd.array(bias))
+        (tvm.runtime.tensor(y), tvm.runtime.tensor(bias))
     )
 
     dev = tvm.device("cuda", 0)
     ex = tvm.compile(mod_deploy, target="cuda")
     vm = relax.vm.VirtualMachine(ex, dev)
 
-    x_nd = tvm.nd.array(x, dev)
-    residual_nd = tvm.nd.array(residual, dev)
+    x_nd = tvm.runtime.tensor(x, dev)
+    residual_nd = tvm.runtime.tensor(residual, dev)
     params = [packed_weight.copyto(dev), scales.copyto(dev), bias_trans.copyto(dev)]
 
     for f_name in ["main_bias", "main_cast_bias", "main_residual"]:
@@ -1634,14 +1634,14 @@ def main(
     vm = relax.vm.VirtualMachine(ex, tvm.cpu(0))
 
     packed_weight, scales, bias_trans = vm[transform_func_name](
-        (tvm.nd.array(y), tvm.nd.array(bias))
+        (tvm.runtime.tensor(y), tvm.runtime.tensor(bias))
     )
 
     dev = tvm.device("cuda", 0)
     ex = tvm.compile(mod_deploy, target="cuda")
     vm = relax.vm.VirtualMachine(ex, dev)
 
-    x_nd = tvm.nd.array(x, dev)
+    x_nd = tvm.runtime.tensor(x, dev)
     inp = [x_nd, packed_weight.copyto(dev), scales.copyto(dev), bias_trans.copyto(dev)]
     out = vm["main"](*inp).numpy()
 
@@ -1909,13 +1909,13 @@ def main(
     ex = tvm.compile(mod_transform, target="llvm")
     vm = relax.vm.VirtualMachine(ex, tvm.cpu(0))
 
-    packed_weight, scales = vm[transform_func_name]((tvm.nd.array(y),))
+    packed_weight, scales = vm[transform_func_name]((tvm.runtime.tensor(y),))
 
     dev = tvm.device("cuda", 0)
     ex = tvm.compile(mod_deploy, target="cuda")
     vm = relax.vm.VirtualMachine(ex, dev)
 
-    x_nd = tvm.nd.array(x, dev)
+    x_nd = tvm.runtime.tensor(x, dev)
     inp = [x_nd, packed_weight.copyto(dev), scales.copyto(dev)]
     out = vm["main"](*inp).numpy()
     ref = np.dot(x, y.transpose())
@@ -2064,13 +2064,13 @@ def main(
     ex = tvm.compile(mod_transform, target="llvm")
     vm = relax.vm.VirtualMachine(ex, tvm.cpu(0))
 
-    packed_weight, scales = vm[transform_func_name]((tvm.nd.array(y),))
+    packed_weight, scales = vm[transform_func_name]((tvm.runtime.tensor(y),))
 
     dev = tvm.device("cuda", 0)
     ex = tvm.compile(mod_deploy, target="cuda")
     vm = relax.vm.VirtualMachine(ex, dev)
 
-    x_nd = tvm.nd.array(x, dev)
+    x_nd = tvm.runtime.tensor(x, dev)
     inp = [x_nd, packed_weight.copyto(dev), scales.copyto(dev)]
     out = vm["main"](*inp).numpy()
     ref = np.dot(x, y.transpose())
diff --git a/tests/python/relax/test_codegen_dnnl.py b/tests/python/relax/test_codegen_dnnl.py
index 370c5f03a486..f386f8f2f8d0 100644
--- a/tests/python/relax/test_codegen_dnnl.py
+++ b/tests/python/relax/test_codegen_dnnl.py
@@ -54,7 +54,7 @@ def main(
 def build_and_run(mod, inputs, legalize=False):
     target = tvm.target.Target("llvm")
     dev = tvm.cpu()
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs]
 
     with tvm.transform.PassContext(config={"relax.transform.apply_legalize_ops": legalize}):
         ex = tvm.compile(mod, target)
diff --git a/tests/python/relax/test_codegen_hipblas.py b/tests/python/relax/test_codegen_hipblas.py
index 004e70e4e60e..286acc44f1f1 100644
--- a/tests/python/relax/test_codegen_hipblas.py
+++ b/tests/python/relax/test_codegen_hipblas.py
@@ -45,7 +45,7 @@ def build_and_run(mod, inputs_np, target, legalize=False):
         ex = tvm.compile(mod, target)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
     return f(*inputs).numpy()
 
 
diff --git a/tests/python/relax/test_codegen_tensorrt.py b/tests/python/relax/test_codegen_tensorrt.py
index 746f4eba6028..84467a67a9c4 100644
--- a/tests/python/relax/test_codegen_tensorrt.py
+++ b/tests/python/relax/test_codegen_tensorrt.py
@@ -67,7 +67,7 @@ def build_and_run(mod, inputs_np, target, legalize=False):
         ex = tvm.compile(mod, target)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
     return f(*inputs).numpy()
 
 
diff --git a/tests/python/relax/test_contrib_vllm.py b/tests/python/relax/test_contrib_vllm.py
index 0a8d338a455e..fade620dfea4 100644
--- a/tests/python/relax/test_contrib_vllm.py
+++ b/tests/python/relax/test_contrib_vllm.py
@@ -48,7 +48,7 @@ def build_and_run(mod, inputs_np, target, legalize=True):
     dev = tvm.device(target, 0)
     vm = relax.VirtualMachine(ex, dev)
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
 
     out = f(*inputs)
 
@@ -752,17 +752,21 @@ def test_reconstruct_from_cache():
 
     dev = tvm.device("cuda", 0)
 
-    key = tvm.nd.array(np.random.randn(num_tokens, num_heads, head_dim).astype("float16"), dev)
-    value = tvm.nd.array(np.random.randn(num_tokens, num_heads, head_dim).astype("float16"), dev)
-    slot_mapping = tvm.nd.array(np.arange(num_tokens).astype("int32"), dev)
+    key = tvm.runtime.tensor(
+        np.random.randn(num_tokens, num_heads, head_dim).astype("float16"), dev
+    )
+    value = tvm.runtime.tensor(
+        np.random.randn(num_tokens, num_heads, head_dim).astype("float16"), dev
+    )
+    slot_mapping = tvm.runtime.tensor(np.arange(num_tokens).astype("int32"), dev)
 
-    k_cache = tvm.nd.array(
+    k_cache = tvm.runtime.tensor(
         np.random.randn(num_blocks, num_heads, head_dim // vec_size, block_size, vec_size).astype(
             "float16"
         ),
         dev,
     )
-    v_cache = tvm.nd.array(
+    v_cache = tvm.runtime.tensor(
         np.random.randn(num_blocks, num_heads, head_dim, block_size).astype("float16"), dev
     )
 
diff --git a/tests/python/relax/test_dataflow_inplace.py b/tests/python/relax/test_dataflow_inplace.py
index f6413c1d8206..00805152b499 100644
--- a/tests/python/relax/test_dataflow_inplace.py
+++ b/tests/python/relax/test_dataflow_inplace.py
@@ -526,8 +526,8 @@ def main(
     new_mod = transform_pass(EndToEndTest)
     tvm.ir.assert_structural_equal(new_mod, Expected)
 
-    x = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
-    y = tvm.nd.array(np.random.rand(1, 3).astype("float32"))
+    x = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
+    y = tvm.runtime.tensor(np.random.rand(1, 3).astype("float32"))
     expected = np.zeros((2, 3), dtype="float32")
 
     target = tvm.target.Target("llvm")
@@ -609,8 +609,8 @@ def main(
             return s
 
     tvm.ir.assert_structural_equal(new_mod, Expected, map_free_vars=True)
-    x = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
-    y = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
+    x = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
+    y = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
     expected = np.zeros((2, 3), dtype="float32")
 
     target = tvm.target.Target("llvm")
diff --git a/tests/python/relax/test_dlpack_integration.py b/tests/python/relax/test_dlpack_integration.py
index b2d71fb8a2ad..7378fe74a42b 100644
--- a/tests/python/relax/test_dlpack_integration.py
+++ b/tests/python/relax/test_dlpack_integration.py
@@ -38,13 +38,13 @@ class TestDLPackIntegration:
     def test_dlpack_pytorch_to_tvm_conversion(self):
         pytorch_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32)
 
-        tvm_ndarray = tvm.nd.from_dlpack(pytorch_tensor)
+        tvm_tensor = tvm.runtime.from_dlpack(pytorch_tensor)
 
-        assert isinstance(tvm_ndarray, tvm.nd.NDArray)
-        assert tvm_ndarray.shape == pytorch_tensor.shape
-        assert str(tvm_ndarray.dtype) == str(pytorch_tensor.dtype).replace("torch.", "")
+        assert isinstance(tvm_tensor, tvm.runtime.Tensor)
+        assert tvm_tensor.shape == pytorch_tensor.shape
+        assert str(tvm_tensor.dtype) == str(pytorch_tensor.dtype).replace("torch.", "")
 
-        tvm_numpy = tvm_ndarray.numpy()
+        tvm_numpy = tvm_tensor.numpy()
         pytorch_numpy = pytorch_tensor.numpy()
         np.testing.assert_allclose(tvm_numpy, pytorch_numpy, atol=1e-5)
 
@@ -54,15 +54,15 @@ def test_dlpack_pytorch_to_tvm_conversion_gpu(self):
                 [1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device="cuda"
             )
 
-            tvm_ndarray = tvm.nd.from_dlpack(pytorch_tensor)
+            tvm_tensor = tvm.runtime.from_dlpack(pytorch_tensor)
 
-            assert isinstance(tvm_ndarray, tvm.nd.NDArray)
-            assert tvm_ndarray.shape == pytorch_tensor.shape
-            assert str(tvm_ndarray.dtype) == str(pytorch_tensor.dtype).replace("torch.", "")
-            assert str(tvm_ndarray.device) == "cuda:0"
+            assert isinstance(tvm_tensor, tvm.runtime.Tensor)
+            assert tvm_tensor.shape == pytorch_tensor.shape
+            assert str(tvm_tensor.dtype) == str(pytorch_tensor.dtype).replace("torch.", "")
+            assert str(tvm_tensor.device) == "cuda:0"
 
             # Move to CPU for numpy conversion
-            tvm_numpy = tvm_ndarray.numpy()
+            tvm_numpy = tvm_tensor.numpy()
             pytorch_numpy = pytorch_tensor.cpu().numpy()
             np.testing.assert_allclose(tvm_numpy, pytorch_numpy, atol=1e-5)
         else:
@@ -72,15 +72,15 @@ def test_dlpack_tvm_to_pytorch_conversion(self):
         import numpy as np
 
         data = np.array([1.0, 2.0, 3.0, 5.0], dtype="float32")
-        tvm_ndarray = tvm.nd.array(data)
+        tvm_tensor = tvm.runtime.tensor(data)
 
-        pytorch_tensor = torch.from_dlpack(tvm_ndarray)
+        pytorch_tensor = torch.from_dlpack(tvm_tensor)
 
         assert isinstance(pytorch_tensor, torch.Tensor)
-        assert pytorch_tensor.shape == tvm_ndarray.shape
+        assert pytorch_tensor.shape == tvm_tensor.shape
         assert pytorch_tensor.dtype == torch.float32
 
-        tvm_numpy = tvm_ndarray.numpy()
+        tvm_numpy = tvm_tensor.numpy()
         pytorch_numpy = pytorch_tensor.numpy()
         np.testing.assert_allclose(tvm_numpy, pytorch_numpy, atol=1e-5)
 
@@ -89,16 +89,16 @@ def test_dlpack_tvm_to_pytorch_conversion_gpu(self):
             import numpy as np
 
             data = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype="float32")
-            tvm_ndarray = tvm.nd.array(data, device=tvm.cuda(0))
+            tvm_tensor = tvm.runtime.tensor(data, device=tvm.cuda(0))
 
-            pytorch_tensor = torch.from_dlpack(tvm_ndarray)
+            pytorch_tensor = torch.from_dlpack(tvm_tensor)
 
             assert isinstance(pytorch_tensor, torch.Tensor)
-            assert pytorch_tensor.shape == tvm_ndarray.shape
+            assert pytorch_tensor.shape == tvm_tensor.shape
             assert pytorch_tensor.dtype == torch.float32
             assert pytorch_tensor.device.type == "cuda"
 
-            tvm_numpy = tvm_ndarray.numpy()
+            tvm_numpy = tvm_tensor.numpy()
             pytorch_numpy = pytorch_tensor.cpu().numpy()
             np.testing.assert_allclose(tvm_numpy, pytorch_numpy, atol=1e-5)
         else:
@@ -110,10 +110,10 @@ def test_dlpack_roundtrip_conversion(self):
         original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32)
 
         # Convert to TVM
-        tvm_ndarray = tvm.nd.from_dlpack(original_tensor)
+        tvm_tensor = tvm.runtime.from_dlpack(original_tensor)
 
         # Convert back to PyTorch
-        result_tensor = torch.from_dlpack(tvm_ndarray)
+        result_tensor = torch.from_dlpack(tvm_tensor)
 
         # Verify roundtrip integrity
         assert torch.allclose(original_tensor, result_tensor, atol=1e-5)
@@ -134,10 +134,10 @@ def test_dlpack_different_data_types(self):
             pytorch_tensor = torch.tensor([1, 2, 3], dtype=torch_dtype)
 
             # Convert to TVM
-            tvm_ndarray = tvm.nd.from_dlpack(pytorch_tensor)
+            tvm_tensor = tvm.runtime.from_dlpack(pytorch_tensor)
 
             # Convert back to PyTorch
-            result_tensor = torch.from_dlpack(tvm_ndarray)
+            result_tensor = torch.from_dlpack(tvm_tensor)
 
             # Verify conversion
             assert torch.allclose(pytorch_tensor, result_tensor, atol=1e-5)
@@ -157,10 +157,10 @@ def test_dlpack_different_shapes(self):
             pytorch_tensor = torch.randn(shape, dtype=torch.float32)
 
             # Convert to TVM
-            tvm_ndarray = tvm.nd.from_dlpack(pytorch_tensor)
+            tvm_tensor = tvm.runtime.from_dlpack(pytorch_tensor)
 
             # Convert back to PyTorch
-            result_tensor = torch.from_dlpack(tvm_ndarray)
+            result_tensor = torch.from_dlpack(tvm_tensor)
 
             # Verify conversion
             assert torch.allclose(pytorch_tensor, result_tensor, atol=1e-5)
@@ -173,15 +173,15 @@ def test_dlpack_functionality_verification(self):
         pytorch_tensor = torch.randn(size, dtype=torch.float32)
 
         # Test DLPack conversion
-        tvm_ndarray_dlpack = tvm.nd.from_dlpack(pytorch_tensor)
+        tvm_tensor_dlpack = tvm.runtime.from_dlpack(pytorch_tensor)
 
         # Test numpy conversion
         numpy_array = pytorch_tensor.detach().cpu().numpy()
-        tvm_ndarray_numpy = tvm.nd.array(numpy_array)
+        tvm_tensor_numpy = tvm.runtime.tensor(numpy_array)
 
         # Verify both methods produce same result
-        result_dlpack = torch.from_dlpack(tvm_ndarray_dlpack)
-        result_numpy = torch.from_numpy(tvm_ndarray_numpy.numpy())
+        result_dlpack = torch.from_dlpack(tvm_tensor_dlpack)
+        result_numpy = torch.from_numpy(tvm_tensor_numpy.numpy())
         assert torch.allclose(result_dlpack, result_numpy, atol=1e-5)
 
         # Verify data integrity
@@ -197,8 +197,8 @@ def test_dlpack_error_handling(self):
 
         # This should work (PyTorch handles non-contiguous tensors)
         try:
-            tvm_ndarray = tvm.nd.from_dlpack(non_contiguous)
-            result_tensor = torch.from_dlpack(tvm_ndarray)
+            tvm_tensor = tvm.runtime.from_dlpack(non_contiguous)
+            result_tensor = torch.from_dlpack(tvm_tensor)
             assert torch.allclose(non_contiguous, result_tensor, atol=1e-5)
         except Exception as e:
             # If it fails, that's also acceptable
@@ -230,7 +230,7 @@ def test_dlpack_device_consistency(self):
         """Test DLPack conversion maintains device consistency."""
         # Test CPU tensor
         cpu_tensor = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32)
-        cpu_tvm = tvm.nd.from_dlpack(cpu_tensor)
+        cpu_tvm = tvm.runtime.from_dlpack(cpu_tensor)
         cpu_result = torch.from_dlpack(cpu_tvm)
 
         assert cpu_result.device.type == "cpu"
@@ -245,13 +245,13 @@ def test_dlpack_memory_sharing(self):
         pytorch_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32)
 
         # Convert to TVM
-        tvm_ndarray = tvm.nd.from_dlpack(pytorch_tensor)
+        tvm_tensor = tvm.runtime.from_dlpack(pytorch_tensor)
 
         # Modify the original tensor
         pytorch_tensor[0] = 10.0
 
         # Convert back to PyTorch
-        result_tensor = torch.from_dlpack(tvm_ndarray)
+        result_tensor = torch.from_dlpack(tvm_tensor)
 
         # The result should reflect the modification (memory sharing)
         assert result_tensor[0] == 10.0
@@ -264,10 +264,10 @@ def test_dlpack_batch_operations(self):
         pytorch_tensors = [torch.randn(5, dtype=torch.float32) for _ in range(batch_size)]
 
         # Convert all to TVM
-        tvm_ndarrays = [tvm.nd.from_dlpack(t) for t in pytorch_tensors]
+        tvm_tensors = [tvm.runtime.from_dlpack(t) for t in pytorch_tensors]
 
         # Convert all back to PyTorch
-        result_tensors = [torch.from_dlpack(t) for t in tvm_ndarrays]
+        result_tensors = [torch.from_dlpack(t) for t in tvm_tensors]
 
         # Verify all conversions
         for i in range(batch_size):
@@ -277,7 +277,7 @@ def test_dlpack_edge_cases(self):
         """Test DLPack conversion with edge cases."""
         # Empty tensor
         empty_tensor = torch.tensor([], dtype=torch.float32)
-        empty_tvm = tvm.nd.from_dlpack(empty_tensor)
+        empty_tvm = tvm.runtime.from_dlpack(empty_tensor)
         empty_result = torch.from_dlpack(empty_tvm)
 
         assert empty_result.shape == empty_tensor.shape
@@ -285,7 +285,7 @@ def test_dlpack_edge_cases(self):
 
         # Single element tensor
         single_tensor = torch.tensor([42.0], dtype=torch.float32)
-        single_tvm = tvm.nd.from_dlpack(single_tensor)
+        single_tvm = tvm.runtime.from_dlpack(single_tensor)
         single_result = torch.from_dlpack(single_tvm)
 
         assert single_result.shape == single_tensor.shape
diff --git a/tests/python/relax/test_e2e_op_dynamic.py b/tests/python/relax/test_e2e_op_dynamic.py
index 9179802360b3..ea1f3a778e47 100644
--- a/tests/python/relax/test_e2e_op_dynamic.py
+++ b/tests/python/relax/test_e2e_op_dynamic.py
@@ -52,10 +52,10 @@ def main(x: R.Tensor((8, 9, 10, 10), "float32"), begin: R.Tensor((4,),"int64"),
     vm = build(DynamicStridedSlice)
 
     x_np = np.random.rand(8, 9, 10, 10).astype(np.float32)
-    data_nd = tvm.nd.array(x_np, dev)
-    begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev)
-    end_nd = tvm.nd.array(np.array(end).astype("int64"), dev)
-    strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev)
+    data_nd = tvm.runtime.tensor(x_np, dev)
+    begin_nd = tvm.runtime.tensor(np.array(begin).astype("int64"), dev)
+    end_nd = tvm.runtime.tensor(np.array(end).astype("int64"), dev)
+    strides_nd = tvm.runtime.tensor(np.array(strides).astype("int64"), dev)
 
     # Reference implementation
     out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides)
@@ -85,10 +85,10 @@ def main(x: R.Tensor(("m", "n", 10, 10), "float32"), begin: R.Tensor((4,),"int64
     vm = build(DynamicStridedSlice)
 
     x_np = np.random.rand(8, 9, 10, 10).astype(np.float32)
-    data_nd = tvm.nd.array(x_np, dev)
-    begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev)
-    end_nd = tvm.nd.array(np.array(end).astype("int64"), dev)
-    strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev)
+    data_nd = tvm.runtime.tensor(x_np, dev)
+    begin_nd = tvm.runtime.tensor(np.array(begin).astype("int64"), dev)
+    end_nd = tvm.runtime.tensor(np.array(end).astype("int64"), dev)
+    strides_nd = tvm.runtime.tensor(np.array(strides).astype("int64"), dev)
 
     # Reference implementation
     out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides)
diff --git a/tests/python/relax/test_frontend_common.py b/tests/python/relax/test_frontend_common.py
index 39f9af103134..21becb2c8590 100644
--- a/tests/python/relax/test_frontend_common.py
+++ b/tests/python/relax/test_frontend_common.py
@@ -25,7 +25,7 @@ def test_detach_params():
     def func(x: R.Tensor((2, 3), "float32")):
         return x
 
-    param = tvm.nd.empty((3,), "float32")
+    param = tvm.runtime.empty((3,), "float32")
     mod = tvm.IRModule({"func": func.with_attr("params", [param])})
     detached_mod, detached_params = detach_params(mod)
 
diff --git a/tests/python/relax/test_frontend_dynamo.py b/tests/python/relax/test_frontend_dynamo.py
index fb1544be68a8..90ac06466ca5 100644
--- a/tests/python/relax/test_frontend_dynamo.py
+++ b/tests/python/relax/test_frontend_dynamo.py
@@ -275,7 +275,7 @@ def verify_dynamo_model(torch_model, input_info, binding, expected):
         args.append(torch.zeros(*info[0], dtype=_convert_data_type(info[1])))
     graph_model = dynamo.export(torch_model)(*args)[0]
     mod = from_fx(graph_model, input_info, unwrap_unit_return_tuple=True)
-    binding = {k: tvm.nd.array(v) for k, v in binding.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding.items()}
     expected = relax.transform.BindParams("main", binding)(expected)
     tvm.ir.assert_structural_equal(mod, expected)
 
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 406a5d9a1c70..2871e3f4cde3 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -34,7 +34,7 @@ def verify_model(torch_model, example_args, binding, expected, dynamic_shapes=No
     exported_program = export(torch_model, args=example_args, dynamic_shapes=dynamic_shapes)
     mod = from_exported_program(exported_program)
 
-    binding = {k: tvm.nd.array(v) for k, v in binding.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding.items()}
     expected = relax.transform.BindParams("main", binding)(expected)
     tvm.ir.assert_structural_equal(mod, expected)
 
@@ -4802,9 +4802,9 @@ def main(
     params = params["main"]
 
     assert len(params) == len(func.params) - 1
-    for param_var, param_ndarray in zip(func.params[1:], params):
-        assert tuple(x.value for x in param_var.struct_info.shape.values) == param_ndarray.shape
-        assert param_var.struct_info.dtype == param_ndarray.dtype
+    for param_var, param_tensor in zip(func.params[1:], params):
+        assert tuple(x.value for x in param_var.struct_info.shape.values) == param_tensor.shape
+        assert param_var.struct_info.dtype == param_tensor.dtype
 
     tvm.testing.assert_allclose(params[0].numpy(), model.conv.weight.detach().detach().numpy())
     tvm.testing.assert_allclose(params[1].numpy(), model.conv.bias.detach().detach().numpy())
diff --git a/tests/python/relax/test_frontend_from_fx.py b/tests/python/relax/test_frontend_from_fx.py
index 47ca0819a9c8..69ebdcbf76bc 100644
--- a/tests/python/relax/test_frontend_from_fx.py
+++ b/tests/python/relax/test_frontend_from_fx.py
@@ -38,7 +38,7 @@ def verify_model(torch_model, input_info, binding, expected):
     graph_model = fx.symbolic_trace(torch_model)
     with torch.no_grad():
         mod = from_fx(graph_model, input_info)
-    binding = {k: tvm.nd.array(v) for k, v in binding.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding.items()}
     expected = relax.transform.BindParams("main", binding)(expected)
     tvm.ir.assert_structural_equal(mod, expected)
 
@@ -4578,9 +4578,9 @@ def main(
     params = params["main"]
 
     assert len(params) == len(func.params) - 1
-    for param_var, param_ndarray in zip(func.params[1:], params):
-        assert tuple(x.value for x in param_var.struct_info.shape.values) == param_ndarray.shape
-        assert param_var.struct_info.dtype == param_ndarray.dtype
+    for param_var, param_tensor in zip(func.params[1:], params):
+        assert tuple(x.value for x in param_var.struct_info.shape.values) == param_tensor.shape
+        assert param_var.struct_info.dtype == param_tensor.dtype
 
     tvm.testing.assert_allclose(params[0].numpy(), model.conv.bias.detach().detach().numpy())
     tvm.testing.assert_allclose(params[1].numpy(), model.conv.weight.detach().detach().numpy())
diff --git a/tests/python/relax/test_frontend_nn_debug.py b/tests/python/relax/test_frontend_nn_debug.py
index a055631a4d51..c1372adff10e 100644
--- a/tests/python/relax/test_frontend_nn_debug.py
+++ b/tests/python/relax/test_frontend_nn_debug.py
@@ -22,7 +22,7 @@
 from tvm import tir
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import op, spec
-from tvm.runtime import NDArray
+from tvm.runtime import Tensor
 
 
 def test_debug_print():
@@ -46,7 +46,7 @@ def test_debug_func():
     @tvm.register_func("testing.relax.frontend.nn.test_debug_func")
     def _debug(  # pylint: disable=too-many-arguments
         lineno: str,
-        tensor: NDArray,
+        tensor: Tensor,
         const_int: int,
         const_float: float,
         const_str: str,
diff --git a/tests/python/relax/test_frontend_nn_extern_module.py b/tests/python/relax/test_frontend_nn_extern_module.py
index cbc2e7f42922..d5b73bec4c7f 100644
--- a/tests/python/relax/test_frontend_nn_extern_module.py
+++ b/tests/python/relax/test_frontend_nn_extern_module.py
@@ -57,8 +57,8 @@ def _var_equal(a, b):  # pylint: disable=invalid-name
 
 def _test_scalar_add(func):
     # pylint: disable=invalid-name
-    x = tvm.nd.array(np.array(1.0).astype("float32"))
-    y = tvm.nd.array(np.array(3.0).astype("float32"))
+    x = tvm.runtime.tensor(np.array(1.0).astype("float32"))
+    y = tvm.runtime.tensor(np.array(3.0).astype("float32"))
     z = func(x, y).numpy()
     # pylint: enable=invalid-name
     assert z.ndim == 0
@@ -68,8 +68,8 @@ def _test_scalar_add(func):
 
 def _test_infer_sym(func, x, y, z):  # pylint: disable=invalid-name
     # pylint: disable=invalid-name
-    a = tvm.nd.array(np.random.uniform(size=(x, y, 1)).astype("float32"))
-    b = tvm.nd.array(np.random.uniform(size=(y, z, 5)).astype("float32"))
+    a = tvm.runtime.tensor(np.random.uniform(size=(x, y, 1)).astype("float32"))
+    b = tvm.runtime.tensor(np.random.uniform(size=(y, z, 5)).astype("float32"))
     c = func(a, b).numpy()
     # pylint: enable=invalid-name
     assert c.shape == (x, y, z, 9)
diff --git a/tests/python/relax/test_frontend_nn_op.py b/tests/python/relax/test_frontend_nn_op.py
index 5c400ef8be28..9e0369318841 100644
--- a/tests/python/relax/test_frontend_nn_op.py
+++ b/tests/python/relax/test_frontend_nn_op.py
@@ -976,10 +976,12 @@ def foo(prob: R.Tensor((3, 5), dtype="float32"), uniform_sample: R.Tensor((6, 1)
     np_rand = np.random.rand(*prob_shape).astype(np.float32)
     # normalize it to get the random prob
     np_prob = np_rand / np_rand.sum(axis=1, keepdims=True)
-    nd_prob = tvm.nd.array(np_prob, dev)
+    nd_prob = tvm.runtime.tensor(np_prob, dev)
     # special sample to get deterministic results
-    nd_sample = tvm.nd.array(np.array([[1], [0], [1], [1], [0], [1]]).astype(np.float32), dev)
-    nd_sample_indices = tvm.nd.array(np.array([[0], [1], [1], [2], [2], [2]]).astype(np.int64), dev)
+    nd_sample = tvm.runtime.tensor(np.array([[1], [0], [1], [1], [0], [1]]).astype(np.float32), dev)
+    nd_sample_indices = tvm.runtime.tensor(
+        np.array([[0], [1], [1], [2], [2], [2]]).astype(np.int64), dev
+    )
     inputs = [nd_prob, nd_sample, nd_sample_indices, effects]
     res = vm["foo"](*inputs)
     tvm.testing.assert_allclose(
@@ -1104,12 +1106,14 @@ def foo(prob: R.Tensor((2, 3), dtype="float32"), index: R.Tensor((2, 3), dtype="
     vm = relax.VirtualMachine(ex, dev)
 
     effects = vm["_initialize_effect"]()
-    sorted_prob = tvm.nd.array(np.array([[0.5, 0.4, 0.1], [0.4, 0.3, 0.3]]).astype(np.float32), dev)
-    indices = tvm.nd.array(np.array([[2, 1, 0], [2, 0, 1]]).astype(np.int64), dev)
-    top_p = tvm.nd.array(np.array([[0.6], [0.9]]).astype(np.float32), dev)
-    top_k = tvm.nd.array(np.array([[3], [2]]).astype(np.int64), dev)
-    usample = tvm.nd.array(np.array([[0.5], [0.6], [0.7]]).astype(np.float32), dev)
-    sample_indices = tvm.nd.array(np.array([[0], [1], [1]]).astype(np.int64), dev)
+    sorted_prob = tvm.runtime.tensor(
+        np.array([[0.5, 0.4, 0.1], [0.4, 0.3, 0.3]]).astype(np.float32), dev
+    )
+    indices = tvm.runtime.tensor(np.array([[2, 1, 0], [2, 0, 1]]).astype(np.int64), dev)
+    top_p = tvm.runtime.tensor(np.array([[0.6], [0.9]]).astype(np.float32), dev)
+    top_k = tvm.runtime.tensor(np.array([[3], [2]]).astype(np.int64), dev)
+    usample = tvm.runtime.tensor(np.array([[0.5], [0.6], [0.7]]).astype(np.float32), dev)
+    sample_indices = tvm.runtime.tensor(np.array([[0], [1], [1]]).astype(np.int64), dev)
 
     inputs = [sorted_prob, indices, top_p, top_k, usample, sample_indices, effects]
 
@@ -1220,10 +1224,12 @@ def foo(prob: R.Tensor((2, 3), dtype="float32"), sorted_prob: R.Tensor((2, 3), d
     vm = relax.VirtualMachine(ex, dev)
 
     effects = vm["_initialize_effect"]()
-    prob = tvm.nd.array(np.array([[0.2, 0.3, 0.5], [0.3, 0.3, 0.4]]).astype(np.float32), dev)
-    sorted_prob = tvm.nd.array(np.array([[0.5, 0.3, 0.2], [0.4, 0.3, 0.3]]).astype(np.float32), dev)
-    top_p = tvm.nd.array(np.array([[0.6], [0.9]]).astype(np.float32), dev)
-    top_k = tvm.nd.array(np.array([[3], [2]]).astype(np.int64), dev)
+    prob = tvm.runtime.tensor(np.array([[0.2, 0.3, 0.5], [0.3, 0.3, 0.4]]).astype(np.float32), dev)
+    sorted_prob = tvm.runtime.tensor(
+        np.array([[0.5, 0.3, 0.2], [0.4, 0.3, 0.3]]).astype(np.float32), dev
+    )
+    top_p = tvm.runtime.tensor(np.array([[0.6], [0.9]]).astype(np.float32), dev)
+    top_k = tvm.runtime.tensor(np.array([[3], [2]]).astype(np.int64), dev)
 
     inputs = [prob, sorted_prob, top_p, top_k, effects]
 
diff --git a/tests/python/relax/test_frontend_onnx.py b/tests/python/relax/test_frontend_onnx.py
index b55489a623f0..625cdebf7f61 100644
--- a/tests/python/relax/test_frontend_onnx.py
+++ b/tests/python/relax/test_frontend_onnx.py
@@ -172,12 +172,12 @@ def _check_output(tvm_out, ort_out):
             assert len(tvm_out) == len(ort_out), "Unequal number of outputs"
             for tvm_out_i, ort_out_i in zip(tvm_out, ort_out):
                 _check_output(tvm_out_i, ort_out_i)
-        elif isinstance(tvm_out, tvm.nd.NDArray) and isinstance(ort_out, np.ndarray):
+        elif isinstance(tvm_out, tvm.runtime.Tensor) and isinstance(ort_out, np.ndarray):
             if check_dtypes:
                 assert tvm_out.numpy().dtype == ort_out.dtype
             tvm.testing.assert_allclose(tvm_out.numpy(), ort_out, rtol=rtol, atol=atol)
         elif isinstance(tvm_out, tvm.runtime.ShapeTuple) and isinstance(ort_out, np.ndarray):
-            shape_out = tvm.nd.array([int(i) for i in tvm_out])
+            shape_out = tvm.runtime.tensor([int(i) for i in tvm_out])
             if check_dtypes:
                 assert _get_numpy_subdtype(shape_out.numpy()) == _get_numpy_subdtype(ort_out)
             tvm.testing.assert_allclose(shape_out.numpy(), ort_out, rtol=rtol, atol=atol)
diff --git a/tests/python/relax/test_frontend_stablehlo.py b/tests/python/relax/test_frontend_stablehlo.py
index 4f049555f148..dd918ab3a2ea 100644
--- a/tests/python/relax/test_frontend_stablehlo.py
+++ b/tests/python/relax/test_frontend_stablehlo.py
@@ -126,7 +126,7 @@ def check_correctness(
     tvm_output = vm.get_outputs("main")
 
     # Single ouput
-    if isinstance(tvm_output, tvm.nd.NDArray):
+    if isinstance(tvm_output, tvm.runtime.Tensor):
         tvm.testing.assert_allclose(tvm_output.numpy(), jax_output, rtol=1e-5, atol=1e-5)
         return
 
@@ -138,7 +138,7 @@ def check_correctness(
 
 def get_vm_res(
     ir_mod: tvm.IRModule, weights: Union[np.ndarray, List[np.ndarray]]
-) -> Union[tvm.nd.NDArray, List[tvm.nd.NDArray]]:
+) -> Union[tvm.runtime.Tensor, List[tvm.runtime.Tensor]]:
     """Compile and run an ir_module on Relax VM
 
     Parameters
@@ -151,7 +151,7 @@ def get_vm_res(
 
     Results
     -------
-    out: Union[tvm.nd.NDArray, List[tvm.nd.NDArray]]
+    out: Union[tvm.runtime.Tensor, List[tvm.runtime.Tensor]]
         inference result
     """
     target = tvm.target.Target("llvm", host="llvm")
diff --git a/tests/python/relax/test_meta_schedule_relax_integration.py b/tests/python/relax/test_meta_schedule_relax_integration.py
index 00a342c46050..6f3cdfa9a0de 100644
--- a/tests/python/relax/test_meta_schedule_relax_integration.py
+++ b/tests/python/relax/test_meta_schedule_relax_integration.py
@@ -154,7 +154,7 @@ def test_extracting_tasks():
 
     relax_expectation = {
         "structural": 2,  # The relax constants do not reach the tir at the lowering.
-        "ignore-ndarray": 2,
+        "ignore-tensor": 2,
         "anchor-block": 1,
     }
     for module_equality, count in relax_expectation.items():
@@ -167,7 +167,7 @@ def test_extracting_tasks():
         assert len(extracted_tasks) == count
 
     tir_relax_mod = Module
-    tir_relax_expectation = {"structural": 3, "ignore-ndarray": 2, "anchor-block": 1}
+    tir_relax_expectation = {"structural": 3, "ignore-tensor": 2, "anchor-block": 1}
     for module_equality, count in tir_relax_expectation.items():
         extracted_tasks = ms.relax_integration.extract_tasks(
             tir_relax_mod,
@@ -178,7 +178,7 @@ def test_extracting_tasks():
         assert len(extracted_tasks) == count
 
 
-@pytest.mark.parametrize("module_equality", ["structural", "ignore-ndarray", "anchor-block"])
+@pytest.mark.parametrize("module_equality", ["structural", "ignore-tensor", "anchor-block"])
 def test_using_anchor_trace(module_equality):
     relax_mod = Module
     target = "llvm -mcpu=core-avx2 -num-cores=1"
diff --git a/tests/python/relax/test_op_datatype.py b/tests/python/relax/test_op_datatype.py
index 48820b9e2e00..a5507f7efaa2 100644
--- a/tests/python/relax/test_op_datatype.py
+++ b/tests/python/relax/test_op_datatype.py
@@ -28,7 +28,7 @@
 
 def test_op_correctness():
     x = relax.Var("x", R.Tensor((2, 3), "float32"))
-    c = relax.Constant(tvm.nd.array(np.array([1, 2, 3], dtype="float16")))
+    c = relax.Constant(tvm.runtime.tensor(np.array([1, 2, 3], dtype="float16")))
     assert relax.op.astype(x, "float16").op == Op.get("relax.astype")
     assert relax.op.wrap_param(c, "float32").op == Op.get("relax.wrap_param")
 
@@ -108,8 +108,8 @@ def test_astype_infer_struct_info_wrong_input_type():
 
 def test_wrap_param_infer_struct_info():
     bb = relax.BlockBuilder()
-    x0 = relax.Constant(tvm.nd.array(np.zeros([1, 2, 3], dtype="float16")))
-    x1 = relax.Constant(tvm.nd.array(np.zeros([1, 2, 3], dtype="int8")))
+    x0 = relax.Constant(tvm.runtime.tensor(np.zeros([1, 2, 3], dtype="float16")))
+    x1 = relax.Constant(tvm.runtime.tensor(np.zeros([1, 2, 3], dtype="int8")))
     _check_inference(
         bb, relax.op.wrap_param(x0, "float32"), relax.TensorStructInfo((1, 2, 3), "float32")
     )
diff --git a/tests/python/relax/test_op_gradient_numeric.py b/tests/python/relax/test_op_gradient_numeric.py
index 840f2985614a..bcea74a883be 100644
--- a/tests/python/relax/test_op_gradient_numeric.py
+++ b/tests/python/relax/test_op_gradient_numeric.py
@@ -45,7 +45,7 @@ def relax_check_gradients(
         The forward operator function. Should be a function in package relax.op.
 
     inputs_numpy : List[np.array]
-        The np array inputs for op_func. inputs_numpy will be transformed into TVM NDArray inside
+        The np array inputs for op_func. inputs_numpy will be transformed into TVM Tensor inside
         this function.
 
         If op_func takes a tuple of tensors as input, you can set tuple_input as True, and pass the
@@ -84,12 +84,12 @@ def _numpy_to_sinfo(data):
     def _numpy_to_tvm(data):
         if isinstance(data, list):
             return [_numpy_to_tvm(d) for d in data]
-        return tvm.nd.array(data)
+        return tvm.runtime.tensor(data)
 
     def _tvm_to_numpy(data, ignore_idx=[]):
         if isinstance(data, tvm.ir.Array):
             return [_tvm_to_numpy(d) for i, d in enumerate(data) if i not in ignore_idx]
-        if isinstance(data, tvm.runtime.ndarray.NDArray):
+        if isinstance(data, tvm.runtime.Tensor):
             return data.numpy()
         return data
 
@@ -189,7 +189,7 @@ def forward(*inputs):
     grad_ex = tvm.compile(grad_mod, target)
     grad_vm = relax.VirtualMachine(grad_ex, dev)
 
-    # tvm.runtime.NDArray inputs
+    # tvm.runtime.Tensor inputs
     inputs_tvm = [_numpy_to_tvm(i) for i in inputs_numpy]
     weights_tvm = _numpy_to_tvm(weights)
     result_filtered = _tvm_to_numpy(grad_vm[func_name](*inputs_tvm, weights_tvm), ignore_grads)
diff --git a/tests/python/relax/test_op_inspect.py b/tests/python/relax/test_op_inspect.py
index b25d1aa09749..cb9b2ded972e 100644
--- a/tests/python/relax/test_op_inspect.py
+++ b/tests/python/relax/test_op_inspect.py
@@ -57,7 +57,7 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty([16], dtype)
+    arg = tvm.runtime.empty([16], dtype)
     res = vm["main"](arg)
 
     expected_type_code = tvm.runtime.DataType(dtype).type_code
@@ -74,7 +74,7 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty([16], dtype)
+    arg = tvm.runtime.empty([16], dtype)
     res = vm["main"](arg)
 
     expected_type_bits = tvm.runtime.DataType(dtype).bits
@@ -91,7 +91,7 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty([16], dtype)
+    arg = tvm.runtime.empty([16], dtype)
     res = vm["main"](arg)
 
     expected_type_lanes = tvm.runtime.DataType(dtype).lanes
@@ -108,7 +108,7 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty(shape, "int32")
+    arg = tvm.runtime.empty(shape, "int32")
     res = vm["main"](arg)
 
     assert res == len(shape)
@@ -124,7 +124,7 @@ def main(A: R.Tensor, axis: R.Prim("int64")):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty(shape, "int32")
+    arg = tvm.runtime.empty(shape, "int32")
 
     res = [vm["main"](arg, i) for i, _ in enumerate(shape)]
 
@@ -150,7 +150,7 @@ def main(A: R.Tensor, axis: R.Prim("int64")):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    arg = tvm.nd.empty(shape, "int32")
+    arg = tvm.runtime.empty(shape, "int32")
 
     res = [vm["main"](arg, i) for i, _ in enumerate(shape)]
     expected = _get_compact_striding(shape)
@@ -190,8 +190,8 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
     dtype = "int32"
-    backing_ndarray = tvm.nd.empty(backing_shape, dtype)
-    view = backing_ndarray._create_view(view_shape, dtype, relative_byte_offset=byte_offset)
+    backing_tensor = tvm.runtime.empty(backing_shape, dtype)
+    view = backing_tensor._create_view(view_shape, dtype, relative_byte_offset=byte_offset)
     res = vm["main"](view)
     assert res == byte_offset
 
@@ -213,8 +213,8 @@ def main(A: R.Tensor):
     built = tvm.compile(mod)
     vm = relax.VirtualMachine(built, tvm.cpu())
 
-    backing_ndarray = tvm.nd.empty(backing_shape, dtype)
-    view = backing_ndarray._create_view(view_shape, dtype, relative_byte_offset=byte_offset)
+    backing_tensor = tvm.runtime.empty(backing_shape, dtype)
+    view = backing_tensor._create_view(view_shape, dtype, relative_byte_offset=byte_offset)
     res = vm["main"](view)
 
     assert res == elem_offset
diff --git a/tests/python/relax/test_op_misc.py b/tests/python/relax/test_op_misc.py
index 366ea1b6883d..d424ab69decc 100644
--- a/tests/python/relax/test_op_misc.py
+++ b/tests/python/relax/test_op_misc.py
@@ -23,7 +23,7 @@
 
 @tvm.register_func("test.op.identity", override=True)
 def identity_packed(a):
-    return tvm.nd.array(a.numpy())
+    return tvm.runtime.tensor(a.numpy())
 
 
 @T.prim_func
diff --git a/tests/python/relax/test_op_take.py b/tests/python/relax/test_op_take.py
index 704895d0e4f3..6bbf13ef36eb 100644
--- a/tests/python/relax/test_op_take.py
+++ b/tests/python/relax/test_op_take.py
@@ -44,7 +44,7 @@ def main(A: R.Tensor([16, 16], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[16, 16]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.take(1, axis=axis)
 
@@ -70,7 +70,7 @@ def main(A: R.Tensor([16, 16], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[16, 16]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.take([1], axis=axis)
 
@@ -92,7 +92,7 @@ def main(A: R.Tensor([16, 16], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[16, 16]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.take([[1, 3], [5, 7]], axis=axis)
 
@@ -119,7 +119,7 @@ def main(A: R.Tensor([16, 16], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[16, 16]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.take(1, axis=axis)
 
@@ -147,7 +147,7 @@ def main(A: R.Tensor(["n", "n"], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[16, 16]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.take(15, axis=axis)
 
@@ -171,7 +171,7 @@ def main(A: R.Tensor([3, 3], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype="float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     if axis == 0:
         np_expected = np.array(
@@ -204,7 +204,7 @@ def main(A: R.Tensor([3, 3], "float16")):
     vm = tvm.relax.VirtualMachine(built, dev)
 
     np_input = np.random.random(size=[3, 3]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np.take(np_input, [0, 1, 2, 3], axis=axis, mode="wrap")
 
@@ -227,7 +227,7 @@ def main(A: R.Tensor([3, 3], "float16")):
     built = tvm.compile(Module, target=target)
     vm = tvm.relax.VirtualMachine(built, dev)
     np_input = np.random.random(size=[3, 3]).astype("float16")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np.take(np_input, [0, 1, 2, 3], axis=axis, mode="clip")
 
diff --git a/tests/python/relax/test_op_view.py b/tests/python/relax/test_op_view.py
index fc9458827b26..171fe0a627bb 100644
--- a/tests/python/relax/test_op_view.py
+++ b/tests/python/relax/test_op_view.py
@@ -481,7 +481,7 @@ class Expected:
         @R.function
         def main(A: R.Tensor([4096], "float32")):
             B = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -513,7 +513,7 @@ class Expected:
         @R.function
         def main(A: R.Tensor(dtype="float32")):
             B = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -543,7 +543,7 @@ class Expected:
         @R.function
         def main(A: R.Tensor([4096], "float32")):
             B = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -573,7 +573,7 @@ class Expected:
         @R.function
         def main(A: R.Tensor([4096], "float32")):
             B = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -622,7 +622,7 @@ class Expected:
         @R.function
         def main(A: R.Tensor([4096], "uint8")):
             B = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -634,7 +634,7 @@ def main(A: R.Tensor([4096], "uint8")):
                 R.prim_value(0),
             )
             C = R.ExternFunc(
-                "runtime.TVMArrayCreateView",
+                "runtime.TVMTensorCreateView",
                 R.Callable(
                     derive_func="tvm.relax.struct_info.infer_view_sinfo",
                     purity=True,
@@ -664,7 +664,7 @@ def main(A: R.Tensor([4096], "float32")):
     vm = tvm.relax.VirtualMachine(built, device=dev)
 
     np_input = np.random.random([4096]).astype("float32")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input
 
@@ -684,7 +684,7 @@ def main(A: R.Tensor([4096], "float32")):
     vm = tvm.relax.VirtualMachine(built, device=dev)
 
     np_input = np.random.random([4096]).astype("float32")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.reshape(64, 64)
 
@@ -708,7 +708,7 @@ def main(A: R.Tensor([4096], "float32")):
     vm = tvm.relax.VirtualMachine(built, device=dev)
 
     np_input = np.random.random([4096]).astype("float32")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.reshape(64, 64)[32:48, :]
 
@@ -728,7 +728,7 @@ def main(A: R.Tensor([4096], "float32")):
     vm = tvm.relax.VirtualMachine(built, device=dev)
 
     np_input = np.random.random([4096]).astype("float32")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = np_input.view("uint32")
 
@@ -758,7 +758,7 @@ def main(A: R.Tensor([4096], "uint8")):
     vm = tvm.relax.VirtualMachine(built, device=dev)
 
     np_input = np.random.randint(0, 255, size=[4096]).astype("uint8")
-    tvm_input = tvm.nd.array(np_input, dev)
+    tvm_input = tvm.runtime.tensor(np_input, dev)
     tvm_output = vm["main"](tvm_input)
     np_expected = [
         np_input[:2048].view("int32"),
diff --git a/tests/python/relax/test_pipeline.py b/tests/python/relax/test_pipeline.py
index 34d0ca9e36d2..f9bce3539645 100644
--- a/tests/python/relax/test_pipeline.py
+++ b/tests/python/relax/test_pipeline.py
@@ -40,8 +40,8 @@ def main(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")):
     ex = tvm.compile(mod, target)
     x_np = np.random.rand(3, 4).astype(np.float32)
     y_np = np.random.rand(3, 4).astype(np.float32)
-    x = tvm.nd.array(x_np)
-    y = tvm.nd.array(y_np)
+    x = tvm.runtime.tensor(x_np)
+    y = tvm.runtime.tensor(y_np)
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
     z = vm["main"](x, y)
@@ -106,8 +106,8 @@ def main(
     for i in range(num_steps):
         x_np = np.random.rand(1, 4).astype(np.float32)
         y_np = np.random.rand(1, 4).astype(np.float32)
-        x = tvm.nd.array(x_np)
-        y = tvm.nd.array(y_np)
+        x = tvm.runtime.tensor(x_np)
+        y = tvm.runtime.tensor(y_np)
         np_shape = (i + 1, 4)
         kv, kv_cache = vm["main"](x, y, tvm.runtime.ShapeTuple(np_shape), kv_cache)
 
diff --git a/tests/python/relax/test_pytorch_integration.py b/tests/python/relax/test_pytorch_integration.py
index 2f39f88475c9..6839906e7a28 100644
--- a/tests/python/relax/test_pytorch_integration.py
+++ b/tests/python/relax/test_pytorch_integration.py
@@ -181,7 +181,7 @@ def test_call_dps_packed_with_dynamic_function(self):
         # Define my_softmax function
         def my_softmax(tensor, dim):
             """Custom softmax function for testing call_dps_packed."""
-            # Convert TVM NDArray to PyTorch tensor if needed
+            # Convert TVM Tensor to PyTorch tensor if needed
             if hasattr(tensor, "numpy"):
                 tensor = torch.from_numpy(tensor.numpy())
             return F.softmax(tensor, dim=dim)
diff --git a/tests/python/relax/test_relax_operators.py b/tests/python/relax/test_relax_operators.py
index c94dd9f5789d..221d7d1270a5 100644
--- a/tests/python/relax/test_relax_operators.py
+++ b/tests/python/relax/test_relax_operators.py
@@ -56,7 +56,7 @@ def run_cpu(mod, func_name, *args, exec_mode):
 def test_unique(exec_mode):
     # TODO(prakalp): also add test for compiling and running on cuda device.
     data_numpy = np.random.randint(0, 16, (16, 16))
-    data = tvm.nd.array(data_numpy)
+    data = tvm.runtime.tensor(data_numpy)
     result, result_sorted = run_cpu(InputModule, "foo", data, exec_mode=exec_mode)
 
     expected_output_sorted, indices = np.unique(data_numpy, return_index=True)
@@ -91,7 +91,7 @@ def test_print(exec_mode):
             run_cpu(
                 PrintTest,
                 "foo",
-                tvm.nd.array(np.array(1).astype("int32")),
+                tvm.runtime.tensor(np.array(1).astype("int32")),
                 exec_mode=exec_mode,
             )
             test_out.seek(0)
@@ -108,7 +108,7 @@ def func(x: R.Tensor((), "int32")):
         _ = R.assert_op(relax.const(True))
         return x
 
-    run_cpu(func, tvm.nd.array(np.array(1).astype("int32")), exec_mode=exec_mode)
+    run_cpu(func, tvm.runtime.tensor(np.array(1).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_passes_with_format_args(exec_mode):
@@ -117,7 +117,7 @@ def func(x: R.Tensor((), "int32")):
         _ = R.assert_op(relax.const(True), x, format="You won't see me")
         return x
 
-    run_cpu(func, tvm.nd.array(np.array(1).astype("int32")), exec_mode=exec_mode)
+    run_cpu(func, tvm.runtime.tensor(np.array(1).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_fails(exec_mode):
@@ -127,7 +127,7 @@ def func(x: R.Tensor((), "int32")):
         return x
 
     with pytest.raises(AssertionError, match="Assertion Failed"):
-        run_cpu(func, tvm.nd.array(np.array(1).astype("int32")), exec_mode=exec_mode)
+        run_cpu(func, tvm.runtime.tensor(np.array(1).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_fails_with_message(exec_mode):
@@ -137,7 +137,7 @@ def func(x: R.Tensor((), "int32")):
         return x
 
     with pytest.raises(AssertionError, match="I failed..."):
-        run_cpu(func, tvm.nd.array(np.array(1).astype("int32")), exec_mode=exec_mode)
+        run_cpu(func, tvm.runtime.tensor(np.array(1).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_fails_with_args(exec_mode):
@@ -147,7 +147,7 @@ def func(x: R.Tensor((), "int32")):
         return x
 
     with pytest.raises(AssertionError, match="5, 5"):
-        run_cpu(func, tvm.nd.array(np.array(5).astype("int32")), exec_mode=exec_mode)
+        run_cpu(func, tvm.runtime.tensor(np.array(5).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_fails_with_formatted_args(exec_mode):
@@ -157,7 +157,7 @@ def func(x: R.Tensor((), "int32")):
         return x
 
     with pytest.raises(AssertionError, match="Number: 6"):
-        run_cpu(func, tvm.nd.array(np.array(6).astype("int32")), exec_mode=exec_mode)
+        run_cpu(func, tvm.runtime.tensor(np.array(6).astype("int32")), exec_mode=exec_mode)
 
 
 def test_assert_on_argument_passes(exec_mode):
@@ -166,8 +166,8 @@ def func(condition: R.Tensor((), "bool"), x: R.Tensor((), "int32")):
         _ = R.assert_op(condition)
         return x
 
-    condition = tvm.nd.array(np.array(True))
-    x = tvm.nd.array(np.array(5).astype("int32"))
+    condition = tvm.runtime.tensor(np.array(True))
+    x = tvm.runtime.tensor(np.array(5).astype("int32"))
     run_cpu(func, condition, x, exec_mode=exec_mode)
 
 
@@ -177,8 +177,8 @@ def func(condition: R.Tensor((), "bool"), x: R.Tensor((), "int32")):
         _ = R.assert_op(condition)
         return x
 
-    condition = tvm.nd.array(np.array(False))
-    x = tvm.nd.array(np.array(5).astype("int32"))
+    condition = tvm.runtime.tensor(np.array(False))
+    x = tvm.runtime.tensor(np.array(5).astype("int32"))
     with pytest.raises(AssertionError):
         run_cpu(func, condition, x, exec_mode=exec_mode)
 
@@ -190,7 +190,7 @@ def func(x: R.Tensor(["N"], "int32")):
         _ = R.assert_op(R.prim_value(N % 8 == 0))
         return x
 
-    x = tvm.nd.array(np.arange(8, dtype="int32"))
+    x = tvm.runtime.tensor(np.arange(8, dtype="int32"))
     run_cpu(func, x, exec_mode=exec_mode)
 
 
@@ -201,7 +201,7 @@ def func(x: R.Tensor(["N"], "int32")):
         _ = R.assert_op(R.prim_value(N % 8 == 0))
         return x
 
-    x = tvm.nd.array(np.arange(10, dtype="int32"))
+    x = tvm.runtime.tensor(np.arange(10, dtype="int32"))
     with pytest.raises(AssertionError):
         run_cpu(func, x, exec_mode=exec_mode)
 
@@ -238,14 +238,17 @@ def test_op_shape_of(exec_mode):
     assert const_shape == tvm.runtime.ShapeTuple([2, 2])
 
     scalar_shape = run_cpu(
-        ShapeOfTest, "get_shape", tvm.nd.array(np.array(1, dtype="int32")), exec_mode=exec_mode
+        ShapeOfTest,
+        "get_shape",
+        tvm.runtime.tensor(np.array(1, dtype="int32")),
+        exec_mode=exec_mode,
     )
     assert scalar_shape == tvm.runtime.ShapeTuple([])
 
     tensor_shape = run_cpu(
         ShapeOfTest,
         "get_shape",
-        tvm.nd.array(np.zeros((1, 2, 3)).astype("int32")),
+        tvm.runtime.tensor(np.zeros((1, 2, 3)).astype("int32")),
         exec_mode=exec_mode,
     )
     assert tensor_shape == tvm.runtime.ShapeTuple([1, 2, 3])
@@ -253,7 +256,7 @@ def test_op_shape_of(exec_mode):
     constrained_shape = run_cpu(
         ShapeOfTest,
         "get_constrained_shape",
-        tvm.nd.array(np.zeros((1,)).astype("int32")),
+        tvm.runtime.tensor(np.zeros((1,)).astype("int32")),
         exec_mode=exec_mode,
     )
     assert constrained_shape == tvm.runtime.ShapeTuple([1])
@@ -283,25 +286,25 @@ def test_op_shape_to_tensor(exec_mode):
     out2d = run_cpu(
         ShapeToTensorTest, "const_shape", tvm.runtime.ShapeTuple([3, 2]), exec_mode=exec_mode
     )
-    assert isinstance(out2d, tvm.runtime.ndarray.NDArray)
+    assert isinstance(out2d, tvm.runtime.Tensor)
     assert np.array_equal(out2d.numpy(), np.array([3, 2]))
 
     out3d = run_cpu(
         ShapeToTensorTest, "const_shape", tvm.runtime.ShapeTuple([3, 3, 2]), exec_mode=exec_mode
     )
-    assert isinstance(out3d, tvm.runtime.ndarray.NDArray)
+    assert isinstance(out3d, tvm.runtime.Tensor)
     assert np.array_equal(out3d.numpy(), np.array([3, 3, 2]))
 
     out4d = run_cpu(
         ShapeToTensorTest, "const_shape", tvm.runtime.ShapeTuple([3, 3, 2, 2]), exec_mode=exec_mode
     )
-    assert isinstance(out4d, tvm.runtime.ndarray.NDArray)
+    assert isinstance(out4d, tvm.runtime.Tensor)
     assert np.array_equal(out4d.numpy(), np.array([3, 3, 2, 2]))
 
     outs = run_cpu(
         ShapeToTensorTest, "symbolic_shape", tvm.runtime.ShapeTuple([3, 2]), exec_mode=exec_mode
     )
-    assert isinstance(outs, tvm.runtime.ndarray.NDArray)
+    assert isinstance(outs, tvm.runtime.Tensor)
     assert np.array_equal(outs.numpy(), np.array([3, 2]))
 
 
@@ -317,7 +320,7 @@ def pure_copy(x: R.Tensor((3, 4), "float32")):
 
     np.random.seed(0)  # to avoid flakiness
     arr = np.random.rand(3, 4).astype("float32")
-    copy_found = run_cpu(CallPureTest, "pure_copy", tvm.nd.array(arr), exec_mode=exec_mode)
+    copy_found = run_cpu(CallPureTest, "pure_copy", tvm.runtime.tensor(arr), exec_mode=exec_mode)
     assert (copy_found.numpy() == arr).all()
 
 
@@ -362,9 +365,9 @@ def inplace_add(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")):
     arr_a = np.random.rand(3, 4).astype("float32")
     arr_b = np.random.rand(3, 4).astype("float32")
     sum = arr_a + arr_b
-    tvm_arr_a = tvm.nd.array(arr_a)
+    tvm_arr_a = tvm.runtime.tensor(arr_a)
     result = run_cpu(
-        CallInplaceAddTest, "inplace_add", tvm_arr_a, tvm.nd.array(arr_b), exec_mode=exec_mode
+        CallInplaceAddTest, "inplace_add", tvm_arr_a, tvm.runtime.tensor(arr_b), exec_mode=exec_mode
     )
     assert result == tvm_arr_a
     assert (result.numpy() == sum).all()
@@ -373,7 +376,7 @@ def inplace_add(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")):
     def inplace_tuple_add(a, b):
         arr_a = a.numpy()
         arr_b = b.numpy()
-        c = tvm.nd.array(arr_a + arr_b)
+        c = tvm.runtime.tensor(arr_a + arr_b)
         for i in range(len(arr_a)):
             for j in range(len(arr_a[i])):
                 arr_a[i][j] = arr_a[i][j] + arr_b[i][j]
@@ -397,8 +400,8 @@ def inplace_tuple(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")
     arr_a = np.random.rand(3, 4).astype("float32")
     arr_b = np.random.rand(3, 4).astype("float32")
     sum = arr_a + arr_b
-    tvm_arr_a = tvm.nd.array(arr_a)
-    tvm_arr_b = tvm.nd.array(arr_b)
+    tvm_arr_a = tvm.runtime.tensor(arr_a)
+    tvm_arr_b = tvm.runtime.tensor(arr_b)
     result = run_cpu(CallInplaceTuple, "inplace_tuple", tvm_arr_a, tvm_arr_b, exec_mode=exec_mode)
     assert result[0] == tvm_arr_a
     assert (result[0].numpy() == sum).all()
@@ -422,7 +425,7 @@ def to_dev(x: R.Tensor((3, 4), "float32")):
 
     np.random.seed(0)  # to avoid flakiness
     arr = np.random.rand(3, 4).astype("float32")
-    copy_found = run_cpu(CallToDevice, "to_dev", tvm.nd.array(arr), exec_mode=exec_mode)
+    copy_found = run_cpu(CallToDevice, "to_dev", tvm.runtime.tensor(arr), exec_mode=exec_mode)
     assert (copy_found.numpy() == arr).all()
 
 
@@ -439,7 +442,7 @@ def to_vdev(x: R.Tensor((3, 4), "float32")):
 
     np.random.seed(0)
     arr = np.random.rand(3, 4).astype("float32")
-    copy_found = run_cpu(ToVDevice, "to_vdev", tvm.nd.array(arr), exec_mode=exec_mode)
+    copy_found = run_cpu(ToVDevice, "to_vdev", tvm.runtime.tensor(arr), exec_mode=exec_mode)
     assert (copy_found.numpy() == arr).all()
 
 
@@ -454,10 +457,10 @@ def func(condition: R.Tensor((), "bool")):
             out = R.prim_value(10)
         return out
 
-    res = run_cpu(func, tvm.nd.array(np.array(True)), exec_mode=exec_mode)
+    res = run_cpu(func, tvm.runtime.tensor(np.array(True)), exec_mode=exec_mode)
     assert res == 5
 
-    res = run_cpu(func, tvm.nd.array(np.array(False)), exec_mode=exec_mode)
+    res = run_cpu(func, tvm.runtime.tensor(np.array(False)), exec_mode=exec_mode)
     assert res == 10
 
 
@@ -491,10 +494,10 @@ def func(x: R.Tensor(["N"], "int64")):
             out = R.prim_value(10)
         return out
 
-    res = run_cpu(func, tvm.nd.array(np.arange(16)), exec_mode=exec_mode)
+    res = run_cpu(func, tvm.runtime.tensor(np.arange(16)), exec_mode=exec_mode)
     assert res == 5
 
-    res = run_cpu(func, tvm.nd.array(np.arange(20)), exec_mode=exec_mode)
+    res = run_cpu(func, tvm.runtime.tensor(np.arange(20)), exec_mode=exec_mode)
     assert res == 10
 
 
diff --git a/tests/python/relax/test_runtime_builtin.py b/tests/python/relax/test_runtime_builtin.py
index fb4c8abdf9e6..a3003459f89d 100644
--- a/tests/python/relax/test_runtime_builtin.py
+++ b/tests/python/relax/test_runtime_builtin.py
@@ -28,7 +28,7 @@
 def test_make_shape():
     MK = MakeShapeCode
     make_shape = tvm.get_global_func("vm.builtin.make_shape")
-    heap = tvm.nd.array(np.arange(10).astype("int64"))
+    heap = tvm.runtime.tensor(np.arange(10).astype("int64"))
     s = make_shape(heap, 3, MK.USE_IMM, 10, MK.LOAD_SHAPE, 0, MK.LOAD_SHAPE, 2)
 
     assert s == tvm.runtime.container.ShapeTuple([10, 0, 2])
@@ -37,12 +37,12 @@ def test_make_shape():
 def test_match_shape():
     MS = MatchShapeCode
     match_shape = tvm.get_global_func("vm.builtin.match_shape")
-    heap = tvm.nd.array(np.zeros(10).astype("int64"))
+    heap = tvm.runtime.tensor(np.zeros(10).astype("int64"))
 
     assert heap.numpy()[2] == 0
 
     s = tvm.runtime.container.ShapeTuple([1, 2, 3])
-    x = tvm.nd.array(np.zeros([1, 2, 3]))
+    x = tvm.runtime.tensor(np.zeros([1, 2, 3]))
 
     match_shape(s, heap, 3, MS.ASSERT_EQUAL_TO_IMM, 1, MS.STORE_TO_HEAP, 2, MS.NO_OP, 0, "")
 
@@ -86,7 +86,7 @@ def test_check_shape_info():
 
 def test_check_tensor_info():
     check_tensor_info = tvm.get_global_func("vm.builtin.check_tensor_info")
-    x = tvm.nd.array(np.zeros((2, 3)).astype("int32"))
+    x = tvm.runtime.tensor(np.zeros((2, 3)).astype("int32"))
 
     check_tensor_info(x, 2, "int32", "")
     check_tensor_info(x, -1, "int32", "")
@@ -116,7 +116,7 @@ def test_check_tensor_info():
 
 def test_check_tuple_info():
     check_tuple_info = tvm.get_global_func("vm.builtin.check_tuple_info")
-    x = tvm.nd.array(np.zeros((2, 3)).astype("int32"))
+    x = tvm.runtime.tensor(np.zeros((2, 3)).astype("int32"))
     t = tvm.runtime.convert([x, x, x])
 
     check_tuple_info(t, 3, "")
@@ -133,7 +133,7 @@ def test_check_tuple_info():
 def test_check_func_info():
     check_func_info = tvm.get_global_func("vm.builtin.check_func_info")
     f = tvm.runtime.convert(lambda x: x)
-    x = tvm.nd.array(np.zeros((2, 3)).astype("int32"))
+    x = tvm.runtime.tensor(np.zeros((2, 3)).astype("int32"))
 
     check_func_info(f, "")
 
@@ -144,8 +144,8 @@ def test_check_func_info():
 
 def test_tuple_getitem():
     tuple_getitem = tvm.get_global_func("vm.builtin.tuple_getitem")
-    x = tvm.nd.array(np.zeros((2, 3)).astype("int32"))
-    y = tvm.nd.array(np.zeros((2, 3)).astype("int32"))
+    x = tvm.runtime.tensor(np.zeros((2, 3)).astype("int32"))
+    y = tvm.runtime.tensor(np.zeros((2, 3)).astype("int32"))
     t = tvm.runtime.convert([x, y])
 
     assert tuple_getitem(t, 0) == x
@@ -157,10 +157,10 @@ def test_attention_kv_cache():
     fappend = tvm.get_global_func("vm.builtin.attention_kv_cache_append")
     fview = tvm.get_global_func("vm.builtin.attention_kv_cache_view")
 
-    cache = fcreate(tvm.nd.empty((1, 2), dtype="int32"), tvm.runtime.ShapeTuple([2, 2]), 0)
+    cache = fcreate(tvm.runtime.empty((1, 2), dtype="int32"), tvm.runtime.ShapeTuple([2, 2]), 0)
     num_steps = 2
     for i in range(num_steps):
-        cache = fappend(cache, tvm.nd.array(i * np.ones((1, 2)).astype("int32")))
+        cache = fappend(cache, tvm.runtime.tensor(i * np.ones((1, 2)).astype("int32")))
 
     res = fview(cache, tvm.runtime.ShapeTuple((num_steps, 2))).numpy()
     for i in range(num_steps):
@@ -168,8 +168,8 @@ def test_attention_kv_cache():
         assert res[i][1] == i
 
 
-def test_ndarray_cache():
-    fload = tvm.get_global_func("vm.builtin.ndarray_cache.load")
+def test_tensor_cache():
+    fload = tvm.get_global_func("vm.builtin.tensor_cache.load")
     fget_params = tvm.get_global_func("vm.builtin.param_array_from_cache")
 
     param_dict = {
@@ -178,7 +178,7 @@ def test_ndarray_cache():
     }
 
     temp = utils.tempdir()
-    tvmjs.dump_ndarray_cache(param_dict, temp.path, encode_format="f32-to-bf16")
+    tvmjs.dump_tensor_cache(param_dict, temp.path, encode_format="f32-to-bf16")
     fload(str(temp.path), tvm.cpu().device_type, 0)
     res = fget_params("x", -1)
     for i, v in enumerate(res):
@@ -188,8 +188,8 @@ def test_ndarray_cache():
         np.testing.assert_allclose(v.numpy(), v_np, atol=1e-6, rtol=1e-6)
 
 
-def test_ndarray_cache_update():
-    fload = tvm.get_global_func("vm.builtin.ndarray_cache.load")
+def test_tensor_cache_update():
+    fload = tvm.get_global_func("vm.builtin.tensor_cache.load")
     fget_params = tvm.get_global_func("vm.builtin.param_array_from_cache")
 
     param_dict = {
@@ -198,10 +198,10 @@ def test_ndarray_cache_update():
     }
 
     temp = utils.tempdir()
-    tvmjs.dump_ndarray_cache(param_dict, temp.path, encode_format="f32-to-bf16")
+    tvmjs.dump_tensor_cache(param_dict, temp.path, encode_format="f32-to-bf16")
     param_dict["x_1"] = np.random.uniform(size=[10, 20]).astype("float32")
     param_dict["x_2"] = np.random.uniform(size=[10]).astype("float32")
-    tvmjs.dump_ndarray_cache(
+    tvmjs.dump_tensor_cache(
         param_dict, temp.path, encode_format="f32-to-bf16", update_if_exists=True
     )
     fload(str(temp.path), tvm.cpu().device_type, 0)
@@ -220,7 +220,7 @@ def test_attention_kv_cache_window_override():
 
     current_pos = 4
     cache = fcreate(
-        tvm.nd.array(np.full((16, 2), -1).astype("int32")),
+        tvm.runtime.tensor(np.full((16, 2), -1).astype("int32")),
         tvm.runtime.ShapeTuple([16, 2]),
         current_pos,
     )
@@ -230,7 +230,7 @@ def test_attention_kv_cache_window_override():
     for i in range(1, num_steps):
         np_array = i * np.ones((i, 2)).astype("int32")
         np_all_arrays = np.concatenate((np_all_arrays, np_array), axis=0)
-        cache = foverride(cache, tvm.nd.array(np_array), 16)
+        cache = foverride(cache, tvm.runtime.tensor(np_array), 16)
         current_pos = (current_pos + i) % 16
 
     res = fview(cache, tvm.runtime.ShapeTuple((16, 2))).numpy()
@@ -252,7 +252,7 @@ def test_attention_kv_cache_window_override_with_sinks():
     current_pos = 0
 
     cache = fcreate(
-        tvm.nd.array(np.full((16, 2), -1).astype("int32")),
+        tvm.runtime.tensor(np.full((16, 2), -1).astype("int32")),
         tvm.runtime.ShapeTuple([16, 2]),
         current_pos,
     )
@@ -262,7 +262,7 @@ def test_attention_kv_cache_window_override_with_sinks():
     for i in range(num_steps):
         np_array = i * np.ones((1, 2)).astype("int32")
         np_all_arrays = np.concatenate((np_all_arrays, np_array), axis=0)
-        cache = foverride(cache, tvm.nd.array(np_array), 16, num_attention_sinks)
+        cache = foverride(cache, tvm.runtime.tensor(np_array), 16, num_attention_sinks)
 
         if has_sink:
             current_pos = max((current_pos + 1) % 16, num_attention_sinks)
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_cpu.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_cpu.py
index 1941edeaa715..970cf3826055 100644
--- a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_cpu.py
+++ b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_cpu.py
@@ -140,7 +140,7 @@ def set_global_func(head_dim, dtype):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -182,7 +182,7 @@ def create_kv_cache(head_dim, dtype, rope_mode, support_sliding_window):
         rope_scale,
         rope_theta,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         ftranspose_append,
         None,  # f_transpose_append_mla
         ["tir", fattn_prefill_ragged],
@@ -244,8 +244,8 @@ def verify_cached_kv(kv_cache, seq_ids, expected_k, expected_v):
         values_expected = expected_v[seq_id]
         assert keys_expected.shape == values_expected.shape
         seq_length = expected_k[seq_id].shape[1]
-        keys = tvm.nd.empty(keys_expected.shape, dtype=dtype, device=device)
-        values = tvm.nd.empty(values_expected.shape, dtype=dtype, device=device)
+        keys = tvm.runtime.empty(keys_expected.shape, dtype=dtype, device=device)
+        values = tvm.runtime.empty(values_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, keys, values)
         tvm.testing.assert_allclose(keys.numpy(), keys_expected, rtol=1e-3, atol=1e-3)
         tvm.testing.assert_allclose(values.numpy(), values_expected, rtol=1e-3, atol=1e-3)
@@ -395,8 +395,8 @@ def apply_attention(
         queries_np = global_new_q[layer_id]
         keys_np = global_new_k[layer_id]
         values_np = global_new_v[layer_id]
-        qkv = tvm.nd.array(np.concatenate([queries_np, keys_np, values_np], axis=1), device)
-        outputs = tvm.nd.empty(queries_np.shape, dtype, device=device)
+        qkv = tvm.runtime.tensor(np.concatenate([queries_np, keys_np, values_np], axis=1), device)
+        outputs = tvm.runtime.empty(queries_np.shape, dtype, device=device)
         fattention_with_fuse_qkv(kv_cache, layer_id, sm_scale, qkv, outputs)
 
         # Compute attention expected results.
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_flashinfer.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_flashinfer.py
index ffd345229200..dd29140e9bb2 100644
--- a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_flashinfer.py
+++ b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_flashinfer.py
@@ -156,7 +156,7 @@ def load_module(name: str, static_modules: List[tvm.runtime.Module]):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -192,7 +192,7 @@ def create_kv_cache(rope_mode):
         rope_scale,
         rope_theta,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         ftranspose_append,
         None,  # f_transpose_append_mla
         ["flashinfer", fattention_prefill_ragged, fattention_prefill_ragged_plan],
@@ -224,8 +224,8 @@ def verify_cached_kv(kv_cache, seq_ids, expected_k, expected_v):
         values_expected = expected_v[seq_id]
         assert keys_expected.shape == values_expected.shape
         seq_length = expected_k[seq_id].shape[1]
-        keys = tvm.nd.empty(keys_expected.shape, dtype=dtype, device=device)
-        values = tvm.nd.empty(values_expected.shape, dtype=dtype, device=device)
+        keys = tvm.runtime.empty(keys_expected.shape, dtype=dtype, device=device)
+        values = tvm.runtime.empty(values_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, keys, values)
         torch.testing.assert_close(
             torch.from_numpy(keys.numpy()).to(device_torch), keys_expected, rtol=1e-3, atol=1e-3
@@ -365,8 +365,10 @@ def apply_attention(
         queries_np = global_new_q[layer_id]
         keys_np = global_new_k[layer_id]
         values_np = global_new_v[layer_id]
-        qkv = tvm.nd.array(torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device)
-        outputs = tvm.nd.empty(queries_np.shape, dtype, device=device)
+        qkv = tvm.runtime.tensor(
+            torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device
+        )
+        outputs = tvm.runtime.empty(queries_np.shape, dtype, device=device)
         fattention_with_fuse_qkv(kv_cache, layer_id, sm_scale, qkv, outputs)
 
         # Compute attention expected results.
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_flashinfer.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_flashinfer.py
index 2f726064a71b..8253c379951a 100644
--- a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_flashinfer.py
+++ b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_flashinfer.py
@@ -169,7 +169,7 @@ def load_module(name: str, static_modules: List[tvm.runtime.Module]):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -218,7 +218,7 @@ def create_kv_cache(dtype):
         1,
         10000,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         None,  # f_transpose_append_mha
         ftranspose_append,
         ["flashinfer", fattn_prefill_ragged, fattn_prefill_ragged_plan],  # fattn_prefill_ragged
@@ -251,7 +251,7 @@ def verify_cached_kv(kv_cache, seq_ids, expected_kv):
     for seq_id in seq_ids:
         kv_expected = expected_kv[seq_id]
         seq_length = expected_kv[seq_id].shape[1]
-        kv_actual = tvm.nd.empty(kv_expected.shape, dtype=dtype, device=device)
+        kv_actual = tvm.runtime.empty(kv_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, kv_actual)
         torch.testing.assert_close(
             torch.from_numpy(kv_actual.numpy()).to(device_torch), kv_expected, rtol=1e-3, atol=1e-3
@@ -334,17 +334,17 @@ def apply_attention(
             is_decode_request = False
 
     for layer_id in range(num_layers):
-        queries = tvm.nd.array(global_new_q[layer_id].cpu().numpy(), device)
-        key_value = tvm.nd.array(global_new_kv[layer_id].cpu().numpy(), device)
+        queries = tvm.runtime.tensor(global_new_q[layer_id].cpu().numpy(), device)
+        key_value = tvm.runtime.tensor(global_new_kv[layer_id].cpu().numpy(), device)
         total_seq_length = global_new_q[layer_id].shape[0]
-        outputs1 = tvm.nd.empty(
+        outputs1 = tvm.runtime.empty(
             (total_seq_length, num_attention_heads, v_head_dim), dtype, device=device
         )
-        lse1 = tvm.nd.empty((total_seq_length, num_attention_heads), "float32", device=device)
-        outputs2 = tvm.nd.empty(
+        lse1 = tvm.runtime.empty((total_seq_length, num_attention_heads), "float32", device=device)
+        outputs2 = tvm.runtime.empty(
             (total_seq_length, num_attention_heads, kv_lora_rank), dtype, device=device
         )
-        lse2 = tvm.nd.empty((total_seq_length, num_attention_heads), "float32", device=device)
+        lse2 = tvm.runtime.empty((total_seq_length, num_attention_heads), "float32", device=device)
 
         fappend_mla_kv(kv_cache, layer_id, key_value)
         if not is_decode_request:
@@ -361,8 +361,8 @@ def apply_attention(
                 total_seq_length, num_attention_heads, qk_rope_head_dim
             )
             keys = torch.cat([keys, k_pe_expanded], dim=2)
-            keys_tvm = tvm.nd.array(keys.cpu().numpy(), device)
-            values_tvm = tvm.nd.array(values.cpu().numpy(), device)
+            keys_tvm = tvm.runtime.tensor(keys.cpu().numpy(), device)
+            values_tvm = tvm.runtime.tensor(values.cpu().numpy(), device)
             fself_attn(kv_cache, layer_id, sm_scale, queries, keys_tvm, values_tvm, outputs1, lse1)
 
         if not all_new_sequences or is_decode_request:
@@ -373,9 +373,9 @@ def apply_attention(
             queries_lora_np = torch.cat(
                 [torch.bmm(queries_lora_np.permute(1, 0, 2), w_uk).permute(1, 0, 2), q_pe], dim=2
             )
-            queries_lora = tvm.nd.array(queries_lora_np.cpu().numpy(), device)
+            queries_lora = tvm.runtime.tensor(queries_lora_np.cpu().numpy(), device)
             fcross_attn(kv_cache, layer_id, sm_scale, queries_lora, outputs2, lse2)
-            cross_attn_output = tvm.nd.array(
+            cross_attn_output = tvm.runtime.tensor(
                 torch.bmm(
                     torch.from_numpy(outputs2.numpy()).to(device_torch).permute(1, 0, 2), w_uv
                 )
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_tir.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_tir.py
index b2982abdb0a5..cc4ffb1d525b 100644
--- a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_tir.py
+++ b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_mla_tir.py
@@ -134,7 +134,7 @@ def set_global_func(dtype):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -185,7 +185,7 @@ def create_kv_cache(dtype):
         1,
         10000,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         None,  # f_transpose_append_mha
         ftranspose_append,
         ["tir", fmla_prefill_ragged],  # fattn_prefill_ragged
@@ -218,7 +218,7 @@ def verify_cached_kv(kv_cache, seq_ids, expected_kv):
     for seq_id in seq_ids:
         kv_expected = expected_kv[seq_id]
         seq_length = expected_kv[seq_id].shape[1]
-        kv_actual = tvm.nd.empty(kv_expected.shape, dtype=dtype, device=device)
+        kv_actual = tvm.runtime.empty(kv_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, kv_actual)
         torch.testing.assert_close(
             torch.from_numpy(kv_actual.numpy()).to(device_torch), kv_expected, rtol=1e-3, atol=1e-3
@@ -301,17 +301,17 @@ def apply_attention(
             is_decode_request = False
 
     for layer_id in range(num_layers):
-        queries = tvm.nd.array(global_new_q[layer_id].cpu().numpy(), device)
-        key_value = tvm.nd.array(global_new_kv[layer_id].cpu().numpy(), device)
+        queries = tvm.runtime.tensor(global_new_q[layer_id].cpu().numpy(), device)
+        key_value = tvm.runtime.tensor(global_new_kv[layer_id].cpu().numpy(), device)
         total_seq_length = global_new_q[layer_id].shape[0]
-        outputs1 = tvm.nd.empty(
+        outputs1 = tvm.runtime.empty(
             (total_seq_length, num_attention_heads, v_head_dim), dtype, device=device
         )
-        lse1 = tvm.nd.empty((total_seq_length, num_attention_heads), "float32", device=device)
-        outputs2 = tvm.nd.empty(
+        lse1 = tvm.runtime.empty((total_seq_length, num_attention_heads), "float32", device=device)
+        outputs2 = tvm.runtime.empty(
             (total_seq_length, num_attention_heads, kv_lora_rank), dtype, device=device
         )
-        lse2 = tvm.nd.empty((total_seq_length, num_attention_heads), "float32", device=device)
+        lse2 = tvm.runtime.empty((total_seq_length, num_attention_heads), "float32", device=device)
 
         fappend_mla_kv(kv_cache, layer_id, key_value)
         if not is_decode_request:
@@ -328,8 +328,8 @@ def apply_attention(
                 total_seq_length, num_attention_heads, qk_rope_head_dim
             )
             keys = torch.cat([keys, k_pe_expanded], dim=2)
-            keys_tvm = tvm.nd.array(keys.cpu().numpy(), device)
-            values_tvm = tvm.nd.array(values.cpu().numpy(), device)
+            keys_tvm = tvm.runtime.tensor(keys.cpu().numpy(), device)
+            values_tvm = tvm.runtime.tensor(values.cpu().numpy(), device)
             fself_attn(kv_cache, layer_id, sm_scale, queries, keys_tvm, values_tvm, outputs1, lse1)
 
         if not all_new_sequences or is_decode_request:
@@ -340,9 +340,9 @@ def apply_attention(
             queries_lora_np = torch.cat(
                 [torch.bmm(queries_lora_np.permute(1, 0, 2), w_uk).permute(1, 0, 2), q_pe], dim=2
             )
-            queries_lora = tvm.nd.array(queries_lora_np.cpu().numpy(), device)
+            queries_lora = tvm.runtime.tensor(queries_lora_np.cpu().numpy(), device)
             fcross_attn(kv_cache, layer_id, sm_scale, queries_lora, outputs2, lse2)
-            cross_attn_output = tvm.nd.array(
+            cross_attn_output = tvm.runtime.tensor(
                 torch.bmm(
                     torch.from_numpy(outputs2.numpy()).to(device_torch).permute(1, 0, 2), w_uv
                 )
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py
index 8cd3a737402e..b80bd1acb7b7 100644
--- a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py
+++ b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py
@@ -142,7 +142,7 @@ def set_global_func(head_dim, dtype):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)
         f = tvm.tir.build(mod["main"], target=target)
-        builts.append(f.entry_func)
+        builts.append(f.main)
 
     (
         ftranspose_append,
@@ -184,7 +184,7 @@ def create_kv_cache(head_dim, dtype, rope_mode, support_sliding_window):
         rope_scale,
         rope_theta,
         None,  # rope_ext_factors
-        tvm.nd.empty((), dtype, device=device),
+        tvm.runtime.empty((), dtype, device=device),
         ftranspose_append,
         None,  # f_transpose_append_mla
         ["tir", fattn_prefill_ragged],
@@ -235,8 +235,8 @@ def verify_cached_kv(kv_cache, seq_ids, expected_k, expected_v):
         values_expected = expected_v[seq_id]
         assert keys_expected.shape == values_expected.shape
         seq_length = expected_k[seq_id].shape[1]
-        keys = tvm.nd.empty(keys_expected.shape, dtype=dtype, device=device)
-        values = tvm.nd.empty(values_expected.shape, dtype=dtype, device=device)
+        keys = tvm.runtime.empty(keys_expected.shape, dtype=dtype, device=device)
+        values = tvm.runtime.empty(values_expected.shape, dtype=dtype, device=device)
         fdebug_get_kv(kv_cache, seq_id, 0, seq_length, keys, values)
         torch.testing.assert_close(
             torch.from_numpy(keys.numpy()).to(device_torch), keys_expected, rtol=1e-3, atol=1e-3
@@ -428,8 +428,10 @@ def apply_attention(
         queries_np = global_new_q[layer_id]
         keys_np = global_new_k[layer_id]
         values_np = global_new_v[layer_id]
-        qkv = tvm.nd.array(torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device)
-        outputs = tvm.nd.empty(queries_np.shape, dtype, device=device)
+        qkv = tvm.runtime.tensor(
+            torch.cat([queries_np, keys_np, values_np], dim=1).cpu().numpy(), device
+        )
+        outputs = tvm.runtime.empty(queries_np.shape, dtype, device=device)
         fattention_with_fuse_qkv(kv_cache, layer_id, sm_scale, qkv, outputs)
 
         # Compute attention expected results.
diff --git a/tests/python/relax/test_runtime_builtin_rnn_state.py b/tests/python/relax/test_runtime_builtin_rnn_state.py
index 095aba8b83e5..515c6ee648ff 100644
--- a/tests/python/relax/test_runtime_builtin_rnn_state.py
+++ b/tests/python/relax/test_runtime_builtin_rnn_state.py
@@ -81,7 +81,7 @@ def _build(tir_func):
         with target:
             mod = dl.ApplyDefaultSchedule(dl.gpu.Fallback())(mod)  # pylint: disable=not-callable
         f = tvm.tir.build(mod["main"], target=target)
-        return f.entry_func
+        return f.main
 
     _f_tir_gets, _f_tir_sets = [], []
     for state in states:
@@ -95,7 +95,10 @@ def _build(tir_func):
 
 def create_rnn_state():
     f_create = tvm.get_global_func("vm.builtin.rnn_state_create")
-    init_values = [tvm.nd.array(np_zero, device=device), tvm.nd.array(np_one, device=device)]
+    init_values = [
+        tvm.runtime.tensor(np_zero, device=device),
+        tvm.runtime.tensor(np_one, device=device),
+    ]
     return f_create(num_layers, reserved_nseq, max_history, f_tir_gets, f_tir_sets, init_values)
 
 
@@ -119,8 +122,8 @@ def test_rnn_state_get(rnn_state):  # pylint: disable=redefined-outer-name
     f_clear(state)
     f_add_sequence(state, 0)
     f_begin_forward(state, ShapeTuple([0]), ShapeTuple([1]))
-    tvm_nd_0 = tvm.nd.array(np.empty((1, 16, 16), "float16"), device=device)
-    tvm_nd_1 = tvm.nd.array(np.empty((1, 32, 32), "float32"), device=device)
+    tvm_nd_0 = tvm.runtime.tensor(np.empty((1, 16, 16), "float16"), device=device)
+    tvm_nd_1 = tvm.runtime.tensor(np.empty((1, 32, 32), "float32"), device=device)
     f_get(state, 0, 0, tvm_nd_0)
     f_get(state, 0, 1, tvm_nd_1)
     f_end_forward(state)
@@ -136,8 +139,8 @@ def test_rnn_state_set(rnn_state):  # pylint: disable=redefined-outer-name
         f_add_sequence(state, seq_id)
     f_begin_forward(state, ShapeTuple([0, 2]), ShapeTuple([1, 1]))
 
-    f_set(state, 0, 0, tvm.nd.array(np.full((2, 16, 16), 2.0, "float16"), device=device))
-    f_set(state, 0, 1, tvm.nd.array(np.full((2, 32, 32), 3.0, "float32"), device=device))
+    f_set(state, 0, 0, tvm.runtime.tensor(np.full((2, 16, 16), 2.0, "float16"), device=device))
+    f_set(state, 0, 1, tvm.runtime.tensor(np.full((2, 32, 32), 3.0, "float32"), device=device))
     f_end_forward(state)
 
     expected_values = [[np_two, np_three], [np_zero, np_one], [np_two, np_three]]
@@ -151,8 +154,8 @@ def test_rnn_state_popn(rnn_state):  # pylint: disable=redefined-outer-name
 
     f_add_sequence(state, 0)
     f_begin_forward(state, ShapeTuple([0]), ShapeTuple([1]))
-    f_set(state, 0, 0, tvm.nd.array(np_two.reshape(1, 16, 16), device=device))
-    f_set(state, 0, 1, tvm.nd.array(np_three.reshape(1, 32, 32), device=device))
+    f_set(state, 0, 0, tvm.runtime.tensor(np_two.reshape(1, 16, 16), device=device))
+    f_set(state, 0, 1, tvm.runtime.tensor(np_three.reshape(1, 32, 32), device=device))
     f_end_forward(state)
 
     verify_state(state, [0], [[np_two, np_three]])
@@ -169,8 +172,8 @@ def test_rnn_state_fork_sequence(rnn_state):  # pylint: disable=redefined-outer-
 
     f_add_sequence(state, 0)
     f_begin_forward(state, ShapeTuple([0]), ShapeTuple([1]))
-    f_set(state, 0, 0, tvm.nd.array(np_two.reshape(1, 16, 16), device=device))
-    f_set(state, 0, 1, tvm.nd.array(np_three.reshape(1, 32, 32), device=device))
+    f_set(state, 0, 0, tvm.runtime.tensor(np_two.reshape(1, 16, 16), device=device))
+    f_set(state, 0, 1, tvm.runtime.tensor(np_three.reshape(1, 32, 32), device=device))
     f_end_forward(state)
     f_fork_sequence(state, 0, 1, -1)
     verify_state(state, [0, 1], [[np_two, np_three], [np_two, np_three]])
diff --git a/tests/python/relax/test_runtime_sampling_flashinfer.py b/tests/python/relax/test_runtime_sampling_flashinfer.py
index dc3a3c86e69a..8dcd7bf61289 100644
--- a/tests/python/relax/test_runtime_sampling_flashinfer.py
+++ b/tests/python/relax/test_runtime_sampling_flashinfer.py
@@ -51,8 +51,8 @@ def load_module(name: str, static_modules: List[tvm.runtime.Module]):
     probs_np = np.array([[0.1, 0.2, 0.3, 0.2, 0.2] for _ in range(batch_size)], dtype="float32")
 
     dev = tvm.cuda(0)
-    prob_tvm = tvm.nd.array(probs_np, device=dev)
-    output_tvm = tvm.nd.empty((batch_size,), "int32", device=dev)
+    prob_tvm = tvm.runtime.tensor(probs_np, device=dev)
+    output_tvm = tvm.runtime.empty((batch_size,), "int32", device=dev)
 
     device = tvm.cuda()
     target = tvm.target.Target.from_device(device)
diff --git a/tests/python/relax/test_tir_call_source_kernel.py b/tests/python/relax/test_tir_call_source_kernel.py
index d7ca2a672b55..4061da3a9c2e 100644
--- a/tests/python/relax/test_tir_call_source_kernel.py
+++ b/tests/python/relax/test_tir_call_source_kernel.py
@@ -92,8 +92,8 @@ def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle):
     assert len(Module.get_attr("external_mods")) == 1
 
     device = tvm.cuda(0)
-    x_nd = tvm.nd.array(np.random.rand(256).astype(np.float32), device)
-    y_nd = tvm.nd.array(np.random.rand(256).astype(np.float32), device)
+    x_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
+    y_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
     output_np = x_nd.numpy() + y_nd.numpy()
 
     with tvm.target.Target("cuda"):
diff --git a/tests/python/relax/test_training_optimizer_numeric.py b/tests/python/relax/test_training_optimizer_numeric.py
index 6a9c34a5fb94..f2106ea2c2e7 100644
--- a/tests/python/relax/test_training_optimizer_numeric.py
+++ b/tests/python/relax/test_training_optimizer_numeric.py
@@ -37,7 +37,7 @@ def _legalize_and_build(mod: IRModule, target, dev):
 def _numpy_to_tvm(data):
     if isinstance(data, (list, tuple)):
         return [_numpy_to_tvm(_data) for _data in data]
-    return tvm.nd.array(data)
+    return tvm.runtime.tensor(data)
 
 
 def _tvm_to_numpy(data):
diff --git a/tests/python/relax/test_transform_bind_params.py b/tests/python/relax/test_transform_bind_params.py
index 2e9845f73f40..c46701d33a85 100644
--- a/tests/python/relax/test_transform_bind_params.py
+++ b/tests/python/relax/test_transform_bind_params.py
@@ -53,8 +53,8 @@ def main(
 
     x_np = np.random.rand(16, 16).astype(np.float32)
     w_np = np.random.rand(16, 16).astype(np.float32)
-    x_tvm = tvm.nd.array(x_np)
-    w_tvm = tvm.nd.array(w_np)
+    x_tvm = tvm.runtime.tensor(x_np)
+    w_tvm = tvm.runtime.tensor(w_np)
     params_dict = {"w": w_np if use_np_array else w_tvm}
     mod = relax.transform.BindParams("main", params_dict)(InputModule)
     assert len(mod["main"].params) == 1
@@ -97,10 +97,10 @@ def main(
             return out
 
     m, n, k = 4, 6, 8
-    w0_tvm = tvm.nd.array(np.random.rand(n, m).astype(np.float32))
-    b0_tvm = tvm.nd.array(np.random.rand(n).astype(np.float32))
-    w1_tvm = tvm.nd.array(np.random.rand(k, n).astype(np.float32))
-    b1_tvm = tvm.nd.array(np.random.rand(k).astype(np.float32))
+    w0_tvm = tvm.runtime.tensor(np.random.rand(n, m).astype(np.float32))
+    b0_tvm = tvm.runtime.tensor(np.random.rand(n).astype(np.float32))
+    w1_tvm = tvm.runtime.tensor(np.random.rand(k, n).astype(np.float32))
+    b1_tvm = tvm.runtime.tensor(np.random.rand(k).astype(np.float32))
     params_dict = {"w0": w0_tvm, "b0": b0_tvm, "w1": w1_tvm, "b1": b1_tvm}
     mod = relax.transform.BindParams("main", params_dict)(Before)
 
diff --git a/tests/python/relax/test_transform_codegen_pass.py b/tests/python/relax/test_transform_codegen_pass.py
index b997eb9c6bc0..dbddc60f8cd9 100644
--- a/tests/python/relax/test_transform_codegen_pass.py
+++ b/tests/python/relax/test_transform_codegen_pass.py
@@ -106,8 +106,8 @@ def setup_test():
 
     np0 = np.random.rand(16, 16).astype(np.float32)
     np1 = np.random.rand(16, 16).astype(np.float32)
-    data0 = tvm.nd.array(np0, dev)
-    data1 = tvm.nd.array(np1, dev)
+    data0 = tvm.runtime.tensor(np0, dev)
+    data1 = tvm.runtime.tensor(np1, dev)
     inputs = [data0, data1]
 
     # Ground truth should be generated before annotation
diff --git a/tests/python/relax/test_transform_cse.py b/tests/python/relax/test_transform_cse.py
index bb10704acbb7..5b12480e253c 100644
--- a/tests/python/relax/test_transform_cse.py
+++ b/tests/python/relax/test_transform_cse.py
@@ -63,8 +63,8 @@ def foo() -> R.Tuple(R.Tensor((), dtype="int32"), R.Tensor((2, 2), dtype="int32"
                 lv0 = R.add(R.const(1, dtype="int32"), R.const(1, dtype="int32"))
                 # we expect to bind the repeated large constants
                 lv1 = R.add(
-                    R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
-                    R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
+                    R.const(tvm.runtime.tensor(np.zeros((2, 2), dtype="int32"))),
+                    R.const(tvm.runtime.tensor(np.zeros((2, 2), dtype="int32"))),
                 )
                 gv = (lv0, lv1)
                 R.output(gv)
@@ -77,8 +77,8 @@ def foo() -> R.Tuple(R.Tensor((), dtype="int32"), R.Tensor((2, 2), dtype="int32"
             with R.dataflow():
                 lv0 = R.add(R.const(1, dtype="int32"), R.const(1, dtype="int32"))
                 lv1 = R.add(
-                    R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
-                    R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
+                    R.const(tvm.runtime.tensor(np.zeros((2, 2), dtype="int32"))),
+                    R.const(tvm.runtime.tensor(np.zeros((2, 2), dtype="int32"))),
                 )
                 gv = (lv0, lv1)
                 R.output(gv)
diff --git a/tests/python/relax/test_transform_few_shot_tuning.py b/tests/python/relax/test_transform_few_shot_tuning.py
index c640deee5496..e769c911a3f0 100644
--- a/tests/python/relax/test_transform_few_shot_tuning.py
+++ b/tests/python/relax/test_transform_few_shot_tuning.py
@@ -343,7 +343,7 @@ def _expected_results(
     func = func.with_attr("global_symbol", "main")
     rt_mod = tvm.compile(func, target="llvm")
     data = [
-        tvm.nd.array(x)
+        tvm.runtime.tensor(x)
         for x in [
             *inputs,
             np.zeros(output_shape, dtype=output_dtype),
@@ -359,7 +359,7 @@ def _actual_results(
     target = _target()
     actual_rt_mod = tvm.compile(actual, target=target)
     actual_data = [
-        tvm.nd.array(x, device=tvm.cuda() if target.kind.name == "cuda" else tvm.cpu())
+        tvm.runtime.tensor(x, device=tvm.cuda() if target.kind.name == "cuda" else tvm.cpu())
         for x in [
             *inputs,
             np.zeros(output_shape, dtype=output_dtype),
diff --git a/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py b/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py
index 4b17829fa0d7..d47fa1166510 100644
--- a/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py
+++ b/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py
@@ -70,13 +70,13 @@ def test_fold_batchnorm_info_conv2d():
     mod_fold = get_conv2d_batchnorm_sample()
 
     target = tvm.target.Target("llvm", host="llvm")
-    data_in = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype(np.float32))
+    data_in = tvm.runtime.tensor(np.random.rand(1, 3, 224, 224).astype(np.float32))
 
-    weight_data = tvm.nd.array(np.random.rand(32, 3, 3, 3).astype(np.float32))
-    gamma_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    beta_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    mean_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    variance_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    weight_data = tvm.runtime.tensor(np.random.rand(32, 3, 3, 3).astype(np.float32))
+    gamma_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    beta_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    mean_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    variance_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
     params_np = {
         "weight": weight_data,
         "gamma": gamma_data,
@@ -121,11 +121,11 @@ def visit_call_(self, call: relax.Call) -> None:  # pylint: disable=arguments-re
 def test_fold_batchnorm_info_conv2d_transform():
     mod = get_conv2d_batchnorm_sample()
     mod = relax.transform.FoldBatchnormToConv2D()(mod)
-    weight_data = tvm.nd.array(np.random.rand(32, 3, 3, 3).astype(np.float32))
-    gamma_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    beta_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    mean_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
-    variance_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    weight_data = tvm.runtime.tensor(np.random.rand(32, 3, 3, 3).astype(np.float32))
+    gamma_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    beta_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    mean_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
+    variance_data = tvm.runtime.tensor(np.random.rand(32).astype(np.float32))
     params_np = {
         "weight": weight_data,
         "gamma": gamma_data,
diff --git a/tests/python/relax/test_transform_fold_constant.py b/tests/python/relax/test_transform_fold_constant.py
index 9f2e3a4a092d..c62a01768eec 100644
--- a/tests/python/relax/test_transform_fold_constant.py
+++ b/tests/python/relax/test_transform_fold_constant.py
@@ -38,7 +38,7 @@ def gen_mod(mod, name, binding):
         The const parameter bindings
     """
     funcs = {}
-    binding = {k: tvm.nd.array(v) for k, v in binding.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding.items()}
 
     for k, v in mod.functions.items():
         if isinstance(v, tvm.relax.Function):
@@ -431,12 +431,14 @@ def expected(
         ) -> R.Tensor((1, 1), dtype="int64"):
             return new_shape
 
-    before = gen_mod(Module, "before", {"indices": tvm.nd.array(np.array([0]).astype("int64"))})
+    before = gen_mod(
+        Module, "before", {"indices": tvm.runtime.tensor(np.array([0]).astype("int64"))}
+    )
     after = relax.transform.FoldConstant()(before)
     np_take = np.take([5, 4, 3, 2], [0], axis=0)
     np_expand = np.expand_dims(np_take, axis=[0])
     np_concat = np.concatenate([np_expand], axis=0)
-    expected = gen_mod(Module, "expected", {"new_shape": tvm.nd.array(np_concat)})
+    expected = gen_mod(Module, "expected", {"new_shape": tvm.runtime.tensor(np_concat)})
     tvm.ir.assert_structural_equal(after, expected)
 
 
diff --git a/tests/python/relax/test_transform_gradient_numeric.py b/tests/python/relax/test_transform_gradient_numeric.py
index 70d6da8d7109..3b1d1dcefee4 100644
--- a/tests/python/relax/test_transform_gradient_numeric.py
+++ b/tests/python/relax/test_transform_gradient_numeric.py
@@ -24,7 +24,7 @@
 
 
 def rand(dtype, *shape):
-    return tvm.nd.array(np.random.rand(*shape).astype(dtype))
+    return tvm.runtime.tensor(np.random.rand(*shape).astype(dtype))
 
 
 def _legalize_and_build(mod, target, dev):
@@ -118,7 +118,9 @@ def test_mlp_blockbuilder(target, dev):
     for arg in After["MLP_adjoint"].params:
         shape = [int(l) for l in arg.struct_info.shape]
         if arg.struct_info.dtype == "int64":
-            args.append(tvm.nd.array(np.random.randint(0, out_size, size=shape).astype(np.int64)))
+            args.append(
+                tvm.runtime.tensor(np.random.randint(0, out_size, size=shape).astype(np.int64))
+            )
         else:  # float32
             args.append(rand("float32", *shape))
 
@@ -127,7 +129,7 @@ def test_mlp_blockbuilder(target, dev):
     _, grad = vm_after["MLP_adjoint"](*args)
 
     def func(*inputs):
-        loss = vm_before["MLP"](args[0], *[tvm.nd.array(i) for i in inputs], args[-1])
+        loss = vm_before["MLP"](args[0], *[tvm.runtime.tensor(i) for i in inputs], args[-1])
         return loss.numpy()
 
     check_numerical_grads(func, [i.numpy() for i in args[1:-1]], [i.numpy() for i in grad])
@@ -183,7 +185,7 @@ def main(x: R.Tensor((6,), "float32"), y: R.Tensor((6, 3, 4), "float32")):
     _, grad = vm_after["main_adjoint"](*args)
 
     def func(*inputs):
-        loss = vm_before["main"](*[tvm.nd.array(i) for i in inputs])
+        loss = vm_before["main"](*[tvm.runtime.tensor(i) for i in inputs])
         return loss.numpy()
 
     check_numerical_grads(func, [i.numpy() for i in args], [i.numpy() for i in grad])
@@ -220,7 +222,7 @@ def main(x: R.Tensor((3, 3), "float32"), y: R.Tensor((3, 3), "float32")):
     _, grad = vm_after["main_adjoint"](*args)
 
     def func(*inputs):
-        loss = vm_before["main"](*[tvm.nd.array(i) for i in inputs])
+        loss = vm_before["main"](*[tvm.runtime.tensor(i) for i in inputs])
         return loss.numpy()
 
     check_numerical_grads(func, [i.numpy() for i in args], [i.numpy() for i in grad])
diff --git a/tests/python/relax/test_transform_lazy_transform_params.py b/tests/python/relax/test_transform_lazy_transform_params.py
index 25d483fc449c..696499121072 100644
--- a/tests/python/relax/test_transform_lazy_transform_params.py
+++ b/tests/python/relax/test_transform_lazy_transform_params.py
@@ -662,7 +662,7 @@ def transform_params(
 
     @tvm.register_func("get_item", override=True)
     def get_item(i):
-        return tvm.nd.array(params[i], dev)
+        return tvm.runtime.tensor(params[i], dev)
 
     @tvm.register_func("set_item", override=True)
     def set_item(i, value):
diff --git a/tests/python/relax/test_transform_to_mixed_precision.py b/tests/python/relax/test_transform_to_mixed_precision.py
index 658f80a06ec5..4e90216f9bc0 100644
--- a/tests/python/relax/test_transform_to_mixed_precision.py
+++ b/tests/python/relax/test_transform_to_mixed_precision.py
@@ -836,7 +836,7 @@ def main(
         "w2": np.random.uniform(size=(4, 4, 1, 1)).astype("float16"),
         "w3": np.random.uniform(size=(4,)).astype("float16"),
     }
-    binding = {k: tvm.nd.array(v) for k, v in binding.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding.items()}
     Input = relax.transform.BindParams("main", binding)(Input)
     Expected = relax.transform.BindParams("main", binding)(Expected)
     Expected2 = relax.transform.BindParams("main", binding)(Expected2)
@@ -975,7 +975,7 @@ def main(
         "w": np.random.uniform(size=(512, 4, 3, 3)).astype("float32"),
         "bias": np.random.uniform(size=(512,)).astype("float32"),
     }
-    binding = {k: tvm.nd.array(v) for k, v in binding_np.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding_np.items()}
 
     Input_bound = relax.transform.BindParams("main", binding)(Input)
     Expected = relax.transform.BindParams("main", binding)(Expected)
@@ -983,7 +983,7 @@ def main(
     _assert_test(Input_bound, expected2=Expected)
 
     binding_np["bias"][0] = 70000  # Out of fp16 range
-    binding = {k: tvm.nd.array(v) for k, v in binding_np.items()}
+    binding = {k: tvm.runtime.tensor(v) for k, v in binding_np.items()}
     Input_bound = relax.transform.BindParams("main", binding)(Input)
     Expected_no_bias_cast = relax.transform.BindParams("main", binding)(Expected_no_bias_cast)
 
diff --git a/tests/python/relax/test_vm_alloc_storage_with_scope.py b/tests/python/relax/test_vm_alloc_storage_with_scope.py
index ec6696000429..3839ae123406 100644
--- a/tests/python/relax/test_vm_alloc_storage_with_scope.py
+++ b/tests/python/relax/test_vm_alloc_storage_with_scope.py
@@ -67,7 +67,7 @@ def test_alloc_storage_with_scope_global():
     dev = tvm.cpu()
     # This is the important line which tests nd allocator
     vm_rt = relax.VirtualMachine(lib, dev, memory_cfg="naive")
-    x = tvm.nd.array(arg0, dev)
+    x = tvm.runtime.tensor(arg0, dev)
     vm_rt.set_input("main", x)
     vm_rt.invoke_stateful("main")
     output = vm_rt.get_outputs("main").numpy()
diff --git a/tests/python/relax/test_vm_build.py b/tests/python/relax/test_vm_build.py
index da8b905193fc..e29d486584e2 100644
--- a/tests/python/relax/test_vm_build.py
+++ b/tests/python/relax/test_vm_build.py
@@ -52,8 +52,8 @@ def foo(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")):
     mod = TestVMCompileStage0
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
-    inp1 = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
-    inp2 = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
+    inp1 = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
+    inp2 = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
     vm = relax.VirtualMachine(ex, tvm.cpu())
     vm["foo"](inp1, inp2)
     tvm.testing.assert_allclose(inp2.numpy(), inp1.numpy(), rtol=1e-7, atol=1e-7)
@@ -72,8 +72,8 @@ def foo(x: R.Tensor((3, 4), "float32"), y: R.Tensor((3, 4), "float32")):
             return y
 
     ex = relax.build(mod, exec_mode=exec_mode)
-    inp1 = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
-    inp2 = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
+    inp1 = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
+    inp2 = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
     vm = relax.VirtualMachine(ex, tvm.cpu())
     vm["foo"](inp1, inp2)
     tvm.testing.assert_allclose(inp2.numpy(), inp1.numpy(), rtol=1e-7, atol=1e-7)
@@ -90,10 +90,10 @@ def foo(x: R.Tensor(["n", "m"], "int32"), y: R.Object) -> R.Tensor(["m", "n"], d
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x0 = tvm.nd.array(np.zeros((1, 2)).astype("int32"))
-    y0 = tvm.nd.array(np.zeros((2, 1)).astype("float32"))
-    y1 = tvm.nd.array(np.zeros((1, 2)).astype("float32"))
-    y2 = tvm.nd.array(np.zeros((2, 1, 1)).astype("float32"))
+    x0 = tvm.runtime.tensor(np.zeros((1, 2)).astype("int32"))
+    y0 = tvm.runtime.tensor(np.zeros((2, 1)).astype("float32"))
+    y1 = tvm.runtime.tensor(np.zeros((1, 2)).astype("float32"))
+    y2 = tvm.runtime.tensor(np.zeros((2, 1, 1)).astype("float32"))
 
     vm["foo"](x0, y0)
 
@@ -119,18 +119,18 @@ def foo(x: R.Tensor(dtype="float32")) -> R.Shape:
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
     shape = (32, 16)
-    arr = tvm.nd.array(np.random.rand(*shape).astype("float32"))
+    arr = tvm.runtime.tensor(np.random.rand(*shape).astype("float32"))
     res = vm["foo"](arr)
     assert res[0] == shape[0] * 2
     assert res[1] == shape[1] * 3
 
     # dtype mismatch
     with pytest.raises(ValueError, match=".*dtype.*"):
-        vm["foo"](tvm.nd.array(np.zeros((1, 2)).astype("int32")))
+        vm["foo"](tvm.runtime.tensor(np.zeros((1, 2)).astype("int32")))
 
     # ndim mismatch
     with pytest.raises(ValueError, match=".*match_cast.*ndim.*"):
-        vm["foo"](tvm.nd.array(np.zeros((1,)).astype("float32")))
+        vm["foo"](tvm.runtime.tensor(np.zeros((1,)).astype("float32")))
 
     # type mismach
     with pytest.raises(TypeError):
@@ -153,7 +153,7 @@ def foo(x: R.Tensor((32, 16), "float32")) -> R.Tensor:
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
     shape = (32, 16)
-    inp = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(*shape).astype(np.float32))
     res = vm["foo"](inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy(), rtol=1e-7, atol=1e-7)
 
@@ -177,7 +177,7 @@ def foo(x: R.Tensor(dtype="float32")) -> R.Tensor:
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
     shape = (32, 16)
-    inp = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(*shape).astype(np.float32))
     res = check_saved_func(vm, "foo", inp)
     tvm.testing.assert_allclose(res.numpy(), np.tile(inp.numpy(), (1, 2)), rtol=1e-7, atol=1e-7)
 
@@ -217,8 +217,8 @@ def func(
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
-    data = tvm.nd.array(np.random.rand(32, 16).astype(np.float32))
-    weight = tvm.nd.array(np.random.rand(16, 32).astype(np.float32))
+    data = tvm.runtime.tensor(np.random.rand(32, 16).astype(np.float32))
+    weight = tvm.runtime.tensor(np.random.rand(16, 32).astype(np.float32))
     res = check_saved_func(vm, "func", data, weight)
     expected = np.dot(data.numpy(), weight.numpy())
     tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-6, atol=1e-6)
@@ -265,9 +265,9 @@ def main(
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
-    x = tvm.nd.array(np.zeros((2, 3)).astype(np.int32))
-    y = tvm.nd.array(np.zeros((2, 3)).astype(np.int32))
-    z = tvm.nd.array(np.ones((2, 3)).astype(np.int32))
+    x = tvm.runtime.tensor(np.zeros((2, 3)).astype(np.int32))
+    y = tvm.runtime.tensor(np.zeros((2, 3)).astype(np.int32))
+    z = tvm.runtime.tensor(np.ones((2, 3)).astype(np.int32))
     vm.set_input("main", x, y, z)
     vm.invoke_stateful("main")
     outs = vm.get_outputs("main")
@@ -312,12 +312,12 @@ def main(
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
-    x = tvm.nd.array(np.ones((2, 3)).astype(np.int32))
-    y = tvm.nd.array(np.ones((2, 3)).astype(np.int32))
+    x = tvm.runtime.tensor(np.ones((2, 3)).astype(np.int32))
+    y = tvm.runtime.tensor(np.ones((2, 3)).astype(np.int32))
     vm.set_input("main", x, y)
     vm.invoke_stateful("main")
     out = vm.get_outputs("main")
-    expected = tvm.nd.array(np.full((2, 3), 2).astype(np.int32))
+    expected = tvm.runtime.tensor(np.full((2, 3), 2).astype(np.int32))
 
     assert x == out
     tvm.testing.assert_allclose(out.numpy(), expected.numpy(), rtol=1e-7, atol=1e-7)
@@ -342,8 +342,8 @@ def test_vm_emit_te_extern(exec_mode):
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
-    data = tvm.nd.array(np.random.rand(16, 32).astype(np.float32))
-    weight = tvm.nd.array(np.random.rand(32, 16).astype(np.float32))
+    data = tvm.runtime.tensor(np.random.rand(16, 32).astype(np.float32))
+    weight = tvm.runtime.tensor(np.random.rand(32, 16).astype(np.float32))
     res = check_saved_func(vm, "rx_cblas_matmul", data, weight)
     expected = np.dot(data.numpy(), weight.numpy())
     tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-6, atol=1e-6)
@@ -370,12 +370,12 @@ def te_func(A, B):
     ex = relax.build(mod, target, exec_mode=exec_mode)
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(
+    inp = tvm.runtime.tensor(
         np.random.rand(
             1,
         ).astype(np.float32)
     )
-    inp2 = tvm.nd.array(
+    inp2 = tvm.runtime.tensor(
         np.random.rand(
             2,
         ).astype(np.float32)
@@ -406,7 +406,7 @@ def te_func(A):
     ex = relax.build(mod, target, exec_mode=exec_mode)
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(
+    inp = tvm.runtime.tensor(
         np.random.rand(
             1,
         ).astype(np.float32)
@@ -435,7 +435,7 @@ def te_func(A):
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
     shape = (9,)
-    inp = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(*shape).astype(np.float32))
     res = check_saved_func(vm, "rx_func", inp)
 
     def expected_output():
@@ -463,7 +463,7 @@ def test_vm_emit_te_constant_param_cpu(exec_mode):
     dev = tvm.cpu()
     vm = relax.VirtualMachine(exec, dev)
 
-    add_res = check_saved_func(vm, "main", tvm.nd.array(x_np, dev))
+    add_res = check_saved_func(vm, "main", tvm.runtime.tensor(x_np, dev))
     tvm.testing.assert_allclose(add_res.numpy(), x_np + c_np, rtol=1e-7, atol=1e-7)
 
 
@@ -490,7 +490,7 @@ def test_vm_emit_te_constant_param_gpu(exec_mode):
     dev = tvm.cuda()
     vm = relax.VirtualMachine(exec, dev)
 
-    add_res = check_saved_func(vm, "main", tvm.nd.array(x_np, dev))
+    add_res = check_saved_func(vm, "main", tvm.runtime.tensor(x_np, dev))
     tvm.testing.assert_allclose(add_res.numpy(), x_np + c_np, rtol=1e-7, atol=1e-7)
 
 
@@ -516,8 +516,8 @@ def te_func(A, B):
     vm = relax.VirtualMachine(ex, tvm.cpu())
     shape1 = (5,)
     shape2 = (3,)
-    inp = tvm.nd.array(np.random.rand(*shape1).astype(np.float32))
-    inp2 = tvm.nd.array(np.random.rand(*shape2).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(*shape1).astype(np.float32))
+    inp2 = tvm.runtime.tensor(np.random.rand(*shape2).astype(np.float32))
     res = check_saved_func(vm, "rx_func", inp, inp2)
 
     def expected_output():
@@ -667,8 +667,8 @@ def te_func(A):
         ex.export_library(temp.relpath("exec.so"))
         vm = relax.VirtualMachine(tvm.runtime.load_module(temp.relpath("exec.so")), tvm.cpu())
 
-    inp = tvm.nd.array(np.random.rand(2).astype(np.float32))
-    inp2 = tvm.nd.array(np.random.rand(3).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(2).astype(np.float32))
+    inp2 = tvm.runtime.tensor(np.random.rand(3).astype(np.float32))
 
     res = check_saved_func(vm, "rx_func", inp, inp2)
 
@@ -693,8 +693,8 @@ def test_vm_tuple(exec_mode):
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
     shape = (5,)
-    inp = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
-    inp2 = tvm.nd.array(np.random.rand(*shape).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(*shape).astype(np.float32))
+    inp2 = tvm.runtime.tensor(np.random.rand(*shape).astype(np.float32))
     (res1, res2), res3 = vm["rx_func"](inp, inp2)
 
     tvm.testing.assert_allclose(res1.numpy(), inp.numpy(), rtol=1e-7, atol=1e-7)
@@ -722,8 +722,8 @@ def tuple_get_item(
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x_inp = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
-    y_inp = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
+    x_inp = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
+    y_inp = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
     res = check_saved_func(vm, "tuple_get_item", x_inp, y_inp)
     tvm.testing.assert_allclose(res.numpy(), x_inp.numpy() + y_inp.numpy(), rtol=1e-7, atol=1e-7)
 
@@ -754,7 +754,7 @@ def copy(A: T.Buffer((2, 3), "float32"), B: T.Buffer((2, 3), "float32")):
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
+    x = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
     y = vm["main"](x)
     tvm.testing.assert_allclose(y.numpy(), x.numpy(), rtol=1e-7, atol=1e-7)
 
@@ -808,8 +808,8 @@ def main(x: R.Tensor((32, 32), "float32"), w: R.Tensor((32, 32), "float32")) ->
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(TestVMSubFunction, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x_inp = tvm.nd.array(np.random.rand(32, 32).astype(np.float32))
-    y_inp = tvm.nd.array(np.random.rand(32, 32).astype(np.float32))
+    x_inp = tvm.runtime.tensor(np.random.rand(32, 32).astype(np.float32))
+    y_inp = tvm.runtime.tensor(np.random.rand(32, 32).astype(np.float32))
     res = check_saved_func(vm, "main", x_inp, y_inp)
     product = np.dot(x_inp.numpy(), y_inp.numpy())
     expected = product * product
@@ -843,7 +843,7 @@ def recursion(n: R.Tensor((1,), "float32")) -> R.Tensor:
     inp = np.empty(1).astype("float32")
     recursion_runs = np.random.randint(1, 10)
     inp.fill(recursion_runs)
-    inp = tvm.nd.array(inp)
+    inp = tvm.runtime.tensor(inp)
     res = check_saved_func(vm, "recursion", inp)
     tvm.testing.assert_allclose(res.numpy(), np.power(2.0, recursion_runs), rtol=1e-7, atol=1e-7)
 
@@ -870,7 +870,7 @@ def foo2(
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x_inp = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
+    x_inp = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
     res_1 = check_saved_func(vm, "foo1", x_inp)
     res_2 = check_saved_func(vm, "foo2", x_inp)
 
@@ -903,8 +903,8 @@ def main(
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(mod, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x_inp = tvm.nd.array(np.random.rand(2, 3).astype("float32"))
-    y_inp = tvm.nd.array(np.array([[3.1, 4.0, 5.0], [6.0, 7.1, 9.0]], dtype="float32"))
+    x_inp = tvm.runtime.tensor(np.random.rand(2, 3).astype("float32"))
+    y_inp = tvm.runtime.tensor(np.array([[3.1, 4.0, 5.0], [6.0, 7.1, 9.0]], dtype="float32"))
     res = check_saved_func(vm, "main", x_inp, y_inp)
     tvm.testing.assert_allclose(res.numpy(), x_inp.numpy() + y_inp.numpy())
 
@@ -921,8 +921,8 @@ def main(x: R.Tensor((1,), "float32"), y: R.Tensor((1,), "float32")):
     target = tvm.target.Target("llvm", host="llvm")
     ex = relax.build(TestTimeEvaluator, target, exec_mode=exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x = tvm.nd.array(np.random.rand(1).astype("float32"))
-    y = tvm.nd.array(np.random.rand(1).astype("float32"))
+    x = tvm.runtime.tensor(np.random.rand(1).astype("float32"))
+    y = tvm.runtime.tensor(np.random.rand(1).astype("float32"))
 
     # ensure we can use time_evaluator with the stateful API
     vm.set_input("main", x, y)
@@ -1054,8 +1054,8 @@ def popen_check():
 
 
 def set_input_trial(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> None:
-    a = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
-    b = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
+    a = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
+    b = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
     vm.set_input("main", a, b)
     vm.invoke_stateful("main")
     res0 = vm.get_outputs("main")
@@ -1067,17 +1067,17 @@ def set_input_trial(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> Non
     tvm.testing.assert_allclose(res0.numpy(), a.numpy() * b.numpy(), rtol=1e-7, atol=1e-7)
     tvm.testing.assert_allclose(res0.numpy(), res1.numpy(), rtol=1e-7, atol=1e-7)
 
-    # bug! If you don't bind the NDArray to a var, the memory will get corrupted.
+    # bug! If you don't bind the Tensor to a var, the memory will get corrupted.
     # Possibly due to object lifecycles and other FFI issues
-    a = tvm.nd.array(np.array(2).astype("int32"), device)
+    a = tvm.runtime.tensor(np.array(2).astype("int32"), device)
     vm.set_input("test_vm_tuple", a)
     vm.invoke_stateful("test_vm_tuple")
     res2 = vm.get_outputs("test_vm_tuple")
-    # the results are NDArrays wrapped around scalars,
-    # so we have to get the scalar out of the NDArray
+    # the results are Tensors wrapped around scalars,
+    # so we have to get the scalar out of the Tensor
     assert tuple(map(lambda a: int(a.numpy()), res2)) == (2, 2)
 
-    b = tvm.nd.array(np.array(1).astype("int32"), device)
+    b = tvm.runtime.tensor(np.array(1).astype("int32"), device)
     vm.set_input("test_vm_nested_tuple", b)
     vm.invoke_stateful("test_vm_nested_tuple")
     res3 = vm.get_outputs("test_vm_nested_tuple")
@@ -1088,8 +1088,8 @@ def set_input_trial(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> Non
 
 def set_input_attempt_stateless(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> None:
     # this should fail: once you set inputs, you cannot run statelessly
-    a = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
-    b = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
+    a = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
+    b = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
     vm.set_input("main", a, b)
     # must use invoke stateful!
     vm["main"]()
@@ -1102,8 +1102,8 @@ def set_input_attempt_invoke(vm: relax.VirtualMachine, device: tvm.runtime.Devic
 
 def set_input_attempt_get(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> None:
     # this should fail: you can't get outputs without invoking the function first
-    a = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
-    b = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
+    a = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
+    b = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
     vm.set_input("main", a, b)
     _ = vm.get_outputs("main")
 
@@ -1169,16 +1169,16 @@ def main(x: R.Tuple([R.Tensor((32,), "float32"), R.Tensor((32,), "float32")])) -
     temp = utils.tempdir()
     vm, device = make_vm(MyMod, exec_mode, temp)
     device = tvm.cpu(0)
-    a = tvm.nd.empty((32,), "float32", device=device)
-    b = tvm.nd.empty((32,), "float32", device=device)
+    a = tvm.runtime.empty((32,), "float32", device=device)
+    b = tvm.runtime.empty((32,), "float32", device=device)
     vm.set_input("main", (a, b))
     vm.invoke_stateful("main")
 
 
 def save_function_kwargs_trial(vm: relax.VirtualMachine, device: tvm.runtime.Device) -> None:
     # just checking that we can use kwargs for the args when saving a function
-    a = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
-    b = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
+    a = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
+    b = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
     vm.save_function("main", "saved_main", x=a, w=b)
     res0 = vm["saved_main"]()
     tvm.testing.assert_allclose(res0.numpy(), a.numpy() * b.numpy(), rtol=1e-7, atol=1e-7)
@@ -1197,8 +1197,8 @@ def save_function_time_evaluator_trial(
     vm: relax.VirtualMachine, device: tvm.runtime.Device
 ) -> None:
     # just checking that the saved function can be called in the time evaluator
-    a = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
-    b = tvm.nd.array(np.random.rand(32, 32).astype("float32"), device)
+    a = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
+    b = tvm.runtime.tensor(np.random.rand(32, 32).astype("float32"), device)
     vm.save_function("main", "saved_main", a, b)
     vm.time_evaluator("saved_main", device)()
 
@@ -1292,16 +1292,16 @@ def func_llvm(
     dev_llvm = tvm.device("llvm")
     vm_llvm = tvm.relax.VirtualMachine(built, device=dev_llvm)
     llvm_output = vm_llvm["func_llvm"](
-        tvm.nd.array(np_A, dev_llvm),
-        tvm.nd.array(np_B, dev_llvm),
+        tvm.runtime.tensor(np_A, dev_llvm),
+        tvm.runtime.tensor(np_B, dev_llvm),
     )
 
     dev_cuda = tvm.device("cuda")
     vm_cuda = tvm.relax.VirtualMachine(built, device=dev_cuda)
 
     cuda_output = vm_cuda["func_cuda"](
-        tvm.nd.array(np_A, dev_cuda),
-        tvm.nd.array(np_B, dev_cuda),
+        tvm.runtime.tensor(np_A, dev_cuda),
+        tvm.runtime.tensor(np_B, dev_cuda),
     )
 
     np_C = np_A + np_B
diff --git a/tests/python/relax/test_vm_builtin.py b/tests/python/relax/test_vm_builtin.py
index 04e2ae1bf339..2bc5e9ea7030 100644
--- a/tests/python/relax/test_vm_builtin.py
+++ b/tests/python/relax/test_vm_builtin.py
@@ -44,9 +44,9 @@ def foo(x: R.Tensor((3, 5), "float32"), y: R.Tensor((3, 1), "float32")):
     np_rand = np.random.rand(3, 5).astype(np.float32)
     # normalize it to get the random prob
     np_prob = np_rand / np_rand.sum(axis=1, keepdims=True)
-    nd_prob = tvm.nd.array(np_prob)
+    nd_prob = tvm.runtime.tensor(np_prob)
     # special sample to get deterministic results
-    nd_sample = tvm.nd.array(np.array([[1.0], [0], [1]]).astype(np.float32))
+    nd_sample = tvm.runtime.tensor(np.array([[1.0], [0], [1]]).astype(np.float32))
 
     vm = relax.VirtualMachine(ex, tvm.cpu())
     res = vm["foo"](nd_prob, nd_sample)
diff --git a/tests/python/relax/test_vm_callback_function.py b/tests/python/relax/test_vm_callback_function.py
index c8f3f2945ede..1014ed98a558 100644
--- a/tests/python/relax/test_vm_callback_function.py
+++ b/tests/python/relax/test_vm_callback_function.py
@@ -51,7 +51,7 @@ def custom_callback(arr):
         from_callback = arr
 
     np_A = np.arange(16, dtype="int32")
-    tvm_A = tvm.nd.array(np_A)
+    tvm_A = tvm.runtime.tensor(np_A)
 
     vm["relax_func"](tvm_A, custom_callback)
 
@@ -78,7 +78,7 @@ def relax_func(
     np_A = np.arange(16, dtype="int32")
 
     def custom_callback():
-        return tvm.nd.array(np_A)
+        return tvm.runtime.tensor(np_A)
 
     output = vm["relax_func"](custom_callback)
 
diff --git a/tests/python/relax/test_vm_codegen_only.py b/tests/python/relax/test_vm_codegen_only.py
index dac0f867cefb..044ba97cbfe4 100644
--- a/tests/python/relax/test_vm_codegen_only.py
+++ b/tests/python/relax/test_vm_codegen_only.py
@@ -51,7 +51,7 @@ def foo(x: R.Tensor((3, 4), "float32")):
     mod = TestVMMove
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
-    inp = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
     vm = relax.VirtualMachine(ex, tvm.cpu())
     res = check_saved_func(vm, "foo", inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy(), rtol=1e-7, atol=1e-7)
@@ -73,7 +73,7 @@ def foo(x: R.Tensor((3, 4), "float32")):
     mod = TestVMToDevice
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
-    inp = tvm.nd.array(np.random.rand(3, 4).astype(np.float32))
+    inp = tvm.runtime.tensor(np.random.rand(3, 4).astype(np.float32))
     vm = relax.VirtualMachine(ex, tvm.cpu())
     res = check_saved_func(vm, "foo", inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy(), rtol=1e-7, atol=1e-7)
@@ -100,7 +100,7 @@ def main(x: R.Tensor(ndim=2, dtype="float32")) -> R.Tensor(ndim=2, dtype="float3
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(np.random.rand(3, 4))
+    inp = tvm.runtime.tensor(np.random.rand(3, 4))
     res = vm["main"](inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy())
 
@@ -145,14 +145,14 @@ def ife(cond: R.Tensor((), "bool"), x: R.Tensor((3, 4), "float32")) -> R.Tensor:
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(np.random.rand(3, 4))
-    res = vm["ife"](tvm.nd.array(1), inp)
+    inp = tvm.runtime.tensor(np.random.rand(3, 4))
+    res = vm["ife"](tvm.runtime.tensor(1), inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy() + inp.numpy(), rtol=1e-7, atol=1e-7)
-    res = vm["ife"](tvm.nd.array(True), inp)
+    res = vm["ife"](tvm.runtime.tensor(True), inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy() + inp.numpy(), rtol=1e-7, atol=1e-7)
-    res = vm["ife"](tvm.nd.array(0), inp)
+    res = vm["ife"](tvm.runtime.tensor(0), inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy() * inp.numpy(), rtol=1e-7, atol=1e-7)
-    res = vm["ife"](tvm.nd.array(False), inp)
+    res = vm["ife"](tvm.runtime.tensor(False), inp)
     tvm.testing.assert_allclose(res.numpy(), inp.numpy() * inp.numpy(), rtol=1e-7, atol=1e-7)
 
 
@@ -171,7 +171,7 @@ def main(x: R.Tensor(ndim=2, dtype="float32")):
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(np.random.rand(2, 3))
+    inp = tvm.runtime.tensor(np.random.rand(2, 3))
     res0, res1, res2 = vm["main"](inp)
     tvm.testing.assert_allclose(res0.numpy(), np.array([1, 2]))
     tvm.testing.assert_allclose(res1.numpy(), np.array([3, 4]))
@@ -203,7 +203,7 @@ def main(x: R.Tensor(ndim=2, dtype="float32")):
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    inp = tvm.nd.array(np.random.rand(1, 2))
+    inp = tvm.runtime.tensor(np.random.rand(1, 2))
     res = vm["main"](inp)
     tvm.testing.assert_allclose(res.numpy(), np.array([4, 6]) + inp.numpy())
 
@@ -262,7 +262,7 @@ def main(x: R.Tensor(["n", "m"], "float32")) -> R.Shape(ndim=3):
     target = tvm.target.Target("llvm", host="llvm")
     ex = codegen(mod, target, exec_mode)
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    x = tvm.nd.array(np.zeros((1, 2)).astype("float32"))
+    x = tvm.runtime.tensor(np.zeros((1, 2)).astype("float32"))
     res = vm["main"](x)
     assert res == tvm.runtime.container.ShapeTuple([2, 1, 2])
 
@@ -272,11 +272,11 @@ def main(x: R.Tensor(["n", "m"], "float32")) -> R.Shape(ndim=3):
 
     # wrong ndim
     with pytest.raises(ValueError, match=r".*ndim.*"):
-        vm["main"](tvm.nd.array(np.zeros(1).astype("float32")))
+        vm["main"](tvm.runtime.tensor(np.zeros(1).astype("float32")))
 
     # wrong dtype
     with pytest.raises(ValueError, match=r".*dtype.*"):
-        vm["main"](tvm.nd.array(np.zeros((1, 2)).astype("int32")))
+        vm["main"](tvm.runtime.tensor(np.zeros((1, 2)).astype("int32")))
 
 
 @pytest.mark.parametrize("exec_mode", EXEC_MODE)
@@ -352,7 +352,7 @@ def main(x: R.Tensor((3, 4), "float32")):
     vm = relax.VirtualMachine(ex, dev)
 
     input_np = np.random.rand(3, 4).astype("float32")
-    input = tvm.nd.array(input_np, dev)
+    input = tvm.runtime.tensor(input_np, dev)
     res = vm["main"](input)
     expected = input_np.reshape(6, 2)
     tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-7, atol=1e-7)
diff --git a/tests/python/relax/test_vm_cuda_graph.py b/tests/python/relax/test_vm_cuda_graph.py
index 1026864e4f9b..728eb584ec24 100644
--- a/tests/python/relax/test_vm_cuda_graph.py
+++ b/tests/python/relax/test_vm_cuda_graph.py
@@ -101,7 +101,7 @@ def test_vm_run():
     dev = tvm.cuda(0)
     vm = relax.VirtualMachine(ex, dev)
     x_np = np.random.uniform(size=(16, 16)).astype("float32")
-    x = tvm.nd.array(x_np, dev)
+    x = tvm.runtime.tensor(x_np, dev)
     y = vm["main"](x)
     y_np = x_np + 1.0 + 1.0 + 1.0 + 1.0
     tvm.testing.assert_allclose(y.numpy(), y_np, rtol=1e-5, atol=1e-5)
@@ -135,7 +135,7 @@ def invalid_impl_for_cudagraph(arg_tensor):
         # capturing a cudaGraph.  This passes the warm-up run
         # performed by "vm.builtin.cuda_graph.run_or_capture", but
         # throws an exception when the cudaGraph is being captured.
-        _dummy_workspace = tvm.nd.empty([16], "float16", dev)
+        _dummy_workspace = tvm.runtime.empty([16], "float16", dev)
         return arg_tensor
 
     @I.ir_module
@@ -171,7 +171,7 @@ def main(A: R.Tensor([16], "float16")):
     built = tvm.compile(Module, target=target)
     vm = tvm.relax.VirtualMachine(built, dev)
 
-    arg = tvm.nd.array(np.arange(16).astype("float16"), dev)
+    arg = tvm.runtime.tensor(np.arange(16).astype("float16"), dev)
 
     with pytest.raises(tvm.TVMError):
         vm["main"](arg)
diff --git a/tests/python/relax/test_vm_execbuilder.py b/tests/python/relax/test_vm_execbuilder.py
index 861ec9f8b041..44ca5c20498c 100644
--- a/tests/python/relax/test_vm_execbuilder.py
+++ b/tests/python/relax/test_vm_execbuilder.py
@@ -31,12 +31,12 @@ def test_vm_execute():
         ib.emit_ret(ib.r(2))
     ex = ib.get()
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    a = tvm.nd.array(
+    a = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
     )
-    b = tvm.nd.array(
+    b = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
@@ -56,12 +56,12 @@ def test_vm_multiple_func():
         ib.emit_ret(ib.r(2))
     ex = ib.get()
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    a = tvm.nd.array(
+    a = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
     )
-    b = tvm.nd.array(
+    b = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
@@ -108,8 +108,8 @@ def test_emit_cache():
         s2 = ib.convert_constant(tvm.runtime.container.ShapeTuple([1, 3]))
         assert s0 == s1
         assert s1 != s2
-        y0 = ib.convert_constant(tvm.nd.array(np.array([1, 2, 3]).astype("int32")))
-        y1 = ib.convert_constant(tvm.nd.array(np.array([1, 2, 3]).astype("int32")))
+        y0 = ib.convert_constant(tvm.runtime.tensor(np.array([1, 2, 3]).astype("int32")))
+        y1 = ib.convert_constant(tvm.runtime.tensor(np.array([1, 2, 3]).astype("int32")))
         assert y0 == y1
         ib.emit_ret(ib.r(0))
 
@@ -153,7 +153,7 @@ def test_vm_operand():
 def test_vm_shapeof():
     ib = relax.ExecBuilder()
     shape = (32, 16)
-    arr = tvm.nd.array(np.random.rand(*shape))
+    arr = tvm.runtime.tensor(np.random.rand(*shape))
     with ib.function("main", num_inputs=0):
         ib.emit_call("vm.builtin.shape_of", args=[arr], dst=ib.r(0))
         ib.emit_ret(ib.r(0))
@@ -200,12 +200,12 @@ def test_vm_goto():
         ib.emit_ret(ib.r(2))
     ex = ib.get()
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    a = tvm.nd.array(
+    a = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
     )
-    b = tvm.nd.array(
+    b = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
@@ -224,12 +224,12 @@ def test_vm_if():
         ib.emit_ret(ib.r(3))
     ex = ib.get()
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    a = tvm.nd.array(
+    a = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
     )
-    b = tvm.nd.array(
+    b = tvm.runtime.tensor(
         np.random.rand(
             4,
         )
@@ -255,10 +255,10 @@ def test_vm_invoke_closure():
 
     ex = ib.get()
     vm = relax.VirtualMachine(ex, tvm.cpu())
-    w_inp = tvm.nd.array(np.random.rand(2, 3))
-    x_inp = tvm.nd.array(np.random.rand(2, 3))
-    y_inp = tvm.nd.array([[3.1, 4.0, 5.0], [6.0, 7.1, 9.0]])
-    z_inp = tvm.nd.array(np.random.rand(2, 3))
+    w_inp = tvm.runtime.tensor(np.random.rand(2, 3))
+    x_inp = tvm.runtime.tensor(np.random.rand(2, 3))
+    y_inp = tvm.runtime.tensor([[3.1, 4.0, 5.0], [6.0, 7.1, 9.0]])
+    z_inp = tvm.runtime.tensor(np.random.rand(2, 3))
     clo = vm["main"](w_inp, x_inp)
     res = vm.invoke_closure(clo, y_inp, z_inp)
     tvm.testing.assert_allclose(
@@ -280,8 +280,8 @@ def main(inp: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="
     ex = tvm.compile(Module, "llvm")
     vm = relax.VirtualMachine(ex, tvm.cpu())
 
-    correct_input = tvm.nd.array(np.random.normal(size=(10, 10)).astype("float32"))
-    incorrect_input = tvm.nd.array(np.random.normal(size=(12, 10)).astype("float32"))
+    correct_input = tvm.runtime.tensor(np.random.normal(size=(10, 10)).astype("float32"))
+    incorrect_input = tvm.runtime.tensor(np.random.normal(size=(12, 10)).astype("float32"))
 
     try:
         vm["main"](incorrect_input)
diff --git a/tests/python/relax/test_vm_instrument.py b/tests/python/relax/test_vm_instrument.py
index 8c4d728da18b..c4d24481ec2d 100644
--- a/tests/python/relax/test_vm_instrument.py
+++ b/tests/python/relax/test_vm_instrument.py
@@ -81,7 +81,7 @@ def instrument(func, name, before_run, ret_val, *args):
             return relax.VMInstrumentReturnKind.SKIP_RUN
 
     vm.set_instrument(instrument)
-    vm["main"](tvm.nd.array(data_np))
+    vm["main"](tvm.runtime.tensor(data_np))
     assert hit_count[("matmul", True)] == 2
     assert ("matmul", False) not in hit_count
     assert hit_count[("relu", True)] == 2
@@ -95,7 +95,7 @@ def test_lib_comparator():
     # compare against library module
     cmp = LibCompareVMInstrument(vm.module.imports[0], tvm.cpu(), verbose=False)
     vm.set_instrument(cmp)
-    vm["main"](tvm.nd.array(data_np))
+    vm["main"](tvm.runtime.tensor(data_np))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relax/test_vm_multi_device.py b/tests/python/relax/test_vm_multi_device.py
index 91ae8bf79256..018eb7bc3cc6 100644
--- a/tests/python/relax/test_vm_multi_device.py
+++ b/tests/python/relax/test_vm_multi_device.py
@@ -79,9 +79,9 @@ def foo(
     np_ipt2 = np.random.rand(4, 5).astype(np.float32)
     np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
 
-    ipt0 = tvm.nd.array(np_ipt0, devices[0])
-    ipt1 = tvm.nd.array(np_ipt1, devices[0])
-    ipt2 = tvm.nd.array(np_ipt2, devices[1])
+    ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
+    ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
+    ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
     res = vm["foo"](ipt0, ipt1, ipt2)
     tvm.testing.assert_allclose(res.numpy(), np_res)
 
@@ -134,10 +134,10 @@ def foo(
     np_ipt3 = np.random.rand(5, 6).astype(np.float32)
     np_res = np.matmul(np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2), np_ipt3)
 
-    ipt0 = tvm.nd.array(np_ipt0, devices[0])
-    ipt1 = tvm.nd.array(np_ipt1, devices[0])
-    ipt2 = tvm.nd.array(np_ipt2, devices[1])
-    ipt3 = tvm.nd.array(np_ipt3, devices[2])
+    ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
+    ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
+    ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
+    ipt3 = tvm.runtime.tensor(np_ipt3, devices[2])
     res = vm["foo"](ipt0, ipt1, ipt2, ipt3)
     tvm.testing.assert_allclose(res.numpy(), np_res)
 
@@ -179,9 +179,9 @@ def foo(
     np_ipt2 = np.random.rand(4, 5).astype(np.float32)
     np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
 
-    ipt0 = tvm.nd.array(np_ipt0, devices[1])
-    ipt1 = tvm.nd.array(np_ipt1, devices[1])
-    ipt2 = tvm.nd.array(np_ipt2, devices[0])
+    ipt0 = tvm.runtime.tensor(np_ipt0, devices[1])
+    ipt1 = tvm.runtime.tensor(np_ipt1, devices[1])
+    ipt2 = tvm.runtime.tensor(np_ipt2, devices[0])
     res = vm["foo"](ipt0, ipt1, ipt2)
     tvm.testing.assert_allclose(res.numpy(), np_res, rtol=1e-4, atol=1e-4)
 
diff --git a/tests/python/relax/test_vm_profiler.py b/tests/python/relax/test_vm_profiler.py
index eaf914560530..cdb27377a587 100644
--- a/tests/python/relax/test_vm_profiler.py
+++ b/tests/python/relax/test_vm_profiler.py
@@ -55,7 +55,7 @@ def test_conv2d_cpu():
     ex = get_exec(data_np.shape)
 
     vm = relax.VirtualMachine(ex, tvm.cpu(), profile=True)
-    report = vm.profile("main", tvm.nd.array(data_np))
+    report = vm.profile("main", tvm.runtime.tensor(data_np))
     print(report)
 
     assert "Duration" in str(report)
@@ -76,7 +76,7 @@ def with_rpc(ex, f, data_np):
     device = remote.cpu()
 
     vm = relax.VirtualMachine(rexec, device=device, profile=True)
-    data = tvm.nd.array(data_np, device)
+    data = tvm.runtime.tensor(data_np, device)
 
     f(vm, data)
 
diff --git a/tests/python/runtime/test_evaluator_with_preproc.py b/tests/python/runtime/test_evaluator_with_preproc.py
index fd8f8e95b0bf..208d584e99a5 100644
--- a/tests/python/runtime/test_evaluator_with_preproc.py
+++ b/tests/python/runtime/test_evaluator_with_preproc.py
@@ -49,9 +49,9 @@ def test_time_evalutor_with_preproc(f_preproc: str):
     dev = tvm.cuda(0)
     evaluator = f.time_evaluator(f.entry_name, dev, repeat=1000, number=1, f_preproc=f_preproc)
 
-    a = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
-    b = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
-    c = tvm.nd.array(np.zeros((128, 128)).astype("float32"), device=dev)
+    a = tvm.runtime.tensor(np.random.rand(128, 128).astype("float32"), device=dev)
+    b = tvm.runtime.tensor(np.random.rand(128, 128).astype("float32"), device=dev)
+    c = tvm.runtime.tensor(np.zeros((128, 128)).astype("float32"), device=dev)
     args = [a, b, c]
     print("Evaluator (f_preproc={}):\t{:.5f}ms".format(f_preproc, evaluator(*args).mean * 1000))
 
diff --git a/tests/python/runtime/test_executable.py b/tests/python/runtime/test_executable.py
index 571ce7adb2bf..4d6830b8b6a4 100644
--- a/tests/python/runtime/test_executable.py
+++ b/tests/python/runtime/test_executable.py
@@ -60,9 +60,9 @@ def test_executable_getitem():
     add_func = executable["add"]
 
     # Verify the function works
-    a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-    b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-    c = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+    a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+    b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+    c = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
     add_func(a, b, c)
 
@@ -87,10 +87,10 @@ def test_executable_jit_already_jitted():
     # The module might be different after force recompilation
 
     # Verify both modules work correctly
-    a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-    b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-    c1 = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
-    c2 = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+    a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+    b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+    c1 = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
+    c2 = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
     jitted_mod1["add"](a, b, c1)
     jitted_mod3["add"](a, b, c2)
@@ -118,9 +118,9 @@ def test_executable_export_library():
         assert loaded_mod is not None
 
         # Test the loaded module
-        a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-        b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-        c = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+        a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+        b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+        c = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
         loaded_mod["add"](a, b, c)
 
@@ -155,9 +155,9 @@ def test_executable_export_library_with_workspace():
         assert loaded_mod is not None
 
         # Test the loaded module
-        a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-        b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-        c = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+        a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+        b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+        c = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
         loaded_mod["add"](a, b, c)
 
@@ -190,9 +190,9 @@ def test_executable_integration():
     assert add_func is not None
 
     # Test the function works
-    a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-    b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-    c = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+    a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+    b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+    c = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
     add_func(a, b, c)
 
@@ -214,7 +214,7 @@ def test_executable_integration():
 
         # Test the loaded module
         loaded_add = loaded_mod["add"]
-        c_loaded = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+        c_loaded = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
         loaded_add(a, b, c_loaded)
 
         # Check results
@@ -249,9 +249,9 @@ def test_executable_jit_force_recompile():
     assert jitted_mod3 is not jitted_mod1
 
     # Test the function works
-    a = tvm.nd.array(np.array([1.0] * 10, dtype="float32"))
-    b = tvm.nd.array(np.array([2.0] * 10, dtype="float32"))
-    c = tvm.nd.array(np.array([0.0] * 10, dtype="float32"))
+    a = tvm.runtime.tensor(np.array([1.0] * 10, dtype="float32"))
+    b = tvm.runtime.tensor(np.array([2.0] * 10, dtype="float32"))
+    c = tvm.runtime.tensor(np.array([0.0] * 10, dtype="float32"))
 
     jitted_mod3["add"](a, b, c)
 
diff --git a/tests/python/runtime/test_runtime_container.py b/tests/python/runtime/test_runtime_container.py
index 8ee483e5f148..49d1c36f50bc 100644
--- a/tests/python/runtime/test_runtime_container.py
+++ b/tests/python/runtime/test_runtime_container.py
@@ -22,7 +22,7 @@
 
 import tvm
 import tvm.testing
-from tvm import nd
+import tvm.runtime
 from tvm.runtime import container as _container
 
 
diff --git a/tests/python/runtime/test_runtime_dlpack.py b/tests/python/runtime/test_runtime_dlpack.py
index 201037c6e469..a5d09ee465a1 100644
--- a/tests/python/runtime/test_runtime_dlpack.py
+++ b/tests/python/runtime/test_runtime_dlpack.py
@@ -29,7 +29,7 @@ def test_from_dlpack_shape_one():
     tgt = tvm.target.Target(target="llvm", host="llvm")
 
     rows = 1
-    a = tvm.runtime.ndarray.from_dlpack(to_dlpack(torch.randn(rows, 16)))
+    a = tvm.runtime.from_dlpack(to_dlpack(torch.randn(rows, 16)))
 
     A = te.placeholder((rows, 16), name="A")
     B = te.placeholder((rows, 16), name="B")
@@ -39,8 +39,8 @@ def test_from_dlpack_shape_one():
 
     dev = tvm.device(tgt.kind.name, 0)
 
-    b = tvm.nd.array(np.random.uniform(size=(rows, 16)).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros((rows, 16), dtype=C.dtype), dev)
+    b = tvm.runtime.tensor(np.random.uniform(size=(rows, 16)).astype(B.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros((rows, 16), dtype=C.dtype), dev)
     fadd(a, b, c)
 
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
@@ -53,7 +53,7 @@ def test_from_dlpack_strided():
 
     rows = 1
     inp = torch.randn(rows, 16)
-    a = tvm.runtime.ndarray.from_dlpack(to_dlpack(inp))
+    a = tvm.runtime.from_dlpack(to_dlpack(inp))
     view = a._create_view((2, 8))
 
     np.testing.assert_equal(inp.numpy().reshape(2, 8), view.numpy())
diff --git a/tests/python/runtime/test_runtime_extension.py b/tests/python/runtime/test_runtime_extension.py
index 7c7dca51c728..44534a6b4703 100644
--- a/tests/python/runtime/test_runtime_extension.py
+++ b/tests/python/runtime/test_runtime_extension.py
@@ -32,7 +32,7 @@ def test_dltensor_compatible():
 
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "arange"))
     f = tvm.compile(mod, target="llvm")
-    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
     f(a)
     np.testing.assert_equal(a.numpy(), np.arange(a.shape[0]))
 
diff --git a/tests/python/runtime/test_runtime_measure.py b/tests/python/runtime/test_runtime_measure.py
index ef27feb26398..fe01e5d331a6 100644
--- a/tests/python/runtime/test_runtime_measure.py
+++ b/tests/python/runtime/test_runtime_measure.py
@@ -37,7 +37,7 @@ def my_debug(filename):
     X = te.compute((), lambda: tvm.tir.call_packed("my_debug", filename))
     func = tvm.tir.build(te.create_prim_func([X]))
 
-    x = tvm.nd.empty((), dtype="int32")
+    x = tvm.runtime.empty((), dtype="int32")
     ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1)
     ftimer(x)
 
diff --git a/tests/python/runtime/test_runtime_module_load.py b/tests/python/runtime/test_runtime_module_load.py
index d22d40f6f2b1..edb7b4f79362 100644
--- a/tests/python/runtime/test_runtime_module_load.py
+++ b/tests/python/runtime/test_runtime_module_load.py
@@ -34,7 +34,7 @@
 path_dso = sys.argv[1]
 dtype = sys.argv[2]
 ff = tvm.runtime.load_module(path_dso)
-a = tvm.nd.array(np.zeros(10, dtype=dtype))
+a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
 ff(a)
 np.testing.assert_equal(a.numpy(), np.arange(a.shape[0]))
 print("Finish runtime checking...")
@@ -75,10 +75,10 @@ def save_object(names):
 
     f1 = tvm.runtime.load_module(path_dso)
     f2 = tvm.runtime.load_module(path_ll)
-    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
     f1(a)
     np.testing.assert_equal(a.numpy(), np.arange(a.shape[0]))
-    a = tvm.nd.array(np.zeros(10, dtype=dtype))
+    a = tvm.runtime.tensor(np.zeros(10, dtype=dtype))
     f2(a)
     np.testing.assert_equal(a.numpy(), np.arange(a.shape[0]))
 
@@ -124,8 +124,8 @@ def popen_check():
             import tvm
 
             f1 = tvm.runtime.load_module(path_dso)
-            a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+            a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+            b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
             f1(a, b)
             np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
@@ -140,8 +140,8 @@ def check_c(device):
             print("Skip because %s is not enabled" % device)
             return
         f = tvm.compile(sch.mod, target=tvm.target.Target(device, host="c"))
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
         f["main"](a, b)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
@@ -176,8 +176,8 @@ def check_llvm():
         m = tvm.runtime.load_module(path_dso)
         fadd1 = m["myadd1"]
         fadd2 = m["myadd2"]
-        a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=nn).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(nn, dtype=A.dtype), dev)
         fadd1(a, b)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
         fadd2(a, b)
@@ -207,8 +207,8 @@ def popen_check():
             ctypes.CDLL(path_dso)
             # Load the system wide library
             mm = tvm.runtime.system_lib()
-            a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
+            a = tvm.runtime.tensor(np.random.uniform(size=nn).astype(A.dtype), dev)
+            b = tvm.runtime.tensor(np.zeros(nn, dtype=A.dtype), dev)
             mm["myadd1"](a, b)
             np.testing.assert_equal(b.numpy(), a.numpy() + 1)
             mm["myadd2"](a, b)
diff --git a/tests/python/runtime/test_runtime_nd_array.py b/tests/python/runtime/test_runtime_nd_array.py
index 8b30b7bba05c..4ed81de55f0e 100644
--- a/tests/python/runtime/test_runtime_nd_array.py
+++ b/tests/python/runtime/test_runtime_nd_array.py
@@ -23,9 +23,9 @@
 
 
 def test_1d_full_view_of_1d_arr():
-    """NDArray::CreateView may return the same array"""
+    """Tensor::CreateView may return the same array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([1024])
     np_expected = np_input
@@ -34,9 +34,9 @@ def test_1d_full_view_of_1d_arr():
 
 
 def test_1d_view_of_first_half_of_1d_arr():
-    """NDArray::CreateView may return a subset of an array"""
+    """Tensor::CreateView may return a subset of an array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([512])
     np_expected = np_input[0:512]
@@ -45,9 +45,9 @@ def test_1d_view_of_first_half_of_1d_arr():
 
 
 def test_1d_view_of_first_half_of_1d_arr():
-    """Subset returned by NDArray::CreateView may have a byte offset"""
+    """Subset returned by Tensor::CreateView may have a byte offset"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([512], relative_byte_offset=512 * 4)
     np_expected = np_input[512:1024]
@@ -58,16 +58,16 @@ def test_1d_view_of_first_half_of_1d_arr():
 def test_view_larger_than_original_is_invalid():
     """Subset may not be larger than the original array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
-    with pytest.raises(ValueError, match="the NDArray being viewed only contains 4096 bytes"):
+    with pytest.raises(ValueError, match="the Tensor being viewed only contains 4096 bytes"):
         tvm_input._create_view([2048])
 
 
 def test_view_entirely_outside_bounds_of_original_is_invalid():
     """The byte_offset may not place a view outside the original array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     with pytest.raises(ValueError, match="would occupy bytes 8192 <= i_byte < 12288"):
         tvm_input._create_view([1024], relative_byte_offset=2048 * 4)
@@ -76,14 +76,14 @@ def test_view_entirely_outside_bounds_of_original_is_invalid():
 def test_view_partially_outside_bounds_of_original_is_invalid():
     """The byte_offset may not place any elements of a view outside the original array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     with pytest.raises(ValueError, match="would occupy bytes 2048 <= i_byte < 6144"):
         tvm_input._create_view([1024], relative_byte_offset=512 * 4)
 
 
 def test_subview_first_half_of_first_half():
-    """NDArray::CreateView be applied to a view
+    """Tensor::CreateView be applied to a view
 
     The first view is at element offset 0 (byte offset 0).  The second
     view is at element offset 0 (byte offset 0) relative to the first
@@ -92,7 +92,7 @@ def test_subview_first_half_of_first_half():
 
     """
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_view = tvm_input._create_view(
         [512],
@@ -108,7 +108,7 @@ def test_subview_first_half_of_first_half():
 
 
 def test_subview_first_half_of_second_half():
-    """NDArray::CreateView be applied to a view
+    """Tensor::CreateView be applied to a view
 
     The first view is at element offset 512 (byte offset 2048).  The
     second view is at element offset 0 (byte offset 0) relative to the
@@ -117,7 +117,7 @@ def test_subview_first_half_of_second_half():
 
     """
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_view = tvm_input._create_view(
         [512],
@@ -133,7 +133,7 @@ def test_subview_first_half_of_second_half():
 
 
 def test_subview_second_half_of_first_half():
-    """NDArray::CreateView be applied to a view
+    """Tensor::CreateView be applied to a view
 
     The first view is at element offset 0 (byte offset 0).  The second
     view is at element offset 256 (byte offset 1024) relative to the
@@ -142,7 +142,7 @@ def test_subview_second_half_of_first_half():
 
     """
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_view = tvm_input._create_view(
         [512],
@@ -158,7 +158,7 @@ def test_subview_second_half_of_first_half():
 
 
 def test_subview_second_half_of_second_half():
-    """NDArray::CreateView be applied to a view
+    """Tensor::CreateView be applied to a view
 
     The first view is at element offset 512 (byte offset 2048).  The
     second view is at element offset 256 (byte offset 1024) relative
@@ -167,7 +167,7 @@ def test_subview_second_half_of_second_half():
 
     """
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_view = tvm_input._create_view(
         [512],
@@ -183,7 +183,7 @@ def test_subview_second_half_of_second_half():
 
 
 def test_subview_must_be_in_range_of_immediate_parent():
-    """Bounds-checking is applied relative to the NDArray
+    """Bounds-checking is applied relative to the Tensor
 
     The first view is at location and covers bytes [0,2048).  The
     subview would occupy bytes [2048, 4096), and raises an error as
@@ -191,7 +191,7 @@ def test_subview_must_be_in_range_of_immediate_parent():
 
     """
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_view = tvm_input._create_view(
         [512],
@@ -206,9 +206,9 @@ def test_subview_must_be_in_range_of_immediate_parent():
 
 
 def test_2d_view_into_1d_arr():
-    """NDArray::CreateView may change the dimensionality of an array"""
+    """Tensor::CreateView may change the dimensionality of an array"""
     np_input = np.arange(1024, dtype="int32")
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([32, 32])
     np_expected = np_input.reshape(32, 32)
@@ -217,9 +217,9 @@ def test_2d_view_into_1d_arr():
 
 
 def test_2d_full_view_into_2d_arr():
-    """NDArray::CreateView may change the shape of an array"""
+    """Tensor::CreateView may change the shape of an array"""
     np_input = np.arange(1024, dtype="int32").reshape(32, 32)
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([16, 64])
     np_expected = np_input.reshape(16, 64)
@@ -228,9 +228,9 @@ def test_2d_full_view_into_2d_arr():
 
 
 def test_2d_view_of_first_half_of_2d_arr():
-    """NDArray::CreateView may return a multi-dimensional view"""
+    """Tensor::CreateView may return a multi-dimensional view"""
     np_input = np.arange(1024, dtype="int32").reshape(32, 32)
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([16, 32])
     np_expected = np_input[0:16, :]
@@ -239,9 +239,9 @@ def test_2d_view_of_first_half_of_2d_arr():
 
 
 def test_2d_view_of_second_half_of_2d_arr():
-    """NDArray::CreateView may return a multi-dimensional view with byte offset"""
+    """Tensor::CreateView may return a multi-dimensional view with byte offset"""
     np_input = np.arange(1024, dtype="int32").reshape(32, 32)
-    tvm_input = tvm.nd.array(np_input)
+    tvm_input = tvm.runtime.tensor(np_input)
 
     tvm_output = tvm_input._create_view([16, 32], relative_byte_offset=32 * 16 * 4)
     np_expected = np_input[16:32, :]
diff --git a/tests/python/runtime/test_runtime_rpc.py b/tests/python/runtime/test_runtime_rpc.py
index ac8653012ace..796e886e7bce 100644
--- a/tests/python/runtime/test_runtime_rpc.py
+++ b/tests/python/runtime/test_runtime_rpc.py
@@ -76,8 +76,8 @@ def verify_rpc(remote, target, shape, dtype):
         f = tvm.compile(te.create_prim_func([A, B]), target=target)
 
         dev = remote.cpu(0)
-        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev)
-        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), device=dev)
+        a = tvm.runtime.tensor(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev)
+        b = tvm.runtime.tensor(np.zeros(shape).astype(A.dtype), device=dev)
         temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         f.write_to_file(path_dso)
@@ -133,10 +133,10 @@ def test_rpc_array():
 
     def check_remote():
         x = np.ones((3, 4))
-        r_cpu = tvm.nd.array(x, remote.cpu(0))
+        r_cpu = tvm.runtime.tensor(x, remote.cpu(0))
         assert str(r_cpu.device).startswith("remote")
         np.testing.assert_equal(r_cpu.numpy(), x)
-        fremote = remote.get_function("rpc.test.remote_array_func")
+        fremote = remote.get_function("rpc.test.remote_tensor_func")
         fremote(r_cpu)
 
     check_remote()
@@ -152,8 +152,8 @@ def check_remote():
         dev = remote.cpu(0)
         a_np = np.ones((5041, 720)).astype("float32")
         b_np = np.ones((720, 192)).astype("float32")
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        b = tvm.runtime.tensor(b_np, dev)
         np.testing.assert_equal(a.numpy(), a_np)
         np.testing.assert_equal(b.numpy(), b_np)
 
@@ -251,8 +251,8 @@ def check_remote(remote):
         f.export_library(path_dso)
         remote.upload(path_dso)
         f1 = remote.load_module("dev_lib.so")
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(102, dtype=A.dtype), dev)
         time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
         cost = time_f(a, b).mean
         print("%g secs/op" % cost)
@@ -266,8 +266,8 @@ def check_remote(remote):
         with open(local_download_path, "wb") as fo:
             fo.write(remote.download_linked_module("dev_lib.tar"))
         fupdated = tvm.runtime.load_module(local_download_path)
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), tvm.cpu(0))
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), tvm.cpu(0))
+        a = tvm.runtime.tensor(np.random.uniform(size=102).astype(A.dtype), tvm.cpu(0))
+        b = tvm.runtime.tensor(np.zeros(102, dtype=A.dtype), tvm.cpu(0))
         fupdated(a, b)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
@@ -289,8 +289,8 @@ def check_minrpc():
         dev = remote.cpu(0)
         f1 = remote.system_lib()
 
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(102, dtype=A.dtype), dev)
         time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1)
         cost = time_f(a, b).mean
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
@@ -325,8 +325,8 @@ def check_remote_link_cl(remote):
         f.export_library(path_tar)
         remote.upload(path_tar)
         fhost = remote.load_module("myadd.tar")
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.zeros(102, dtype=A.dtype), dev)
         fhost(a, b)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
@@ -369,7 +369,7 @@ def check_multi_hop():
         assert fecho("xyz") == "xyz"
         assert bytes(fecho(bytearray(b"123"))) == b"123"
 
-        nd = tvm.nd.array([1, 2, 3], device=client.cpu(0))
+        nd = tvm.runtime.tensor([1, 2, 3], device=client.cpu(0))
         assert nd.numpy()[1] == 2
 
     def check_error_handling():
@@ -386,7 +386,7 @@ def check_error_handling():
 
 
 @tvm.testing.requires_rpc
-def test_rpc_return_ndarray():
+def test_rpc_return_tensor():
     # start server
     server = rpc.Server(key="x1")
     client = rpc.connect("127.0.0.1", server.port, key="x1")
diff --git a/tests/python/runtime/test_runtime_trace.py b/tests/python/runtime/test_runtime_trace.py
index 5093ce930ec3..263652bb695c 100644
--- a/tests/python/runtime/test_runtime_trace.py
+++ b/tests/python/runtime/test_runtime_trace.py
@@ -24,8 +24,8 @@ def test_trace_default_action():
     x = te.placeholder((n, n, n), name="X", dtype="float32")
     y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([i, j, k, x[i][j][k]]))
     f = tvm.compile(te.create_prim_func([x, y]), target="llvm")
-    xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
-    ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
+    xnd = tvm.runtime.tensor(np.ones((n, n, n), dtype=x.dtype))
+    ynd = tvm.runtime.tensor(np.zeros((n, n, n), dtype=y.dtype))
     f(xnd, ynd)
 
 
@@ -45,9 +45,9 @@ def check_assign(dtype):
         )
         f = tvm.compile(te.create_prim_func([x, y, z]), "llvm")
 
-        xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
-        ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
-        znd = tvm.nd.array(np.zeros((n, n, n), dtype=z.dtype))
+        xnd = tvm.runtime.tensor(np.ones((n, n, n), dtype=x.dtype))
+        ynd = tvm.runtime.tensor(np.zeros((n, n, n), dtype=y.dtype))
+        znd = tvm.runtime.tensor(np.zeros((n, n, n), dtype=z.dtype))
         f(xnd, ynd, znd)
 
         assert np.array_equal(xnd.numpy(), np.ones((n, n, n)))
@@ -73,9 +73,9 @@ def check_expr_sum(dtype):
             + tvm.tir.trace([b[i][j][k]], "tvm.tir.trace_callback3"),
         )
         f = tvm.compile(te.create_prim_func([a, b, c]))
-        xnd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
-        ynd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
-        znd = tvm.nd.array(np.zeros((n, n, n), dtype=c.dtype))
+        xnd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=a.dtype)))
+        ynd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=b.dtype)))
+        znd = tvm.runtime.tensor(np.zeros((n, n, n), dtype=c.dtype))
         f(xnd, ynd, znd)
         assert np.array_equal(znd.numpy(), xnd.numpy() + ynd.numpy())
 
@@ -103,11 +103,11 @@ def check_expr_sum(dtype):
             + tvm.tir.trace([i, j, k, e[i][j][k]], "tvm.tir.trace_silent"),
         )
         f = tvm.compile(te.create_prim_func([a, b, d, e, c]))
-        a_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
-        b_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
-        d_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=d.dtype)))
-        e_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=e.dtype)))
-        c_nd = tvm.nd.array(np.zeros((n, n, n), dtype=c.dtype))
+        a_nd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=a.dtype)))
+        b_nd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=b.dtype)))
+        d_nd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=d.dtype)))
+        e_nd = tvm.runtime.tensor(np.array(np.ones((n, n, n), dtype=e.dtype)))
+        c_nd = tvm.runtime.tensor(np.zeros((n, n, n), dtype=c.dtype))
         f(a_nd, b_nd, d_nd, e_nd, c_nd)
         assert np.array_equal(
             c_nd.numpy(), a_nd.numpy() + b_nd.numpy() + d_nd.numpy() + e_nd.numpy()
@@ -134,9 +134,9 @@ def check_expr_sum_custom(dtype):
         f = tvm.compile(te.create_prim_func([a, b, c]))
         npa = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
         npb = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
-        xnd = tvm.nd.array(npa)
-        ynd = tvm.nd.array(npb)
-        znd = tvm.nd.array(np.zeros((n, n), dtype=c.dtype))
+        xnd = tvm.runtime.tensor(npa)
+        ynd = tvm.runtime.tensor(npb)
+        znd = tvm.runtime.tensor(np.zeros((n, n), dtype=c.dtype))
         f(xnd, ynd, znd)
         assert np.array_equal(znd.numpy(), npa + npb)
 
@@ -160,9 +160,9 @@ def check_assign(dtype):
         z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_int_second"))
         f = tvm.compile(te.create_prim_func([x, y, z]))
 
-        xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
-        ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
-        znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype))
+        xnd = tvm.runtime.tensor(np.ones((n,), dtype=x.dtype))
+        ynd = tvm.runtime.tensor(np.zeros((n,), dtype=y.dtype))
+        znd = tvm.runtime.tensor(np.zeros((n,), dtype=z.dtype))
         f(xnd, ynd, znd)
         check_array_first = np.array([13, 13, 13, 13])
         check_array_second = np.array([14, 14, 14, 14])
@@ -191,9 +191,9 @@ def check_assign(dtype):
         )
         f = tvm.compile(te.create_prim_func([x, y, z]), target="llvm")
 
-        xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
-        ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
-        znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype))
+        xnd = tvm.runtime.tensor(np.ones((n,), dtype=x.dtype))
+        ynd = tvm.runtime.tensor(np.zeros((n,), dtype=y.dtype))
+        znd = tvm.runtime.tensor(np.zeros((n,), dtype=z.dtype))
         f(xnd, ynd, znd)
         check_array_first = np.array([13.0, 13.0, 13.0, 13.0])
         check_array_second = np.array([14.0, 14.0, 14.0, 14.0])
diff --git a/tests/python/target/test_arm_target.py b/tests/python/target/test_arm_target.py
index 686954baade1..d656031ad9cb 100644
--- a/tests/python/target/test_arm_target.py
+++ b/tests/python/target/test_arm_target.py
@@ -84,7 +84,7 @@ def my_func(a: T.handle):
 
     mod = tvm.compile(my_func, target=target)
 
-    A_nd = tvm.nd.array(np.empty((1,), dtype="int32"), device=dev)
+    A_nd = tvm.runtime.tensor(np.empty((1,), dtype="int32"), device=dev)
     mod(A_nd)
 
     ref = 10000 // (sve_device_vector_length // 32)
@@ -109,8 +109,8 @@ def my_func(a: T.handle, b: T.handle):
 
     A_np = np.random.uniform(size=(num_elements,)).astype("float32")
     B_np = np.zeros((num_elements,)).astype("float32")
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
 
     tvm.testing.assert_allclose(B_nd.numpy(), A_np)
@@ -137,8 +137,8 @@ def my_func(a: T.handle, b: T.handle):
 
     A_np = np.random.uniform(size=(num_elements,)).astype(dtype)
     B_np = np.zeros((num_elements,)).astype(dtype)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
 
     tvm.testing.assert_allclose(B_nd.numpy(), A_np)
@@ -159,7 +159,7 @@ def my_func(a: T.handle):
     mod = tvm.compile(my_func, target=target)
 
     A_np = np.zeros((num_elements,)).astype("float32")
-    A_nd = tvm.nd.array(A_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
     mod(A_nd)
 
     ref = np.ones((num_elements,))
diff --git a/tests/python/te/test_te_create_primfunc.py b/tests/python/te/test_te_create_primfunc.py
index b070371b8ac4..c8a095280230 100644
--- a/tests/python/te/test_te_create_primfunc.py
+++ b/tests/python/te/test_te_create_primfunc.py
@@ -352,8 +352,8 @@ def test_constant():
     func = te.create_prim_func([C, A])
     func = tvm.compile(func)
     a_np = np.random.uniform(size=(M,)).astype(A.dtype)
-    c = tvm.nd.array(np.zeros(M, dtype=C.dtype))
-    x = func(c, tvm.nd.array(a_np))
+    c = tvm.runtime.tensor(np.zeros(M, dtype=C.dtype))
+    x = func(c, tvm.runtime.tensor(a_np))
     tvm.testing.assert_allclose(a_np + 2, c.numpy())
 
 
@@ -367,8 +367,8 @@ def test_data_dependent_access():
 
     a_np = np.random.uniform(size=(10,)).astype(A.dtype)
     b_np = np.arange(10, dtype=B.dtype)
-    c = tvm.nd.array(np.zeros(10, dtype=C.dtype))
-    func(c, tvm.nd.array(a_np), tvm.nd.array(b_np))
+    c = tvm.runtime.tensor(np.zeros(10, dtype=C.dtype))
+    func(c, tvm.runtime.tensor(a_np), tvm.runtime.tensor(b_np))
     tvm.testing.assert_allclose(a_np[b_np], c.numpy())
 
 
diff --git a/tests/python/tir-base/test_tir_imm_values.py b/tests/python/tir-base/test_tir_imm_values.py
index 11213e35364c..4ec1674af203 100644
--- a/tests/python/tir-base/test_tir_imm_values.py
+++ b/tests/python/tir-base/test_tir_imm_values.py
@@ -271,7 +271,7 @@ def float_imm_div(x: T.float32, y: T.float32, z: T.Buffer((), "float32")):
 
     def __wrap_build(f):
         lib = tvm.compile(f, target="llvm")
-        z = tvm.nd.array(np.zeros([]).astype("float32"))
+        z = tvm.runtime.tensor(np.zeros([]).astype("float32"))
 
         def _func(x, y):
             lib(x, y, z)
diff --git a/tests/python/tir-base/test_tir_index_map.py b/tests/python/tir-base/test_tir_index_map.py
index 3ddbd2f69f59..8696a4062668 100644
--- a/tests/python/tir-base/test_tir_index_map.py
+++ b/tests/python/tir-base/test_tir_index_map.py
@@ -214,12 +214,12 @@ def expected_inverse(i0, i1, i2, i3):
     assert expected_map.is_equivalent_to(inverse_map)
 
 
-def test_map_ndarray():
+def test_map_tensor():
     index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
 
     inp = np.arange(16).astype("int8")
 
-    out = index_map.map_ndarray(tvm.nd.array(inp)).numpy()
+    out = index_map.map_tensor(tvm.runtime.tensor(inp)).numpy()
 
     ref = np.zeros(out.shape).astype("int8")
 
@@ -232,7 +232,7 @@ def test_map_ndarray():
 
     inp = np.random.randn(10, 10, 10, 10).astype("float16")
 
-    out = index_map.map_ndarray(tvm.nd.array(inp)).numpy()
+    out = index_map.map_tensor(tvm.runtime.tensor(inp)).numpy()
 
     ref = np.transpose(inp, (3, 0, 1, 2))
 
@@ -254,8 +254,8 @@ def test_map_ndarray():
     I = 64
     O = 64
     inp = np.random.randn(kH, kW, I, O).astype("float32")
-    arr = tvm.nd.array(inp)
-    out = index_map.map_ndarray(arr).numpy()
+    arr = tvm.runtime.tensor(inp)
+    out = index_map.map_tensor(arr).numpy()
 
     ref = np.zeros(out.shape).astype("float32")
 
@@ -269,7 +269,7 @@ def test_map_ndarray():
     np.testing.assert_equal(ref, out)
 
     inverse_map = index_map.inverse(inp.shape)
-    np.testing.assert_equal(inverse_map.map_ndarray(index_map.map_ndarray(arr)).numpy(), inp)
+    np.testing.assert_equal(inverse_map.map_tensor(index_map.map_tensor(arr)).numpy(), inp)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py
index 55f8dbed6c3c..1492816429d0 100644
--- a/tests/python/tir-base/test_tir_intrin.py
+++ b/tests/python/tir-base/test_tir_intrin.py
@@ -42,8 +42,8 @@ def test_nearbyint():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
-    a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
+    a_rounded = tvm.runtime.tensor(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
     func(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
@@ -97,8 +97,8 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
         func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=atol, rtol=rtol)
 
@@ -113,8 +113,8 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
                     np.random.uniform(-2.0, -1.1, size=n // 2),
                 ]
             ).astype(A.dtype)
-            a2 = tvm.nd.array(out_np, dev)
-            b2 = tvm.nd.array(np.empty_like(out_np), dev)
+            a2 = tvm.runtime.tensor(out_np, dev)
+            b2 = tvm.runtime.tensor(np.empty_like(out_np), dev)
             func(a2, b2)
             # all outputs should be NaN
             assert np.all(np.isnan(b2.numpy()))
@@ -149,9 +149,9 @@ def run_test(tvm_intrin, np_func):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
+        c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
         func(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
@@ -176,9 +176,9 @@ def test_ldexp():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
+    c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
     func(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
@@ -230,8 +230,8 @@ def clz_np(x, dtype):
 
     for high in highs:
         a_np = np.random.randint(1, high=high, size=(n,), dtype=dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        b = tvm.runtime.tensor(np.zeros((n,)).astype("int32"), dev)
         func(a, b)
         ref = clz_np(a_np, dtype)
         np.testing.assert_equal(b.numpy(), ref)
diff --git a/tests/python/tir-base/test_tir_ptx_cp_async.py b/tests/python/tir-base/test_tir_ptx_cp_async.py
index d5c029c10138..f3255bd257c6 100644
--- a/tests/python/tir-base/test_tir_ptx_cp_async.py
+++ b/tests/python/tir-base/test_tir_ptx_cp_async.py
@@ -55,8 +55,8 @@ def test_ptx_cp_async():
     A_np = np.random.rand(32, 128).astype("float16")
     B_np = np.zeros((32, 128)).astype("float16")
     dev = tvm.cuda(0)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
     tvm.testing.assert_allclose(B_nd.numpy(), A_np)
 
@@ -102,8 +102,8 @@ def test_ptx_cp_async_barrier():
     A_np = np.random.rand(32, 128).astype("float16")
     B_np = np.zeros((32, 128)).astype("float16")
     dev = tvm.cuda(0)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
     tvm.testing.assert_allclose(B_nd.numpy(), A_np)
 
@@ -143,8 +143,8 @@ def test_ptx_cp_async_bulk():
     A_np = np.random.rand(32, 128).astype("float16")
     B_np = np.zeros((32, 128)).astype("float16")
     dev = tvm.cuda(0)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)
     tvm.testing.assert_allclose(B_nd.numpy(), A_np)
 
diff --git a/tests/python/tir-base/test_tir_ptx_ldmatrix.py b/tests/python/tir-base/test_tir_ptx_ldmatrix.py
index 346f9c393fcd..8d4ed399b2e8 100644
--- a/tests/python/tir-base/test_tir_ptx_ldmatrix.py
+++ b/tests/python/tir-base/test_tir_ptx_ldmatrix.py
@@ -87,8 +87,8 @@ def test_ptx_ldmatrix():
                     A_mask_np[:16, :16] = A_np[:16, :16]
             B_np = np.zeros((16, 16)).astype("float16")
             dev = tvm.cuda(0)
-            A_nd = tvm.nd.array(A_np, device=dev)
-            B_nd = tvm.nd.array(B_np, device=dev)
+            A_nd = tvm.runtime.tensor(A_np, device=dev)
+            B_nd = tvm.runtime.tensor(B_np, device=dev)
             mod(A_nd, B_nd)
             tvm.testing.assert_allclose(B_nd.numpy(), A_mask_np)
 
diff --git a/tests/python/tir-base/test_tir_ptx_mma.py b/tests/python/tir-base/test_tir_ptx_mma.py
index 8f221d95da32..ad38348efdb4 100644
--- a/tests/python/tir-base/test_tir_ptx_mma.py
+++ b/tests/python/tir-base/test_tir_ptx_mma.py
@@ -74,9 +74,9 @@ def test_gemm_mma_m8n8k4_row_col_fp64pf64fp64():
     C_np = np.zeros([8, 8]).astype("float64")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -150,9 +150,9 @@ def test_gemm_mma_m8n8k4_row_row_fp16fp16fp16():
     C_np = np.zeros([16, 16]).astype("float16")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -233,9 +233,9 @@ def test_gemm_mma_m8n8k4_row_row_fp16fp16fp32():
     C_np = np.zeros([16, 16]).astype("float32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -304,9 +304,9 @@ def test_gemm_mma_m8n8k16_row_col_s8s8s32():
     C_np = np.zeros([8, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -375,9 +375,9 @@ def test_gemm_mma_m8n8k16_row_col_s8u8s32():
     C_np = np.zeros([8, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -442,9 +442,9 @@ def test_gemm_mma_m8n8k32_row_col_s4s4s32():
     cuda_mod = tvm.compile(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.empty([8, 32], "int4", ctx)
-    B_tvm = tvm.nd.empty([8, 32], "int4", ctx)
-    C_tvm = tvm.nd.empty([8, 8], "int32", ctx)
+    A_tvm = tvm.runtime.empty([8, 32], "int4", ctx)
+    B_tvm = tvm.runtime.empty([8, 32], "int4", ctx)
+    C_tvm = tvm.runtime.empty([8, 8], "int32", ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
     # Currently the correctness is not checked.
@@ -505,9 +505,9 @@ def test_gemm_mma_m8n8k32_row_col_s4u4s32():
     cuda_mod = tvm.compile(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.empty([8, 32], "int4", ctx)
-    B_tvm = tvm.nd.empty([8, 32], "uint4", ctx)
-    C_tvm = tvm.nd.empty([8, 8], "int32", ctx)
+    A_tvm = tvm.runtime.empty([8, 32], "int4", ctx)
+    B_tvm = tvm.runtime.empty([8, 32], "uint4", ctx)
+    C_tvm = tvm.runtime.empty([8, 8], "int32", ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
     # Currently the correctness is not checked.
@@ -574,9 +574,9 @@ def test_gemm_mma_m16n8k8_row_col_fp16fp16fp32():
     C_np = np.zeros([16, 8]).astype("float32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -650,9 +650,9 @@ def test_gemm_mma_m16n8k16_row_col_fp16fp16fp16():
     C_np = np.zeros([16, 8]).astype("float16")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -726,9 +726,9 @@ def test_gemm_mma_m16n8k16_row_col_fp16fp16fp32():
     C_np = np.zeros([16, 8]).astype("float32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -802,9 +802,9 @@ def test_gemm_mma_m16n8k16_row_col_s8s8s32():
     C_np = np.zeros([16, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -878,9 +878,9 @@ def test_gemm_mma_m16n8k16_row_col_s8u8s32():
     C_np = np.zeros([16, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -954,9 +954,9 @@ def test_gemm_mma_m16n8k32_row_col_s8s8s32():
     C_np = np.zeros([16, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -1030,9 +1030,9 @@ def test_gemm_mma_m16n8k32_row_col_s8u8s32():
     C_np = np.zeros([16, 8]).astype("int32")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.array(A_np, ctx)
-    B_tvm = tvm.nd.array(B_np, ctx)
-    C_tvm = tvm.nd.array(C_np, ctx)
+    A_tvm = tvm.runtime.tensor(A_np, ctx)
+    B_tvm = tvm.runtime.tensor(B_np, ctx)
+    C_tvm = tvm.runtime.tensor(C_np, ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
 
@@ -1102,9 +1102,9 @@ def test_gemm_mma_m16n8k64_row_col_s4s4s32():
     cuda_mod = tvm.compile(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.empty([16, 64], "int4", ctx)
-    B_tvm = tvm.nd.empty([8, 64], "int4", ctx)
-    C_tvm = tvm.nd.empty([16, 8], "int32", ctx)
+    A_tvm = tvm.runtime.empty([16, 64], "int4", ctx)
+    B_tvm = tvm.runtime.empty([8, 64], "int4", ctx)
+    C_tvm = tvm.runtime.empty([16, 8], "int32", ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
     # Currently the correctness is not checked.
@@ -1170,9 +1170,9 @@ def test_gemm_mma_m16n8k64_row_col_s4u4s32():
     cuda_mod = tvm.compile(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.empty([16, 64], "int4", ctx)
-    B_tvm = tvm.nd.empty([8, 64], "uint4", ctx)
-    C_tvm = tvm.nd.empty([16, 8], "int32", ctx)
+    A_tvm = tvm.runtime.empty([16, 64], "int4", ctx)
+    B_tvm = tvm.runtime.empty([8, 64], "uint4", ctx)
+    C_tvm = tvm.runtime.empty([16, 8], "int32", ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
     # Currently the correctness is not checked.
@@ -1239,9 +1239,9 @@ def test_gemm_mma_m16n8k256_row_col_b1b1s32():
     cuda_mod = tvm.compile(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
-    A_tvm = tvm.nd.empty([16, 256], "int1", ctx)
-    B_tvm = tvm.nd.empty([8, 256], "int1", ctx)
-    C_tvm = tvm.nd.empty([16, 8], "int32", ctx)
+    A_tvm = tvm.runtime.empty([16, 256], "int1", ctx)
+    B_tvm = tvm.runtime.empty([8, 256], "int1", ctx)
+    C_tvm = tvm.runtime.empty([16, 8], "int32", ctx)
 
     cuda_mod(A_tvm, B_tvm, C_tvm)
     # Currently the correctness is not checked.
diff --git a/tests/python/tir-base/test_tir_ptx_mma_sp.py b/tests/python/tir-base/test_tir_ptx_mma_sp.py
index d5c6c9a03b45..fef373799b2b 100644
--- a/tests/python/tir-base/test_tir_ptx_mma_sp.py
+++ b/tests/python/tir-base/test_tir_ptx_mma_sp.py
@@ -283,10 +283,10 @@ def get_meta_m16n8k16_half(mask):
         meta = get_meta_m16n8k16_half(mask)
 
         ctx = tvm.cuda()
-        A_tvm = tvm.nd.array(A_np, ctx)
-        B_tvm = tvm.nd.array(B_np, ctx)
-        C_tvm = tvm.nd.array(np.zeros_like(C_np), ctx)
-        meta_tvm = tvm.nd.array(meta, ctx)
+        A_tvm = tvm.runtime.tensor(A_np, ctx)
+        B_tvm = tvm.runtime.tensor(B_np, ctx)
+        C_tvm = tvm.runtime.tensor(np.zeros_like(C_np), ctx)
+        meta_tvm = tvm.runtime.tensor(meta, ctx)
         cuda_mod(A_tvm, B_tvm, C_tvm, meta_tvm)
 
         tvm.testing.assert_allclose(C_tvm.numpy(), C_np, atol=1e-3, rtol=1e-3)
@@ -322,10 +322,10 @@ def get_meta_m16n8k32_half(mask):
         meta = get_meta_m16n8k32_half(mask)
 
         ctx = tvm.cuda()
-        A_tvm = tvm.nd.array(A_np, ctx)
-        B_tvm = tvm.nd.array(B_np, ctx)
-        C_tvm = tvm.nd.array(np.zeros_like(C_np), ctx)
-        meta_tvm = tvm.nd.array(meta, ctx)
+        A_tvm = tvm.runtime.tensor(A_np, ctx)
+        B_tvm = tvm.runtime.tensor(B_np, ctx)
+        C_tvm = tvm.runtime.tensor(np.zeros_like(C_np), ctx)
+        meta_tvm = tvm.runtime.tensor(meta, ctx)
         cuda_mod(A_tvm, B_tvm, C_tvm, meta_tvm)
 
     tvm.testing.assert_allclose(C_tvm.numpy(), C_np, atol=1e-3, rtol=1e-3)
diff --git a/tests/python/tir-base/test_tir_structural_equal_hash.py b/tests/python/tir-base/test_tir_structural_equal_hash.py
index 5e7c49ac14b9..559d705b6267 100644
--- a/tests/python/tir-base/test_tir_structural_equal_hash.py
+++ b/tests/python/tir-base/test_tir_structural_equal_hash.py
@@ -120,8 +120,8 @@ def test_prim_func():
     func1 = tvm.ir.load_json(tvm.ir.save_json(func0))
     tvm.ir.assert_structural_equal(func0, func1)
 
-    data0 = tvm.nd.array([1, 2, 3])
-    data1 = tvm.nd.array([1, 2, 3])
+    data0 = tvm.runtime.tensor([1, 2, 3])
+    data1 = tvm.runtime.tensor([1, 2, 3])
     # attributes and ndarrays
     func0 = func0.with_attr("data", data0)
     func1 = func1.with_attr("data", data1)
@@ -174,9 +174,9 @@ def test_prim_func_body_mismatch():
 
 def test_array():
     x = np.arange(10)
-    nx = tvm.nd.array(x)
-    ny = tvm.nd.array(x)
-    nz = tvm.nd.array(x.reshape(2, 5))
+    nx = tvm.runtime.tensor(x)
+    ny = tvm.runtime.tensor(x)
+    nz = tvm.runtime.tensor(x.reshape(2, 5))
     assert consistent_equal(nx, ny)
     assert not consistent_equal(nx, nz)
 
diff --git a/tests/python/tir-base/test_tir_te_extern_primfunc.py b/tests/python/tir-base/test_tir_te_extern_primfunc.py
index 9c375481fe45..1408597fa22e 100644
--- a/tests/python/tir-base/test_tir_te_extern_primfunc.py
+++ b/tests/python/tir-base/test_tir_te_extern_primfunc.py
@@ -48,8 +48,8 @@ def func_1(A: T.Buffer((16,), "float32"), C: T.Buffer((1,), "float32")):
 def verify_func_1(module):
     a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
     c_np = np.zeros((1,), dtype=np.float32)
-    a = tvm.nd.array(a_np, device=tvm.cpu(0))
-    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    a = tvm.runtime.tensor(a_np, device=tvm.cpu(0))
+    c = tvm.runtime.tensor(c_np, device=tvm.cpu(0))
 
     module(a, c)
     tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1), c.numpy(), rtol=1e-4)
@@ -78,9 +78,9 @@ def verify_func_2(module):
     a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
     d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
     c_np = np.zeros((1,), dtype=np.float32)
-    a = tvm.nd.array(a_np, device=tvm.cpu(0))
-    d = tvm.nd.array(d_np, device=tvm.cpu(0))
-    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    a = tvm.runtime.tensor(a_np, device=tvm.cpu(0))
+    d = tvm.runtime.tensor(d_np, device=tvm.cpu(0))
+    c = tvm.runtime.tensor(c_np, device=tvm.cpu(0))
 
     module(c, a, d)
     tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
@@ -116,11 +116,11 @@ def verify_func_3(module):
     c_np = np.zeros((1,), dtype=np.float32)
     e_np = np.zeros((16,), dtype=np.float32)
     f_np = np.zeros((16,), dtype=np.float32)
-    a = tvm.nd.array(a_np, device=tvm.cpu(0))
-    d = tvm.nd.array(d_np, device=tvm.cpu(0))
-    c = tvm.nd.array(c_np, device=tvm.cpu(0))
-    e = tvm.nd.array(e_np, device=tvm.cpu(0))
-    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+    a = tvm.runtime.tensor(a_np, device=tvm.cpu(0))
+    d = tvm.runtime.tensor(d_np, device=tvm.cpu(0))
+    c = tvm.runtime.tensor(c_np, device=tvm.cpu(0))
+    e = tvm.runtime.tensor(e_np, device=tvm.cpu(0))
+    f = tvm.runtime.tensor(f_np, device=tvm.cpu(0))
 
     module(c, a, d, e, f)
     tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
@@ -158,11 +158,11 @@ def verify_func_4(module):
     c_np = np.zeros((1,), dtype=np.float32)
     e_np = np.zeros((16,), dtype=np.float32)
     f_np = np.zeros((16,), dtype=np.float32)
-    a = tvm.nd.array(a_np, device=tvm.cpu(0))
-    d = tvm.nd.array(d_np, device=tvm.cpu(0))
-    c = tvm.nd.array(c_np, device=tvm.cpu(0))
-    e = tvm.nd.array(e_np, device=tvm.cpu(0))
-    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+    a = tvm.runtime.tensor(a_np, device=tvm.cpu(0))
+    d = tvm.runtime.tensor(d_np, device=tvm.cpu(0))
+    c = tvm.runtime.tensor(c_np, device=tvm.cpu(0))
+    e = tvm.runtime.tensor(e_np, device=tvm.cpu(0))
+    f = tvm.runtime.tensor(f_np, device=tvm.cpu(0))
 
     module(c, a, f, d, e)
     tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
diff --git a/tests/python/tir-schedule/test_tir_schedule_decompose_padding.py b/tests/python/tir-schedule/test_tir_schedule_decompose_padding.py
index c8679843dda6..882a5b72cefa 100644
--- a/tests/python/tir-schedule/test_tir_schedule_decompose_padding.py
+++ b/tests/python/tir-schedule/test_tir_schedule_decompose_padding.py
@@ -32,9 +32,9 @@ def check_decompose_padding(origin, scheduled, expected, check_run=False):
         out_buffer = origin.buffer_map[origin.params[1]]
         in_shape = [int(_) for _ in in_buffer.shape]
         out_shape = [int(_) for _ in out_buffer.shape]
-        x = tvm.nd.array(np.random.uniform(0, 64, in_shape).astype(in_buffer.dtype))
-        y0 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
-        y1 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
+        x = tvm.runtime.tensor(np.random.uniform(0, 64, in_shape).astype(in_buffer.dtype))
+        y0 = tvm.runtime.tensor(np.zeros(out_shape).astype(out_buffer.dtype))
+        y1 = tvm.runtime.tensor(np.zeros(out_shape).astype(out_buffer.dtype))
         f_origin = tvm.compile(origin)
         f_scheduled = tvm.compile(scheduled)
         f_origin(x, y0)
diff --git a/tests/python/tir-schedule/test_tir_schedule_rolling_buffer.py b/tests/python/tir-schedule/test_tir_schedule_rolling_buffer.py
index 0ea51aaf83aa..6fdd830120ec 100644
--- a/tests/python/tir-schedule/test_tir_schedule_rolling_buffer.py
+++ b/tests/python/tir-schedule/test_tir_schedule_rolling_buffer.py
@@ -38,9 +38,9 @@ def check_rolling_buffer(
         out_buffer = origin.buffer_map[origin.params[1]]
         in_shape = [int(_) for _ in in_buffer.shape]
         out_shape = [int(_) for _ in out_buffer.shape]
-        x = tvm.nd.array(np.random.uniform(0, 64, in_shape).astype(in_buffer.dtype))
-        y0 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
-        y1 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
+        x = tvm.runtime.tensor(np.random.uniform(0, 64, in_shape).astype(in_buffer.dtype))
+        y0 = tvm.runtime.tensor(np.zeros(out_shape).astype(out_buffer.dtype))
+        y1 = tvm.runtime.tensor(np.zeros(out_shape).astype(out_buffer.dtype))
         f_origin = tvm.compile(origin)
         f_scheduled = tvm.compile(scheduled)
         f_origin(x, y0)
diff --git a/tests/python/tir-schedule/test_tir_schedule_tensorize_ldmatrix_mma_numeric.py b/tests/python/tir-schedule/test_tir_schedule_tensorize_ldmatrix_mma_numeric.py
index d5646f60fb7a..203bf0fea222 100644
--- a/tests/python/tir-schedule/test_tir_schedule_tensorize_ldmatrix_mma_numeric.py
+++ b/tests/python/tir-schedule/test_tir_schedule_tensorize_ldmatrix_mma_numeric.py
@@ -169,9 +169,9 @@ def run_test(
             b_np = np.random.randint(-128, 128, (K, N)).astype("int8")
             c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32")
 
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((M, N), dtype=out_dtype), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
+    c = tvm.runtime.tensor(np.zeros((M, N), dtype=out_dtype), dev)
 
     f(a, b, c)
 
diff --git a/tests/python/tir-schedule/test_tir_schedule_tensorize_mfma_numeric.py b/tests/python/tir-schedule/test_tir_schedule_tensorize_mfma_numeric.py
index c8edaf30fca9..f98c10c8b9e6 100644
--- a/tests/python/tir-schedule/test_tir_schedule_tensorize_mfma_numeric.py
+++ b/tests/python/tir-schedule/test_tir_schedule_tensorize_mfma_numeric.py
@@ -146,9 +146,9 @@ def run_test(
             b_np = np.random.randint(-128, 128, (K, N)).astype("int8")
             c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32")
 
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((M, N), dtype=out_dtype), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(b_np, dev)
+    c = tvm.runtime.tensor(np.zeros((M, N), dtype=out_dtype), dev)
 
     f(a, b, c)
 
diff --git a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
index 0fac2177f7f1..840c83452ed5 100644
--- a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py
@@ -148,8 +148,8 @@ def test_inject_async_copy():
         A_np = np.random.rand(32, 128).astype(dtype)
         B_np = np.zeros((32, 128)).astype(dtype)
         dev = tvm.cuda(0)
-        A_nd = tvm.nd.array(A_np, device=dev)
-        B_nd = tvm.nd.array(B_np, device=dev)
+        A_nd = tvm.runtime.tensor(A_np, device=dev)
+        B_nd = tvm.runtime.tensor(B_np, device=dev)
         mod(A_nd, B_nd)
         tvm.testing.assert_allclose(B_nd.numpy(), A_np)
 
@@ -177,9 +177,9 @@ def test_inject_async_copy_shared_dyn():
     B_np = np.random.rand(32, 128).astype("float16")
     C_np = np.zeros((32, 128)).astype("float16")
     dev = tvm.cuda(0)
-    A_nd = tvm.nd.array(A_np, device=dev)
-    B_nd = tvm.nd.array(B_np, device=dev)
-    C_nd = tvm.nd.array(C_np, device=dev)
+    A_nd = tvm.runtime.tensor(A_np, device=dev)
+    B_nd = tvm.runtime.tensor(B_np, device=dev)
+    C_nd = tvm.runtime.tensor(C_np, device=dev)
     mod(A_nd, B_nd, C_nd)
     tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
 
@@ -234,8 +234,8 @@ def test_inject_async_copy_barrier():
         A_np = np.random.rand(32, 128).astype(dtype)
         B_np = np.zeros((32, 128)).astype(dtype)
         dev = tvm.cuda(0)
-        A_nd = tvm.nd.array(A_np, device=dev)
-        B_nd = tvm.nd.array(B_np, device=dev)
+        A_nd = tvm.runtime.tensor(A_np, device=dev)
+        B_nd = tvm.runtime.tensor(B_np, device=dev)
         mod(A_nd, B_nd)
         tvm.testing.assert_allclose(B_nd.numpy(), A_np)
 
diff --git a/tests/python/tir-transform/test_tir_transform_inject_software_pipeline.py b/tests/python/tir-transform/test_tir_transform_inject_software_pipeline.py
index c4f2756251c5..697887dc8cbb 100644
--- a/tests/python/tir-transform/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/tir-transform/test_tir_transform_inject_software_pipeline.py
@@ -1538,9 +1538,9 @@ def build_and_run(sch):
         a_np = np.random.uniform(size=(N, K)).astype("float16")
         b_np = np.random.uniform(size=(K, M)).astype("float16")
         c_np = np.dot(a_np.astype("float32"), b_np.astype("float32"))
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros((N, M), dtype="float32"), dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        b = tvm.runtime.tensor(b_np, dev)
+        c = tvm.runtime.tensor(np.zeros((N, M), dtype="float32"), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
 
diff --git a/tests/python/tir-transform/test_tir_transform_lower_intrin.py b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
index f31cf559764d..864b24bc0f51 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_intrin.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
@@ -48,9 +48,9 @@ def make_binds(i):
 
     C = te.compute((n,), make_binds)
     f = tvm.compile(te.create_prim_func([A, B, C]), "llvm")
-    a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype))
-    b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype))
-    c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype))
+    a = tvm.runtime.tensor(np.array([x for x, y in data], dtype=expr.dtype))
+    b = tvm.runtime.tensor(np.array([y for x, y in data], dtype=expr.dtype))
+    c = tvm.runtime.tensor(np.zeros(len(data), dtype=expr.dtype))
     f(a, b, c)
     cref = np.array([fref(x, y) for x, y in data])
     np.testing.assert_equal(c.numpy(), cref)
diff --git a/tests/python/tir-transform/test_tir_transform_lower_tvm_builtin.py b/tests/python/tir-transform/test_tir_transform_lower_tvm_builtin.py
index 08f377829f1e..0f71b78f0ca1 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_tvm_builtin.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_tvm_builtin.py
@@ -143,7 +143,7 @@ def build_tir():
 
     mod = build_tir()
     f = tvm.compile(mod, None)
-    a = tvm.nd.array(np.zeros(2, dtype="float32"))
+    a = tvm.runtime.tensor(np.zeros(2, dtype="float32"))
     f(a)
     tvm.testing.assert_allclose(a.numpy(), expected_value)
 
diff --git a/tests/python/tir-transform/test_tir_transform_make_packed_api.py b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
index 4fecafef1d15..723584ff5576 100644
--- a/tests/python/tir-transform/test_tir_transform_make_packed_api.py
+++ b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
@@ -214,8 +214,8 @@ def func(A: T.Buffer([16, 16], "int32"), B: T.Buffer([16, 16], "int32")):
 
     built = tvm.compile(func, target="llvm")
 
-    A = tvm.nd.array(np.zeros([16], dtype="int32"))
-    B = tvm.nd.empty([16, 16], "int32", tvm.cpu())
+    A = tvm.runtime.tensor(np.zeros([16], dtype="int32"))
+    B = tvm.runtime.empty([16, 16], "int32", tvm.cpu())
 
     with pytest.raises(tvm.TVMError):
         built(A, B)
@@ -231,8 +231,8 @@ def func(A: T.Buffer([16, 16], "int32"), B: T.Buffer([16, 16], "int32")):
 
     built = tvm.compile(func, target="llvm")
 
-    A = tvm.nd.array(np.zeros([16], dtype="int32"))
-    B = tvm.nd.empty([16], "int32", tvm.cpu())
+    A = tvm.runtime.tensor(np.zeros([16], dtype="int32"))
+    B = tvm.runtime.empty([16], "int32", tvm.cpu())
 
     with pytest.raises(tvm.TVMError):
         built(A, B)
diff --git a/tests/python/tvmscript/test_tvmscript_ir_builder_tir.py b/tests/python/tvmscript/test_tvmscript_ir_builder_tir.py
index 1dece07ed9dd..db6f4ba47f19 100644
--- a/tests/python/tvmscript/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/tvmscript/test_tvmscript_ir_builder_tir.py
@@ -20,9 +20,9 @@
 import pytest
 import tvm
 import tvm.testing
+import tvm.runtime
 from tvm import tir
 from tvm.ir.base import assert_structural_equal
-from tvm.runtime import ndarray
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import tir as T
 
@@ -388,7 +388,7 @@ def test_ir_builder_tir_allocate_const():
         buffer_var,
         "int32",
         [10],
-        ndarray.array(np.asarray(data, "int32")),
+        tvm.runtime.tensor(np.asarray(data, "int32")),
         tir.Evaluate(1),
         annotations={},
     )
diff --git a/tests/python/tvmscript/test_tvmscript_ops.py b/tests/python/tvmscript/test_tvmscript_ops.py
index 7672b75ec126..0d6beabd7a40 100644
--- a/tests/python/tvmscript/test_tvmscript_ops.py
+++ b/tests/python/tvmscript/test_tvmscript_ops.py
@@ -81,10 +81,10 @@ def _check_get_valid_counts_with_numpy(f, dshape, score_threshold, id_index, sco
                     np_out2[i, j, k] = -1.0
                 np_out3[i, j] = -1
 
-    in_data = tvm.nd.array(np_data, ctx)
-    out1 = tvm.nd.array(np_out1, ctx)
-    out2 = tvm.nd.array(np_out2, ctx)
-    out3 = tvm.nd.array(np_out3, ctx)
+    in_data = tvm.runtime.tensor(np_data, ctx)
+    out1 = tvm.runtime.tensor(np_out1, ctx)
+    out2 = tvm.runtime.tensor(np_out2, ctx)
+    out3 = tvm.runtime.tensor(np_out3, ctx)
     f(in_data, out1, out2, out3, score_threshold, id_index, score_index)
     tvm.testing.assert_allclose(out1.numpy(), np_out1, rtol=1e-5)
     tvm.testing.assert_allclose(out2.numpy(), np_out2, rtol=1e-5)
@@ -134,8 +134,8 @@ def _check_alloc_zero_dim_buffer(f):
 
     np_data = np.zeros(shape=()).astype(dtype)
     np_out = np.zeros(shape=()).astype(dtype)
-    tvm_data = tvm.nd.array(np_data, ctx)
-    tvm_out = tvm.nd.array(np_out, ctx)
+    tvm_data = tvm.runtime.tensor(np_data, ctx)
+    tvm_out = tvm.runtime.tensor(np_out, ctx)
 
     # np func exection
     np_inter = np.array(1)
@@ -175,7 +175,7 @@ def ceildiv_test(A: T.Buffer(16, "int32")):
 @tvm.testing.requires_llvm
 def test_ceildiv():
     f = tvm.compile(ceildiv_test, "llvm")
-    a = tvm.nd.array(np.arange(16).astype("int32"))
+    a = tvm.runtime.tensor(np.arange(16).astype("int32"))
     f(a)
     ref = (np.arange(16) + 3) // 4
     tvm.testing.assert_allclose(a.numpy(), ref)
diff --git a/web/.gitignore b/web/.gitignore
index 17d59ed10d4b..a746034d5aa4 100644
--- a/web/.gitignore
+++ b/web/.gitignore
@@ -4,5 +4,5 @@ out
 node_modules
 build
 debug
-.ndarray_cache
+.tensor_cache
 src/tvmjs_runtime_wasi.js
diff --git a/web/apps/browser/rpc_server.html b/web/apps/browser/rpc_server.html
index 07e6fe87fc95..6bcecfe8661c 100644
--- a/web/apps/browser/rpc_server.html
+++ b/web/apps/browser/rpc_server.html
@@ -51,12 +51,12 @@
     function connectRPC() {
       const proxyUrl = document.getElementById("proxyUrl").value;
       const key = document.getElementById("proxyKey").value;
-      const ndarrayCacheName = document.getElementById("cache-select").value;
-      let ndarrayCacheUrl = new URL(ndarrayCacheName + "/", document.URL).href;
-      let ndarrayCacheDevice = document.getElementById("ndarrayCacheDevice").value;
+      const tensorCacheName = document.getElementById("cache-select").value;
+      let tensorCacheUrl = new URL(tensorCacheName + "/", document.URL).href;
+      let tensorCacheDevice = document.getElementById("tensorCacheDevice").value;
 
-      if (ndarrayCacheName == "none" || ndarrayCacheName === undefined) {
-        ndarrayCacheUrl = "";
+      if (tensorCacheName == "none" || tensorCacheName === undefined) {
+        tensorCacheUrl = "";
       }
 
       // only works for once.
@@ -66,7 +66,7 @@
 
       new tvmjs.RPCServer(
         proxyUrl, key, getImports, customLog,
-        ndarrayCacheUrl, ndarrayCacheDevice, initProgressCallback,
+        tensorCacheUrl, tensorCacheDevice, initProgressCallback,
         tvmjsGlobalEnv.asyncOnRPCServerLoad);
     }
 
@@ -117,12 +117,12 @@ <h2>Options</h2>
       type="text"
       value="wasm"
     /><br />
-    NDArrayCache -
+    TensorCache -
     <select name="cache-name" id="cache-select">
       <option value="none">none</option>
     </select>
     CacheDevice -
-    <select name="cache-device" id="ndarrayCacheDevice">
+    <select name="cache-device" id="tensorCacheDevice">
       <option value="webgpu">webgpu</option>
       <option value="cpu">cpu</option>
     </select>
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 31f494322684..146a5ae1f7cd 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -38,7 +38,6 @@
 #include "src/runtime/device_api.cc"
 #include "src/runtime/file_utils.cc"
 #include "src/runtime/logging.cc"
-#include "src/runtime/ndarray.cc"
 #include "src/runtime/profiling.cc"
 #include "src/runtime/rpc/rpc_channel.cc"
 #include "src/runtime/rpc/rpc_endpoint.cc"
@@ -46,6 +45,7 @@
 #include "src/runtime/rpc/rpc_local_session.cc"
 #include "src/runtime/rpc/rpc_module.cc"
 #include "src/runtime/rpc/rpc_session.cc"
+#include "src/runtime/tensor.cc"
 #include "src/runtime/workspace_pool.cc"
 // relax setup
 #include "ffi/src/ffi/container.cc"
@@ -56,8 +56,8 @@
 #include "ffi/src/ffi/extra/module.cc"
 #include "ffi/src/ffi/extra/testing.cc"
 #include "ffi/src/ffi/function.cc"
-#include "ffi/src/ffi/ndarray.cc"
 #include "ffi/src/ffi/object.cc"
+#include "ffi/src/ffi/tensor.cc"
 #include "ffi/src/ffi/traceback.cc"
 #include "src/runtime/memory/memory_manager.cc"
 #include "src/runtime/nvtx.cc"
@@ -67,9 +67,9 @@
 #include "src/runtime/vm/executable.cc"
 #include "src/runtime/vm/kv_state.cc"
 #include "src/runtime/vm/lm_support.cc"
-#include "src/runtime/vm/ndarray_cache_support.cc"
 #include "src/runtime/vm/paged_kv_cache.cc"
 #include "src/runtime/vm/rnn_state.cc"
+#include "src/runtime/vm/tensor_cache_support.cc"
 #include "src/runtime/vm/vm.cc"
 
 // --- Implementations of backend and wasm runtime API. ---
@@ -121,7 +121,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
       });
 });
 
-void ArrayDecodeStorage(NDArray cpu_arr, std::string bytes, std::string format, std::string dtype) {
+void ArrayDecodeStorage(Tensor cpu_arr, std::string bytes, std::string format, std::string dtype) {
   if (format == "f32-to-bf16" && dtype == "float32") {
     std::vector<uint16_t> buffer(bytes.length() / 2);
     std::memcpy(buffer.data(), bytes.data(), buffer.size() * 2);
@@ -166,7 +166,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
                                });
 });
 
-NDArray ConcatEmbeddings(const std::vector<NDArray>& embeddings) {
+Tensor ConcatEmbeddings(const std::vector<Tensor>& embeddings) {
   // Get output shape
   int64_t hidden_size = embeddings[0]->shape[1];
   DLDataType dtype = embeddings[0]->dtype;
@@ -182,7 +182,7 @@ NDArray ConcatEmbeddings(const std::vector<NDArray>& embeddings) {
   std::vector<int64_t> shape;
   shape.push_back(seqLen);
   shape.push_back(hidden_size);
-  NDArray result = NDArray::Empty(shape, dtype, device);
+  Tensor result = Tensor::Empty(shape, dtype, device);
 
   // Copy
   int offset = 0;
@@ -193,29 +193,29 @@ NDArray ConcatEmbeddings(const std::vector<NDArray>& embeddings) {
     copy_dst.shape = embeddings[i]->shape;
     copy_dst.byte_offset =
         offset * hidden_size * ((embeddings[i]->dtype.bits * embeddings[i]->dtype.lanes + 7) / 8);
-    NDArray::CopyFromTo(&copy_src, &copy_dst);
+    Tensor::CopyFromTo(&copy_src, &copy_dst);
     offset += embeddings[i]->shape[0];
   }
 
   return result;
 }
 
-// Concatenate n NDArrays
+// Concatenate n Tensors
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def_packed("tvmjs.runtime.ConcatEmbeddings",
                   [](ffi::PackedArgs args, ffi::Any* ret) {
-                    std::vector<NDArray> embeddings;
+                    std::vector<Tensor> embeddings;
                     for (int i = 0; i < args.size(); ++i) {
-                      embeddings.push_back(args[i].cast<NDArray>());
+                      embeddings.push_back(args[i].cast<Tensor>());
                     }
-                    NDArray result = ConcatEmbeddings(std::move(embeddings));
+                    Tensor result = ConcatEmbeddings(std::move(embeddings));
                     *ret = result;
                   })
-      .def("tvmjs.runtime.NDArrayCopyFromBytes",
-           [](NDArray nd, TVMFFIByteArray* bytes) { nd.CopyFromBytes(bytes->data, bytes->size); })
-      .def("tvmjs.runtime.NDArrayCopyToBytes", [](NDArray nd) -> ffi::Bytes {
+      .def("tvmjs.runtime.TensorCopyFromBytes",
+           [](Tensor nd, TVMFFIByteArray* bytes) { nd.CopyFromBytes(bytes->data, bytes->size); })
+      .def("tvmjs.runtime.TensorCopyToBytes", [](Tensor nd) -> ffi::Bytes {
         size_t size = GetDataSize(*(nd.operator->()));
         std::string bytes;
         bytes.resize(size);
diff --git a/web/src/artifact_cache.ts b/web/src/artifact_cache.ts
index 61ad021c7fef..439f91c88160 100644
--- a/web/src/artifact_cache.ts
+++ b/web/src/artifact_cache.ts
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-export interface NDArrayCacheEntry {
+export interface TensorCacheEntry {
   name: string;
   shape: Array<number>;
   dtype: string;
@@ -26,11 +26,11 @@ export interface NDArrayCacheEntry {
   nbytes: number;
 }
 
-export interface NDArrayShardEntry {
+export interface TensorShardEntry {
   dataPath: string;
   format: "raw-shard";
   nbytes: number;
-  records: Array<NDArrayCacheEntry>;
+  records: Array<TensorCacheEntry>;
 }
 
 /**
@@ -357,13 +357,13 @@ export class ArtifactIndexedDBCache implements ArtifactCacheTemplate {
 /**
  * Function to check if NDarray is in Cache or not
  *
- * @param ndarrayCacheUrl The cache url which links to the NDArray
+ * @param tensorCacheUrl The cache url which links to the Tensor
  * @param cacheScope The scope identifier of the cache
  * @param cacheType The type of the cache: "cache" or "indexedDB"
- * @returns the result if the cache has NDArray
+ * @returns the result if the cache has Tensor
  */
-export async function hasNDArrayInCache(
-  ndarrayCacheUrl: string,
+export async function hasTensorInCache(
+  tensorCacheUrl: string,
   cacheScope = "tvmjs",
   cacheType = "cache"
 ): Promise<boolean> {
@@ -376,25 +376,25 @@ export async function hasNDArrayInCache(
     console.error("Unsupported cacheType: " + cacheType + ", using default ArtifactCache.");
     artifactCache = new ArtifactCache(cacheScope);
   }
-  const jsonUrl = new URL("ndarray-cache.json", ndarrayCacheUrl).href;
+  const jsonUrl = new URL("tensor-cache.json", tensorCacheUrl).href;
   const hasJsonUrlInCache = await artifactCache.hasAllKeys([jsonUrl]);
   if (!hasJsonUrlInCache) {
     return false;
   }
   let list = await artifactCache.fetchWithCache(jsonUrl, "json");
-  list = list["records"] as Array<NDArrayShardEntry>;
-  return await artifactCache.hasAllKeys(list.map(key => new URL(key.dataPath, ndarrayCacheUrl).href));
+  list = list["records"] as Array<TensorShardEntry>;
+  return await artifactCache.hasAllKeys(list.map(key => new URL(key.dataPath, tensorCacheUrl).href));
 }
 
 
 /**
- * Given cacheUrl, search up items to delete based on cacheUrl/ndarray-cache.json
+ * Given cacheUrl, search up items to delete based on cacheUrl/tensor-cache.json
  *
  * @param cacheUrl The cacheUrl for the items
  * @param cacheScope The scope identifier of the cache
  * @param cacheType The type of the cache: "cache" or "indexedDB"
  */
-export async function deleteNDArrayCache(
+export async function deleteTensorCache(
   cacheUrl: string,
   cacheScope = "tvmjs",
   cacheType = "cache"
@@ -408,9 +408,9 @@ export async function deleteNDArrayCache(
     console.error("Unsupported cacheType: " + cacheType + ", using default ArtifactCache.");
     artifactCache = new ArtifactCache(cacheScope);
   }
-  const jsonUrl = new URL("ndarray-cache.json", cacheUrl).href;
+  const jsonUrl = new URL("tensor-cache.json", cacheUrl).href;
   const list = await artifactCache.fetchWithCache(jsonUrl, "json");
-  const arrayentry = list["records"] as Array<NDArrayShardEntry>;
+  const arrayentry = list["records"] as Array<TensorShardEntry>;
   const processShard = async (i: number) => {
     const dataUrl = new URL(arrayentry[i].dataPath, cacheUrl).href;
     await artifactCache.deleteInCache(dataUrl);
diff --git a/web/src/ctypes.ts b/web/src/ctypes.ts
index 9836fbfda530..04054df00599 100644
--- a/web/src/ctypes.ts
+++ b/web/src/ctypes.ts
@@ -102,9 +102,9 @@ export const enum TypeIndex {
    */
   kTVMFFIShape = 70,
   /*!
-   * \brief NDArray object, layout = { TVMFFIObject, DLTensor, ... }
+   * \brief Tensor object, layout = { TVMFFIObject, DLTensor, ... }
    */
-  kTVMFFINDArray = 71,
+  kTVMFFITensor = 71,
   /*! \brief Map object. */
   kTVMFFIMap = 72,
   /*! \brief Runtime module object. */
diff --git a/web/src/index.ts b/web/src/index.ts
index d4fc9b9187e6..868a26623ae0 100644
--- a/web/src/index.ts
+++ b/web/src/index.ts
@@ -19,7 +19,7 @@
 
 export {
   Scalar, DLDevice, DLDataType,
-  PackedFunc, Module, NDArray,
+  PackedFunc, Module, Tensor,
   TVMArray, TVMObject, VirtualMachine,
   InitProgressCallback, InitProgressReport,
   Instance, instantiate
@@ -28,8 +28,8 @@ export {
   ArtifactCacheTemplate,
   ArtifactCache,
   ArtifactIndexedDBCache,
-  hasNDArrayInCache,
-  deleteNDArrayCache
+  hasTensorInCache,
+  deleteTensorCache
 } from "./artifact_cache";
 export { Disposable, LibraryProvider } from "./types";
 export { RPCServer } from "./rpc_server";
diff --git a/web/src/rpc_server.ts b/web/src/rpc_server.ts
index 1e3af6f6438e..b43d5706d7f6 100644
--- a/web/src/rpc_server.ts
+++ b/web/src/rpc_server.ts
@@ -81,8 +81,8 @@ export class RPCServer {
   state: RPCServerState = RPCServerState.InitHeader;
   logger: (msg: string) => void;
   getImports: () => Record<string, unknown>;
-  private ndarrayCacheUrl: string;
-  private ndarrayCacheDevice: string;
+  private tensorCacheUrl: string;
+  private tensorCacheDevice: string;
   private initProgressCallback?: runtime.InitProgressCallback;
   private asyncOnServerLoad?: (inst: runtime.Instance) => Promise<void>;
   private pendingSend: Promise<void> = Promise.resolve();
@@ -102,8 +102,8 @@ export class RPCServer {
     key: string,
     getImports: () => Record<string, unknown>,
     logger: (msg: string) => void = console.log,
-    ndarrayCacheUrl = "",
-    ndarrayCacheDevice = "cpu",
+    tensorCacheUrl = "",
+    tensorCacheDevice = "cpu",
     initProgressCallback: runtime.InitProgressCallback | undefined = undefined,
     asyncOnServerLoad: ((inst: runtime.Instance) => Promise<void>) | undefined = undefined,
   ) {
@@ -112,8 +112,8 @@ export class RPCServer {
     this.name = "WebSocketRPCServer[" + this.key + "]: ";
     this.getImports = getImports;
     this.logger = logger;
-    this.ndarrayCacheUrl = ndarrayCacheUrl;
-    this.ndarrayCacheDevice = ndarrayCacheDevice;
+    this.tensorCacheUrl = tensorCacheUrl;
+    this.tensorCacheDevice = tensorCacheDevice;
     this.initProgressCallback = initProgressCallback;
     this.asyncOnServerLoad = asyncOnServerLoad;
     this.checkLittleEndian();
@@ -145,7 +145,7 @@ export class RPCServer {
       this.log("Automatic reconnecting..");
       new RPCServer(
         this.url, this.key, this.getImports, this.logger,
-        this.ndarrayCacheUrl, this.ndarrayCacheDevice,
+        this.tensorCacheUrl, this.tensorCacheDevice,
         this.initProgressCallback, this.asyncOnServerLoad);
     } else {
       this.log("Closing the server, final state=" + this.state);
@@ -287,12 +287,12 @@ export class RPCServer {
         this.inst.registerInitProgressCallback(this.initProgressCallback);
       }
 
-      if (this.ndarrayCacheUrl.length != 0) {
-        if (this.ndarrayCacheDevice === "cpu") {
-          await this.inst.fetchNDArrayCache(this.ndarrayCacheUrl, this.inst.cpu());
+      if (this.tensorCacheUrl.length != 0) {
+        if (this.tensorCacheDevice === "cpu") {
+          await this.inst.fetchTensorCache(this.tensorCacheUrl, this.inst.cpu());
         } else {
-          assert(this.ndarrayCacheDevice === "webgpu");
-          await this.inst.fetchNDArrayCache(this.ndarrayCacheUrl, this.inst.webgpu());
+          assert(this.tensorCacheDevice === "webgpu");
+          await this.inst.fetchTensorCache(this.tensorCacheUrl, this.inst.webgpu());
         }
       }
 
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 3720b1873eee..cfb4d6777f86 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -31,7 +31,7 @@ import {
   ArtifactCache,
   ArtifactCacheTemplate,
   ArtifactIndexedDBCache,
-  NDArrayShardEntry,
+  TensorShardEntry,
 } from "./artifact_cache";
 import * as compact from "./compact";
 import * as ctypes from "./ctypes";
@@ -156,24 +156,24 @@ class RuntimeContext implements Disposable {
   functionListGlobalNamesFunctor: PackedFunc;
   moduleGetFunction: PackedFunc;
   moduleImport: PackedFunc;
-  ndarrayEmpty: PackedFunc;
-  ndarrayCopyFromTo: PackedFunc;
-  ndarrayCopyFromJSBytes: PackedFunc;
-  ndarrayCopyToJSBytes: PackedFunc;
+  tensorEmpty: PackedFunc;
+  tensorCopyFromTo: PackedFunc;
+  tensorCopyFromJSBytes: PackedFunc;
+  tensorCopyToJSBytes: PackedFunc;
   arrayGetItem: PackedFunc;
   arrayGetSize: PackedFunc;
   arrayMake: PackedFunc;
   arrayConcat: PackedFunc;
   getSysLib: PackedFunc;
-  arrayCacheGet: PackedFunc;
-  arrayCacheUpdate: PackedFunc;
-  arrayCacheRemove: PackedFunc;
-  arrayCacheClear: PackedFunc;
+  tensorCacheGet: PackedFunc;
+  tensorCacheUpdate: PackedFunc;
+  tensorCacheRemove: PackedFunc;
+  tensorCacheClear: PackedFunc;
   arrayDecodeStorage: PackedFunc;
   paramModuleFromCache: PackedFunc;
   paramModuleFromCacheByName: PackedFunc;
   makeShapeTuple: PackedFunc;
-  ndarrayCreateView: PackedFunc;
+  tensorCreateView: PackedFunc;
   sampleTopPFromLogits: PackedFunc;
   sampleTopPFromProb: PackedFunc;
   applyRepetitionPenalty: PackedFunc;
@@ -191,24 +191,24 @@ class RuntimeContext implements Disposable {
     );
     this.moduleGetFunction = getGlobalFunc("ffi.ModuleGetFunction");
     this.moduleImport = getGlobalFunc("ffi.ModuleImportModule");
-    this.ndarrayEmpty = getGlobalFunc("runtime.TVMArrayAllocWithScope");
-    this.ndarrayCopyFromTo = getGlobalFunc("runtime.TVMArrayCopyFromTo");
-    this.ndarrayCopyFromJSBytes = getGlobalFunc("tvmjs.runtime.NDArrayCopyFromBytes");
-    this.ndarrayCopyToJSBytes = getGlobalFunc("tvmjs.runtime.NDArrayCopyToBytes");
+    this.tensorEmpty = getGlobalFunc("runtime.TVMTensorAllocWithScope");
+    this.tensorCopyFromTo = getGlobalFunc("runtime.TVMTensorCopyFromTo");
+    this.tensorCopyFromJSBytes = getGlobalFunc("tvmjs.runtime.NDTensorCopyFromBytes");
+    this.tensorCopyToJSBytes = getGlobalFunc("tvmjs.runtime.TensorCopyToBytes");
     this.arrayGetItem = getGlobalFunc("ffi.ArrayGetItem");
     this.arrayGetSize = getGlobalFunc("ffi.ArraySize");
     this.arrayMake = getGlobalFunc("ffi.Array");
     this.arrayConcat = getGlobalFunc("tvmjs.runtime.ArrayConcat");
     this.getSysLib = getGlobalFunc("ffi.SystemLib");
-    this.arrayCacheGet = getGlobalFunc("vm.builtin.ndarray_cache.get");
-    this.arrayCacheRemove = getGlobalFunc("vm.builtin.ndarray_cache.remove");
-    this.arrayCacheUpdate = getGlobalFunc("vm.builtin.ndarray_cache.update");
-    this.arrayCacheClear = getGlobalFunc("vm.builtin.ndarray_cache.clear");
+    this.tensorCacheGet = getGlobalFunc("vm.builtin.tensor_cache.get");
+    this.tensorCacheRemove = getGlobalFunc("vm.builtin.tensor_cache.remove");
+    this.tensorCacheUpdate = getGlobalFunc("vm.builtin.tensor_cache.update");
+    this.tensorCacheClear = getGlobalFunc("vm.builtin.tensor_cache.clear");
     this.arrayDecodeStorage = getGlobalFunc("tvmjs.array.decode_storage");
     this.paramModuleFromCache = getGlobalFunc("vm.builtin.param_module_from_cache");
     this.paramModuleFromCacheByName = getGlobalFunc("vm.builtin.param_module_from_cache_by_name");
     this.makeShapeTuple = getGlobalFunc("ffi.Shape");
-    this.ndarrayCreateView = getGlobalFunc("runtime.TVMArrayCreateView");
+    this.tensorCreateView = getGlobalFunc("runtime.TVMTensorCreateView");
     this.sampleTopPFromLogits = getGlobalFunc("vm.builtin.sample_top_p_from_logits");
     this.sampleTopPFromProb = getGlobalFunc("vm.builtin.sample_top_p_from_prob");
     this.applyRepetitionPenalty = getGlobalFunc("vm.builtin.apply_repetition_penalty");
@@ -219,20 +219,20 @@ class RuntimeContext implements Disposable {
 
   dispose(): void {
     // call array cache clear to clear all cached items
-    this.arrayCacheClear.dispose();
+    this.tensorCacheClear.dispose();
     this.arrayGetItem.dispose();
     this.arrayGetSize.dispose();
     this.arrayMake.dispose();
     this.arrayConcat.dispose();
-    this.arrayCacheGet.dispose();
-    this.arrayCacheRemove.dispose();
-    this.arrayCacheUpdate.dispose();
-    this.arrayCacheClear.dispose();
+    this.tensorCacheGet.dispose();
+    this.tensorCacheRemove.dispose();
+    this.tensorCacheUpdate.dispose();
+    this.tensorCacheClear.dispose();
     this.arrayDecodeStorage.dispose();
     this.paramModuleFromCache.dispose();
     this.paramModuleFromCacheByName.dispose();
     this.makeShapeTuple.dispose();
-    this.ndarrayCreateView.dispose();
+    this.tensorCreateView.dispose();
     this.sampleTopPFromLogits.dispose();
     this.applyRepetitionPenalty.dispose();
     this.applyPresenceAndFrequencyPenalty.dispose();
@@ -339,7 +339,7 @@ const DeviceStrToEnum: Record<string, number> = {
 };
 
 /**
- * Represent a runtime context where a NDArray can reside.
+ * Represent a runtime context where a Tensor can reside.
  */
 export class DLDevice {
   /** The device type code of the device. */
@@ -399,7 +399,7 @@ const DLDataTypeCodeToStr: Record<number, string> = {
 };
 
 /**
- * Runtime data type of NDArray.
+ * Runtime data type of Tensor.
  */
 export class DLDataType {
   /** The type code */
@@ -497,10 +497,10 @@ class PackedFuncCell extends TVMObject {
 }
 
 /**
- * n-dimnesional array.
+ * Tensor( n-dimnesional array).
  */
 
-export class NDArray extends TVMObject {
+export class Tensor extends TVMObject {
   /** Number of dimensions. */
   ndim: number;
   /** Data type of the array. */
@@ -572,12 +572,12 @@ export class NDArray extends TVMObject {
    * @param dtype The data type of the new array.
    * @returns The new sliced ndarray.
    */
-  view(shape: Array<number>, dtype?: string): NDArray {
+  view(shape: Array<number>, dtype?: string): Tensor {
     const shapeArray = shape.map((value) => new Scalar(value, "int"));
     if (dtype === undefined) {
       dtype = this.dtype;
     }
-    return this.ctx.ndarrayCreateView(
+    return this.ctx.tensorCreateView(
       this,
       this.ctx.makeShapeTuple(...shapeArray),
       this.dtype,
@@ -591,24 +591,24 @@ export class NDArray extends TVMObject {
    */
   getDataPtr(): Pointer {
     if (this.handle === 0) {
-      throw Error("NDArray has already been disposed");
+      throw Error("Tensor has already been disposed");
     }
     return this.dataPtr;
   }
 
   /**
-   * Copy data from another NDArray or javascript array.
+   * Copy data from another Tensor or javascript array.
    * The number of elements must match.
    *
    * @param data The source data array.
    * @returns this
    */
   copyFrom(
-    data: NDArray | Array<number> | Float32Array | Float64Array |
+    data: Tensor | Array<number> | Float32Array | Float64Array |
       Int32Array | Int8Array | Uint8Array | Uint8ClampedArray
   ): this {
-    if (data instanceof NDArray) {
-      this.ctx.ndarrayCopyFromTo(data, this);
+    if (data instanceof Tensor) {
+      this.ctx.tensorCopyFromTo(data, this);
       return this;
     } else {
       const size = this.shape.reduce((a, b) => {
@@ -660,23 +660,23 @@ export class NDArray extends TVMObject {
     if (nbytes != data.length) {
       throw new Error("Expect the data's length equals nbytes=" + nbytes);
     }
-    this.ctx.ndarrayCopyFromJSBytes(this, data);
+    this.ctx.tensorCopyFromJSBytes(this, data);
     return this;
   }
   /**
-   * Return a copied Uint8Array of the raw bytes in the NDArray.
+   * Return a copied Uint8Array of the raw bytes in the Tensor.
    * @returns The result array.
    */
   toRawBytes(): Uint8Array {
     if (this.device.deviceType != DeviceStrToEnum.cpu) {
       throw new Error("Can only sync copy CPU array, use cpu_arr.copyfrom(gpu_arr) then sync instead.");
     }
-    return this.ctx.ndarrayCopyToJSBytes(this) as Uint8Array;
+    return this.ctx.tensorCopyToJSBytes(this) as Uint8Array;
   }
 
   /**
-   * Return a TypedArray copy of the NDArray, the specific type depends on
-   * the dtype of the NDArray.
+   * Return a TypedArray copy of the Tensor, the specific type depends on
+   * the dtype of the Tensor.
    * @returns The result array.
    */
   toArray(): Float32Array | Float64Array | Int32Array | Int8Array | Uint8Array {
@@ -834,7 +834,7 @@ export type InitProgressCallback = (report: InitProgressReport) => void;
 /**
  * TVM runtime instance.
  *
- * All objects(NDArray, Module, PackedFunc) returned by TVM runtim function call
+ * All objects(Tensor, Module, PackedFunc) returned by TVM runtim function call
  * and PackedFunc instance are tracked through a scope mechanism that will get
  * auto-released when we call EndScope.
  *
@@ -1179,7 +1179,7 @@ export class Instance implements Disposable {
   }
 
   //-----------------------------------------------
-  // Native NDArray Cache Support
+  // Native Tensor Cache Support
   //-----------------------------------------------
   /**
    * Register a call back for fetch progress.
@@ -1213,53 +1213,53 @@ export class Instance implements Disposable {
   }
 
   /**
-   * Get NDArray from cache.
+   * Get Tensor from cache.
    * @param name  The name of array.
    * @returns  The result.
    */
-  ndarrayCacheGet(name: string): NDArray | undefined {
-    return this.ctx.arrayCacheGet(name);
+  tensorCacheGet(name: string): Tensor | undefined {
+    return this.ctx.tensorCacheGet(name);
   }
 
   /**
-   * Get NDArray from cache.
+   * Get Tensor from cache.
    * @param name  The name of array.
    * @returns  The result.
    */
-  ndarrayCacheRemove(name: string): NDArray | undefined {
-    return this.ctx.arrayCacheRemove(name);
+  tensorCacheRemove(name: string): Tensor | undefined {
+    return this.ctx.tensorCacheRemove(name);
   }
 
   /**
-   * Update the ndarray cache.
+   * Update the tensor cache.
    * @param name The name of the array.
    * @param arr The content.
    */
-  ndarrayCacheUpdate(name: string, arr: NDArray, override = false) {
-    this.ctx.arrayCacheUpdate(name, arr, this.scalar(override ? 1 : 0, "int32"));
+  tensorCacheUpdate(name: string, arr: Tensor, override = false) {
+    this.ctx.tensorCacheUpdate(name, arr, this.scalar(override ? 1 : 0, "int32"));
   }
 
   /**
-   * Update the ndarray cache.
+   * Update the tensor cache.
    * @param name The name of the array.
    * @param arr The content.
    */
-  ndarrayCacheClear() {
-    this.ctx.arrayCacheClear();
+  tensorCacheClear() {
+    this.ctx.tensorCacheClear();
   }
 
   /**
-   * Given cacheUrl, search up items to fetch based on cacheUrl/ndarray-cache.json
+   * Given cacheUrl, search up items to fetch based on cacheUrl/tensor-cache.json
    *
-   * @param ndarrayCacheUrl The cache url.
+   * @param tensorCacheUrl The cache url.
    * @param device The device to be fetched to.
    * @param cacheScope The scope identifier of the cache
    * @param cacheType The type of the cache: "cache" or "indexedDB"
    * @param signal An optional AbortSignal to abort the fetch
    * @returns The meta data
    */
-  async fetchNDArrayCache(
-    ndarrayCacheUrl: string,
+  async fetchTensorCache(
+    tensorCacheUrl: string,
     device: DLDevice,
     cacheScope = "tvmjs",
     cacheType = "cache",
@@ -1274,28 +1274,28 @@ export class Instance implements Disposable {
       console.error("Unsupported cacheType: " + cacheType + ", using default ArtifactCache.");
       artifactCache = new ArtifactCache(cacheScope);
     }
-    const jsonUrl = new URL("ndarray-cache.json", ndarrayCacheUrl).href;
+    const jsonUrl = new URL("tensor-cache.json", tensorCacheUrl).href;
     const list = await artifactCache.fetchWithCache(jsonUrl, "json");
-    await this.fetchNDArrayCacheInternal(
-      ndarrayCacheUrl,
-      list["records"] as Array<NDArrayShardEntry>, device, artifactCache,
+    await this.fetchTensorCacheInternal(
+      tensorCacheUrl,
+      list["records"] as Array<TensorShardEntry>, device, artifactCache,
       signal);
     this.cacheMetadata = { ...this.cacheMetadata, ...(list["metadata"] as Record<string, any>) };
   }
 
 
   /**
-   * Fetch list of NDArray into the NDArrayCache.
+   * Fetch list of Tensor into the TensorCache.
    *
-   * @param ndarrayCacheUrl The cache url.
+   * @param tensorCacheUrl The cache url.
    * @param list The list of array data.
    * @param device The device to store the data to.
    * @param artifactCache The artifact cache
    * @param signal An optional AbortSignal to abort the fetch
    */
-  private async fetchNDArrayCacheInternal(
-    ndarrayCacheUrl: string,
-    list: Array<NDArrayShardEntry>,
+  private async fetchTensorCacheInternal(
+    tensorCacheUrl: string,
+    list: Array<TensorShardEntry>,
     device: DLDevice,
     artifactCache: ArtifactCacheTemplate,
     signal?: AbortSignal,
@@ -1310,7 +1310,7 @@ export class Instance implements Disposable {
     let fetchedShards = 0;
     let timeElapsed = 0;
 
-    const cacheOnly = await artifactCache.hasAllKeys(list.map(key => new URL(key.dataPath, ndarrayCacheUrl).href));
+    const cacheOnly = await artifactCache.hasAllKeys(list.map(key => new URL(key.dataPath, tensorCacheUrl).href));
 
     // `loading`: we have finished downloading (or already cacheOnly) and are loading onto WebGPU
     const reportCallback = (iter: number, loading = false) => {
@@ -1351,7 +1351,7 @@ export class Instance implements Disposable {
       // Download params [start, end) from `list`
       for (let i = start; i < end; i++) {
         const shard = list[i];
-        const dataUrl = new URL(shard.dataPath, ndarrayCacheUrl).href;
+        const dataUrl = new URL(shard.dataPath, tensorCacheUrl).href;
         try {
           await artifactCache.addToCache(dataUrl, "arraybuffer", signal);
         } catch (err) {
@@ -1377,7 +1377,7 @@ export class Instance implements Disposable {
     // Then iteratively, load the shard from cache
     for (let i = 0; i < list.length; ++i) {
       const shard = list[i];
-      const dataUrl = new URL(shard.dataPath, ndarrayCacheUrl).href;
+      const dataUrl = new URL(shard.dataPath, tensorCacheUrl).href;
       let buffer;
       try {
         buffer = await artifactCache.fetchWithCache(dataUrl, "arraybuffer");
@@ -1399,7 +1399,7 @@ export class Instance implements Disposable {
           this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), rec.format, rec.dtype);
           // then async stream into GPU if needed
           if (device.deviceType === DeviceStrToEnum.cpu) {
-            this.ndarrayCacheUpdate(rec.name, cpu_arr, false);
+            this.tensorCacheUpdate(rec.name, cpu_arr, false);
             cpu_arr.dispose();
           } else {
             // allocate a gpu arr and async copy to it.
@@ -1410,7 +1410,7 @@ export class Instance implements Disposable {
             });
             gpu_arr.copyFrom(cpu_arr);
             await device.sync();
-            this.ndarrayCacheUpdate(rec.name, gpu_arr, false);
+            this.tensorCacheUpdate(rec.name, gpu_arr, false);
             cpu_arr.dispose();
             gpu_arr.dispose();
           }
@@ -1463,7 +1463,7 @@ export class Instance implements Disposable {
   }
 
   /**
-   * Create an empty {@link NDArray} with given shape and dtype.
+   * Create an empty {@link Tensor} with given shape and dtype.
    *
    * @param shape The shape of the array.
    * @param dtype The data type of the array.
@@ -1474,13 +1474,13 @@ export class Instance implements Disposable {
     shape: Array<number> | number,
     dtype: string | DLDataType = "float32",
     dev: DLDevice = this.device("cpu", 0)
-  ): NDArray {
+  ): Tensor {
     shape = typeof shape === "number" ? [shape] : shape;
-    return this.ctx.ndarrayEmpty(this.makeShapeTuple(shape), dtype, dev, null);
+    return this.ctx.tensorEmpty(this.makeShapeTuple(shape), dtype, dev, null);
   }
 
   /**
-   * Create am uniform {@link NDArray} with given shape.
+   * Create am uniform {@link Tensor} with given shape.
    *
    * @param shape The shape of the array.
    * @param low The low value.
@@ -1493,7 +1493,7 @@ export class Instance implements Disposable {
     low: number,
     high: number,
     dev: DLDevice
-  ): NDArray {
+  ): Tensor {
     const ret = this.empty(shape, "float32", dev);
     const size = shape.reduce((a, b) => {
       return a * b;
@@ -1521,7 +1521,7 @@ export class Instance implements Disposable {
    * @param top_p The top_p
    * @returns The sampled index.
    */
-  sampleTopPFromLogits(logits: NDArray, temperature: number, top_p: number): number {
+  sampleTopPFromLogits(logits: Tensor, temperature: number, top_p: number): number {
     return this.ctx.sampleTopPFromLogits(logits, temperature, top_p, this.rng.randomFloat());
   }
 
@@ -1532,7 +1532,7 @@ export class Instance implements Disposable {
    * @param top_p The top_p
    * @returns The sampled index.
    */
-  sampleTopPFromProb(prob: NDArray, top_p: number): number {
+  sampleTopPFromProb(prob: Tensor, top_p: number): number {
     return this.ctx.sampleTopPFromProb(prob, top_p, this.rng.randomFloat());
   }
 
@@ -1542,7 +1542,7 @@ export class Instance implements Disposable {
    * @param token_ids The appeared token ids.
    * @param penalty The penalty factor.
    */
-  applyRepetitionPenalty(logits: NDArray, token_ids: NDArray, penalty: number) {
+  applyRepetitionPenalty(logits: Tensor, token_ids: Tensor, penalty: number) {
     return this.ctx.applyRepetitionPenalty(logits, token_ids, penalty);
   }
 
@@ -1556,9 +1556,9 @@ export class Instance implements Disposable {
    * @param frequency_penalty The penalty factor.
    */
   applyPresenceAndFrequencyPenalty(
-    logits: NDArray,
-    token_ids: NDArray,
-    token_freqs: NDArray,
+    logits: Tensor,
+    token_ids: Tensor,
+    token_freqs: Tensor,
     presence_penalty: number,
     frequency_penalty: number
   ) {
@@ -1572,7 +1572,7 @@ export class Instance implements Disposable {
    * @param logits The input logits before softmax w/ temperature.
    * @param temperature The temperature factor.
    */
-  applySoftmaxWithTemperature(logits: NDArray, temperature: number) {
+  applySoftmaxWithTemperature(logits: Tensor, temperature: number) {
     return this.ctx.applySoftmaxWithTemperature(logits, temperature);
   }
 
@@ -1587,11 +1587,11 @@ export class Instance implements Disposable {
   /**
    * Show image in canvas.
    *
-   * @param dataRGBA Image array in height x width uint32 NDArray RGBA format on GPU.
+   * @param dataRGBA Image array in height x width uint32 Tensor RGBA format on GPU.
    */
-  showImage(dataRGBA: NDArray) {
+  showImage(dataRGBA: Tensor) {
     if (dataRGBA.shape.length != 2) {
-      throw Error("Require a height x width uint32 NDArray in RGBA" +
+      throw Error("Require a height x width uint32 Tensor in RGBA" +
         "get shape=" + dataRGBA.shape.toString() + " instead."
       );
     }
@@ -1600,7 +1600,7 @@ export class Instance implements Disposable {
         "get " + DeviceEnumToStr[dataRGBA.device.deviceType] + " instead.");
     }
     if (dataRGBA.dtype != "uint32") {
-      throw Error("Require a height x width uint32 NDArray in RGBA, " +
+      throw Error("Require a height x width uint32 Tensor in RGBA, " +
         "get " + dataRGBA.dtype + " instead.");
     }
     this.lib.webGPUContext?.drawImageFromBuffer(
@@ -1644,11 +1644,11 @@ export class Instance implements Disposable {
   }
 
   /**
-   * Join a sequence of NDArrays that represent embeddings.
-   * @param inputs A list of embeddings in NDArrays, each array i has shape (m_i, hidden_size).
-   * @returns An NDArray of shape (\sum_{i} {m}, hidden_size)
+   * Join a sequence of Tensors that represent embeddings.
+   * @param inputs A list of embeddings in Tensors, each array i has shape (m_i, hidden_size).
+   * @returns An Tensor of shape (\sum_{i} {m}, hidden_size)
    */
-  concatEmbeddings(embeddings: Array<NDArray>): NDArray {
+  concatEmbeddings(embeddings: Array<Tensor>): Tensor {
     // 1. Check shape validity
     const hidden_size = embeddings[0].shape[1];
     embeddings.forEach((input) => {
@@ -1664,7 +1664,7 @@ export class Instance implements Disposable {
         "not found, but called concatEmbeddings."
       );
     }
-    return this.ctx.concatEmbeddings(...embeddings) as NDArray;
+    return this.ctx.concatEmbeddings(...embeddings) as Tensor;
   }
 
   /**
@@ -2033,9 +2033,9 @@ export class Instance implements Disposable {
       stack.storeI32(argZeroPaddingOffset, 0);
       // clear off the extra zero padding after ptr storage
       stack.storeI32(argValueOffset + SizeOf.I32, 0);
-      if (val instanceof NDArray) {
+      if (val instanceof Tensor) {
         if (!val.isView) {
-          stack.storeI32(argTypeIndexOffset, TypeIndex.kTVMFFINDArray);
+          stack.storeI32(argTypeIndexOffset, TypeIndex.kTVMFFITensor);
           stack.storePtr(argValueOffset, val.getHandle());
         } else {
           stack.storeI32(argTypeIndexOffset, TypeIndex.kTVMFFIDLTensorPtr);
@@ -2225,15 +2225,15 @@ export class Instance implements Disposable {
       case TypeIndex.kTVMFFIOpaquePtr: {
         return this.memory.loadPointer(valuePtr);
       }
-      case TypeIndex.kTVMFFINDArray: {
+      case TypeIndex.kTVMFFITensor: {
         return this.ctx.attachToCurrentScope(
-          new NDArray(this.memory.loadPointer(valuePtr), this.lib, this.ctx, false)
+          new Tensor(this.memory.loadPointer(valuePtr), this.lib, this.ctx, false)
         );
       }
       case TypeIndex.kTVMFFIDLTensorPtr: {
         assert(callbackArg);
         // no need to attach as we are only looking at view
-        return new NDArray(this.memory.loadPointer(valuePtr), this.lib, this.ctx, true);
+        return new Tensor(this.memory.loadPointer(valuePtr), this.lib, this.ctx, true);
       }
       case TypeIndex.kTVMFFIFunction: {
         return this.ctx.attachToCurrentScope(
diff --git a/web/tests/node/test_packed_func.js b/web/tests/node/test_packed_func.js
index 3c6980cc1f06..83ac61156430 100644
--- a/web/tests/node/test_packed_func.js
+++ b/web/tests/node/test_packed_func.js
@@ -158,7 +158,7 @@ test("ExceptionPassing", () => {
   tvm.endScope();
 });
 
-test("NDArrayCbArg", () => {
+test("TensorCbArg", () => {
   tvm.beginScope();
   let use_count = tvm.getGlobalFunc("testing.object_use_count");
   let record = [];
diff --git a/web/tests/node/test_ndarray.js b/web/tests/node/test_tensor.js
similarity index 100%
rename from web/tests/node/test_ndarray.js
rename to web/tests/node/test_tensor.js
diff --git a/web/tests/python/relax_rpc_test.py b/web/tests/python/relax_rpc_test.py
index e55ad1935122..c21b98564d78 100644
--- a/web/tests/python/relax_rpc_test.py
+++ b/web/tests/python/relax_rpc_test.py
@@ -74,8 +74,8 @@ def check(remote):
         vm = relax.VirtualMachine(remote.system_lib(), device=dev)
         adata = np.random.uniform(size=n).astype(dtype)
         bdata = np.random.uniform(size=n).astype(dtype)
-        a = tvm.nd.array(adata, dev)
-        b = tvm.nd.array(bdata, dev)
+        a = tvm.runtime.tensor(adata, dev)
+        b = tvm.runtime.tensor(bdata, dev)
         vm.set_input("main", a, b)
         vm.invoke_stateful("main")
         c = vm.get_outputs("main")
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 8925da00a489..260ccc9b3490 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -64,8 +64,8 @@ def check(remote, size):
         # basic function checks.
         dev = remote.webgpu(0)
         adata = np.random.uniform(size=size).astype(A.dtype)
-        a = tvm.nd.array(adata, dev)
-        b = tvm.nd.array(np.zeros(size, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(adata, dev)
+        b = tvm.runtime.tensor(np.zeros(size, dtype=A.dtype), dev)
 
         np.testing.assert_equal(a.numpy(), adata)
         f1 = remote.system_lib()