apache · cyx-6 · Sep 6, 2025 · Sep 6, 2025
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -43,16 +43,15 @@
 #include "../ffi/src/ffi/extra/module.cc"
 #include "../ffi/src/ffi/extra/testing.cc"
 #include "../ffi/src/ffi/function.cc"
-#include "../ffi/src/ffi/ndarray.cc"
 #include "../ffi/src/ffi/object.cc"
+#include "../ffi/src/ffi/tensor.cc"
 #include "../ffi/src/ffi/traceback.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/device_api.cc"
 #include "../src/runtime/file_utils.cc"
 #include "../src/runtime/logging.cc"
 #include "../src/runtime/memory/memory_manager.cc"
 #include "../src/runtime/minrpc/minrpc_logger.cc"
-#include "../src/runtime/ndarray.cc"
 #include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
 #include "../src/runtime/rpc/rpc_channel.cc"
@@ -63,6 +62,7 @@
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/rpc/rpc_session.cc"
 #include "../src/runtime/rpc/rpc_socket_impl.cc"
+#include "../src/runtime/tensor.cc"
 #include "../src/runtime/thread_pool.cc"
 #include "../src/runtime/threading_backend.cc"
 #include "../src/runtime/workspace_pool.cc"

diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
@@ -72,8 +72,8 @@ def test_rpc_module():
         dev = remote.cl(0)
         remote.upload(path_dso_cl)
         f1 = remote.load_module("dev_lib_cl.so")
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+        a = tvm.runtime.tensor(a_np, dev)
+        b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
         time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
         cost = time_f(a, b).mean
         print("%g secs/op\n" % cost)

diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
@@ -25,7 +25,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <string>
 #include <vector>

diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -137,7 +137,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int inpu
   };
   DLManagedTensor managed{tensor, /*manager_ctx*/ nullptr, /*deleter*/ nullptr};
 
-  auto input = tvm::runtime::NDArray::FromDLPack(&managed);
+  auto input = tvm::runtime::Tensor::FromDLPack(&managed);
 
   tvm::ffi::Function set_input = get_module_func(TheModel->model_executor, "set_input");
   set_input(input_idx, input);
@@ -172,17 +172,17 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
   }
 
   tvm::ffi::Function get_output = get_module_func(TheModel->model_executor, "get_output");
-  tvm::runtime::NDArray output = get_output(output_idx);
+  tvm::runtime::Tensor output = get_output(output_idx);
 
   std::vector<int64_t> shape_vec{output->shape, output->shape + output->ndim};
 
-  auto* container = new tvm::runtime::NDArray::Container(
-      static_cast<void*>(output_value), shape_vec, output->dtype, Model::external());
+  auto* container = new tvm::runtime::Tensor::Container(static_cast<void*>(output_value), shape_vec,
+                                                        output->dtype, Model::external());
   container->SetDeleter([](tvm::Object* container) {
-    delete static_cast<tvm::runtime::NDArray::Container*>(container);
+    delete static_cast<tvm::runtime::Tensor::Container*>(container);
   });
 
-  tvm::runtime::NDArray host_output(tvm::runtime::GetObjectPtr<tvm::runtime::Object>(container));
+  tvm::runtime::Tensor host_output(tvm::runtime::GetObjectPtr<tvm::runtime::Object>(container));
 
   if (meta_size != 0) {
     auto* meta = reinterpret_cast<tensor_meta*>(output_meta);

diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -72,8 +72,8 @@ def test_rpc_module(host, port, key, mode):
     dev = remote.metal(0)
     f1 = remote.load_module("dev_lib.dylib")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(a_np, dev)
+    b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
     time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
     cost = time_f(a, b).mean
     print("Metal: %g secs/op" % cost)

diff --git a/docs/arch/index.rst b/docs/arch/index.rst
@@ -133,7 +133,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu
     import tvm
     # Example runtime execution program in python, with type annotated
     mod: tvm.runtime.Module = tvm.runtime.load_module("compiled_artifact.so")
-    arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], device=tvm.cuda(0))
+    arr: tvm.runtime.Tensor = tvm.runtime.tensor([1, 2, 3], device=tvm.cuda(0))
     fun: tvm.runtime.PackedFunc = mod["addone"]
     fun(arr)
     print(arr.numpy())
@@ -142,7 +142,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu
 :py:class:`tvm.runtime.Module` encapsulates the result of compilation. A runtime.Module contains a GetFunction method to obtain PackedFuncs by name.
 
 :py:class:`tvm.runtime.PackedFunc` is a type-erased function interface for both the generated functions. A runtime.PackedFunc can take arguments and return values with the
-following types: POD types(int, float), string, runtime.PackedFunc, runtime.Module, runtime.NDArray, and other sub-classes of runtime.Object.
+following types: POD types(int, float), string, runtime.PackedFunc, runtime.Module, runtime.Tensor, and other sub-classes of runtime.Object.
 
 :py:class:`tvm.runtime.Module` and :py:class:`tvm.runtime.PackedFunc` are powerful mechanisms to modularize the runtime. For example, to get the above `addone` function on CUDA, we can use LLVM to generate the host-side code to compute the launching parameters(e.g. size of the thread groups) and then call into another PackedFunc from a CUDAModule that is backed by the CUDA driver API. The same mechanism can be used for OpenCL kernels.
 
@@ -155,7 +155,7 @@ The above example only deals with a simple `addone` function. The code snippet b
    factory: tvm.runtime.Module = tvm.runtime.load_module("resnet18.so")
    # Create a stateful graph execution module for resnet18 on cuda(0)
    gmod: tvm.runtime.Module = factory["resnet18"](tvm.cuda(0))
-   data: tvm.runtime.NDArray = get_input_data()
+   data: tvm.runtime.Tensor = get_input_data()
    # set input
    gmod["set_input"](0, data)
    # execute the model

diff --git a/docs/deep_dive/tensor_ir/tutorials/tir_creation.py b/docs/deep_dive/tensor_ir/tutorials/tir_creation.py
@@ -204,9 +204,9 @@ def mm_relu(a: T.handle, b: T.handle, c: T.handle):
 
 
 def evaluate_dynamic_shape(lib: tvm.runtime.Module, m: int, n: int, k: int):
-    A = tvm.nd.array(np.random.uniform(size=(m, k)).astype("float32"))
-    B = tvm.nd.array(np.random.uniform(size=(k, n)).astype("float32"))
-    C = tvm.nd.array(np.zeros((m, n), dtype="float32"))
+    A = tvm.runtime.tensor(np.random.uniform(size=(m, k)).astype("float32"))
+    B = tvm.runtime.tensor(np.random.uniform(size=(k, n)).astype("float32"))
+    C = tvm.runtime.tensor(np.zeros((m, n), dtype="float32"))
     lib(A, B, C)
     return C.numpy()
 

diff --git a/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py b/docs/deep_dive/tensor_ir/tutorials/tir_transformation.py
@@ -72,9 +72,9 @@ def main(
 b_np = np.random.uniform(size=(128, 128)).astype("float32")
 c_np = a_np @ b_np
 
-a_nd = tvm.nd.array(a_np)
-b_nd = tvm.nd.array(b_np)
-c_nd = tvm.nd.array(np.zeros((128, 128), dtype="float32"))
+a_nd = tvm.runtime.tensor(a_np)
+b_nd = tvm.runtime.tensor(b_np)
+c_nd = tvm.runtime.tensor(np.zeros((128, 128), dtype="float32"))
 
 
 def evaluate(mod: tvm.IRModule):

diff --git a/docs/get_started/tutorials/ir_module.py b/docs/get_started/tutorials/ir_module.py
@@ -237,7 +237,7 @@ def main(
 vm = relax.VirtualMachine(exec, dev)
 
 raw_data = np.random.rand(1, 784).astype("float32")
-data = tvm.nd.array(raw_data, dev)
+data = tvm.runtime.tensor(raw_data, dev)
 cpu_out = vm["main"](data, *params_from_torch["main"]).numpy()
 print(cpu_out)
 
@@ -267,8 +267,8 @@ def main(
 dev = tvm.device("cuda", 0)
 vm = relax.VirtualMachine(exec, dev)
 # Need to allocate data and params on GPU device
-data = tvm.nd.array(raw_data, dev)
-gpu_params = [tvm.nd.array(p, dev) for p in params_from_torch["main"]]
+data = tvm.runtime.tensor(raw_data, dev)
+gpu_params = [tvm.runtime.tensor(p, dev) for p in params_from_torch["main"]]
 gpu_out = vm["main"](data, *gpu_params).numpy()
 print(gpu_out)
 

diff --git a/docs/get_started/tutorials/quick_start.py b/docs/get_started/tutorials/quick_start.py
@@ -141,9 +141,9 @@ def forward(self, x):
 device = tvm.cpu()
 vm = relax.VirtualMachine(ex, device)
 data = np.random.rand(1, 784).astype("float32")
-tvm_data = tvm.nd.array(data, device=device)
+tvm_data = tvm.runtime.tensor(data, device=device)
 params = [np.random.rand(*param.shape).astype("float32") for _, param in param_spec]
-params = [tvm.nd.array(param, device=device) for param in params]
+params = [tvm.runtime.tensor(param, device=device) for param in params]
 print(vm["forward"](tvm_data, *params).numpy())
 
 ################################################################################
@@ -158,14 +158,14 @@ def forward(self, x):
 #       prefill_logits = vm["prefill"](inputs, weight, kv_cache)
 #       decoded_logits = vm["decode"](inputs, weight, kv_cache)
 #
-# - TVM runtime comes with native data structures, such as NDArray, can also have zero
+# - TVM runtime comes with native data structures, such as Tensor, can also have zero
 #   copy exchange with existing ecosystem (DLPack exchange with PyTorch)
 #
 #   .. code-block:: Python
 #
-#       # Convert PyTorch tensor to TVM NDArray
-#       x_tvm = tvm.nd.from_dlpack(x_torch.to_dlpack())
-#       # Convert TVM NDArray to PyTorch tensor
+#       # Convert PyTorch tensor to TVM Tensor
+#       x_tvm = tvm.runtime.from_dlpack(x_torch.to_dlpack())
+#       # Convert TVM Tensor to PyTorch tensor
 #       x_torch = torch.from_dlpack(x_tvm.to_dlpack())
 #
 # - TVM runtime works in non-python environments, so it works on settings such as mobile
@@ -175,14 +175,14 @@ def forward(self, x):
 #       // C++ snippet
 #       runtime::Module vm = ex.GetFunction("load_executable")();
 #       vm.GetFunction("init")(...);
-#       NDArray out = vm.GetFunction("prefill")(data, weight, kv_cache);
+#       Tensor out = vm.GetFunction("prefill")(data, weight, kv_cache);
 #
 #   .. code-block:: Java
 #
 #       // Java snippet
 #       Module vm = ex.getFunction("load_executable").invoke();
 #       vm.getFunction("init").pushArg(...).invoke;
-#       NDArray out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
+#       Tensor out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
 #
 
 ################################################################################

diff --git a/docs/how_to/tutorials/cross_compilation_and_rpc.py b/docs/how_to/tutorials/cross_compilation_and_rpc.py
@@ -182,8 +182,8 @@
 
 # create arrays on the remote device
 dev = remote.cpu()
-a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
 # the function will run on the remote device
 func(a, b)
 np.testing.assert_equal(b.numpy(), a.numpy() + 1)
@@ -249,8 +249,8 @@ def run_opencl():
 
     # run
     dev = remote.cl()
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.runtime.tensor(np.zeros(1024, dtype=A.dtype), dev)
     func(a, b)
     np.testing.assert_equal(b.numpy(), a.numpy() + 1)
     print("OpenCL test passed!")

diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py
@@ -209,8 +209,8 @@ def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IR
 dev = tvm.device("cuda", 0)
 vm = relax.VirtualMachine(ex, dev)
 # Need to allocate data and params on GPU device
-data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev)
-gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
+data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32"), dev)
+gpu_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
 gpu_out = vm["forward"](data, *gpu_params).numpy()
 print(gpu_out)
 

diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py
@@ -117,8 +117,8 @@
     dev = tvm.device("cuda", 0)
     vm = relax.VirtualMachine(ex, dev)
     # Need to allocate data and params on GPU device
-    gpu_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
-    gpu_params = [tvm.nd.array(p, dev) for p in params["main"]]
+    gpu_data = tvm.runtime.tensor(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
+    gpu_params = [tvm.runtime.tensor(p, dev) for p in params["main"]]
     gpu_out = vm["main"](gpu_data, *gpu_params).numpy()
 
     print(gpu_out.shape)
diff --git a/docs/how_to/tutorials/optimize_llm.py b/docs/how_to/tutorials/optimize_llm.py
@@ -489,7 +489,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
 
     # Convert params into ndarray
     params = [
-        tvm.nd.array(param_dict[k].astype("float16"), device=dev) for k in named_params.keys()
+        tvm.runtime.tensor(param_dict[k].astype("float16"), device=dev) for k in named_params.keys()
     ]
 
 
@@ -523,7 +523,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
     input_len = len(prompt)
 
     # Load prompt tokens into TVM ndarray on the target device
-    tokens = tvm.nd.array(np.array(prompt).astype("int32"), device=dev)
+    tokens = tvm.runtime.tensor(np.array(prompt).astype("int32"), device=dev)
 
 ######################################################################
 # Create the KVCache
@@ -609,7 +609,7 @@ def sample_token(logits):
     print("The generated token:")
 
     while last_token != tokenizer.eos_token_id:
-        tokens = tvm.nd.array(np.array([last_token]).astype("int32"), device=dev)
+        tokens = tvm.runtime.tensor(np.array([last_token]).astype("int32"), device=dev)
         hidden_states = embed(tokens, params)
         begin_forward_func(kv_cache, ShapeTuple([seq_id]), ShapeTuple([1]))
         logits, kv_cache = vm["decode"](hidden_states, kv_cache, params)

diff --git a/docs/reference/api/python/index.rst b/docs/reference/api/python/index.rst
@@ -34,7 +34,6 @@ Python API
     :caption: tvm.runtime
 
     runtime/runtime
-    runtime/ndarray
     runtime/vm
     runtime/disco
     runtime/profiling

diff --git a/docs/reference/api/python/runtime/ndarray.rst b/docs/reference/api/python/runtime/ndarray.rst
diff --git a/docs/reference/api/python/runtime/runtime.rst b/docs/reference/api/python/runtime/runtime.rst
@@ -19,4 +19,3 @@ tvm.runtime
 -----------
 .. automodule:: tvm.runtime
    :members:
-   :exclude-members: NDArray
diff --git a/ffi/CMakeLists.txt b/ffi/CMakeLists.txt
@@ -57,7 +57,7 @@ set(tvm_ffi_objs_sources
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/object.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/error.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/function.cc"
-  "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/ndarray.cc"
+  "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/tensor.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/dtype.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/container.cc"
 )
@@ -189,7 +189,7 @@ if (TVM_FFI_BUILD_PYTHON_MODULE)
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/dtype.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/error.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/function.pxi
-    ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/ndarray.pxi
+    ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/tensor.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/object.pxi
     ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython/string.pxi
   )

diff --git a/ffi/docs/.gitignore b/ffi/docs/.gitignore
@@ -1 +1,2 @@
 _build
+**/generated/*.rst
diff --git a/ffi/docs/Makefile b/ffi/docs/Makefile
@@ -25,7 +25,7 @@ BUILDDIR      = _build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-.PHONY: help Makefile livehtml
+.PHONY: help Makefile livehtml clean
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
@@ -34,3 +34,7 @@ help:
 
 livehtml:
 	@sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	rm -rf $(BUILDDIR)
+	rm -rf reference/python/generated
diff --git a/ffi/docs/concepts/abi_overview.md b/ffi/docs/concepts/abi_overview.md
@@ -219,17 +219,17 @@ typedef struct TVMFFIObject {
 - `deleter` ensures that objects allocated from one language/runtime can be safely deleted in another.
 
 The object format provides a unified way to manage object life-cycle and dynamic type casting
-for heap-allocated objects, including Shape, NDArray,
+for heap-allocated objects, including Shape, Tensor,
 Function, Array, Map and other custom objects.
 
 
-### DLPack Compatible NDArray
+### DLPack Compatible Tensor
 
-We provide first-class support for DLPack raw unmanaged pointer support as well as a managed NDArray object that
-directly adopts the DLPack DLTensor layout. The overall layout of the NDArray object is as follows:
+We provide first-class support for DLPack raw unmanaged pointer support as well as a managed Tensor object that
+directly adopts the DLPack DLTensor layout. The overall layout of the Tensor object is as follows:
 
 ```c++
-struct NDArrayObj: public ffi::Object, public DLTensor {
+struct TensorObj: public ffi::Object, public DLTensor {
 };
 ```
 
@@ -241,7 +241,7 @@ DLTensor* ReadDLTensorPtr(const TVMFFIAny *value) {
   if (value->type_index == kTVMFFIDLTensorPtr) {
     return static_cast<DLTensor*>(value->v_ptr);
   }
-  assert(value->type_index == kTVMFFINDArray);
+  assert(value->type_index == kTVMFFITensor);
   return reinterpret_cast<DLTensor*>(
     reinterpret_cast<char*>(value->v_obj) + sizeof(TVMFFIObject));
 }

diff --git a/ffi/docs/conf.py b/ffi/docs/conf.py
@@ -20,6 +20,9 @@
 
 import tomli
 
+
+os.environ["TVM_FFI_BUILD_DOCS"] = "1"
+
 # -- General configuration ------------------------------------------------
 
 # Load version from pyproject.toml