diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index 2024f2badc..e2e20707be 100644
--- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -149,6 +149,7 @@ __device__ inline int FN_INT32(bitwise_not)(int a) { return ~a; }
 __device__ inline int FN_INT32(clz)(int a) { return __clz(a); }
 __device__ inline int FN_INT32(popc)(int a) { return __popc(a); }
 __device__ inline int FN_INT32(logical_right_shift)(int a, int b) { return ((unsigned int)a >> b); }
+__device__ inline int FN_INT32(trunc)(int a) { return a; }
 
 __device__ inline int FN_INT32(max)(int a, int b) { return max(a, b); }
 __device__ inline int FN_INT32(min)(int a, int b) { return min(a, b); }
@@ -170,6 +171,7 @@ __device__ inline long long int FN_INT64(bitwise_xor)(long long int a, long long
 __device__ inline long long int FN_INT64(bitwise_not)(long long int a) { return ~a; }
 __device__ inline long long int FN_INT64(clz)(long long int a) { return __clzll(a); }
 __device__ inline long long int FN_INT64(popc)(long long int a) { return __popcll(a); }
+__device__ inline long long int FN_INT64(trunc)(long long int a) { return a; }
 __device__ inline long long int FN_INT64(mod)(long long int a, long long int b) {
   long long int res = a % b;
   if ((res != 0) && ((b ^ res) < 0)) res += b;
diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
index 40bce455ff..2610eea2f4 100644
--- a/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -204,6 +204,7 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(bitwise_not)
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(clz)
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(popc)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(trunc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT32
 
@@ -213,6 +214,7 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
   REGISTER_EXTERN_FUNC_1_IN_1_INT64(bitwise_not)
   REGISTER_EXTERN_FUNC_1_IN_1_INT64(clz)
   REGISTER_EXTERN_FUNC_1_IN_1_INT64(popc)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(trunc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT64
 
diff --git a/python/tests/ops/test_trunc_op.py b/python/tests/ops/test_trunc_op.py
new file mode 100644
index 0000000000..74db6db877
--- /dev/null
+++ b/python/tests/ops/test_trunc_op.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTruncOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-1000.0,
+            high=1000.0)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.trunc(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.trunc(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTruncOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTruncOpShape"
+        self.cls = TestTruncOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestTruncOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTruncOpDtype"
+        self.cls = TestTruncOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestTruncOpShape().run()
+    TestTruncOpDtype().run()