diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d8051e1fbb116..7a3b450f71548 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -91,15 +91,16 @@ endif()
 cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context)
 
 if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 elseif(WITH_ROCM)
-  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 else()
-  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 
diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
new file mode 100644
index 0000000000000..b15a66c51c4b6
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/details/cow_ptr.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_, paddle::memory::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  paddle::memory::Copy(platform::CPUPlace(), dst,
+                       OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_,
+                       stream);
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             paddle::memory::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const platform::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
+                       platform::CPUPlace(), src, *gpu_memory_size_, stream);
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
+    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
+      const platform::Place &place) const {                                    \
+    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index cf71cdfc6d651..d1aee6cb2f662 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -23,17 +23,21 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/details/cow_ptr.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace framework {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
+    const paddle::memory::allocation::AllocationPtr &gpu_) {
+  return gpu_ == nullptr
+             ? paddle::none
+             : paddle::optional<platform::CUDAPlace>(
+                   BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+}
+
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -198,10 +202,7 @@ class Vector {
     std::mutex &Mutex() const { return mtx_; }
 
     paddle::optional<platform::CUDAPlace> CUDAPlace() const {
-      return gpu_ == nullptr
-                 ? paddle::none
-                 : paddle::optional<platform::CUDAPlace>(
-                       BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+      return OptionalCUDAPlace(gpu_);
     }
 
    private:
@@ -212,17 +213,7 @@ class Vector {
       kDirty = 0x10
     };
 
-    void CopyToCPU() const {
-      // COPY GPU Data To CPU
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(gpu_->place()));
-      auto stream = dev_ctx->stream();
-      void *src = gpu_->ptr();
-      void *dst = cpu_.data();
-      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                           gpu_memory_size_, stream);
-      dev_ctx->Wait();
-    }
+    void CopyToCPU() const;
 
     void MutableCPU() {
       if (IsInCUDA() && IsDirty()) {
@@ -260,17 +251,7 @@ class Vector {
       }
     }
 
-    void CopyCPUDataToCUDA(const platform::Place &place) const {
-      void *src = cpu_.data();
-      gpu_memory_size_ = cpu_.size() * sizeof(T);
-      gpu_ = memory::Alloc(place, gpu_memory_size_);
-      void *dst = gpu_->ptr();
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(place));
-      auto stream = dev_ctx->stream();
-      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                           gpu_memory_size_, stream);
-    }
+    void CopyCPUDataToCUDA(const platform::Place &place) const;
 
     void ImmutableCPU() const {
       if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
@@ -291,7 +272,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable paddle::memory::AllocationPtr gpu_;
+    mutable paddle::memory::allocation::AllocationPtr gpu_;
     mutable size_t gpu_memory_size_{0};
     mutable int flag_;
 
@@ -465,81 +446,5 @@ class Vector {
   mutable details::COWPtr<VectorData> m_;
 };
 
-#else  // PADDLE_WITH_CUDA
-
-template <typename T>
-class CPUVector : public std::vector<T, std::allocator<T>> {
- public:
-  CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())  // NOLINT
-      : std::vector<T>(count, value) {}
-  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
-  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
-  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other)  // NOLINT
-      : std::vector<T>(std::move(other)) {}
-  CPUVector &operator=(const CPUVector &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-  CPUVector &operator=(const std::vector<T> &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-
-  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
-    std::stringstream ss;
-    for (auto v : other) {
-      os << v << " ";
-    }
-    return os;
-  }
-
-  T &operator[](size_t id) { return this->at(id); }
-
-  const T &operator[](size_t id) const { return this->at(id); }
-
-  template <typename D>
-  void Extend(const D &begin, const D &end) {
-    this->reserve(this->size() + size_t(end - begin));
-    this->insert(this->end(), begin, end);
-  }
-
-  const T *CUDAData(platform::Place place) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Vector::CUDAData() method is not supported in CPU-only version."));
-  }
-
-  T *CUDAMutableData(platform::Place place) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Vector::CUDAMutableData() method is not supported in CPU-only "
-        "version."));
-  }
-
-  const T *Data(platform::Place place) const {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(place), true,
-        platform::errors::Unavailable(
-            "Vector::Data() method is not supported when not in CPUPlace."));
-    return this->data();
-  }
-
-  T *MutableData(platform::Place place) {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(place), true,
-        platform::errors::Unavailable("Vector::MutableData() method is not "
-                                      "supported when not in CPUPlace."));
-    return this->data();
-  }
-
-  const void *Handle() const { return static_cast<const void *>(this); }
-};
-
-template <typename T>
-using Vector = CPUVector<T>;
-
-#endif  // PADDLE_WITH_CUDA
-
 };  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 10e7ed0fb6021..011e2729d4adf 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -25,6 +25,7 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index fd0f42df11875..fa0cab04168d1 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -31,13 +31,9 @@ namespace operators {
 using Tensor = framework::Tensor;
 using SelectedRows = framework::SelectedRows;
 using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 template <typename T>
 using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
 
 template <typename T>
 class FilterByInstagKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index f05af3f249ce0..bd24bbeb9f047 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -33,13 +33,9 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 template <typename T>
 using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
 
 template <typename T>
 class ShuffleBatchKernel : public framework::OpKernel<T> {
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 63cae6cc70867..28c522d7ea640 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -8,7 +8,7 @@ endif()
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context)
-
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
+
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS tensor_meta tensor_base)
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index fbb39e6f17fed..ccbcf02ffe70a 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -83,9 +83,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the lod of the tensor.
   /// \return The lod of the tensor.
-  const std::vector<std::vector<size_t>>& lod() const noexcept {
-    return meta_.lod;
-  }
+  const LoD& lod() const noexcept { return meta_.lod; }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
diff --git a/paddle/pten/core/tensor_meta.cc b/paddle/pten/core/tensor_meta.cc
index d8a3b5c9b2c61..3343527e8cd41 100644
--- a/paddle/pten/core/tensor_meta.cc
+++ b/paddle/pten/core/tensor_meta.cc
@@ -27,7 +27,7 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype,
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& dims,
                                  DataLayout layout,
-                                 const std::vector<std::vector<size_t>>& lod)
+                                 const LoD& lod)
     : dims(dims), dtype(dtype), layout(layout), lod(lod) {}
 
 bool DenseTensorMeta::valid() const noexcept {
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 5341c6bff5449..083ef2c5d39a5 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -22,15 +22,16 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
+
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
-// #include "paddle/fluid/framework/mixed_vector.h"
+// @zhanlve: Rollback to original LoD for now
+#include "paddle/fluid/framework/mixed_vector.h"
 
 namespace pten {
 
 using DDim = paddle::framework::DDim;
-using LoD = std::vector<std::vector<size_t>>;
-
+using LoD = std::vector<paddle::framework::Vector<size_t>>;
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
@@ -44,7 +45,7 @@ struct DenseTensorMeta {
   DenseTensorMeta(DataType dtype,
                   const DDim& dims,
                   DataLayout layout,
-                  const std::vector<std::vector<size_t>>& lod);
+                  const LoD& lod);
 
   /// \brief Test whether the metadata is valid. Does not throw exceptions.
   /// \return Whether the metadata is valid.
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
index b59cee5dc7e84..50116caca58a7 100644
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ b/paddle/pten/tests/api/test_tensor_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/tensor_meta.h"
 
 namespace paddle {
 namespace tests {
@@ -30,7 +31,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   const DDim dims({2, 1});
   const DataType dtype{DataType::FLOAT32};
   const DataLayout layout{DataLayout::NCHW};
-  const std::vector<std::vector<size_t>> lod{{0, 2}};
+  const pten::LoD lod{{0, 2}};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc =
@@ -46,7 +47,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
 
   CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
   CHECK(dense_tensor.lod()[0] ==
-        static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
+        static_cast<paddle::framework::Vector<size_t>>((lod_tensor.lod()[0])));
   CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type()));
   CHECK(dense_tensor.layout() ==
         pten::TransToPtenDataLayout(lod_tensor.layout()));
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 4a17046b2f30c..07ad582725d50 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -25,7 +25,7 @@ TEST(dense_tensor, meta) {
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
   // TODO(Shixiaowei02): need to check the lod is valid.
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
 
   DenseTensorMeta meta_0;
   CHECK(!meta_0.valid());
@@ -72,7 +72,7 @@ TEST(dense_tensor, ctor) {
   const DDim dims({1, 2});
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc = std::make_shared<FancyAllocator>();
@@ -106,7 +106,7 @@ TEST(dense_tensor, resize) {
   const DDim dims({1, 2});
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc = std::make_shared<FancyAllocator>();
@@ -126,7 +126,7 @@ TEST(dense_tensor, shallow_copy) {
   const DDim dims({1, 2});
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc = std::make_shared<FancyAllocator>();