diff --git a/.github/workflows/make_wheel_Windows_x86.sh b/.github/workflows/make_wheel_Windows_x86.sh
index be4fbbf47..54bb00200 100644
--- a/.github/workflows/make_wheel_Windows_x86.sh
+++ b/.github/workflows/make_wheel_Windows_x86.sh
@@ -8,10 +8,11 @@ if [ -z $HOROVOD_VERSION ] ; then
 fi
 
 python -m pip install --default-timeout=1000 wheel setuptools tensorflow==$TF_VERSION horovod==$HOROVOD_VERSION
-bash ./tools/testing/build_and_run_tests.sh
-
+python -m pip install tensorflow-io
 python -m pip install --upgrade protobuf==3.20.0
 
+bash ./tools/testing/build_and_run_tests.sh
+
 python configure.py
 
 bazel.exe build  --no-cache \
diff --git a/.github/workflows/make_wheel_macOS_arm64.sh b/.github/workflows/make_wheel_macOS_arm64.sh
index f36c6617b..93c6577c9 100644
--- a/.github/workflows/make_wheel_macOS_arm64.sh
+++ b/.github/workflows/make_wheel_macOS_arm64.sh
@@ -11,7 +11,8 @@ export TF_NEED_CUDA=0
 
 python --version
 
-RUN python -m pip install --upgrade protobuf==3.20.0
+python -m pip install tensorflow-io
+python -m pip install --upgrade protobuf==3.20.0
 python configure.py
 
 bazel build \
diff --git a/.github/workflows/make_wheel_macOS_x86.sh b/.github/workflows/make_wheel_macOS_x86.sh
index a417c61f2..66ddea513 100644
--- a/.github/workflows/make_wheel_macOS_x86.sh
+++ b/.github/workflows/make_wheel_macOS_x86.sh
@@ -9,6 +9,7 @@ python --version
 brew install open-mpi
 
 python -m pip install --default-timeout=1000 delocate==0.9.1 wheel setuptools tensorflow==$TF_VERSION
+python -m pip install tensorflow-io
 python -m pip install --upgrade protobuf==3.20.0
 
 bash tools/docker/install/install_horovod.sh $HOROVOD_VERSION --only-cpu
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2fb1786ab..fcda6ab70 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -34,6 +34,7 @@ jobs:
           pip install --default-timeout=1000 -r tools/install_deps/pytest.txt -r tools/install_deps/tensorflow-cpu.txt -r requirements.txt
           sudo apt install -y redis > /dev/null 2> /dev/null
           bash tools/install_deps/install_bazelisk.sh ./
+          python -m pip install tensorflow-io
           python -m pip install --upgrade protobuf==3.20.0
           python configure.py
           bazel test --local_ram_resources=4096 -c opt -k --test_timeout 300,450,1200,3600 --test_output=errors //tensorflow_recommenders_addons/...
diff --git a/WORKSPACE b/WORKSPACE
index dc59b6b7c..7df12b577 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -54,16 +54,6 @@ http_archive(
     url = "https://github.com/sewenew/redis-plus-plus/archive/refs/tags/1.2.3.zip",
 )
 
-http_archive(
-    name = "hadoop",
-    build_file = "//third_party:hadoop.BUILD",
-    sha256 = "fa9d0587d06c36838e778081bcf8271a9c63060af00b3bf456423c1777a62043",
-    strip_prefix = "hadoop-rel-release-3.3.0",
-    urls = [
-        "https://github.com/apache/hadoop/archive/refs/tags/rel/release-3.3.0.tar.gz",
-    ],
-)
-
 tf_configure(
     name = "local_config_tf",
 )
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
index 9a53530b6..076e2791b 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
@@ -31,7 +31,6 @@ custom_op_library(
     ] + glob(["kernels/lookup_impl/lookup_table_op_gpu*"])),
     deps = [
         "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/cuckoo:cuckoohash",
-        "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system",
     ],
 )
 
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op.cc
index a31b15ebd..de4cf52eb 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/kernels/lookup_table_op.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_cpu.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/types.h"
@@ -290,20 +291,213 @@ class CuckooHashTableOfTensors final : public LookupInterface {
   }
 
   Status ExportValues(OpKernelContext* ctx) override {
-    int64 value_dim = value_shape_.dim_size(0);
-    return table_->export_values(ctx, value_dim);
+    Tensor* keys;
+    Tensor* values;
+    const auto table_size = table_->size();
+    const auto output_key_size = static_cast<int64>(table_size);
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output("keys", TensorShape({output_key_size}), &keys));
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        "values",
+        TensorShape({output_key_size, static_cast<int64>(runtime_dim_)}),
+        &values));
+    table_->dump((K*)keys->tensor_data().data(),
+                 (V*)values->tensor_data().data(), 0, table_size);
+
+    return Status::OK();
   }
 
-  Status SaveToHDFS(OpKernelContext* ctx, const string& filepath,
-                    const size_t buffer_size) {
-    int64 value_dim = value_shape_.dim_size(0);
-    return table_->save_to_hdfs(ctx, value_dim, filepath, buffer_size);
+  Status SaveToFileSystemImpl(FileSystem* fs, const size_t value_dim,
+                              const string& filepath, const size_t buffer_size,
+                              bool append_to_file) {
+    std::unique_ptr<WritableFile> key_writer;
+    std::unique_ptr<WritableFile> value_writer;
+    const string key_filepath(filepath + "-keys");
+    const string value_filepath(filepath + "-values");
+    string key_tmpfilepath(filepath + "-keys.tmp");
+    string value_tmpfilepath(filepath + "-values.tmp");
+    bool has_atomic_move = false;
+    auto has_atomic_move_ret = fs->HasAtomicMove(filepath, &has_atomic_move);
+    bool need_tmp_file =
+        (has_atomic_move == false) || (has_atomic_move_ret != Status::OK());
+    if (!need_tmp_file) {
+      key_tmpfilepath = key_filepath;
+      value_tmpfilepath = value_filepath;
+    }
+    TF_RETURN_IF_ERROR(
+        fs->RecursivelyCreateDir(std::string(fs->Dirname(filepath))));
+    if (append_to_file) {
+      TF_RETURN_IF_ERROR(fs->NewAppendableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(
+          fs->NewAppendableFile(value_tmpfilepath, &value_writer));
+    } else {
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(value_tmpfilepath, &value_writer));
+    }
+
+    size_t key_offset = 0;
+    size_t value_offset = 0;
+    const size_t value_len = sizeof(V) * value_dim;
+    const size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    const size_t value_buffer_byte_size = buffer_size * value_len;
+    std::vector<char> key_buffer_vector(key_buffer_byte_size);
+    char* key_buffer = key_buffer_vector.data();
+    std::vector<char> value_buffer_vector(value_buffer_byte_size);
+    char* value_buffer = value_buffer_vector.data();
+
+    const size_t table_size = table_->size();
+    size_t search_offset = 0;
+    size_t total_saved = 0;
+    while (search_offset < table_size) {
+      auto dump_counter = table_->dump((K*)key_buffer, (V*)value_buffer,
+                                       search_offset, buffer_size);
+      search_offset += dump_counter;
+      key_offset += dump_counter * sizeof(K);
+      value_offset += dump_counter * value_len;
+      TF_RETURN_IF_ERROR(
+          key_writer->Append(StringPiece(key_buffer, key_offset)));
+      key_buffer = key_buffer_vector.data();
+      key_offset = 0;
+      TF_RETURN_IF_ERROR(
+          value_writer->Append(StringPiece(value_buffer, value_offset)));
+      value_buffer = value_buffer_vector.data();
+      value_offset = 0;
+      total_saved += dump_counter;
+    }
+
+    if (key_offset > 0 && value_offset > 0) {
+      TF_RETURN_IF_ERROR(
+          key_writer->Append(StringPiece(key_buffer, key_offset)));
+      TF_RETURN_IF_ERROR(
+          value_writer->Append(StringPiece(value_buffer, value_offset)));
+    }
+
+    TF_RETURN_IF_ERROR(key_writer->Flush());
+    TF_RETURN_IF_ERROR(value_writer->Flush());
+    TF_RETURN_IF_ERROR(key_writer->Sync());
+    TF_RETURN_IF_ERROR(value_writer->Sync());
+
+    LOG(INFO) << "Finish saving " << total_saved << " keys and values to "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    if (need_tmp_file) {
+      TF_RETURN_IF_ERROR(fs->FileExists(key_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(key_tmpfilepath, key_filepath));
+      TF_RETURN_IF_ERROR(fs->FileExists(value_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(value_tmpfilepath, value_filepath));
+    }
+
+    return Status::OK();
   }
 
-  Status LoadFromHDFS(OpKernelContext* ctx, const string& filepath,
-                      const size_t buffer_size) {
-    int64 value_dim = value_shape_.dim_size(0);
-    return table_->load_from_hdfs(ctx, value_dim, filepath, buffer_size);
+  Status SaveToFileSystem(OpKernelContext* ctx, const string& dirpath,
+                          const string& file_name, const size_t buffer_size,
+                          bool append_to_file) {
+    string filepath = io::JoinPath(dirpath, file_name);
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        env->GetFileSystemForFile(filepath, &fs),
+        "Please make sure you have already imported tensorflow_io before using "
+        "TFRA file system operation.");
+    const size_t value_dim = static_cast<size_t>(value_shape_.dim_size(0));
+    return SaveToFileSystemImpl(fs, value_dim, filepath, buffer_size,
+                                append_to_file);
+  }
+
+  Status LoadFromFileSystemImpl(FileSystem* fs, const size_t value_dim,
+                                const string& filepath,
+                                const size_t buffer_size) {
+    const string key_filepath = filepath + "-keys";
+    TF_RETURN_IF_ERROR(fs->FileExists(key_filepath));
+    std::unique_ptr<RandomAccessFile> key_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(key_filepath, &key_file));
+    std::unique_ptr<io::RandomAccessInputStream> key_input_stream(
+        new io::RandomAccessInputStream(key_file.get()));
+    const size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    io::BufferedInputStream key_reader(key_input_stream.get(),
+                                       key_buffer_byte_size);
+
+    const string value_filepath = filepath + "-values";
+    TF_RETURN_IF_ERROR(fs->FileExists(value_filepath));
+    std::unique_ptr<RandomAccessFile> value_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(value_filepath, &value_file));
+    std::unique_ptr<io::RandomAccessInputStream> value_input_stream(
+        new io::RandomAccessInputStream(value_file.get()));
+    const size_t value_len = sizeof(V) * value_dim;
+    const size_t value_buffer_size = buffer_size * value_len;
+    io::BufferedInputStream value_reader(value_input_stream.get(),
+                                         value_buffer_size);
+
+    uint64 key_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(key_filepath, &key_file_size));
+    const size_t key_size = key_file_size / sizeof(K);
+
+    uint64 value_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(value_filepath, &value_file_size));
+    const size_t value_size = value_file_size / value_len;
+
+    if (key_size != value_size) {
+      return errors::Unavailable(
+          "the keys number in file " + key_filepath +
+          " is not equal to the value vectors number in file " +
+          value_filepath + ".");
+    }
+
+    tstring key_buffer;
+    key_buffer.resize(sizeof(K));
+    tstring value_buffer;
+    value_buffer.resize(value_len);
+    uint64 key_file_offset = 0;
+
+    while (key_file_offset < key_file_size) {
+      TF_RETURN_IF_ERROR(key_reader.ReadNBytes(sizeof(K), &key_buffer));
+      TF_RETURN_IF_ERROR(value_reader.ReadNBytes(value_len, &value_buffer));
+      table_->insert_or_assign_one(*((K*)key_buffer.data()),
+                                   (V*)value_buffer.data(), runtime_dim_);
+      key_file_offset += sizeof(K);
+    }
+
+    LOG(INFO) << "Finish loading " << key_size << " keys and values from "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    return Status::OK();
+  }
+
+  Status LoadFromFileSystem(OpKernelContext* ctx, const string& dirpath,
+                            const string& file_name, const size_t buffer_size,
+                            bool load_entire_dir) {
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(env->GetFileSystemForFile(dirpath, &fs),
+                                    "Please make sure you have already "
+                                    "imported tensorflow_io before using "
+                                    "TFRA file system operation.");
+    const size_t value_dim = static_cast<size_t>(value_shape_.dim_size(0));
+    if (load_entire_dir) {
+      int separator_pos = file_name.rfind("_mht_");
+      string file_pattern =
+          io::JoinPath(dirpath, file_name.substr(0, separator_pos)) + "*";
+      std::vector<string> all_filepath;
+      TF_RETURN_IF_ERROR(fs->GetMatchingPaths(file_pattern, &all_filepath));
+      // delete -keys/-values postfix
+      for (auto it = all_filepath.begin(); it != all_filepath.end(); ++it) {
+        int kv_separator_pos = it->rfind("-");
+        *it = it->substr(0, kv_separator_pos);
+      }
+      // remove duplicate elements
+      sort(all_filepath.begin(), all_filepath.end());
+      all_filepath.erase(unique(all_filepath.begin(), all_filepath.end()),
+                         all_filepath.end());
+      for (auto& fp : all_filepath) {
+        TF_RETURN_IF_ERROR(
+            LoadFromFileSystemImpl(fs, value_dim, fp, buffer_size));
+      }
+    } else {
+      string filepath = io::JoinPath(dirpath, file_name);
+      return LoadFromFileSystemImpl(fs, value_dim, filepath, buffer_size);
+    }
+    return Status::OK();
   }
 
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
@@ -609,14 +803,16 @@ class HashTableExportOp : public HashTableOpKernel {
   }
 };
 
-// Op that export all keys and values to HDFS.
+// Op that save all keys and values to FileSystem.
 template <class K, class V>
-class HashTableSaveToHDFSOp : public HashTableOpKernel {
+class HashTableSaveToFileSystemOp : public HashTableOpKernel {
  public:
-  explicit HashTableSaveToHDFSOp(OpKernelConstruction* ctx)
+  explicit HashTableSaveToFileSystemOp(OpKernelConstruction* ctx)
       : HashTableOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("append_to_file", &append_to_file_));
     int64 signed_buffer_size = 0;
-    ctx->GetAttr("buffer_size", &signed_buffer_size);
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
     buffer_size_ = static_cast<size_t>(signed_buffer_size);
   }
 
@@ -625,17 +821,35 @@ class HashTableSaveToHDFSOp : public HashTableOpKernel {
     OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    const Tensor& ftensor = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
-                errors::InvalidArgument("filepath must be scalar."));
-    string filepath = string(ftensor.scalar<tstring>()().data());
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
 
     lookup::CuckooHashTableOfTensors<K, V>* table_cuckoo =
         (lookup::CuckooHashTableOfTensors<K, V>*)table;
-    OP_REQUIRES_OK(ctx, table_cuckoo->SaveToHDFS(ctx, filepath, buffer_size_));
+    OP_REQUIRES_OK(
+        ctx, table_cuckoo->SaveToFileSystem(ctx, dirpath, file_name,
+                                            buffer_size_, append_to_file_));
   }
 
  private:
+  string dirpath_env_;
+  bool append_to_file_;
   size_t buffer_size_;
 };
 
@@ -669,14 +883,16 @@ class HashTableImportOp : public HashTableOpKernel {
   }
 };
 
-// Clear the table and insert data from HDFS.
+// Clear the table and insert data from FileSystem.
 template <class K, class V>
-class HashTableLoadFromHDFSOp : public HashTableOpKernel {
+class HashTableLoadFromFileSystemOp : public HashTableOpKernel {
  public:
-  explicit HashTableLoadFromHDFSOp(OpKernelConstruction* ctx)
+  explicit HashTableLoadFromFileSystemOp(OpKernelConstruction* ctx)
       : HashTableOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("load_entire_dir", &load_entire_dir_));
     int64 signed_buffer_size = 0;
-    ctx->GetAttr("buffer_size", &signed_buffer_size);
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
     buffer_size_ = static_cast<size_t>(signed_buffer_size);
   }
 
@@ -685,18 +901,35 @@ class HashTableLoadFromHDFSOp : public HashTableOpKernel {
     OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    const Tensor& ftensor = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
-                errors::InvalidArgument("filepath must be scalar."));
-    string filepath = string(ftensor.scalar<tstring>()().data());
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
 
     lookup::CuckooHashTableOfTensors<K, V>* table_cuckoo =
         (lookup::CuckooHashTableOfTensors<K, V>*)table;
-    OP_REQUIRES_OK(ctx,
-                   table_cuckoo->LoadFromHDFS(ctx, filepath, buffer_size_));
+    OP_REQUIRES_OK(
+        ctx, table_cuckoo->LoadFromFileSystem(ctx, dirpath, file_name,
+                                              buffer_size_, load_entire_dir_));
   }
 
  private:
+  string dirpath_env_;
+  bool load_entire_dir_;
   size_t buffer_size_;
 };
 
@@ -743,16 +976,18 @@ REGISTER_KERNEL_BUILDER(
                               .TypeConstraint<key_dtype>("Tin")               \
                               .TypeConstraint<value_dtype>("Tout"),           \
                           HashTableFindWithExistsOp<key_dtype, value_dtype>); \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(CuckooHashTableSaveToHDFS))     \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<key_dtype>("key_dtype")         \
-                              .TypeConstraint<value_dtype>("value_dtype"),    \
-                          HashTableSaveToHDFSOp<key_dtype, value_dtype>);     \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(CuckooHashTableLoadFromHDFS))   \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<key_dtype>("key_dtype")         \
-                              .TypeConstraint<value_dtype>("value_dtype"),    \
-                          HashTableLoadFromHDFSOp<key_dtype, value_dtype>);
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name(PREFIX_OP_NAME(CuckooHashTableSaveToFileSystem))                   \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<key_dtype>("key_dtype")                             \
+          .TypeConstraint<value_dtype>("value_dtype"),                        \
+      HashTableSaveToFileSystemOp<key_dtype, value_dtype>);                   \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name(PREFIX_OP_NAME(CuckooHashTableLoadFromFileSystem))                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<key_dtype>("key_dtype")                             \
+          .TypeConstraint<value_dtype>("value_dtype"),                        \
+      HashTableLoadFromFileSystemOp<key_dtype, value_dtype>);
 
 REGISTER_KERNEL(int32, double);
 REGISTER_KERNEL(int32, float);
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.cu.cc
index 7fd2ada8f..7216a0e0f 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.cu.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.cu.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <cstdlib>
-#include <iomanip>
 #include <type_traits>
 #include <utility>
 
@@ -33,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/stream.h"
 
@@ -176,66 +176,9 @@ class CuckooHashTableOfTensorsGpu final : public LookupInterface {
     return Status::OK();
   }
 
-  void RehashIfNeeded(cudaStream_t stream, size_t num_keys = 0) {
-    K* d_keys;
-    gpu::ValueArrayBase<V>* d_values;
-    size_t* d_dump_counter;
-    size_t new_max_size = max_size_;
-    const float max_load_factor = 0.75;
-    const float min_load_factor = 0.25;
-
-    const bool should_check =
-        last_size_hint_ == 0 || num_keys == 0 ||
-        ((last_size_hint_ + num_keys) > max_load_factor * max_size_);
-    if (!should_check) {
-      last_size_hint_ += num_keys;
-      return;
-    }
-
-    size_t total_size = table_->get_size(stream);
-    last_size_hint_ = total_size;
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    if (total_size >= max_load_factor * max_size_) {
-      new_max_size = max_size_ * 2;
-    }
-    if (total_size < min_load_factor * max_size_ && max_size_ > min_size_) {
-      new_max_size = max_size_ / 2;
-    }
-
-    // The table should be able to hold num_keys at least
-    if (new_max_size < total_size + num_keys) {
-      new_max_size = (total_size + num_keys) * 2;
-    }
-
-    if (new_max_size != max_size_) {  // rehash manually.
-      size_t capacity = table_->get_capacity();
-      size_t h_dump_counter = 0;
-      CUDA_CHECK(cudaMallocManaged((void**)&d_dump_counter, sizeof(size_t)));
-      CUDA_CHECK(cudaMallocManaged((void**)&d_keys, sizeof(K) * capacity));
-      CUDA_CHECK(cudaMallocManaged((void**)&d_values,
-                                   sizeof(V) * runtime_dim_ * capacity));
-      table_->dump(d_keys, (gpu::ValueArrayBase<V>*)d_values, 0, capacity,
-                   d_dump_counter, stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-
-      delete table_;
-      table_ = NULL;
-      CreateTable(new_max_size, &table_);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      CUDA_CHECK(cudaMemcpy((size_t*)&h_dump_counter, (size_t*)d_dump_counter,
-                            sizeof(size_t), cudaMemcpyDefault));
-      table_->upsert((const K*)d_keys, (const gpu::ValueArrayBase<V>*)d_values,
-                     h_dump_counter, stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      CUDA_CHECK(cudaFree(d_keys));
-      CUDA_CHECK(cudaFree(d_values));
-      CUDA_CHECK(cudaFree(d_dump_counter));
-      max_size_ = new_max_size;
-      LOG(INFO) << "HashTable on GPU changes to new status: [size="
-                << total_size << ", max_size=" << max_size_
-                << ", load factor=" << std::setprecision(2)
-                << (float)total_size / (float)max_size_ << "].";
-    }
+  void RehashIfNeeded(cudaStream_t stream, const size_t num_keys = 0) {
+    table_->rehash_if_needed(&last_size_hint_, &max_size_, min_size_, stream,
+                             num_keys);
   }
 
   Status Insert(OpKernelContext* ctx, const Tensor& keys,
@@ -411,6 +354,290 @@ class CuckooHashTableOfTensorsGpu final : public LookupInterface {
     return Status::OK();
   }
 
+// For propagating errors when calling a function.
+#define TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA(CLEANUP_CODE, ...) \
+  do {                                                          \
+    ::tensorflow::Status _status = (__VA_ARGS__);               \
+    if (TF_PREDICT_FALSE(!_status.ok())) {                      \
+      {CLEANUP_CODE};                                           \
+      return _status;                                           \
+    }                                                           \
+  } while (0)
+
+  Status SaveToFileSystemImpl(FileSystem* fs, const size_t value_dim,
+                              const string& filepath, const size_t buffer_size,
+                              bool append_to_file, cudaStream_t stream) {
+    std::unique_ptr<WritableFile> key_writer;
+    std::unique_ptr<WritableFile> value_writer;
+    const string key_filepath(filepath + "-keys");
+    const string value_filepath(filepath + "-values");
+    string key_tmpfilepath(filepath + "-keys.tmp");
+    string value_tmpfilepath(filepath + "-values.tmp");
+    bool has_atomic_move = false;
+    auto has_atomic_move_ret = fs->HasAtomicMove(filepath, &has_atomic_move);
+    bool need_tmp_file =
+        (has_atomic_move == false) || (has_atomic_move_ret != Status::OK());
+    if (!need_tmp_file) {
+      key_tmpfilepath = key_filepath;
+      value_tmpfilepath = value_filepath;
+    }
+    TF_RETURN_IF_ERROR(
+        fs->RecursivelyCreateDir(std::string(fs->Dirname(filepath))));
+    if (append_to_file) {
+      TF_RETURN_IF_ERROR(fs->NewAppendableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(
+          fs->NewAppendableFile(value_tmpfilepath, &value_writer));
+    } else {
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(value_tmpfilepath, &value_writer));
+    }
+
+    size_t key_offset = 0;
+    size_t value_offset = 0;
+    const size_t value_len = sizeof(V) * value_dim;
+    const size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    const size_t value_buffer_byte_size = buffer_size * value_len;
+    std::vector<char> key_buffer_vector(key_buffer_byte_size);
+    char* key_buffer = key_buffer_vector.data();
+    std::vector<char> value_buffer_vector(value_buffer_byte_size);
+    char* value_buffer = value_buffer_vector.data();
+
+    K* d_keys = nullptr;
+    V* d_values = nullptr;
+    size_t* d_dump_counter;
+    size_t dump_counter;
+    size_t search_offset = 0;
+    size_t table_capacity = table_->get_capacity();
+
+    CUDA_CHECK(cudaMallocAsync(&d_keys, key_buffer_byte_size, stream));
+    CUDA_CHECK(cudaMallocAsync(&d_values, value_buffer_byte_size, stream));
+    CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream));
+#define CLEANUP_CUDA_CODE                      \
+  CUDA_CHECK(cudaFreeAsync(d_keys, stream));   \
+  CUDA_CHECK(cudaFreeAsync(d_values, stream)); \
+  CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream));
+
+    size_t search_length = 0;
+    size_t total_saved = 0;
+    while (search_offset < table_capacity) {
+      if (search_offset + buffer_size >= table_capacity) {
+        search_length = table_capacity - search_offset;
+      } else {
+        search_length = buffer_size;
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      table_->dump(d_keys, (gpu::ValueType<V>*)d_values, search_offset,
+                   search_length, d_dump_counter, stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+
+      CUDA_CHECK(cudaMemcpyAsync(&dump_counter, d_dump_counter, sizeof(size_t),
+                                 cudaMemcpyDeviceToHost, stream));
+
+      if (dump_counter > 0) {
+        key_offset = dump_counter * sizeof(K);
+        value_offset = dump_counter * value_len;
+        CUDA_CHECK(cudaMemcpyAsync(key_buffer, d_keys, key_offset,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(value_buffer, d_values, value_offset,
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA(
+            CLEANUP_CUDA_CODE,
+            key_writer->Append(StringPiece(key_buffer, key_offset)));
+        TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA(
+            CLEANUP_CUDA_CODE,
+            value_writer->Append(StringPiece(value_buffer, value_offset)));
+      }
+      search_offset += search_length;
+      total_saved += dump_counter;
+    }
+
+    CLEANUP_CUDA_CODE
+#undef CLEANUP_CUDA_CODE
+
+    TF_RETURN_IF_ERROR(key_writer->Flush());
+    TF_RETURN_IF_ERROR(value_writer->Flush());
+    // "munmap_chunk(): invalid pointer" when call TF IO S3 File System Sync()
+    // function, unknown reasons.
+    // TODO: Fix it.
+    // TF_RETURN_IF_ERROR(key_writer->Sync());
+    // TF_RETURN_IF_ERROR(value_writer->Sync());
+
+    LOG(INFO) << "Finish saving " << total_saved << " keys and values to "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    if (need_tmp_file) {
+      TF_RETURN_IF_ERROR(fs->FileExists(key_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(key_tmpfilepath, key_filepath));
+      TF_RETURN_IF_ERROR(fs->FileExists(value_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(value_tmpfilepath, value_filepath));
+    }
+
+    return Status::OK();
+  }
+
+  Status SaveToFileSystem(OpKernelContext* ctx, const string& dirpath,
+                          const string& file_name, const size_t buffer_size,
+                          bool append_to_file) {
+    string filepath = io::JoinPath(dirpath, file_name);
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        env->GetFileSystemForFile(filepath, &fs),
+        "Please make sure you have already imported tensorflow_io before using "
+        "TFRA file system operation.");
+    const size_t value_dim = static_cast<size_t>(value_shape_.dim_size(0));
+    cudaStream_t _stream;
+    CUDA_CHECK(cudaStreamCreate(&_stream));
+    auto statu = SaveToFileSystemImpl(fs, value_dim, filepath, buffer_size,
+                                      append_to_file, _stream);
+    CUDA_CHECK(cudaStreamDestroy(_stream));
+    return statu;
+  }
+
+  Status LoadFromFileSystemImpl(FileSystem* fs, const size_t value_dim,
+                                const string& filepath,
+                                const size_t buffer_size, cudaStream_t stream) {
+    const string key_filepath = filepath + "-keys";
+    TF_RETURN_IF_ERROR(fs->FileExists(key_filepath));
+    std::unique_ptr<RandomAccessFile> key_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(key_filepath, &key_file));
+    std::unique_ptr<io::RandomAccessInputStream> key_input_stream(
+        new io::RandomAccessInputStream(key_file.get()));
+    const size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    io::BufferedInputStream key_reader(key_input_stream.get(),
+                                       key_buffer_byte_size * 2);
+
+    const string value_filepath = filepath + "-values";
+    TF_RETURN_IF_ERROR(fs->FileExists(value_filepath));
+    std::unique_ptr<RandomAccessFile> value_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(value_filepath, &value_file));
+    std::unique_ptr<io::RandomAccessInputStream> value_input_stream(
+        new io::RandomAccessInputStream(value_file.get()));
+    const size_t value_len = sizeof(V) * value_dim;
+    const size_t value_buffer_byte_size = buffer_size * value_len;
+    io::BufferedInputStream value_reader(value_input_stream.get(),
+                                         value_buffer_byte_size * 2);
+
+    uint64 key_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(key_filepath, &key_file_size));
+    const size_t key_size = key_file_size / sizeof(K);
+
+    uint64 value_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(value_filepath, &value_file_size));
+    const size_t value_size = value_file_size / value_len;
+
+    if (key_size != value_size) {
+      return errors::Unavailable(
+          "the keys number in file " + key_filepath +
+          " is not equal to the value vectors number in file " +
+          value_filepath + ".");
+    }
+
+    // Rehash table
+    RehashIfNeeded(stream, key_size);
+
+    K* d_keys = nullptr;
+    V* d_values = nullptr;
+    CUDA_CHECK(cudaMallocAsync(&d_keys, key_buffer_byte_size, stream));
+    CUDA_CHECK(cudaMallocAsync(&d_values, value_buffer_byte_size, stream));
+#define CLEANUP_CUDA_CODE                    \
+  CUDA_CHECK(cudaFreeAsync(d_keys, stream)); \
+  CUDA_CHECK(cudaFreeAsync(d_values, stream));
+
+    tstring key_buffer;
+    key_buffer.resize(key_buffer_byte_size);
+    tstring value_buffer;
+    value_buffer.resize(value_buffer_byte_size);
+
+    size_t key_file_offset = 0;
+    int64_t remainder = key_file_size - key_file_offset;
+    size_t nkeys = 0;
+    size_t key_read_byte = 0;
+    size_t value_read_byte = 0;
+    while (remainder > 0) {
+      if (remainder > static_cast<int64_t>(key_buffer_byte_size)) {
+        key_read_byte = key_buffer_byte_size;
+        nkeys = buffer_size;
+        value_read_byte = value_buffer_byte_size;
+      } else {
+        key_read_byte = remainder;
+        nkeys = key_read_byte / sizeof(K);
+        value_read_byte = nkeys * value_len;
+      }
+      TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA(
+          CLEANUP_CUDA_CODE, key_reader.ReadNBytes(key_read_byte, &key_buffer));
+      TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA(
+          CLEANUP_CUDA_CODE,
+          value_reader.ReadNBytes(value_read_byte, &value_buffer));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(cudaMemcpyAsync(d_keys, key_buffer.data(), key_read_byte,
+                                 cudaMemcpyHostToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(d_values, value_buffer.data(), value_read_byte,
+                                 cudaMemcpyHostToDevice, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      table_->upsert(d_keys, (gpu::ValueType<V>*)d_values, nkeys, stream);
+      key_file_offset += key_read_byte;
+      remainder = key_file_size - key_file_offset;
+    }
+
+    CLEANUP_CUDA_CODE
+#undef CLEANUP_CUDA_CODE
+    LOG(INFO) << "Finish loading " << key_size << " keys and values from "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    return Status::OK();
+  }
+
+  Status LoadFromFileSystem(OpKernelContext* ctx, const string& dirpath,
+                            const string& file_name, const size_t buffer_size,
+                            bool load_entire_dir) {
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(env->GetFileSystemForFile(dirpath, &fs),
+                                    "Please make sure you have already "
+                                    "imported tensorflow_io before using "
+                                    "TFRA file system operation.");
+    const size_t value_dim = static_cast<size_t>(value_shape_.dim_size(0));
+    auto statu = Status::OK();
+    if (load_entire_dir) {
+      int separator_pos = file_name.rfind("_mht_");
+      string file_pattern =
+          io::JoinPath(dirpath, file_name.substr(0, separator_pos)) + "*";
+      std::vector<string> all_filepath;
+      TF_RETURN_IF_ERROR(fs->GetMatchingPaths(file_pattern, &all_filepath));
+      // delete -keys/-values postfix
+      for (auto it = all_filepath.begin(); it != all_filepath.end(); ++it) {
+        int kv_separator_pos = it->rfind("-");
+        *it = it->substr(0, kv_separator_pos);
+      }
+      // remove duplicate elements
+      sort(all_filepath.begin(), all_filepath.end());
+      all_filepath.erase(unique(all_filepath.begin(), all_filepath.end()),
+                         all_filepath.end());
+      for (auto fp : all_filepath) {
+        cudaStream_t _stream;
+        CUDA_CHECK(cudaStreamCreate(&_stream));
+        statu = LoadFromFileSystemImpl(fs, value_dim, fp, buffer_size, _stream);
+        CUDA_CHECK(cudaStreamDestroy(_stream));
+        if (statu != Status::OK()) {
+          return statu;
+        }
+      }
+    } else {
+      string filepath = io::JoinPath(dirpath, file_name);
+      cudaStream_t _stream;
+      CUDA_CHECK(cudaStreamCreate(&_stream));
+      statu =
+          LoadFromFileSystemImpl(fs, value_dim, filepath, buffer_size, _stream);
+      CUDA_CHECK(cudaStreamDestroy(_stream));
+    }
+    return statu;
+  }
+
+#undef TF_RETURN_IF_ERROR_WITH_CLEANUP_CUDA
+
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
   DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
   TensorShape key_shape() const final { return TensorShape(); }
@@ -650,6 +877,56 @@ REGISTER_KERNEL_BUILDER(
     Name(PREFIX_OP_NAME(CuckooHashTableExport)).Device(DEVICE_GPU),
     HashTableExportGpuOp);
 
+// Op that export all keys and values to FileSystem.
+template <class K, class V>
+class HashTableSaveToFileSystemGpuOp : public OpKernel {
+ public:
+  explicit HashTableSaveToFileSystemGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("append_to_file", &append_to_file_));
+    int64 signed_buffer_size = 0;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    lookup::CuckooHashTableOfTensorsGpu<K, V>* table_cuckoo =
+        (lookup::CuckooHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(
+        ctx, table_cuckoo->SaveToFileSystem(ctx, dirpath, file_name,
+                                            buffer_size_, append_to_file_));
+  }
+
+ private:
+  string dirpath_env_;
+  bool append_to_file_;
+  size_t buffer_size_;
+};
+
 // Clear the table and insert data.
 class HashTableImportGpuOp : public OpKernel {
  public:
@@ -676,6 +953,56 @@ REGISTER_KERNEL_BUILDER(
     Name(PREFIX_OP_NAME(CuckooHashTableImport)).Device(DEVICE_GPU),
     HashTableImportGpuOp);
 
+// Clear the table and insert data from FileSystem.
+template <class K, class V>
+class HashTableLoadFromFileSystemGpuOp : public OpKernel {
+ public:
+  explicit HashTableLoadFromFileSystemGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("load_entire_dir", &load_entire_dir_));
+    int64 signed_buffer_size = 0;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    lookup::CuckooHashTableOfTensorsGpu<K, V>* table_cuckoo =
+        (lookup::CuckooHashTableOfTensorsGpu<K, V>*)table;
+    OP_REQUIRES_OK(
+        ctx, table_cuckoo->LoadFromFileSystem(ctx, dirpath, file_name,
+                                              buffer_size_, load_entire_dir_));
+  }
+
+ private:
+  string dirpath_env_;
+  bool load_entire_dir_;
+  size_t buffer_size_;
+};
+
 // Register the CuckooHashTableOfTensors op.
 
 #define REGISTER_KERNEL(key_dtype, value_dtype)                            \
@@ -702,7 +1029,19 @@ REGISTER_KERNEL_BUILDER(
           .Device(DEVICE_GPU)                                              \
           .TypeConstraint<key_dtype>("Tin")                                \
           .TypeConstraint<value_dtype>("Tout"),                            \
-      HashTableFindWithExistsGpuOp<key_dtype, value_dtype>)
+      HashTableFindWithExistsGpuOp<key_dtype, value_dtype>)                \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name(PREFIX_OP_NAME(CuckooHashTableSaveToFileSystem))                \
+          .Device(DEVICE_GPU)                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                          \
+          .TypeConstraint<value_dtype>("value_dtype"),                     \
+      HashTableSaveToFileSystemGpuOp<key_dtype, value_dtype>);             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name(PREFIX_OP_NAME(CuckooHashTableLoadFromFileSystem))              \
+          .Device(DEVICE_GPU)                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                          \
+          .TypeConstraint<value_dtype>("value_dtype"),                     \
+      HashTableLoadFromFileSystemGpuOp<key_dtype, value_dtype>);
 
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, Eigen::half);
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_cpu.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_cpu.h
index 0e2f8fc20..4e3f2c9fc 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_cpu.h
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_cpu.h
@@ -29,8 +29,9 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/cuckoo/cuckoohash_map.hh"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/types.h"
 
 namespace tensorflow {
@@ -118,11 +119,14 @@ class TableWrapperBase {
  public:
   virtual ~TableWrapperBase() {}
   virtual bool insert_or_assign(K key, ConstTensor2D<V>& value_flat,
-                                int64 value_dim, int64 index) {
+                                int64 value_dim, int64 index) const {
+    return false;
+  }
+  virtual bool insert_or_assign_one(K key, V* value, int64 value_dim) const {
     return false;
   }
   virtual bool insert_or_accum(K key, ConstTensor2D<V>& value_or_delta_flat,
-                               bool exist, int64 value_dim, int64 index) {
+                               bool exist, int64 value_dim, int64 index) const {
     return false;
   }
   virtual void find(const K& key, Tensor2D<V>& value_flat,
@@ -132,18 +136,13 @@ class TableWrapperBase {
                     ConstTensor2D<V>& default_flat, bool& exist,
                     int64 value_dim, bool is_full_size_default,
                     int64 index) const {}
-  virtual size_t size() const { return 0; }
-  virtual void clear() {}
-  virtual bool erase(const K& key) { return false; }
-  virtual Status export_values(OpKernelContext* ctx, int64 value_dim) {
-    return Status::OK();
+  virtual size_t dump(K* keys, V* values, const size_t search_offset,
+                      const size_t search_length) const {
+    return 0;
   }
-  virtual Status save_to_hdfs(OpKernelContext* ctx, int64 value_dim,
-                              const string& filepath,
-                              const size_t buffer_size) {}
-  virtual Status load_from_hdfs(OpKernelContext* ctx, int64 value_dim,
-                                const string& filepath,
-                                const size_t buffer_size) {}
+  virtual size_t size() const { return 0; }
+  virtual void clear() const {}
+  virtual bool erase(const K& key) const { return false; }
 };
 
 template <class K, class V, size_t DIM>
@@ -164,7 +163,7 @@ class TableWrapperOptimized final : public TableWrapperBase<K, V> {
   ~TableWrapperOptimized() override { delete table_; }
 
   bool insert_or_assign(K key, ConstTensor2D<V>& value_flat, int64 value_dim,
-                        int64 index) override {
+                        int64 index) const override {
     ValueType value_vec;
     for (int64 j = 0; j < value_dim; j++) {
       V value = value_flat(index, j);
@@ -173,8 +172,15 @@ class TableWrapperOptimized final : public TableWrapperBase<K, V> {
     return table_->insert_or_assign(key, value_vec);
   }
 
+  bool insert_or_assign_one(K key, V* value, int64 value_dim) const override {
+    assert(value_dim == DIM);
+    ValueType value_vec;
+    std::copy_n(value, DIM, (V*)value_vec.data());
+    return table_->insert_or_assign(key, value_vec);
+  }
+
   bool insert_or_accum(K key, ConstTensor2D<V>& value_or_delta_flat, bool exist,
-                       int64 value_dim, int64 index) override {
+                       int64 value_dim, int64 index) const override {
     ValueType value_or_delta_vec;
     for (int64 j = 0; j < value_dim; j++) {
       value_or_delta_vec[j] = value_or_delta_flat(index, j);
@@ -215,110 +221,46 @@ class TableWrapperOptimized final : public TableWrapperBase<K, V> {
     }
   }
 
-  size_t size() const override { return table_->size(); }
-
-  void clear() override { table_->clear(); }
-
-  bool erase(const K& key) override { return table_->erase(key); }
-
-  Status export_values(OpKernelContext* ctx, int64 value_dim) override {
+  size_t dump(K* keys, V* values, const size_t search_offset,
+              const size_t search_length) const override {
     auto lt = table_->lock_table();
-    int64 size = lt.size();
-
-    Tensor* keys;
-    Tensor* values;
-    TF_RETURN_IF_ERROR(
-        ctx->allocate_output("keys", TensorShape({size}), &keys));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(
-        "values", TensorShape({size, value_dim}), &values));
-
-    auto keys_data = keys->flat<K>();
-    auto values_data = values->matrix<V>();
-    int64 i = 0;
-
-    for (auto it = lt.begin(); it != lt.end(); ++it, ++i) {
-      K key = it->first;
-      ValueType value = it->second;
-      keys_data(i) = key;
-      for (int64 j = 0; j < value_dim; j++) {
-        values_data(i, j) = value.at(j);
-      }
+    auto lt_size = lt.size();
+    if (search_offset > lt_size || lt_size == 0) {
+      return 0;
     }
-    return Status::OK();
-  }
-
-  Status save_to_hdfs(OpKernelContext* ctx, int64 value_dim,
-                      const string& filepath,
-                      const size_t buffer_size) override {
-    size_t dim = static_cast<size_t>(value_dim);
-    auto lt = table_->lock_table();
-
-    HadoopFileSystem hdfs;
-    std::unique_ptr<WritableFile> writer;
-    const string tmp_file = filepath + ".tmp";
-    TF_RETURN_IF_ERROR(hdfs.NewWritableFile(tmp_file, &writer));
-
-    const uint32 value_len = sizeof(V) * dim;
-    const uint32 record_len = sizeof(K) + value_len;
-    uint64 pos = 0;
-    uint8 content[buffer_size + record_len];
-
-    for (auto it = lt.begin(); it != lt.end(); ++it) {
-      K k = it->first;
-      std::memcpy(content + pos, reinterpret_cast<uint8*>(&k), sizeof(K));
-
-      const auto& jt = it->second.data();
-      std::memcpy(content + pos + sizeof(K), reinterpret_cast<uint8*>(jt),
-                  value_len);
-
-      pos += record_len;
-      if (pos > buffer_size) {
-        TF_RETURN_IF_ERROR(
-            writer->Append(StringPiece(reinterpret_cast<char*>(content), pos)));
-        pos = 0;
+    auto search_begin = lt.begin();
+    for (size_t i = 0; i < search_offset; ++i) {
+      ++search_begin;
+    }
+    auto search_end = search_begin;
+    if ((search_offset + search_length) >= lt_size) {
+      search_end = lt.end();
+    } else {
+      for (size_t i = 0; i < search_length; ++i) {
+        ++search_end;
       }
     }
 
-    if (pos > 0) {
-      TF_RETURN_IF_ERROR(
-          writer->Append(StringPiece(reinterpret_cast<char*>(content), pos)));
+    constexpr const size_t value_dim = DIM;
+    K* key_ptr = keys;
+    V* val_ptr = values;
+    size_t dump_counter = 0;
+    for (auto it = search_begin; it != search_end;
+         ++it, ++key_ptr, val_ptr += value_dim) {
+      const K& key = it->first;
+      const ValueType& value = it->second;
+      *key_ptr = key;
+      std::copy_n((V*)value.data(), value_dim, val_ptr);
+      ++dump_counter;
     }
-
-    TF_RETURN_IF_ERROR(writer->Close());
-    TF_RETURN_IF_ERROR(hdfs.RenameFile(tmp_file, filepath));
-    return Status::OK();
+    return dump_counter;
   }
 
-  Status load_from_hdfs(OpKernelContext* ctx, int64 value_dim,
-                        const string& filepath,
-                        const size_t buffer_size) override {
-    size_t dim = static_cast<size_t>(value_dim);
-
-    HadoopFileSystem hdfs;
-    std::unique_ptr<RandomAccessFile> file;
-    TF_RETURN_IF_ERROR(hdfs.NewRandomAccessFile(filepath, &file));
-    std::unique_ptr<io::RandomAccessInputStream> input_stream(
-        new io::RandomAccessInputStream(file.get()));
-    io::BufferedInputStream reader(input_stream.get(), buffer_size);
-
-    uint64 file_size = 0;
-    TF_RETURN_IF_ERROR(hdfs.GetFileSize(filepath, &file_size));
-
-    tstring content;
-    const uint32 value_len = sizeof(V) * dim;
-    const uint32 record_len = sizeof(K) + value_len;
-    uint64 i = 0;
-
-    while (i < file_size) {
-      TF_RETURN_IF_ERROR(reader.ReadNBytes(record_len, &content));
-      K* k = reinterpret_cast<K*>(content.data());
-      ValueType* value_vec =
-          reinterpret_cast<ValueType*>(content.data() + sizeof(K));
-      table_->insert_or_assign(*k, *value_vec);
-      i += record_len;
-    }
-    return Status::OK();
-  }
+  size_t size() const override { return table_->size(); }
+
+  void clear() const override { table_->clear(); }
+
+  bool erase(const K& key) const override { return table_->erase(key); }
 
  private:
   size_t init_size_;
@@ -343,7 +285,7 @@ class TableWrapperDefault final : public TableWrapperBase<K, V> {
   ~TableWrapperDefault() override { delete table_; }
 
   bool insert_or_assign(K key, ConstTensor2D<V>& value_flat, int64 value_dim,
-                        int64 index) override {
+                        int64 index) const override {
     ValueType value_vec;
     for (int64 j = 0; j < value_dim; j++) {
       V value = value_flat(index, j);
@@ -353,7 +295,7 @@ class TableWrapperDefault final : public TableWrapperBase<K, V> {
   }
 
   bool insert_or_accum(K key, ConstTensor2D<V>& value_or_delta_flat, bool exist,
-                       int64 value_dim, int64 index) override {
+                       int64 value_dim, int64 index) const override {
     ValueType value_or_delta_vec;
     for (int64 j = 0; j < value_dim; j++) {
       value_or_delta_vec.push_back(value_or_delta_flat(index, j));
@@ -361,6 +303,12 @@ class TableWrapperDefault final : public TableWrapperBase<K, V> {
     return table_->insert_or_accum(key, value_or_delta_vec, exist);
   }
 
+  bool insert_or_assign_one(K key, V* value, int64 value_dim) const override {
+    ValueType value_vec;
+    std::copy_n(value, value_dim, (V*)value_vec.data());
+    return table_->insert_or_assign(key, value_vec);
+  }
+
   void find(const K& key, typename tensorflow::TTypes<V, 2>::Tensor& value_flat,
             ConstTensor2D<V>& default_flat, int64 value_dim,
             bool is_full_size_default, int64 index) const override {
@@ -394,38 +342,47 @@ class TableWrapperDefault final : public TableWrapperBase<K, V> {
     }
   }
 
-  size_t size() const override { return table_->size(); }
-
-  void clear() override { table_->clear(); }
-
-  bool erase(const K& key) override { return table_->erase(key); }
-
-  Status export_values(OpKernelContext* ctx, int64 value_dim) override {
+  size_t dump(K* keys, V* values, const size_t search_offset,
+              const size_t search_length) const override {
     auto lt = table_->lock_table();
-    int64 size = lt.size();
-
-    Tensor* keys;
-    Tensor* values;
-    TF_RETURN_IF_ERROR(
-        ctx->allocate_output("keys", TensorShape({size}), &keys));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(
-        "values", TensorShape({size, value_dim}), &values));
-
-    auto keys_data = keys->flat<K>();
-    auto values_data = values->matrix<V>();
-    int64 i = 0;
-
-    for (auto it = lt.begin(); it != lt.end(); ++it, ++i) {
-      K key = it->first;
-      ValueType value = it->second;
-      keys_data(i) = key;
-      for (int64 j = 0; j < value_dim; j++) {
-        values_data(i, j) = value.at(j);
+    auto lt_size = lt.size();
+    if (search_offset > lt_size || lt_size == 0) {
+      return 0;
+    }
+    auto search_begin = lt.begin();
+    for (size_t i = 0; i < search_offset; ++i) {
+      ++search_begin;
+    }
+    auto search_end = search_begin;
+    if ((search_offset + search_length) >= lt_size) {
+      search_end = lt.end();
+    } else {
+      for (size_t i = 0; i < search_length; ++i) {
+        ++search_end;
       }
     }
-    return Status::OK();
+
+    const auto value_dim = (lt.begin()->second).size();
+    K* key_ptr = keys;
+    V* val_ptr = values;
+    size_t dump_counter = 0;
+    for (auto it = search_begin; it != search_end;
+         ++it, ++key_ptr, val_ptr += value_dim) {
+      const K& key = it->first;
+      const ValueType& value = it->second;
+      *key_ptr = key;
+      std::copy_n((V*)value.data(), value_dim, val_ptr);
+      ++dump_counter;
+    }
+    return dump_counter;
   }
 
+  size_t size() const override { return table_->size(); }
+
+  void clear() const override { table_->clear(); }
+
+  bool erase(const K& key) const override { return table_->erase(key); }
+
  private:
   size_t init_size_;
   Table* table_;
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h
index 2291cc287..353dade5b 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
 #define TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
 
+#include <iomanip>
 #include <typeindex>
 
 #include "tensorflow/core/framework/bounds_check.h"
@@ -27,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash/nv_hashtable.cuh"
@@ -60,13 +65,16 @@ class TableWrapperBase {
   virtual void dump(K* d_key, ValueType<V>* d_val, const size_t offset,
                     const size_t search_length, size_t* d_dump_counter,
                     cudaStream_t stream) const {}
+  virtual void rehash_if_needed(size_t* last_size_hint, size_t* max_size,
+                                const size_t min_size, cudaStream_t stream,
+                                const size_t new_keys = 0) {}
   virtual void get(const K* d_keys, ValueType<V>* d_vals, bool* d_status,
                    size_t len, ValueType<V>* d_def_val, cudaStream_t stream,
                    bool is_full_size_default) const {}
   virtual size_t get_size(cudaStream_t stream) const { return 0; }
   virtual size_t get_capacity() const { return 0; }
-  virtual void remove(const K* d_keys, size_t len, cudaStream_t stream) {}
-  virtual void clear(cudaStream_t stream) {}
+  virtual void remove(const K* d_keys, size_t len, cudaStream_t stream) const {}
+  virtual void clear(cudaStream_t stream) const {}
 };
 
 template <class K, class V, size_t DIM>
@@ -98,6 +106,72 @@ class TableWrapper final : public TableWrapperBase<K, V> {
     table_->dump(d_key, d_val, offset, search_length, d_dump_counter, stream);
   }
 
+  void rehash_if_needed(size_t* last_size_hint, size_t* max_size,
+                        const size_t min_size, cudaStream_t stream,
+                        const size_t new_keys = 0) override {
+    K* d_keys;
+    gpu::ValueArrayBase<V>* d_values;
+    constexpr auto runtime_dim = DIM;
+    size_t* d_dump_counter;
+    size_t new_max_size = *max_size;
+    const float max_load_factor = 0.75;
+    const float min_load_factor = 0.25;
+
+    const bool should_check =
+        ((*last_size_hint + new_keys) > max_load_factor * *max_size);
+    if (!should_check) {
+      *last_size_hint += new_keys;
+      return;
+    }
+
+    size_t total_size = table_->get_size(stream);
+    *last_size_hint = total_size;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    if (total_size >= max_load_factor * *max_size) {
+      new_max_size = *max_size * 2;
+    }
+    if (total_size<min_load_factor* * max_size&& * max_size> min_size) {
+      new_max_size = *max_size / 2;
+    }
+
+    // The table should be able to hold new_keys at least
+    if (new_max_size < total_size + new_keys) {
+      new_max_size = (total_size + new_keys) * 2;
+    }
+
+    if (new_max_size != *max_size) {  // rehash manually.
+      size_t capacity = table_->get_capacity();
+      size_t h_dump_counter = 0;
+      CUDA_CHECK(cudaMallocManaged((void**)&d_dump_counter, sizeof(size_t)));
+      CUDA_CHECK(cudaMallocManaged((void**)&d_keys, sizeof(K) * capacity));
+      CUDA_CHECK(cudaMallocManaged((void**)&d_values,
+                                   sizeof(V) * runtime_dim * capacity));
+      table_->dump(d_keys, (gpu::ValueArrayBase<V>*)d_values, 0, capacity,
+                   d_dump_counter, stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+
+      auto tmp_table_ = table_;
+      table_ = new Table(new_max_size);
+      delete tmp_table_;
+      tmp_table_ = NULL;
+
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(cudaMemcpy((size_t*)&h_dump_counter, (size_t*)d_dump_counter,
+                            sizeof(size_t), cudaMemcpyDefault));
+      table_->upsert((const K*)d_keys, (const gpu::ValueArrayBase<V>*)d_values,
+                     h_dump_counter, stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(cudaFree(d_keys));
+      CUDA_CHECK(cudaFree(d_values));
+      CUDA_CHECK(cudaFree(d_dump_counter));
+      *max_size = new_max_size;
+      LOG(INFO) << "HashTable on GPU changes to new status: [size="
+                << total_size << ", max_size=" << *max_size
+                << ", load factor=" << std::setprecision(2)
+                << (float)total_size / (float)*max_size << "].";
+    }
+  }
+
   void get(const K* d_keys, ValueType<V>* d_vals, bool* d_status, size_t len,
            ValueType<V>* d_def_val, cudaStream_t stream,
            bool is_full_size_default) const override {
@@ -111,16 +185,16 @@ class TableWrapper final : public TableWrapperBase<K, V> {
 
   size_t get_capacity() const override { return table_->get_capacity(); }
 
-  void remove(const K* d_keys, size_t len, cudaStream_t stream) override {
+  void remove(const K* d_keys, size_t len, cudaStream_t stream) const override {
     table_->remove(d_keys, len, stream);
   }
 
-  void clear(cudaStream_t stream) override { table_->clear(stream); }
+  void clear(cudaStream_t stream) const override { table_->clear(stream); }
 
  private:
   size_t max_size_;
   Table* table_;
-};
+};  // namespace gpu
 
 #define CREATE_A_TABLE(DIM)                                   \
   do {                                                        \
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp
index 15e17046e..0262b7ed0 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_cluster_connection_pool.hpp
@@ -39,7 +39,7 @@ template <typename RedisInstance, typename K, typename V>
 class RedisWrapper<RedisInstance, K, V,
                    typename std::enable_if<
                        std::is_same<RedisInstance, RedisCluster>::value>::type>
-    : public RedisVirtualWrapper {
+    : public RedisBaseWrapper<K, V> {
  private:
   ConnectionOptions conn_opts;
   ConnectionPoolOptions pool_opts;
@@ -80,17 +80,19 @@ class RedisWrapper<RedisInstance, K, V,
 
  public:
   std::shared_ptr<RedisInstance> StartConn(size_t ip_port_count, Role role) {
-    conn_opts.host = redis_connection_params.redis_host_ip[ip_port_count];
-    conn_opts.port = redis_connection_params.redis_host_port[ip_port_count];
+    conn_opts.host = this->redis_connection_params.redis_host_ip[ip_port_count];
+    conn_opts.port =
+        this->redis_connection_params.redis_host_port[ip_port_count];
 
-    SetPublicConnParams(conn_opts, pool_opts, redis_connection_params);
+    this->SetPublicConnParams(conn_opts, pool_opts,
+                              this->redis_connection_params);
 
     try {
       auto redis_client = std::make_shared<RedisInstance>(
           RedisInstance(conn_opts, pool_opts, role));
       redis_client->set("key test for connecting", "val test for connecting",
                         std::chrono::milliseconds(1));
-      if (RedisClusterEnabled(redis_client) == false) {
+      if (this->RedisClusterEnabled(redis_client) == false) {
         LOG(ERROR)
             << "Now is cluster mode but try to connect Redis single node. "
                "Please check redis_connection_mode in config file.";
@@ -110,14 +112,14 @@ class RedisWrapper<RedisInstance, K, V,
   }
 
   virtual Status Conn() override {
-    assert(redis_connection_params.redis_host_ip.size() ==
-           redis_connection_params.redis_host_port.size());
+    assert(this->redis_connection_params.redis_host_ip.size() ==
+           this->redis_connection_params.redis_host_port.size());
     auto role_read = Role::MASTER;
-    if (redis_connection_params.redis_read_access_slave) {
+    if (this->redis_connection_params.redis_read_access_slave) {
       role_read = Role::SLAVE;
     }
-    if (isRedisConnect == false) {
-      for (size_t i = 0; i < redis_connection_params.redis_host_ip.size();
+    if (this->isRedisConnect == false) {
+      for (size_t i = 0; i < this->redis_connection_params.redis_host_ip.size();
            ++i) {
         for (short j = 0; j < 10; j++) {
           if (redis_conn_read == nullptr) {
@@ -127,19 +129,19 @@ class RedisWrapper<RedisInstance, K, V,
             redis_conn_write = StartConn(i, Role::MASTER);
           }
           if (redis_conn_read != nullptr && redis_conn_write != nullptr) {
-            isRedisConnect = true;
+            this->isRedisConnect = true;
             return Status::OK();
           }
         }
         LOG(WARNING) << "Can not access the host "
-                     << redis_connection_params.redis_host_ip[i]
+                     << this->redis_connection_params.redis_host_ip[i]
                      << ". Delete it from the host list.";
-        redis_connection_params.redis_host_ip.erase(
-            redis_connection_params.redis_host_ip.begin() + i);
-        redis_connection_params.redis_host_port.erase(
-            redis_connection_params.redis_host_port.begin() + i);
+        this->redis_connection_params.redis_host_ip.erase(
+            this->redis_connection_params.redis_host_ip.begin() + i);
+        this->redis_connection_params.redis_host_port.erase(
+            this->redis_connection_params.redis_host_port.begin() + i);
       }
-      if (isRedisConnect == false) {
+      if (this->isRedisConnect == false) {
         LOG(ERROR) << "Can not connect to the Redis Cluster servers.";
         if (redis_conn_read == nullptr && redis_conn_write != nullptr) {
           return Status(error::UNAVAILABLE,
@@ -241,17 +243,17 @@ class RedisWrapper<RedisInstance, K, V,
     long long cursor = 0;
     const redisReply *set_reply;
     keys_prefix_name_slices_in_redis.reserve(
-        redis_connection_params.storage_slice);
+        this->redis_connection_params.storage_slice);
     for (size_t i = 0; i < ip_port_set.size(); ++i) {
       connection_options.host = ip_port_set[i].first;  // Required.
       connection_options.port =
           ip_port_set[i].second;  // Optional. The default port is 6379.
-      connection_options.user = redis_connection_params.redis_user;
+      connection_options.user = this->redis_connection_params.redis_user;
       connection_options.password =
-          redis_connection_params
+          this->redis_connection_params
               .redis_password;  // Optional. No redis_password by default.
       connection_options.db =
-          redis_connection_params
+          this->redis_connection_params
               .redis_db;  // Optional. Use the 0th database by default.
       redis_client.reset(new Redis(connection_options));
       auto cmd_per_server = [](::sw::redis::Connection &connection,
@@ -316,22 +318,22 @@ class RedisWrapper<RedisInstance, K, V,
                 << " existing in Redis cluster servers";
       return 0;
     } else if (keys_prefix_name_slices_in_redis.size() ==
-               redis_connection_params.storage_slice) {
+               this->redis_connection_params.storage_slice) {
       LOG(INFO) << "There is already a corresponding table " << keys_prefix_name
                 << " existing in Redis cluster servers";
       return 1;
     } else if (keys_prefix_name_slices_in_redis.size() <=
-               redis_connection_params.storage_slice) {
-      LOG(WARNING) << "storage_slice in redis_connection_params which is "
-                   << redis_connection_params.storage_slice
+               this->redis_connection_params.storage_slice) {
+      LOG(WARNING) << "storage_slice in this->redis_connection_params which is "
+                   << this->redis_connection_params.storage_slice
                    << " is bigger than the slices number of this "
                    << keys_prefix_name
                    << " in the Redis Cluster servers which is "
                    << keys_prefix_name_slices_in_redis.size();
       return 2;
     } else {
-      LOG(ERROR) << "storage_slice in redis_connection_params which is "
-                 << redis_connection_params.storage_slice
+      LOG(ERROR) << "storage_slice in this->redis_connection_params which is "
+                 << this->redis_connection_params.storage_slice
                  << " did not equal to the slices number of this "
                  << keys_prefix_name
                  << " in the Redis Cluster servers which is "
@@ -344,7 +346,7 @@ class RedisWrapper<RedisInstance, K, V,
   virtual std::vector<std::pair<unsigned, unsigned>> ClusterNodesSlots(
       bool full_slots) override {
     std::vector<std::pair<unsigned, unsigned>> cluster_slots;
-    cluster_slots.reserve(redis_connection_params.storage_slice);
+    cluster_slots.reserve(this->redis_connection_params.storage_slice);
     auto cmd = [](::sw::redis::Connection &connection,
                   ::sw::redis::StringView hkey) {
       connection.send("CLUSTER NODES");
@@ -361,7 +363,7 @@ class RedisWrapper<RedisInstance, K, V,
     if (reply->type == REDIS_REPLY_STRING) {
       std::vector<std::vector<::sw::redis::StringView>> csv_table;
       std::vector<::sw::redis::StringView> csv_table_row;
-      csv_table.reserve(redis_connection_params.storage_slice * 2);
+      csv_table.reserve(this->redis_connection_params.storage_slice * 2);
       csv_table_row.reserve(10);
       const char *str_ptr = reply->str;
       const char *const str_ptr_begin = reply->str;
@@ -493,7 +495,7 @@ class RedisWrapper<RedisInstance, K, V,
   }
 
   virtual std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> MgetInBucket(
-      const Tensor &keys, const int64_t begin, const int64_t max_i,
+      const K *keys, const int64_t begin, const int64_t max_i,
       const std::string &keys_prefix_name_slice) override {
     std::unique_ptr<BucketContext> bucket_context_temp(new BucketContext());
     const static char *redis_command = "HMGET";
@@ -503,10 +505,8 @@ class RedisWrapper<RedisInstance, K, V,
     bucket_context_temp->HandleClear();
     bucket_context_temp->HandleReserve(argc);
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     bucket_context_temp->HandlePushBack(redis_command, redis_command_byte);
     bucket_context_temp->HandlePushBack(keys_prefix_name_slice.data(),
@@ -540,7 +540,7 @@ class RedisWrapper<RedisInstance, K, V,
 
   virtual Status SetExpireBuckets(
       const std::string &keys_prefix_name) override {
-    if (redis_connection_params.expire_model_tag_in_seconds >= 0) {
+    if (this->redis_connection_params.expire_model_tag_in_seconds >= 0) {
       // std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> reply;
       const std::string expire_command("EXPIRE ");
       std::string redis_command;
@@ -553,7 +553,8 @@ class RedisWrapper<RedisInstance, K, V,
         redis_command.clear();
         redis_command =
             expire_command + bucket_name + ' ' +
-            std::to_string(redis_connection_params.expire_model_tag_in_seconds);
+            std::to_string(
+                this->redis_connection_params.expire_model_tag_in_seconds);
         try {
           /*reply=*/redis_conn_write->command(cmd, bucket_name,
                                               redis_command.data());
@@ -615,7 +616,7 @@ class RedisWrapper<RedisInstance, K, V,
 
     size_t buf_len;
     volatile void *tem_aio_buf;
-    for (unsigned i = 0; i < redis_connection_params.storage_slice; ++i) {
+    for (unsigned i = 0; i < this->redis_connection_params.storage_slice; ++i) {
       redis_command = "DUMP " + keys_prefix_name_slices[i];
       reply.reset();
       try {
@@ -887,7 +888,8 @@ class RedisWrapper<RedisInstance, K, V,
       const std::vector<std::string> &keys_prefix_name_slices_old,
       const std::vector<std::string> &keys_prefix_name_slices_new) override {
     try {
-      for (unsigned i = 0; i < redis_connection_params.storage_slice; ++i) {
+      for (unsigned i = 0; i < this->redis_connection_params.storage_slice;
+           ++i) {
         network_worker_pool->enqueue([this, &keys_prefix_name_slices_old,
                                       &keys_prefix_name_slices_new, i] {
           DoDuplicateInRedis(keys_prefix_name_slices_old[i],
@@ -933,7 +935,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   */
   virtual std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
   MgetCommand(
-      const Tensor &keys, ThreadContext *thread_context, const int64_t begin,
+      const K *keys, ThreadContext *thread_context, const int64_t begin,
       const int64_t max_i,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
@@ -942,15 +944,13 @@ every bucket has its own BucketContext for sending data---for locating reply-
     const static char *redis_command = "HMGET";
     const static std::size_t &&redis_command_byte = 5;
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     const unsigned &&vector_len =
         (static_cast<int64_t>(reinterpret_cast<int>(argc)) /
-         redis_connection_params.storage_slice) +
+         this->redis_connection_params.storage_slice) +
         2;
 
     thread_context->HandleReserve(storage_slice, vector_len, total);
@@ -1029,24 +1029,19 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MgetToTensor(
-      Tensor *values, const Tensor &default_value, const bool is_full_default,
+      V *values, const V *default_value, const bool is_full_default,
       ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
       const int64_t begin, const int64_t max_i,
       const int64_t Velems_per_dim0) override {
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values->tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *dft_raw =
-        reinterpret_cast<const V *>(default_value.tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *const dft_raw_begin =
-        reinterpret_cast<const V *>(default_value.tensor_data().data());
+    const V *pv_raw = values + begin * Velems_per_dim0;
+    const V *dft_raw = default_value + begin * Velems_per_dim0;
+    const V *const dft_raw_begin = default_value;
 
     const std::vector<unsigned> *bucket_locs =
         thread_context->bucket_locs.get();
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     unsigned buckets_iters_nums[storage_slice];
     unsigned bucket_loc;
     memset(buckets_iters_nums, 0U, sizeof(buckets_iters_nums));
@@ -1089,25 +1084,19 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MgetToTensorWithExist(
-      Tensor *values, const Tensor &default_value, Tensor &exists,
+      V *values, const V *default_value, bool *exists,
       const bool is_full_default, ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
       const int64_t begin, const int64_t max_i,
       const int64_t Velems_per_dim0) override {
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values->tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *dft_raw =
-        reinterpret_cast<const V *>(default_value.tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *const dft_raw_begin =
-        reinterpret_cast<const V *>(default_value.tensor_data().data());
-    auto exists_flat = exists.flat<bool>();
+    const V *pv_raw = values + begin * Velems_per_dim0;
+    const V *dft_raw = default_value + begin * Velems_per_dim0;
+    const V *const dft_raw_begin = default_value;
 
     const std::vector<unsigned> *bucket_locs =
         thread_context->bucket_locs.get();
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     unsigned buckets_iters_nums[storage_slice];
     unsigned bucket_loc;
     memset(buckets_iters_nums, 0U, sizeof(buckets_iters_nums));
@@ -1128,11 +1117,11 @@ every bucket has its own BucketContext for sending data---for locating reply-
             ReplyMemcpyToValTensor<V>(
                 pv_raw, temp_reply->str,
                 Velems_per_dim0);  // Direct access to Tensor data in TensorFlow
-            exists_flat(j) = true;
+            exists[j] = true;
           } else {
             CopyDefaultToTensor(is_full_default, pv_raw, dft_raw, dft_raw_begin,
                                 Velems_per_dim0);
-            exists_flat(j) = false;
+            exists[j] = false;
           }
         }
       } else {
@@ -1145,7 +1134,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
         ++(buckets_iters_nums[bucket_loc]);
         CopyDefaultToTensor(is_full_default, pv_raw, dft_raw, dft_raw_begin,
                             Velems_per_dim0);
-        exists_flat(j) = false;
+        exists[j] = false;
       }
     }
 
@@ -1153,7 +1142,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MsetCommand(
-      const Tensor &keys, const Tensor &values, ThreadContext *thread_context,
+      const K *keys, const V *values, ThreadContext *thread_context,
       const int64_t begin, const int64_t max_i, const int64_t Velems_per_dim0,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
@@ -1162,20 +1151,17 @@ every bucket has its own BucketContext for sending data---for locating reply-
     const static char *redis_command = "HMSET";
     const static std::size_t &&redis_command_byte = 5;
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
 
-    const V *pv_raw = reinterpret_cast<const V *>(values.tensor_data().data()) +
-                      begin * Velems_per_dim0;
+    const V *pv_raw = values + begin * Velems_per_dim0;
 
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     const unsigned &&vector_len =
         (static_cast<int64_t>(reinterpret_cast<int>(argc)) /
-         redis_connection_params.storage_slice) +
+         this->redis_connection_params.storage_slice) +
         2;
 
     thread_context->HandleReserve(storage_slice, vector_len, total);
@@ -1243,33 +1229,28 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MaccumCommand(
-      const Tensor &keys, const Tensor &values_or_delta, const Tensor &exists,
+      const K *keys, const V *values_or_delta, const bool *exists,
       ThreadContext *thread_context, const int64_t begin, const int64_t max_i,
-      const int64_t Velems_per_dim0,
+      const int64_t Velems_per_dim0, std::string &values_dtype_str,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
     const int &&argc = total * 2 + 4;
 
     const static char *redis_command = "HMACCUM";
     const static std::size_t &&redis_command_byte = 7;
-    std::string dTypestr = DataTypeString(values_or_delta.dtype());
-    size_t dTypeStrsize = dTypestr.size();
+    size_t dtype_str_size = values_dtype_str.size();
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
 
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values_or_delta.tensor_data().data()) +
-        begin * Velems_per_dim0;
+    const V *pv_raw = values_or_delta + begin * Velems_per_dim0;
 
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     const unsigned &&vector_len =
         (static_cast<int64_t>(reinterpret_cast<int>(argc)) /
-         redis_connection_params.storage_slice) +
+         this->redis_connection_params.storage_slice) +
         4;
 
     thread_context->HandleReserve(storage_slice, vector_len, total);
@@ -1278,7 +1259,8 @@ every bucket has its own BucketContext for sending data---for locating reply-
       thread_context->HandlePushBack(i, redis_command, redis_command_byte);
       thread_context->HandlePushBack(i, keys_prefix_name_slices[i].data(),
                                      keys_prefix_name_slices[i].size());
-      thread_context->HandlePushBack(i, dTypestr.c_str(), dTypeStrsize);
+      thread_context->HandlePushBack(i, values_dtype_str.c_str(),
+                                     dtype_str_size);
     }
 
     VContentAndTypeSizeResult VCATS_temp;
@@ -1300,8 +1282,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
           key_bucket_locs, VCATS_temp.VContentPointer, VCATS_temp.VTypeSize);
     }
 
-    const bool *pe_raw =
-        reinterpret_cast<const bool *>(exists.tensor_data().data()) + begin;
+    const bool *pe_raw = exists + begin;
     for (unsigned i = 0; i < storage_slice; ++i) {
       thread_context->HandlePushBack(i, KContentPointer<bool>(pe_raw),
                                      total * KTypeSize<bool>(pe_raw));
@@ -1345,7 +1326,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status DelCommand(
-      const Tensor &keys, ThreadContext *thread_context, const int64_t begin,
+      const K *keys, ThreadContext *thread_context, const int64_t begin,
       const int64_t max_i,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
@@ -1354,15 +1335,13 @@ every bucket has its own BucketContext for sending data---for locating reply-
     const static char *redis_command = "HDEL";
     const static std::size_t &&redis_command_byte = 4;
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
-    const unsigned &storage_slice = redis_connection_params.storage_slice;
+    const unsigned &storage_slice = this->redis_connection_params.storage_slice;
     const unsigned &&vector_len =
         (static_cast<int64_t>(reinterpret_cast<int>(argc)) /
-         redis_connection_params.storage_slice) +
+         this->redis_connection_params.storage_slice) +
         2;
 
     thread_context->HandleReserve(storage_slice, vector_len, total);
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp
index 2584e4033..53e4bf981 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_pool.hpp
@@ -39,7 +39,7 @@ template <typename RedisInstance, typename K, typename V>
 class RedisWrapper<
     RedisInstance, K, V,
     typename std::enable_if<std::is_same<RedisInstance, Redis>::value>::type>
-    : public RedisVirtualWrapper {
+    : public RedisBaseWrapper<K, V> {
  private:
   SentinelOptions sentinel_opts;
   bool using_sentinel = true;
@@ -83,37 +83,39 @@ class RedisWrapper<
   }
 
   std::shared_ptr<RedisInstance> StartConn(Role role) {
-    assert(redis_connection_params.redis_host_ip.size() ==
-           redis_connection_params.redis_host_port.size());
+    assert(this->redis_connection_params.redis_host_ip.size() ==
+           this->redis_connection_params.redis_host_port.size());
 
-    SetPublicConnParams(conn_opts, pool_opts, redis_connection_params);
+    this->SetPublicConnParams(conn_opts, pool_opts,
+                              this->redis_connection_params);
 
     if (this->using_sentinel) {
       sentinel_opts.nodes.clear();
-      for (size_t i = 0; i < redis_connection_params.redis_host_ip.size();
+      for (size_t i = 0; i < this->redis_connection_params.redis_host_ip.size();
            ++i) {
         sentinel_opts.nodes.push_back(
-            {redis_connection_params.redis_host_ip[i],
-             redis_connection_params.redis_host_port[i]});
+            {this->redis_connection_params.redis_host_ip[i],
+             this->redis_connection_params.redis_host_port[i]});
       }
 
-      sentinel_opts.password = redis_connection_params.redis_sentinel_password;
+      sentinel_opts.password =
+          this->redis_connection_params.redis_sentinel_password;
       // Optional. Timeout before we successfully connect to Redis Sentinel.
       sentinel_opts.connect_timeout = std::chrono::milliseconds(
-          redis_connection_params.redis_sentinel_connect_timeout);
+          this->redis_connection_params.redis_sentinel_connect_timeout);
       // Optional. Timeout before we successfully send request to or receive
       // response from Redis Sentinel.
       sentinel_opts.socket_timeout = std::chrono::milliseconds(
-          redis_connection_params.redis_sentinel_socket_timeout);
+          this->redis_connection_params.redis_sentinel_socket_timeout);
 
       auto sentinel = std::make_shared<Sentinel>(sentinel_opts);
 
       try {
-        auto redis_client = std::make_shared<RedisInstance>(
-            RedisInstance(sentinel, redis_connection_params.redis_master_name,
-                          role, conn_opts, pool_opts));
+        auto redis_client = std::make_shared<RedisInstance>(RedisInstance(
+            sentinel, this->redis_connection_params.redis_master_name, role,
+            conn_opts, pool_opts));
         redis_client->ping();
-        if (RedisClusterEnabled(redis_client) == true) {
+        if (this->RedisClusterEnabled(redis_client) == true) {
           LOG(ERROR)
               << "Now is sentinel mode but try to connect Redis cluster nodes. "
                  "Please check redis_connection_mode in config file.";
@@ -146,16 +148,17 @@ class RedisWrapper<
 
   std::shared_ptr<RedisInstance> start_conn_without_sentinel() {
     // Redis connection options
-    conn_opts.host = redis_connection_params.redis_host_ip[0];
-    conn_opts.port = redis_connection_params.redis_host_port[0];
+    conn_opts.host = this->redis_connection_params.redis_host_ip[0];
+    conn_opts.port = this->redis_connection_params.redis_host_port[0];
 
-    SetPublicConnParams(conn_opts, pool_opts, redis_connection_params);
+    this->SetPublicConnParams(conn_opts, pool_opts,
+                              this->redis_connection_params);
 
     try {
       auto redis_client =
           std::make_shared<RedisInstance>(RedisInstance(conn_opts, pool_opts));
       redis_client->ping();
-      if (RedisClusterEnabled(redis_client) == true) {
+      if (this->RedisClusterEnabled(redis_client) == true) {
         LOG(ERROR)
             << "Now is single mode but try to connect Redis cluster nodes. "
                "Please check redis_connection_mode in config file.";
@@ -176,10 +179,10 @@ class RedisWrapper<
 
   virtual Status Conn() override {
     auto role_read = Role::MASTER;
-    if (redis_connection_params.redis_read_access_slave) {
+    if (this->redis_connection_params.redis_read_access_slave) {
       role_read = Role::SLAVE;
     }
-    if (isRedisConnect == false) {
+    if (this->isRedisConnect == false) {
       for (short i = 0; i < 10; i++) {
         if (redis_conn_read == nullptr) {
           redis_conn_read = StartConn(role_read);
@@ -188,11 +191,11 @@ class RedisWrapper<
           redis_conn_write = StartConn(Role::MASTER);
         }
         if (redis_conn_read != nullptr && redis_conn_write != nullptr) {
-          isRedisConnect = true;
+          this->isRedisConnect = true;
           return Status::OK();
         }
       }
-      if (isRedisConnect == false) {
+      if (this->isRedisConnect == false) {
         LOG(ERROR) << "Can not connect to the Redis Master servers.";
         if (redis_conn_read == nullptr && redis_conn_write != nullptr) {
           return Status(error::UNAVAILABLE,
@@ -226,7 +229,7 @@ class RedisWrapper<
     long long cursor = 0;
     const redisReply *set_reply;
     keys_prefix_name_slices_in_redis.reserve(
-        redis_connection_params.storage_slice);
+        this->redis_connection_params.storage_slice);
     while (true) {
       if (only_get_buckets) {
         redis_command = "SCAN " + std::to_string(cursor) + " MATCH " +
@@ -282,22 +285,22 @@ class RedisWrapper<
                 << " existing in Redis server";
       return 0;
     } else if (keys_prefix_name_slices_in_redis.size() ==
-               redis_connection_params.storage_slice) {
+               this->redis_connection_params.storage_slice) {
       LOG(INFO) << "There is already a corresponding table " << keys_prefix_name
                 << " existing in Redis server";
       return 1;
     } else if (keys_prefix_name_slices_in_redis.size() <=
-               redis_connection_params.storage_slice) {
-      LOG(WARNING) << "storage_slice in redis_connection_params which is "
-                   << redis_connection_params.storage_slice
+               this->redis_connection_params.storage_slice) {
+      LOG(WARNING) << "storage_slice in this->redis_connection_params which is "
+                   << this->redis_connection_params.storage_slice
                    << " is bigger than the slices number of this "
                    << keys_prefix_name
                    << " in the Redis Cluster servers which is "
                    << keys_prefix_name_slices_in_redis.size();
       return 2;
     } else {
-      LOG(WARNING) << "storage_slice in redis_connection_params which is "
-                   << redis_connection_params.storage_slice
+      LOG(WARNING) << "storage_slice in this->redis_connection_params which is "
+                   << this->redis_connection_params.storage_slice
                    << " did not equal to the slices number of this "
                    << keys_prefix_name
                    << " in the Redis Single servers which is "
@@ -377,7 +380,7 @@ class RedisWrapper<
   }
 
   virtual std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> MgetInBucket(
-      const Tensor &keys, const int64_t begin, const int64_t max_i,
+      const K *keys, const int64_t begin, const int64_t max_i,
       const std::string &keys_prefix_name_slice) override {
     std::unique_ptr<BucketContext> bucket_context_temp(new BucketContext());
     const static char *redis_command = "HMGET";
@@ -386,10 +389,8 @@ class RedisWrapper<
     bucket_context_temp->HandleClear();
     bucket_context_temp->HandleReserve(argc);
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     bucket_context_temp->HandlePushBack(redis_command, redis_command_byte);
     bucket_context_temp->HandlePushBack(keys_prefix_name_slice.data(),
@@ -425,7 +426,7 @@ class RedisWrapper<
 
   virtual Status SetExpireBuckets(
       const std::string &keys_prefix_name) override {
-    if (redis_connection_params.expire_model_tag_in_seconds >= 0) {
+    if (this->redis_connection_params.expire_model_tag_in_seconds >= 0) {
       // std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> reply;
       const std::string expire_command("EXPIRE ");
       std::string redis_command;
@@ -438,7 +439,8 @@ class RedisWrapper<
         redis_command.clear();
         redis_command =
             expire_command + bucket_name + ' ' +
-            std::to_string(redis_connection_params.expire_model_tag_in_seconds);
+            std::to_string(
+                this->redis_connection_params.expire_model_tag_in_seconds);
         try {
           /*reply=*/redis_conn_write->command(cmd, redis_command.data());
         } catch (const std::exception &err) {
@@ -498,7 +500,7 @@ class RedisWrapper<
 
     size_t buf_len;
     volatile void *tem_aio_buf;
-    for (unsigned i = 0; i < redis_connection_params.storage_slice; ++i) {
+    for (unsigned i = 0; i < this->redis_connection_params.storage_slice; ++i) {
       redis_command = "DUMP " + keys_prefix_name_slices[i];
       reply.reset();
       try {
@@ -770,7 +772,8 @@ class RedisWrapper<
       const std::vector<std::string> &keys_prefix_name_slices_old,
       const std::vector<std::string> &keys_prefix_name_slices_new) override {
     try {
-      for (unsigned i = 0; i < redis_connection_params.storage_slice; ++i) {
+      for (unsigned i = 0; i < this->redis_connection_params.storage_slice;
+           ++i) {
         network_worker_pool->enqueue([this, &keys_prefix_name_slices_old,
                                       &keys_prefix_name_slices_new, i] {
           DoDuplicateInRedis(keys_prefix_name_slices_old[i],
@@ -816,7 +819,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   */
   virtual std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
   MgetCommand(
-      const Tensor &keys, ThreadContext *thread_context, const int64_t begin,
+      const K *keys, ThreadContext *thread_context, const int64_t begin,
       const int64_t max_i,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int argc = (max_i - begin) + 2;
@@ -829,10 +832,8 @@ every bucket has its own BucketContext for sending data---for locating reply-
     std::vector<const char *> *ptrs_0 = thread_context->buckets[0]->ptrs.get();
     std::vector<std::size_t> *sizes_0 = thread_context->buckets[0]->sizes.get();
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     auto ptrs_iter = ptrs_0->begin();
     *ptrs_iter = redis_command;
@@ -891,21 +892,16 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MgetToTensor(
-      Tensor *values, const Tensor &default_value, const bool is_full_default,
+      V *values, const V *default_value, const bool is_full_default,
       ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
       const int64_t begin, const int64_t max_i,
       const int64_t Velems_per_dim0) override {
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values->tensor_data().data()) +
-        begin * Velems_per_dim0;
+    const V *pv_raw = values + begin * Velems_per_dim0;
 
-    const V *dft_raw =
-        reinterpret_cast<const V *>(default_value.tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *const dft_raw_begin =
-        reinterpret_cast<const V *>(default_value.tensor_data().data());
+    const V *dft_raw = default_value + begin * Velems_per_dim0;
+    const V *const dft_raw_begin = default_value;
 
     redisReply *temp_reply;
     bool print_once = false;
@@ -941,22 +937,16 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MgetToTensorWithExist(
-      Tensor *values, const Tensor &default_value, Tensor &exists,
+      V *values, const V *default_value, bool *exists,
       const bool is_full_default, ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
       const int64_t begin, const int64_t max_i,
       const int64_t Velems_per_dim0) override {
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values->tensor_data().data()) +
-        begin * Velems_per_dim0;
+    const V *pv_raw = values + begin * Velems_per_dim0;
 
-    const V *dft_raw =
-        reinterpret_cast<const V *>(default_value.tensor_data().data()) +
-        begin * Velems_per_dim0;
-    const V *const dft_raw_begin =
-        reinterpret_cast<const V *>(default_value.tensor_data().data());
-    auto exists_flat = exists.flat<bool>();
+    const V *dft_raw = default_value + begin * Velems_per_dim0;
+    const V *const dft_raw_begin = default_value;
 
     redisReply *temp_reply;
     bool print_once = false;
@@ -971,11 +961,11 @@ every bucket has its own BucketContext for sending data---for locating reply-
             ReplyMemcpyToValTensor<V>(
                 pv_raw, temp_reply->str,
                 Velems_per_dim0);  // Direct access to Tensor data in TensorFlow
-            exists_flat(j) = true;
+            exists[j] = true;
           } else {
             CopyDefaultToTensor(is_full_default, pv_raw, dft_raw, dft_raw_begin,
                                 Velems_per_dim0);
-            exists_flat(j) = false;
+            exists[j] = false;
           }
         }
       } else {
@@ -987,7 +977,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
         }
         CopyDefaultToTensor(is_full_default, pv_raw, dft_raw, dft_raw_begin,
                             Velems_per_dim0);
-        exists_flat(j) = false;
+        exists[j] = false;
       }
     }
 
@@ -995,7 +985,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MsetCommand(
-      const Tensor &keys, const Tensor &values, ThreadContext *thread_context,
+      const K *keys, const V *values, ThreadContext *thread_context,
       const int64_t begin, const int64_t max_i, const int64_t Velems_per_dim0,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
@@ -1009,15 +999,12 @@ every bucket has its own BucketContext for sending data---for locating reply-
     std::vector<const char *> *ptrs_0 = thread_context->buckets[0]->ptrs.get();
     std::vector<std::size_t> *sizes_0 = thread_context->buckets[0]->sizes.get();
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
 
-    const V *pv_raw = reinterpret_cast<const V *>(values.tensor_data().data()) +
-                      begin * Velems_per_dim0;
+    const V *pv_raw = values + begin * Velems_per_dim0;
 
     auto ptrs_iter = ptrs_0->begin();
     *ptrs_iter = redis_command;
@@ -1072,39 +1059,34 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status MaccumCommand(
-      const Tensor &keys, const Tensor &values_or_delta, const Tensor &exists,
+      const K *keys, const V *values_or_delta, const bool *exists,
       ThreadContext *thread_context, const int64_t begin, const int64_t max_i,
-      const int64_t Velems_per_dim0,
+      const int64_t Velems_per_dim0, std::string &values_dtype_str,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int &&total = max_i - begin;
     const int &&argc = total * 2 + 4;
 
     const static char *redis_command = "HMACCUM";
     const static std::size_t redis_command_byte = 7;
-    std::string dTypestr = DataTypeString(values_or_delta.dtype());
 
     thread_context->HandleReserve(1U, argc, 0);
 
     std::vector<const char *> *ptrs_0 = thread_context->buckets[0]->ptrs.get();
     std::vector<std::size_t> *sizes_0 = thread_context->buckets[0]->sizes.get();
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     const std::size_t &&V_byte_size = Velems_per_dim0 * sizeof(V);
 
-    const V *pv_raw =
-        reinterpret_cast<const V *>(values_or_delta.tensor_data().data()) +
-        begin * Velems_per_dim0;
+    const V *pv_raw = values_or_delta + begin * Velems_per_dim0;
 
     auto ptrs_iter = ptrs_0->begin();
     *ptrs_iter = redis_command;
     ++ptrs_iter;
     *ptrs_iter = keys_prefix_name_slices[0].data();
     ++ptrs_iter;
-    *ptrs_iter = dTypestr.c_str();
+    *ptrs_iter = values_dtype_str.c_str();
     ++ptrs_iter;
 
     auto sizes_iter = sizes_0->begin();
@@ -1112,7 +1094,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
     ++sizes_iter;
     *sizes_iter = keys_prefix_name_slices[0].size();
     ++sizes_iter;
-    *sizes_iter = dTypestr.size();
+    *sizes_iter = values_dtype_str.size();
     ++sizes_iter;
 
     VContentAndTypeSizeResult VCATS_temp;
@@ -1134,8 +1116,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
       ++sizes_iter;
     }
 
-    const bool *pe_raw =
-        reinterpret_cast<const bool *>(exists.tensor_data().data()) + begin;
+    const bool *pe_raw = exists + begin;
     *ptrs_iter = KContentPointer<bool>(pe_raw);
     *sizes_iter = total * KTypeSize<bool>(pe_raw);
 
@@ -1161,7 +1142,7 @@ every bucket has its own BucketContext for sending data---for locating reply-
   }
 
   virtual Status DelCommand(
-      const Tensor &keys, ThreadContext *thread_context, const int64_t begin,
+      const K *keys, ThreadContext *thread_context, const int64_t begin,
       const int64_t max_i,
       const std::vector<std::string> &keys_prefix_name_slices) override {
     const int argc = (max_i - begin) + 2;
@@ -1174,10 +1155,8 @@ every bucket has its own BucketContext for sending data---for locating reply-
     std::vector<const char *> *ptrs_0 = thread_context->buckets[0]->ptrs.get();
     std::vector<std::size_t> *sizes_0 = thread_context->buckets[0]->sizes.get();
 
-    const K *const pk_raw_end =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + max_i;
-    const K *pk_raw =
-        reinterpret_cast<const K *>(keys.tensor_data().data()) + begin;
+    const K *const pk_raw_end = keys + max_i;
+    const K *pk_raw = keys + begin;
 
     auto ptrs_iter = ptrs_0->begin();
     *ptrs_iter = redis_command;
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp
index 36e1ade9b..fa472270e 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_connection_util.hpp
@@ -322,7 +322,8 @@ class ThreadContext {
 
 typedef unsigned (*KBucketNumHandle)(uint32_t, const uint8_t *, size_t);
 
-class RedisVirtualWrapper {
+template <typename K, typename V>
+class RedisBaseWrapper {
  protected:
   Redis_Connection_Params redis_connection_params;
   KBucketNumHandle K_bucket_num_handle;
@@ -413,7 +414,7 @@ class RedisVirtualWrapper {
                            long long *cursor, const long long count) = 0;
 
   virtual std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> MgetInBucket(
-      const Tensor &keys, const int64_t begin, const int64_t max_i,
+      const K *, const int64_t begin, const int64_t max_i,
       const std::string &keys_prefix_name_slice) = 0;
 
   virtual Status SetExpireBuckets(const std::string &keys_prefix_name) = 0;
@@ -434,12 +435,12 @@ class RedisVirtualWrapper {
       const std::vector<std::string> &keys_prefix_name_slices_new) = 0;
 
   virtual std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
-  MgetCommand(const Tensor &keys, ThreadContext *thread_context,
-              const int64_t begin, const int64_t max_i,
+  MgetCommand(const K *, ThreadContext *thread_context, const int64_t begin,
+              const int64_t max_i,
               const std::vector<std::string> &keys_prefix_name_slices) = 0;
 
   virtual Status MgetToTensor(
-      Tensor *values, const Tensor &default_value, const bool is_full_default,
+      V *values, const V *default_value, const bool is_full_default,
       ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
@@ -447,7 +448,7 @@ class RedisVirtualWrapper {
       const int64_t Velems_per_dim0) = 0;
 
   virtual Status MgetToTensorWithExist(
-      Tensor *values, const Tensor &default_value, Tensor &exists,
+      V *values, const V *default_value, bool *exists,
       const bool is_full_default, ThreadContext *thread_context,
       std::vector<std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter>>
           &reply,
@@ -455,24 +456,24 @@ class RedisVirtualWrapper {
       const int64_t Velems_per_dim0) = 0;
 
   virtual Status MsetCommand(
-      const Tensor &keys, const Tensor &values, ThreadContext *thread_context,
+      const K *, const V *values, ThreadContext *thread_context,
       const int64_t begin, const int64_t max_i, const int64_t Velems_per_dim0,
       const std::vector<std::string> &keys_prefix_name_slices) = 0;
 
   virtual Status MaccumCommand(
-      const Tensor &keys, const Tensor &values, const Tensor &exists,
+      const K *, const V *values, const bool *exists,
       ThreadContext *thread_context, const int64_t begin, const int64_t max_i,
-      const int64_t Velems_per_dim0,
+      const int64_t Velems_per_dim0, std::string &values_dtype_str,
       const std::vector<std::string> &keys_prefix_name_slices) = 0;
 
   virtual Status DelCommand(
-      const Tensor &keys, ThreadContext *thread_context, const int64_t begin,
+      const K *, ThreadContext *thread_context, const int64_t begin,
       const int64_t max_i,
       const std::vector<std::string> &keys_prefix_name_slices) = 0;
 };
 
 template <typename RedisInstance, typename K, typename V, typename = void>
-class RedisWrapper : public RedisVirtualWrapper {};
+class RedisWrapper : public RedisBaseWrapper<K, V> {};
 
 struct VContentAndTypeSizeResult {
   size_t VTypeSize;
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp
index 98bbdb7d8..27508bdec 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_impl/redis_table_op_util.hpp
@@ -59,10 +59,11 @@ size_t SelectAvailableThreadContext(
   return thread_context_id;
 }
 
-Status launchFindCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
+template <typename K, typename V>
+Status launchFindCore(std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance,
                       std::vector<std::string> &keys_prefix_name_slices,
-                      const Tensor &keys, Tensor *values,
-                      const Tensor &default_value, const bool is_full_default,
+                      const K *keys, V *values, const V *default_value,
+                      const bool is_full_default,
                       const int64_t &Velems_per_flat2_dim0,
                       std::vector<ThreadContext *> &threads_Find,
                       std::mutex &threads_Find_mutex, const int64_t begin,
@@ -85,11 +86,12 @@ Status launchFindCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
   return statu;
 }
 
+template <typename K, typename V>
 Status launchFindWithExistsCore(
-    std::shared_ptr<RedisVirtualWrapper> _table_instance,
-    std::vector<std::string> &keys_prefix_name_slices, const Tensor &keys,
-    Tensor *values, const Tensor &default_value, Tensor &exists,
-    const bool is_full_default, const int64_t &Velems_per_flat2_dim0,
+    std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance,
+    std::vector<std::string> &keys_prefix_name_slices, const K *keys, V *values,
+    const V *default_value, bool *exists, const bool is_full_default,
+    const int64_t &Velems_per_flat2_dim0,
     std::vector<ThreadContext *> &threads_Find, std::mutex &threads_Find_mutex,
     const int64_t begin, const int64_t end) {
   // TODO: Implement the function of not looking up the table if the key does
@@ -112,9 +114,10 @@ Status launchFindWithExistsCore(
   return statu;
 }
 
-Status launchInsertCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
+template <typename K, typename V>
+Status launchInsertCore(std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance,
                         std::vector<std::string> &keys_prefix_name_slices,
-                        const Tensor &keys, const Tensor &values,
+                        const K *keys, const V *values,
                         const int64_t &Velems_per_flat2_dim0,
                         std::vector<ThreadContext *> &threads_Insert,
                         std::mutex &threads_Insert_mutex, const int64_t begin,
@@ -132,11 +135,12 @@ Status launchInsertCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
   return statu;
 }
 
-Status launchAccumCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
+template <typename K, typename V>
+Status launchAccumCore(std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance,
                        std::vector<std::string> &keys_prefix_name_slices,
-                       const Tensor &keys, const Tensor &values_or_delta,
-                       const Tensor &exists,
-                       const int64_t &Velems_per_flat2_dim0,
+                       const K *keys, const V *values_or_delta,
+                       const bool *exists, const int64_t &Velems_per_flat2_dim0,
+                       std::string &values_dtype_str,
                        std::vector<ThreadContext *> &threads_Insert,
                        std::mutex &threads_Accum_mutex, const int64_t begin,
                        const int64_t end) {
@@ -145,7 +149,8 @@ Status launchAccumCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
 
   auto statu = _table_instance->MaccumCommand(
       keys, values_or_delta, exists, threads_Insert.at(thread_context_id),
-      begin, end, Velems_per_flat2_dim0, keys_prefix_name_slices);
+      begin, end, Velems_per_flat2_dim0, values_dtype_str,
+      keys_prefix_name_slices);
 
   threads_Insert[thread_context_id]->thread_occupied.store(
       false, std::memory_order_release);
@@ -153,9 +158,10 @@ Status launchAccumCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
   return statu;
 }
 
-Status launchDeleteCore(std::shared_ptr<RedisVirtualWrapper> _table_instance,
+template <typename K, typename V>
+Status launchDeleteCore(std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance,
                         std::vector<std::string> &keys_prefix_name_slices,
-                        const Tensor &keys,
+                        const K *keys,
                         std::vector<ThreadContext *> &threads_Delete,
                         std::mutex &threads_Delete_mutex, const int64_t begin,
                         const int64_t end) {
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc
index 36314ae83..de4cc89fc 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/redis_table_op.cc
@@ -33,6 +33,11 @@ limitations under the License.
 #include "redis_impl/redis_connection_pool.hpp"
 #include "redis_impl/redis_table_op_util.hpp"
 #include "tensorflow/core/kernels/lookup_table_op.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/types.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
@@ -71,7 +76,7 @@ class RedisTableOfTensors final : public LookupInterface {
   std::vector<std::string> keys_prefix_name_slices;
   std::vector<std::string> keys_prefix_name_slices_import;
 
-  std::shared_ptr<RedisVirtualWrapper> _table_instance = nullptr;
+  std::shared_ptr<RedisBaseWrapper<K, V>> _table_instance = nullptr;
 
   std::vector<ThreadContext *> threads_Find;
   std::vector<ThreadContext *> threads_Insert;
@@ -93,13 +98,11 @@ class RedisTableOfTensors final : public LookupInterface {
  private:
   void launchFind_parallel(OpKernelContext *ctx,
                            std::vector<std::string> &keys_prefix_name_slices,
-                           const Tensor &keys, Tensor *values,
-                           const Tensor &default_value, const int64_t &total,
+                           const K *keys, V *values, const V *default_value,
+                           const int64_t &total,
                            const int64_t &Velems_per_flat2_dim0,
+                           const bool is_full_default,
                            std::vector<ThreadContext *> &threads_Find) {
-    const bool is_full_default =
-        (values->NumElements() == default_value.NumElements());
-
     const int64_t max_parallelism = (total / multi_redis_cmd_max_argc) + 1;
 
     auto shard = [this, &ctx, &total, &keys_prefix_name_slices, &keys, &values,
@@ -107,9 +110,9 @@ class RedisTableOfTensors final : public LookupInterface {
                   &threads_Find](int64_t begin, int64_t end) {
       const int64_t max_i = std::min(total, end);
 
-      OP_REQUIRES_OK(
-          ctx,
-          launchFindCore(_table_instance, keys_prefix_name_slices, keys, values,
+      OP_REQUIRES_OK(ctx,
+                     launchFindCore<K, V>(
+                         _table_instance, keys_prefix_name_slices, keys, values,
                          default_value, is_full_default, Velems_per_flat2_dim0,
                          threads_Find, threads_Find_mutex, begin, max_i));
     };
@@ -120,29 +123,22 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchFind(OpKernelContext *ctx,
                   std::vector<std::string> &keys_prefix_name_slices,
-                  const Tensor &keys, Tensor *values,
-                  const Tensor &default_value, const int64_t &total,
-                  const int64_t &Velems_per_flat2_dim0,
+                  const K *keys, V *values, const V *default_value,
+                  const int64_t &total, const int64_t &Velems_per_flat2_dim0,
+                  const bool is_full_default,
                   std::vector<ThreadContext *> &threads_Find) {
-    const bool is_full_default =
-        (values->NumElements() == default_value.NumElements());
-
     OP_REQUIRES_OK(
-        ctx,
-        launchFindCore(_table_instance, keys_prefix_name_slices, keys, values,
-                       default_value, is_full_default, Velems_per_flat2_dim0,
-                       threads_Find, threads_Find_mutex, 0, total));
+        ctx, launchFindCore<K, V>(_table_instance, keys_prefix_name_slices,
+                                  keys, values, default_value, is_full_default,
+                                  Velems_per_flat2_dim0, threads_Find,
+                                  threads_Find_mutex, 0, total));
   }
 
   void launchFindWithExists_parallel(
       OpKernelContext *ctx, std::vector<std::string> &keys_prefix_name_slices,
-      const Tensor &keys, Tensor *values, const Tensor &default_value,
-      Tensor &exists, const int64_t &total,
-      const int64_t &Velems_per_flat2_dim0,
-      std::vector<ThreadContext *> &threads_Find) {
-    const bool is_full_default =
-        (values->NumElements() == default_value.NumElements());
-
+      const K *keys, V *values, const V *default_value, bool *exists,
+      const int64_t &total, const int64_t &Velems_per_flat2_dim0,
+      const bool is_full_default, std::vector<ThreadContext *> &threads_Find) {
     const int64_t max_parallelism = (total / multi_redis_cmd_max_argc) + 1;
 
     auto shard = [this, &ctx, &total, &keys_prefix_name_slices, &keys, &values,
@@ -151,7 +147,7 @@ class RedisTableOfTensors final : public LookupInterface {
                   &threads_Find](int64_t begin, int64_t end) {
       const int64_t max_i = std::min(total, end);
 
-      OP_REQUIRES_OK(ctx, launchFindWithExistsCore(
+      OP_REQUIRES_OK(ctx, launchFindWithExistsCore<K, V>(
                               _table_instance, keys_prefix_name_slices, keys,
                               values, default_value, exists, is_full_default,
                               Velems_per_flat2_dim0, threads_Find,
@@ -164,16 +160,13 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchFindWithExists(OpKernelContext *ctx,
                             std::vector<std::string> &keys_prefix_name_slices,
-                            const Tensor &keys, Tensor *values,
-                            const Tensor &default_value, Tensor &exists,
-                            const int64_t &total,
+                            const K *keys, V *values, const V *default_value,
+                            bool *exists, const int64_t &total,
                             const int64_t &Velems_per_flat2_dim0,
+                            const bool is_full_default,
                             std::vector<ThreadContext *> &threads_Find) {
-    const bool is_full_default =
-        (values->NumElements() == default_value.NumElements());
-
     OP_REQUIRES_OK(
-        ctx, launchFindWithExistsCore(
+        ctx, launchFindWithExistsCore<K, V>(
                  _table_instance, keys_prefix_name_slices, keys, values,
                  default_value, exists, is_full_default, Velems_per_flat2_dim0,
                  threads_Find, threads_Find_mutex, 0, total));
@@ -181,7 +174,7 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchInsert_parallel(OpKernelContext *ctx,
                              std::vector<std::string> &keys_prefix_name_slices,
-                             const Tensor &keys, const Tensor &values,
+                             const K *keys, const V *values,
                              const int64_t &total,
                              const int64_t &Velems_per_flat2_dim0,
                              std::vector<ThreadContext *> &threads_Insert) {
@@ -192,10 +185,10 @@ class RedisTableOfTensors final : public LookupInterface {
                   &threads_Insert](int64_t begin, int64_t end) {
       const int64_t max_i = std::min(total, end);
 
-      OP_REQUIRES_OK(
-          ctx, launchInsertCore(_table_instance, keys_prefix_name_slices, keys,
-                                values, Velems_per_flat2_dim0, threads_Insert,
-                                threads_Insert_mutex, begin, max_i));
+      OP_REQUIRES_OK(ctx, launchInsertCore<K, V>(
+                              _table_instance, keys_prefix_name_slices, keys,
+                              values, Velems_per_flat2_dim0, threads_Insert,
+                              threads_Insert_mutex, begin, max_i));
     };
     int64_t slices_size = std::min(total, multi_redis_cmd_max_argc - 1);
     auto &worker_threads = *ctx->device()->tensorflow_cpu_worker_threads();
@@ -204,33 +197,35 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchInsert(OpKernelContext *ctx,
                     std::vector<std::string> &keys_prefix_name_slices,
-                    const Tensor &keys, const Tensor &values,
-                    const int64_t &total, const int64_t &Velems_per_flat2_dim0,
+                    const K *keys, const V *values, const int64_t &total,
+                    const int64_t &Velems_per_flat2_dim0,
                     std::vector<ThreadContext *> &threads_Insert) {
-    OP_REQUIRES_OK(
-        ctx, launchInsertCore(_table_instance, keys_prefix_name_slices, keys,
-                              values, Velems_per_flat2_dim0, threads_Insert,
-                              threads_Insert_mutex, 0, total));
+    OP_REQUIRES_OK(ctx, launchInsertCore<K, V>(
+                            _table_instance, keys_prefix_name_slices, keys,
+                            values, Velems_per_flat2_dim0, threads_Insert,
+                            threads_Insert_mutex, 0, total));
   }
 
   void launchAccum_parallel(OpKernelContext *ctx,
                             std::vector<std::string> &keys_prefix_name_slices,
-                            const Tensor &keys, const Tensor &values_or_delta,
-                            const Tensor &exists, const int64_t &total,
+                            const K *keys, const V *values_or_delta,
+                            const bool *exists, const int64_t &total,
                             const int64_t &Velems_per_flat2_dim0,
+                            std::string &values_dtype_str,
                             std::vector<ThreadContext *> &threads_Insert) {
     const int64_t max_parallelism = (total / multi_redis_cmd_max_argc) + 1;
 
     auto shard = [this, &ctx, &total, &keys_prefix_name_slices, &keys,
                   &values_or_delta, &exists, &Velems_per_flat2_dim0,
+                  &values_dtype_str,
                   &threads_Insert](int64_t begin, int64_t end) {
       const int64_t max_i = std::min(total, end);
 
-      OP_REQUIRES_OK(
-          ctx,
-          launchAccumCore(_table_instance, keys_prefix_name_slices, keys,
-                          values_or_delta, exists, Velems_per_flat2_dim0,
-                          threads_Insert, threads_Accum_mutex, begin, max_i));
+      OP_REQUIRES_OK(ctx, launchAccumCore<K, V>(
+                              _table_instance, keys_prefix_name_slices, keys,
+                              values_or_delta, exists, Velems_per_flat2_dim0,
+                              values_dtype_str, threads_Insert,
+                              threads_Accum_mutex, begin, max_i));
     };
     int64_t slices_size = std::min(total, multi_redis_cmd_max_argc - 1);
     auto &worker_threads = *ctx->device()->tensorflow_cpu_worker_threads();
@@ -239,19 +234,20 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchAccum(OpKernelContext *ctx,
                    std::vector<std::string> &keys_prefix_name_slices,
-                   const Tensor &keys, const Tensor &values_or_delta,
-                   const Tensor &exists, const int64_t &total,
-                   const int64_t &Velems_per_flat2_dim0,
+                   const K *keys, const V *values_or_delta, const bool *exists,
+                   const int64_t &total, const int64_t &Velems_per_flat2_dim0,
+                   std::string &values_dtype_str,
                    std::vector<ThreadContext *> &threads_Insert) {
-    OP_REQUIRES_OK(
-        ctx, launchAccumCore(_table_instance, keys_prefix_name_slices, keys,
-                             values_or_delta, exists, Velems_per_flat2_dim0,
-                             threads_Insert, threads_Insert_mutex, 0, total));
+    OP_REQUIRES_OK(ctx, launchAccumCore<K, V>(
+                            _table_instance, keys_prefix_name_slices, keys,
+                            values_or_delta, exists, Velems_per_flat2_dim0,
+                            values_dtype_str, threads_Insert,
+                            threads_Insert_mutex, 0, total));
   }
 
   void launchDelete_parallel(OpKernelContext *ctx,
                              std::vector<std::string> &keys_prefix_name_slices,
-                             const Tensor &keys, const int64_t &total,
+                             const K *keys, const int64_t &total,
                              std::vector<ThreadContext *> &threads_Delete) {
     const int64_t max_parallelism = (total / multi_redis_cmd_max_argc) + 1;
 
@@ -260,9 +256,9 @@ class RedisTableOfTensors final : public LookupInterface {
       const int64_t max_i = std::min(total, end);
 
       OP_REQUIRES_OK(
-          ctx,
-          launchDeleteCore(_table_instance, keys_prefix_name_slices, keys,
-                           threads_Delete, threads_Delete_mutex, begin, max_i));
+          ctx, launchDeleteCore<K, V>(_table_instance, keys_prefix_name_slices,
+                                      keys, threads_Delete,
+                                      threads_Delete_mutex, begin, max_i));
     };
     int64_t slices_size = std::min(total, multi_redis_cmd_max_argc - 1);
     auto &worker_threads = *ctx->device()->tensorflow_cpu_worker_threads();
@@ -271,11 +267,11 @@ class RedisTableOfTensors final : public LookupInterface {
 
   void launchDelete(OpKernelContext *ctx,
                     std::vector<std::string> &keys_prefix_name_slices,
-                    const Tensor &keys, const int64_t &total,
+                    const K *keys, const int64_t &total,
                     std::vector<ThreadContext *> &threads_Delete) {
-    OP_REQUIRES_OK(
-        ctx, launchDeleteCore(_table_instance, keys_prefix_name_slices, keys,
-                              threads_Delete, threads_Delete_mutex, 0, total));
+    OP_REQUIRES_OK(ctx, launchDeleteCore<K, V>(
+                            _table_instance, keys_prefix_name_slices, keys,
+                            threads_Delete, threads_Delete_mutex, 0, total));
   }
 
  public:
@@ -639,7 +635,7 @@ class RedisTableOfTensors final : public LookupInterface {
       }
       try {
         // insert KV pair into new Redis with new storage_slice
-        launchInsert(ctx, keys_prefix_name_slices, keys_temp, values_temp,
+        launchInsert(ctx, keys_prefix_name_slices, pk_raw, pv_raw,
                      slice_keys_size, runtime_value_dim_, threads_Insert);
       } catch (const std::exception &err) {
         LOG(ERROR)
@@ -681,15 +677,20 @@ class RedisTableOfTensors final : public LookupInterface {
     int64_t total = keys.NumElements();
     if (total > 0) {
       const int64_t Velems_per_flat2_dim0 = values->NumElements() / total;
-
+      const bool is_full_default =
+          (values->NumElements() == default_value.NumElements());
       if (total < (multi_redis_cmd_max_argc - 1)) {
-        launchFind(ctx, keys_prefix_name_slices, keys, values, default_value,
-                   total, Velems_per_flat2_dim0, threads_Find);
+        launchFind(ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+                   (V *)values->tensor_data().data(),
+                   (V *)default_value.tensor_data().data(), total,
+                   Velems_per_flat2_dim0, is_full_default, threads_Find);
       } else {
         // redis commmand args > multi_redis_cmd_max_argc
-        launchFind_parallel(ctx, keys_prefix_name_slices, keys, values,
-                            default_value, total, Velems_per_flat2_dim0,
-                            threads_Find);
+        launchFind_parallel(
+            ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+            (V *)values->tensor_data().data(),
+            (V *)default_value.tensor_data().data(), total,
+            Velems_per_flat2_dim0, is_full_default, threads_Find);
       }
     }
 
@@ -702,85 +703,100 @@ class RedisTableOfTensors final : public LookupInterface {
     int64_t total = keys.NumElements();
     if (total > 0) {
       const int64_t Velems_per_flat2_dim0 = values->NumElements() / total;
-
+      const bool is_full_default =
+          (values->NumElements() == default_value.NumElements());
       if (total < (multi_redis_cmd_max_argc - 1)) {
-        launchFindWithExists(ctx, keys_prefix_name_slices, keys, values,
-                             default_value, exists, total,
-                             Velems_per_flat2_dim0, threads_Find);
+        launchFindWithExists(
+            ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+            (V *)values->tensor_data().data(),
+            (V *)default_value.tensor_data().data(),
+            (bool *)exists.tensor_data().data(), total, Velems_per_flat2_dim0,
+            is_full_default, threads_Find);
       } else {
         // redis commmand args > multi_redis_cmd_max_argc
-        launchFindWithExists_parallel(ctx, keys_prefix_name_slices, keys,
-                                      values, default_value, exists, total,
-                                      Velems_per_flat2_dim0, threads_Find);
+        launchFindWithExists_parallel(
+            ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+            (V *)values->tensor_data().data(),
+            (V *)default_value.tensor_data().data(),
+            (bool *)exists.tensor_data().data(), total, Velems_per_flat2_dim0,
+            is_full_default, threads_Find);
       }
     }
     return Status::OK();
   }
 
-  Status DoInsert(bool clear, OpKernelContext *ctx, const Tensor &keys,
-                  const Tensor &values) {
-    int64_t total = keys.NumElements();
-    if (total > 0) {
-      const int64_t Velems_per_flat2_dim0 = values.NumElements() / total;
-      auto statu = Status::OK();
-      if (clear) {
-        for (auto keys_prefix_name_slice : keys_prefix_name_slices) {
-          statu = _table_instance->RemoveHkeysInBuckets(keys_prefix_name_slice);
-          if (statu != Status::OK()) {
-            return statu;
-          }
+  Status DoInsert(bool clear, OpKernelContext *ctx, const K *keys,
+                  const V *values, const int64_t total,
+                  const int64_t Velems_per_flat2_dim0) {
+    auto statu = Status::OK();
+    if (clear) {
+      for (auto keys_prefix_name_slice : keys_prefix_name_slices) {
+        statu = _table_instance->RemoveHkeysInBuckets(keys_prefix_name_slice);
+        if (statu != Status::OK()) {
+          return statu;
         }
       }
-      if (total < (multi_redis_cmd_max_argc - 1)) {
-        launchInsert(ctx, keys_prefix_name_slices, keys, values, total,
-                     Velems_per_flat2_dim0, threads_Insert);
-      } else {
-        launchInsert_parallel(
-            ctx, keys_prefix_name_slices, keys, values, total,
-            Velems_per_flat2_dim0,
-            threads_Insert);  // redis commmand args > multi_redis_cmd_max_argc
-      }
     }
-    return Status::OK();
-  }
-
-  Status DoAccum(OpKernelContext *ctx, const Tensor &keys,
-                 const Tensor &values_or_delta, const Tensor &exists) {
-    int64_t total = keys.NumElements();
-    const int64_t Velems_per_flat2_dim0 =
-        values_or_delta.NumElements() / keys.NumElements();
-
     if (total < (multi_redis_cmd_max_argc - 1)) {
-      launchAccum(ctx, keys_prefix_name_slices, keys, values_or_delta, exists,
-                  total, Velems_per_flat2_dim0, threads_Insert);
+      launchInsert(ctx, keys_prefix_name_slices, keys, values, total,
+                   Velems_per_flat2_dim0, threads_Insert);
     } else {
-      launchAccum_parallel(
-          ctx, keys_prefix_name_slices, keys, values_or_delta, exists, total,
+      launchInsert_parallel(
+          ctx, keys_prefix_name_slices, keys, values, total,
           Velems_per_flat2_dim0,
           threads_Insert);  // redis commmand args > multi_redis_cmd_max_argc
     }
-
     return Status::OK();
   }
 
   Status Insert(OpKernelContext *ctx, const Tensor &keys,
                 const Tensor &values) override {
-    return DoInsert(false, ctx, keys, values);
+    const int64_t total = keys.NumElements();
+    if (total > 0) {
+      const int64_t Velems_per_flat2_dim0 = values.NumElements() / total;
+      return DoInsert(false, ctx, (K *)keys.tensor_data().data(),
+                      (V *)values.tensor_data().data(), total,
+                      Velems_per_flat2_dim0);
+    } else {
+      LOG(INFO) << "Redis Backend Insert nothing for empty input keys tensor.";
+      return Status::OK();
+    }
   }
 
   Status Accum(OpKernelContext *ctx, const Tensor &keys,
                const Tensor &values_or_delta, const Tensor &exists) {
-    return DoAccum(ctx, keys, values_or_delta, exists);
+    int64_t total = keys.NumElements();
+    const int64_t Velems_per_flat2_dim0 =
+        values_or_delta.NumElements() / keys.NumElements();
+    auto values_dtype_str = DataTypeString(values_or_delta.dtype());
+
+    if (total < (multi_redis_cmd_max_argc - 1)) {
+      launchAccum(ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+                  (V *)values_or_delta.tensor_data().data(),
+                  (bool *)exists.tensor_data().data(), total,
+                  Velems_per_flat2_dim0, values_dtype_str, threads_Insert);
+    } else {
+      launchAccum_parallel(
+          ctx, keys_prefix_name_slices, (K *)keys.tensor_data().data(),
+          (V *)values_or_delta.tensor_data().data(),
+          (bool *)exists.tensor_data().data(), total, Velems_per_flat2_dim0,
+          values_dtype_str,
+          threads_Insert);  // redis commmand args > multi_redis_cmd_max_argc
+    }
+
+    return Status::OK();
   }
 
   Status Remove(OpKernelContext *ctx, const Tensor &keys) override {
     int64_t total = keys.NumElements();
     if (total > 0) {
       if (total < (multi_redis_cmd_max_argc - 1)) {
-        launchDelete(ctx, keys_prefix_name_slices, keys, total, threads_Delete);
+        launchDelete(ctx, keys_prefix_name_slices,
+                     (K *)keys.tensor_data().data(), total, threads_Delete);
       } else {
         // redis commmand args > multi_redis_cmd_max_argc
-        launchDelete_parallel(ctx, keys_prefix_name_slices, keys, total,
+        launchDelete_parallel(ctx, keys_prefix_name_slices,
+                              (K *)keys.tensor_data().data(), total,
                               threads_Delete);
       }
     }
@@ -809,7 +825,7 @@ class RedisTableOfTensors final : public LookupInterface {
     } else {
       if (keys.NumElements() > 0 &&
           redis_connection_params.table_store_mode == 0) {
-        return DoInsert(true, ctx, keys, values);
+        return Insert(ctx, keys, values);
       } else {
         LOG(INFO) << "Import nothing from the TensorFlow saved model to Redis "
                      "service for "
@@ -1035,6 +1051,268 @@ class RedisTableOfTensors final : public LookupInterface {
     return Status::OK();
   }
 
+  Status SaveToFileSystemImpl(FileSystem *fs, const string &filepath,
+                              const size_t buffer_size,
+                              const bool append_to_file) {
+    int64_t total_size = 0;
+    long long cursor = 0;
+    std::unique_ptr<redisReply, ::sw::redis::ReplyDeleter> hscan_reply;
+    const redisReply *kvs_reply;
+
+    for (size_t i = 0; i < keys_prefix_name_slices.size(); ++i) {
+      total_size +=
+          _table_instance->TableSizeInBucket(keys_prefix_name_slices[i]);
+    }
+
+    // construct file system relative object
+    std::unique_ptr<WritableFile> key_writer;
+    std::unique_ptr<WritableFile> value_writer;
+    const string key_filepath(filepath + "-keys");
+    const string value_filepath(filepath + "-values");
+    string key_tmpfilepath(filepath + "-keys.tmp");
+    string value_tmpfilepath(filepath + "-values.tmp");
+    bool has_atomic_move = false;
+    auto has_atomic_move_ret = fs->HasAtomicMove(filepath, &has_atomic_move);
+    bool need_tmp_file =
+        (has_atomic_move == false) || (has_atomic_move_ret != Status::OK());
+    if (!need_tmp_file) {
+      key_tmpfilepath = key_filepath;
+      value_tmpfilepath = value_filepath;
+    }
+    TF_RETURN_IF_ERROR(
+        fs->RecursivelyCreateDir(std::string(fs->Dirname(filepath))));
+    if (append_to_file) {
+      TF_RETURN_IF_ERROR(fs->NewAppendableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(
+          fs->NewAppendableFile(value_tmpfilepath, &value_writer));
+    } else {
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(key_tmpfilepath, &key_writer));
+      TF_RETURN_IF_ERROR(fs->NewWritableFile(value_tmpfilepath, &value_writer));
+    }
+
+    if (total_size == 0) {
+      LOG(WARNING) << "There is no embedding table called " << keys_prefix_name
+                   << " existing in the Redis service. "
+                   << "Saving values to file system failed.";
+      return Status::OK();
+    }
+
+    // buffer for write to file system
+    const size_t value_len = sizeof(V) * runtime_value_dim_;
+    const size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    const size_t value_buffer_byte_size = buffer_size * value_len;
+    std::vector<char> key_buffer_vector(key_buffer_byte_size);
+    std::vector<char> value_buffer_vector(value_buffer_byte_size);
+
+    size_t total_saved = 0;
+
+    redisReply const *temp_reply;
+    const K *pk_raw = reinterpret_cast<const K *>(key_buffer_vector.data());
+    const V *pv_raw = reinterpret_cast<const V *>(value_buffer_vector.data());
+    for (size_t i = 0; i < keys_prefix_name_slices.size(); ++i) {
+      cursor = 0;
+      while (true) {
+        hscan_reply.reset();
+        hscan_reply = _table_instance->HscanGetKeysValsInBucket(
+            keys_prefix_name_slices[i], &cursor, multi_redis_cmd_max_argc);
+        if (hscan_reply == nullptr) {
+          return errors::Unknown(
+              "Unknown errors happen when HscanGetKeysValsInBucket in "
+              "SaveToFileSystemImpl");
+        }
+        kvs_reply = hscan_reply->element[1];
+        // fill Tensor keys and values
+        for (size_t j = 0; j < kvs_reply->elements; ++j) {
+          temp_reply = kvs_reply->element[j];
+          if (temp_reply->type ==
+              REDIS_REPLY_STRING) {  // #define REDIS_REPLY_STRING 1
+            ReplyMemcpyToKeyTensor<K>(
+                pk_raw, temp_reply->str,
+                temp_reply->len);  // Direct access to Tensor data in TensorFlow
+          }
+          ++pk_raw;
+
+          ++j;
+          temp_reply = kvs_reply->element[j];
+          if (temp_reply->type ==
+              REDIS_REPLY_STRING) {  // #define REDIS_REPLY_STRING 1
+            ReplyMemcpyToValTensor<V>(
+                pv_raw, temp_reply->str,
+                runtime_value_dim_);  // Direct access to Tensor data in
+                                      // TensorFlow
+          }
+          pv_raw += runtime_value_dim_;
+
+          if (((char *)pk_raw - key_buffer_vector.data()) >=
+              static_cast<int64_t>(key_buffer_byte_size)) {
+            pk_raw = reinterpret_cast<const K *>(key_buffer_vector.data());
+            TF_RETURN_IF_ERROR(key_writer->Append(
+                StringPiece((char *)pk_raw, key_buffer_byte_size)));
+            pv_raw = reinterpret_cast<const V *>(value_buffer_vector.data());
+            TF_RETURN_IF_ERROR(value_writer->Append(
+                StringPiece((char *)pv_raw, value_buffer_byte_size)));
+          }
+          ++total_saved;
+        }
+
+        LOG(INFO) << "The cursor of scanning " << keys_prefix_name_slices[i]
+                  << " in SaveToFileSystem is " << cursor << " now.";
+        if (cursor == 0) {
+          break;
+        }
+      }
+    }
+
+    if (((char *)pk_raw - key_buffer_vector.data()) &&
+        ((char *)pv_raw - value_buffer_vector.data())) {
+      TF_RETURN_IF_ERROR(key_writer->Append(
+          StringPiece(key_buffer_vector.data(),
+                      (char *)pk_raw - key_buffer_vector.data())));
+      TF_RETURN_IF_ERROR(value_writer->Append(
+          StringPiece(value_buffer_vector.data(),
+                      (char *)pv_raw - value_buffer_vector.data())));
+    }
+
+    TF_RETURN_IF_ERROR(key_writer->Flush());
+    TF_RETURN_IF_ERROR(value_writer->Flush());
+    TF_RETURN_IF_ERROR(key_writer->Sync());
+    TF_RETURN_IF_ERROR(value_writer->Sync());
+
+    LOG(INFO) << "Finish saving " << total_saved << " keys and values to "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    if (need_tmp_file) {
+      TF_RETURN_IF_ERROR(fs->FileExists(key_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(key_tmpfilepath, key_filepath));
+      TF_RETURN_IF_ERROR(fs->FileExists(value_tmpfilepath));
+      TF_RETURN_IF_ERROR(fs->RenameFile(value_tmpfilepath, value_filepath));
+    }
+
+    return Status::OK();
+  }
+
+  Status SaveToFileSystem(OpKernelContext *ctx, const string &dirpath,
+                          const string &file_name, const size_t buffer_size,
+                          bool append_to_file) {
+    string filepath = io::JoinPath(dirpath, file_name);
+    FileSystem *fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        env->GetFileSystemForFile(filepath, &fs),
+        "Please make sure you have already imported tensorflow_io before using "
+        "TFRA file system operation.");
+    return SaveToFileSystemImpl(fs, filepath, buffer_size, append_to_file);
+  }
+
+  Status LoadFromFileSystemImpl(OpKernelContext *ctx, FileSystem *fs,
+                                const string &filepath,
+                                const size_t buffer_size) {
+    const string key_filepath = filepath + "-keys";
+    TF_RETURN_IF_ERROR(fs->FileExists(key_filepath));
+    std::unique_ptr<RandomAccessFile> key_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(key_filepath, &key_file));
+    std::unique_ptr<io::RandomAccessInputStream> key_input_stream(
+        new io::RandomAccessInputStream(key_file.get()));
+    size_t key_buffer_byte_size = buffer_size * sizeof(K);
+    io::BufferedInputStream key_reader(key_input_stream.get(),
+                                       key_buffer_byte_size * 2);
+
+    const string value_filepath = filepath + "-values";
+    TF_RETURN_IF_ERROR(fs->FileExists(key_filepath));
+    std::unique_ptr<RandomAccessFile> value_file;
+    TF_RETURN_IF_ERROR(fs->NewRandomAccessFile(value_filepath, &value_file));
+    std::unique_ptr<io::RandomAccessInputStream> value_input_stream(
+        new io::RandomAccessInputStream(value_file.get()));
+    const size_t value_len = sizeof(V) * runtime_value_dim_;
+    size_t value_buffer_byte_size = buffer_size * value_len;
+    io::BufferedInputStream value_reader(value_input_stream.get(),
+                                         value_buffer_byte_size * 2);
+
+    uint64 key_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(key_filepath, &key_file_size));
+    const size_t key_size = key_file_size / sizeof(K);
+
+    uint64 value_file_size = 0;
+    TF_RETURN_IF_ERROR(fs->GetFileSize(value_filepath, &value_file_size));
+    const size_t value_size = value_file_size / value_len;
+
+    if (key_size != value_size) {
+      return errors::Unavailable(
+          "the keys number in file " + key_filepath +
+          " is not equal to the value vectors number in file " +
+          value_filepath + ".");
+    }
+
+    tstring key_buffer;
+    key_buffer.resize(key_buffer_byte_size);
+    tstring value_buffer;
+    value_buffer.resize(value_buffer_byte_size);
+
+    size_t key_file_offset = 0;
+    int64_t remainder = key_file_size - key_file_offset;
+    size_t nkeys = 0;
+    size_t key_read_byte = 0;
+    size_t value_read_byte = 0;
+    while (remainder > 0) {
+      if (remainder > static_cast<int64_t>(key_buffer_byte_size)) {
+        key_read_byte = key_buffer_byte_size;
+        nkeys = buffer_size;
+        value_read_byte = value_buffer_byte_size;
+      } else {
+        key_read_byte = remainder;
+        nkeys = key_read_byte / sizeof(K);
+        value_read_byte = nkeys * value_len;
+      }
+      TF_RETURN_IF_ERROR(key_reader.ReadNBytes(key_read_byte, &key_buffer));
+      TF_RETURN_IF_ERROR(
+          value_reader.ReadNBytes(value_read_byte, &value_buffer));
+      TF_RETURN_IF_ERROR(DoInsert(false, ctx, (K *)key_buffer.data(),
+                                  (V *)value_buffer.data(), nkeys,
+                                  runtime_value_dim_));
+      key_file_offset += key_read_byte;
+      remainder = key_file_size - key_file_offset;
+    }
+
+    LOG(INFO) << "Finish loading " << key_size << " keys and values from "
+              << key_filepath << " and " << value_filepath << " in total.";
+
+    return Status::OK();
+  }
+
+  Status LoadFromFileSystem(OpKernelContext *ctx, const string &dirpath,
+                            const string &file_name, const size_t buffer_size,
+                            bool load_entire_dir) {
+    string filepath = io::JoinPath(dirpath, file_name);
+    FileSystem *fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        env->GetFileSystemForFile(filepath, &fs),
+        "Please make sure you have already imported tensorflow_io before using "
+        "TFRA file system operation.");
+    if (load_entire_dir) {
+      int separator_pos = file_name.rfind("_mht_");
+      string file_pattern =
+          io::JoinPath(dirpath, file_name.substr(0, separator_pos)) + "*";
+      std::vector<string> all_filepath;
+      TF_RETURN_IF_ERROR(fs->GetMatchingPaths(file_pattern, &all_filepath));
+      // delete -keys/-values postfix
+      for (auto it = all_filepath.begin(); it != all_filepath.end(); ++it) {
+        int kv_separator_pos = it->rfind("-");
+        *it = it->substr(0, kv_separator_pos);
+      }
+      // remove duplicate elements
+      sort(all_filepath.begin(), all_filepath.end());
+      all_filepath.erase(unique(all_filepath.begin(), all_filepath.end()),
+                         all_filepath.end());
+      for (auto &fp : all_filepath) {
+        TF_RETURN_IF_ERROR(LoadFromFileSystemImpl(ctx, fs, fp, buffer_size));
+      }
+    } else {
+      return LoadFromFileSystemImpl(ctx, fs, filepath, buffer_size);
+    }
+    return Status::OK();
+  }
+
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
 
   DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
@@ -1217,7 +1495,7 @@ class HashTableAccumOp : public HashTableOpKernel {
     OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    RedisTableOfTensors<K, V> *redisTable = (RedisTableOfTensors<K, V> *)table;
+    RedisTableOfTensors<K, V> *redis_table = (RedisTableOfTensors<K, V> *)table;
 
     DataTypeVector expected_inputs = {expected_input_0_, table->key_dtype(),
                                       table->value_dtype(),
@@ -1237,7 +1515,8 @@ class HashTableAccumOp : public HashTableOpKernel {
     if (ctx->track_allocations()) {
       memory_used_before = table->MemoryUsed();
     }
-    OP_REQUIRES_OK(ctx, redisTable->Accum(ctx, keys, values_or_deltas, exists));
+    OP_REQUIRES_OK(ctx,
+                   redis_table->Accum(ctx, keys, values_or_deltas, exists));
     if (ctx->track_allocations()) {
       ctx->record_persistent_memory_allocation(table->MemoryUsed() -
                                                memory_used_before);
@@ -1329,7 +1608,56 @@ class HashTableExportOp : public HashTableOpKernel {
   }
 };
 
-// Clear the table and insert data.
+// Op that export all keys and values to FileSystem.
+template <class K, class V>
+class HashTableSaveToFileSystemOp : public HashTableOpKernel {
+ public:
+  explicit HashTableSaveToFileSystemOp(OpKernelConstruction *ctx)
+      : HashTableOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("append_to_file", &append_to_file_));
+    int64 signed_buffer_size = 0;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    LookupInterface *table;
+    OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor &dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor &fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    RedisTableOfTensors<K, V> *redis_table = (RedisTableOfTensors<K, V> *)table;
+    OP_REQUIRES_OK(
+        ctx, redis_table->SaveToFileSystem(ctx, dirpath, file_name,
+                                           buffer_size_, append_to_file_));
+  }
+
+ private:
+  string dirpath_env_;
+  bool append_to_file_;
+  size_t buffer_size_;
+};
+
+// Insert data.
 class HashTableImportOp : public HashTableOpKernel {
  public:
   using HashTableOpKernel::HashTableOpKernel;
@@ -1359,6 +1687,55 @@ class HashTableImportOp : public HashTableOpKernel {
   }
 };
 
+// Insert data from FileSystem.
+template <class K, class V>
+class HashTableLoadFromFileSystemOp : public HashTableOpKernel {
+ public:
+  explicit HashTableLoadFromFileSystemOp(OpKernelConstruction *ctx)
+      : HashTableOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("load_entire_dir", &load_entire_dir_));
+    int64 signed_buffer_size = 0;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    LookupInterface *table;
+    OP_REQUIRES_OK(ctx, GetTable(ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor &dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor &fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    RedisTableOfTensors<K, V> *redis_table = (RedisTableOfTensors<K, V> *)table;
+    OP_REQUIRES_OK(
+        ctx, redis_table->LoadFromFileSystem(ctx, dirpath, file_name,
+                                             buffer_size_, load_entire_dir_));
+  }
+
+ private:
+  string dirpath_env_;
+  bool load_entire_dir_;
+  size_t buffer_size_;
+};
+
 REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(RedisTableFind)).Device(DEVICE_CPU),
                         HashTableFindOp);
 REGISTER_KERNEL_BUILDER(
@@ -1402,7 +1779,19 @@ REGISTER_KERNEL_BUILDER(
           .Device(DEVICE_CPU)                                               \
           .TypeConstraint<key_dtype>("Tin")                                 \
           .TypeConstraint<value_dtype>("Tout"),                             \
-      redis_table::HashTableFindWithExistsOp<key_dtype, value_dtype>);
+      redis_table::HashTableFindWithExistsOp<key_dtype, value_dtype>);      \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name(PREFIX_OP_NAME(RedisTableSaveToFileSystem))                      \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<key_dtype>("key_dtype")                           \
+          .TypeConstraint<value_dtype>("value_dtype"),                      \
+      redis_table::HashTableSaveToFileSystemOp<key_dtype, value_dtype>);    \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name(PREFIX_OP_NAME(RedisTableLoadFromFileSystem))                    \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<key_dtype>("key_dtype")                           \
+          .TypeConstraint<value_dtype>("value_dtype"),                      \
+      redis_table::HashTableLoadFromFileSystemOp<key_dtype, value_dtype>);
 
 REGISTER_KERNEL(int32, double);
 REGISTER_KERNEL(int32, float);
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/BUILD
deleted file mode 100644
index dc5a041f5..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("@local_config_tf//:build_defs.bzl", "DTF_VERSION_INTEGER", "D_GLIBCXX_USE_CXX11_ABI")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "hadoop_file_system",
-    srcs = ["hadoop_file_system.cc"],
-    hdrs = ["hadoop_file_system.h"],
-    copts = [
-        D_GLIBCXX_USE_CXX11_ABI,
-        DTF_VERSION_INTEGER,
-    ],
-    deps = [
-        "@hadoop",
-        "@local_config_tf//:libtensorflow_framework",
-        "@local_config_tf//:tf_header_lib",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.cc
deleted file mode 100755
index 0eb64ddec..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.cc
+++ /dev/null
@@ -1,600 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TF_VERSION_INTEGER >= 2070  // 2.7.0
-
-#include "hadoop_file_system.h"
-
-#include <errno.h>
-
-#include "hdfs/hdfs.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/file_system.h"
-#include "tensorflow/core/platform/file_system_helper.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/strcat.h"
-#if TF_VERSION_INTEGER >= 2080  // 2.8.0
-#include "tensorflow/core/platform/errors.h"
-using tensorflow::errors::IOError;
-#else
-#include "tensorflow/core/platform/error.h"
-#endif
-
-namespace tensorflow {
-
-template <typename R, typename... Args>
-Status BindFunc(void* handle, const char* name,
-                std::function<R(Args...)>* func) {
-  void* symbol_ptr = nullptr;
-  TF_RETURN_IF_ERROR(
-      Env::Default()->GetSymbolFromLibrary(handle, name, &symbol_ptr));
-  *func = reinterpret_cast<R (*)(Args...)>(symbol_ptr);
-  return Status::OK();
-}
-
-class LibHDFS {
- public:
-  LibHDFS() { LoadAndBind(); }
-
-  // The status, if any, from failure to load.
-  Status status() const { return status_; }
-
-  std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
-  std::function<hdfsBuilder*()> hdfsNewBuilder;
-  std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
-  std::function<int(const char*, char**)> hdfsConfGetStr;
-  std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
-  std::function<tSize(hdfsFS, hdfsFile, tOffset, void*, tSize)> hdfsPread;
-  std::function<tSize(hdfsFS, hdfsFile, const void*, tSize)> hdfsWrite;
-  std::function<int(hdfsFS, hdfsFile)> hdfsHFlush;
-  std::function<int(hdfsFS, hdfsFile)> hdfsHSync;
-  std::function<tOffset(hdfsFS, hdfsFile)> hdfsTell;
-  std::function<hdfsFile(hdfsFS, const char*, int, int, short, tSize)>
-      hdfsOpenFile;
-  std::function<int(hdfsFS, const char*)> hdfsExists;
-  std::function<hdfsFileInfo*(hdfsFS, const char*, int*)> hdfsListDirectory;
-  std::function<void(hdfsFileInfo*, int)> hdfsFreeFileInfo;
-  std::function<int(hdfsFS, const char*, int recursive)> hdfsDelete;
-  std::function<int(hdfsFS, const char*)> hdfsCreateDirectory;
-  std::function<hdfsFileInfo*(hdfsFS, const char*)> hdfsGetPathInfo;
-  std::function<int(hdfsFS, const char*, const char*)> hdfsRename;
-
- private:
-  void LoadAndBind() {
-    auto TryLoadAndBind = [this](const char* name, void** handle) -> Status {
-      TF_RETURN_IF_ERROR(Env::Default()->LoadDynamicLibrary(name, handle));
-#define BIND_HDFS_FUNC(function) \
-  TF_RETURN_IF_ERROR(BindFunc(*handle, #function, &function));
-
-      BIND_HDFS_FUNC(hdfsBuilderConnect);
-      BIND_HDFS_FUNC(hdfsNewBuilder);
-      BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
-      BIND_HDFS_FUNC(hdfsConfGetStr);
-      BIND_HDFS_FUNC(hdfsCloseFile);
-      BIND_HDFS_FUNC(hdfsPread);
-      BIND_HDFS_FUNC(hdfsWrite);
-      BIND_HDFS_FUNC(hdfsHFlush);
-      BIND_HDFS_FUNC(hdfsTell);
-      BIND_HDFS_FUNC(hdfsHSync);
-      BIND_HDFS_FUNC(hdfsOpenFile);
-      BIND_HDFS_FUNC(hdfsExists);
-      BIND_HDFS_FUNC(hdfsListDirectory);
-      BIND_HDFS_FUNC(hdfsFreeFileInfo);
-      BIND_HDFS_FUNC(hdfsDelete);
-      BIND_HDFS_FUNC(hdfsCreateDirectory);
-      BIND_HDFS_FUNC(hdfsGetPathInfo);
-      BIND_HDFS_FUNC(hdfsRename);
-#undef BIND_HDFS_FUNC
-      return Status::OK();
-    };
-
-// libhdfs.so won't be in the standard locations. Use the path as specified
-// in the libhdfs documentation.
-#if defined(PLATFORM_WINDOWS)
-    const char* kLibHdfsDso = "hdfs.dll";
-#elif defined(MACOS) || defined(TARGET_OS_MAC)
-    const char* kLibHdfsDso = "libhdfs.dylib";
-#else
-    const char* kLibHdfsDso = "libhdfs.so";
-#endif
-    char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home != nullptr) {
-      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-      status_ = TryLoadAndBind(path.c_str(), &handle_);
-      if (status_.ok()) {
-        return;
-      } else {
-        LOG(ERROR) << "HadoopFileSystem load error: "
-                   << status_.error_message();
-      }
-    }
-
-    // Try to load the library dynamically in case it has been installed
-    // to a in non-standard location.
-    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
-  }
-
-  Status status_;
-  void* handle_ = nullptr;
-};
-
-HadoopFileSystem::HadoopFileSystem() {}
-
-HadoopFileSystem::~HadoopFileSystem() {}
-
-const LibHDFS* libhdfs() {
-  static const LibHDFS* libhdfs = new LibHDFS();
-  return libhdfs;
-}
-
-Status SplitArchiveNameAndPath(StringPiece& path, string& nn) {
-  size_t index_end_archive_name = path.find(".har");
-  if (index_end_archive_name == path.npos) {
-    return errors::InvalidArgument(
-        "Hadoop archive path does not contain a .har extension");
-  }
-  // Case of hadoop archive. Namenode is the path to the archive.
-  std::ostringstream namenodestream;
-  namenodestream << "har://" << nn
-                 << path.substr(0, index_end_archive_name + 4);
-  nn = namenodestream.str();
-  path.remove_prefix(index_end_archive_name + 4);
-  if (path.empty()) {
-    // Root of the archive
-    path = "/";
-  }
-  return Status::OK();
-}
-
-// We implement connection caching in Tensorflow, which can significantly
-// improve performance. Fixes #43187
-Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
-  TF_RETURN_IF_ERROR(libhdfs()->status());
-
-  StringPiece scheme, namenode, path;
-  io::ParseURI(fname, &scheme, &namenode, &path);
-  string nn(namenode);
-
-  string cacheKey(scheme.data(), scheme.size());
-  if (scheme == "file") {
-    nn = "";
-  } else if (scheme == "viewfs") {
-    char* defaultFS = nullptr;
-    libhdfs()->hdfsConfGetStr("fs.defaultFS", &defaultFS);
-    StringPiece defaultScheme, defaultCluster, defaultPath;
-    io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
-
-    if (scheme != defaultScheme ||
-        (namenode != "" && namenode != defaultCluster)) {
-      return errors::Unimplemented(
-          "viewfs is only supported as a fs.defaultFS.");
-    }
-    // The default NameNode configuration will be used (from the XML
-    // configuration files). See:
-    // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
-    nn = "default";
-  } else if (scheme == "har") {
-    TF_RETURN_IF_ERROR(SplitArchiveNameAndPath(path, nn));
-  } else {
-    if (nn.empty()) {
-      nn = "default";
-    }
-  }
-  cacheKey += nn;
-  {
-    mutex_lock lock(mu_);
-    if (connectionCache_.find(cacheKey) == connectionCache_.end()) {
-      hdfsBuilder* builder = libhdfs()->hdfsNewBuilder();
-      libhdfs()->hdfsBuilderSetNameNode(builder,
-                                        nn.empty() ? nullptr : nn.c_str());
-      hdfsFS cacheFs = libhdfs()->hdfsBuilderConnect(builder);
-      if (cacheFs == nullptr) {
-        return errors::Aborted(strerror(errno));
-      }
-      connectionCache_[cacheKey] = cacheFs;
-    }
-    *fs = connectionCache_[cacheKey];
-  }
-  return Status::OK();
-}
-
-string HadoopFileSystem::TranslateName(const string& name) const {
-  StringPiece scheme, namenode, path;
-  io::ParseURI(name, &scheme, &namenode, &path);
-  return string(path);
-}
-
-class HDFSRandomAccessFile : public RandomAccessFile {
- public:
-  HDFSRandomAccessFile(const string& filename, const string& hdfs_filename,
-                       hdfsFS fs, hdfsFile file)
-      : filename_(filename),
-        hdfs_filename_(hdfs_filename),
-        fs_(fs),
-        file_(file) {
-    const char* disable_eof_retried = getenv("HDFS_DISABLE_READ_EOF_RETRIED");
-    if (disable_eof_retried && disable_eof_retried[0] == '1') {
-      disable_eof_retried_ = true;
-    } else {
-      disable_eof_retried_ = false;
-    }
-  }
-
-  ~HDFSRandomAccessFile() override {
-    if (file_ != nullptr) {
-      mutex_lock lock(mu_);
-      libhdfs()->hdfsCloseFile(fs_, file_);
-    }
-  }
-
-  Status Name(StringPiece* result) const override {
-    *result = filename_;
-    return Status::OK();
-  }
-
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
-    Status s;
-    char* dst = scratch;
-    bool eof_retried = false;
-    if (disable_eof_retried_) {
-      // eof_retried = true, avoid calling hdfsOpenFile in Read, Fixes #42597
-      eof_retried = true;
-    }
-    while (n > 0 && s.ok()) {
-      // We lock inside the loop rather than outside so we don't block other
-      // concurrent readers.
-      mutex_lock lock(mu_);
-      // Max read length is INT_MAX-2, for hdfsPread function take a parameter
-      // of int32. -2 offset can avoid JVM OutOfMemoryError.
-      size_t read_n =
-          std::min(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
-      tSize r = libhdfs()->hdfsPread(fs_, file_, static_cast<tOffset>(offset),
-                                     dst, static_cast<tSize>(read_n));
-      if (r > 0) {
-        dst += r;
-        n -= r;
-        offset += r;
-      } else if (!eof_retried && r == 0) {
-        // Always reopen the file upon reaching EOF to see if there's more data.
-        // If writers are streaming contents while others are concurrently
-        // reading, HDFS requires that we reopen the file to see updated
-        // contents.
-        //
-        // Fixes #5438
-        if (file_ != nullptr && libhdfs()->hdfsCloseFile(fs_, file_) != 0) {
-          return IOError(filename_, errno);
-        }
-        file_ = libhdfs()->hdfsOpenFile(fs_, hdfs_filename_.c_str(), O_RDONLY,
-                                        0, 0, 0);
-        if (file_ == nullptr) {
-          return IOError(filename_, errno);
-        }
-        eof_retried = true;
-      } else if (eof_retried && r == 0) {
-        s = Status(error::OUT_OF_RANGE, "Read less bytes than requested");
-      } else if (errno == EINTR || errno == EAGAIN) {
-        // hdfsPread may return EINTR too. Just retry.
-      } else {
-        s = IOError(filename_, errno);
-      }
-    }
-    *result = StringPiece(scratch, dst - scratch);
-    return s;
-  }
-
- private:
-  string filename_;
-  string hdfs_filename_;
-  hdfsFS fs_;
-  bool disable_eof_retried_;
-
-  mutable mutex mu_;
-  mutable hdfsFile file_ TF_GUARDED_BY(mu_);
-};
-
-Status HadoopFileSystem::NewRandomAccessFile(
-    const string& fname, TransactionToken* token,
-    std::unique_ptr<RandomAccessFile>* result) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  hdfsFile file = libhdfs()->hdfsOpenFile(fs, TranslateName(fname).c_str(),
-                                          O_RDONLY, 0, 0, 0);
-  if (file == nullptr) {
-    return IOError(fname, errno);
-  }
-  result->reset(
-      new HDFSRandomAccessFile(fname, TranslateName(fname), fs, file));
-  return Status::OK();
-}
-
-class HDFSWritableFile : public WritableFile {
- public:
-  HDFSWritableFile(const string& fname, hdfsFS fs, hdfsFile file)
-      : filename_(fname), fs_(fs), file_(file) {}
-
-  ~HDFSWritableFile() override {
-    if (file_ != nullptr) {
-      Close().IgnoreError();
-    }
-  }
-
-  Status Append(StringPiece data) override {
-    size_t cur_pos = 0, write_len = 0;
-    bool retry = false;
-    // max() - 2 can avoid OutOfMemoryError in JVM .
-    static const size_t max_len_once =
-        static_cast<size_t>(std::numeric_limits<tSize>::max() - 2);
-    while (cur_pos < data.size()) {
-      write_len = std::min(data.size() - cur_pos, max_len_once);
-      tSize w = libhdfs()->hdfsWrite(fs_, file_, data.data() + cur_pos,
-                                     static_cast<tSize>(write_len));
-      if (w == -1) {
-        if (!retry && (errno == EINTR || errno == EAGAIN)) {
-          retry = true;
-        } else {
-          return IOError(filename_, errno);
-        }
-      } else {
-        cur_pos += w;
-      }
-    }
-    return Status::OK();
-  }
-
-  Status Close() override {
-    Status result;
-    if (libhdfs()->hdfsCloseFile(fs_, file_) != 0) {
-      result = IOError(filename_, errno);
-    }
-    fs_ = nullptr;
-    file_ = nullptr;
-    return result;
-  }
-
-  Status Flush() override {
-    if (libhdfs()->hdfsHFlush(fs_, file_) != 0) {
-      return IOError(filename_, errno);
-    }
-    return Status::OK();
-  }
-
-  Status Name(StringPiece* result) const override {
-    *result = filename_;
-    return Status::OK();
-  }
-
-  Status Sync() override {
-    if (libhdfs()->hdfsHSync(fs_, file_) != 0) {
-      return IOError(filename_, errno);
-    }
-    return Status::OK();
-  }
-
-  Status Tell(int64* position) override {
-    *position = libhdfs()->hdfsTell(fs_, file_);
-    if (*position == -1) {
-      return IOError(filename_, errno);
-    }
-    return Status::OK();
-  }
-
- private:
-  string filename_;
-  hdfsFS fs_;
-  hdfsFile file_;
-};
-
-Status HadoopFileSystem::NewWritableFile(
-    const string& fname, TransactionToken* token,
-    std::unique_ptr<WritableFile>* result) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  hdfsFile file = libhdfs()->hdfsOpenFile(fs, TranslateName(fname).c_str(),
-                                          O_WRONLY, 0, 0, 0);
-  if (file == nullptr) {
-    return IOError(fname, errno);
-  }
-  result->reset(new HDFSWritableFile(fname, fs, file));
-  return Status::OK();
-}
-
-Status HadoopFileSystem::NewAppendableFile(
-    const string& fname, TransactionToken* token,
-    std::unique_ptr<WritableFile>* result) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  hdfsFile file = libhdfs()->hdfsOpenFile(fs, TranslateName(fname).c_str(),
-                                          O_WRONLY | O_APPEND, 0, 0, 0);
-  if (file == nullptr) {
-    return IOError(fname, errno);
-  }
-  result->reset(new HDFSWritableFile(fname, fs, file));
-  return Status::OK();
-}
-
-Status HadoopFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, TransactionToken* token,
-    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  // hadoopReadZero() technically supports this call with the following
-  // caveats:
-  // - It only works up to 2 GB. We'd have to Stat() the file to ensure that
-  //   it fits.
-  // - If not on the local filesystem, the entire file will be read, making
-  //   it inefficient for callers that assume typical mmap() behavior.
-  return errors::Unimplemented("HDFS does not support ReadOnlyMemoryRegion");
-}
-
-Status HadoopFileSystem::FileExists(const string& fname,
-                                    TransactionToken* token) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-  if (libhdfs()->hdfsExists(fs, TranslateName(fname).c_str()) == 0) {
-    return Status::OK();
-  }
-  return errors::NotFound(fname, " not found.");
-}
-
-Status HadoopFileSystem::GetChildren(const string& dir, TransactionToken* token,
-                                     std::vector<string>* result) {
-  result->clear();
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(dir, &fs));
-
-  // hdfsListDirectory returns nullptr if the directory is empty. Do a separate
-  // check to verify the directory exists first.
-  FileStatistics stat;
-  TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
-
-  int entries = 0;
-  hdfsFileInfo* info =
-      libhdfs()->hdfsListDirectory(fs, TranslateName(dir).c_str(), &entries);
-  if (info == nullptr) {
-    if (stat.is_directory) {
-      // Assume it's an empty directory.
-      return Status::OK();
-    }
-    return IOError(dir, errno);
-  }
-  for (int i = 0; i < entries; i++) {
-    result->push_back(string(io::Basename(info[i].mName)));
-  }
-  libhdfs()->hdfsFreeFileInfo(info, entries);
-  return Status::OK();
-}
-
-Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
-                                          TransactionToken* token,
-                                          std::vector<string>* results) {
-  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
-}
-
-Status HadoopFileSystem::DeleteFile(const string& fname,
-                                    TransactionToken* token) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  if (libhdfs()->hdfsDelete(fs, TranslateName(fname).c_str(),
-                            /*recursive=*/0) != 0) {
-    return IOError(fname, errno);
-  }
-  return Status::OK();
-}
-
-Status HadoopFileSystem::CreateDir(const string& dir, TransactionToken* token) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(dir, &fs));
-
-  if (libhdfs()->hdfsCreateDirectory(fs, TranslateName(dir).c_str()) != 0) {
-    return IOError(dir, errno);
-  }
-  return Status::OK();
-}
-
-Status HadoopFileSystem::DeleteDir(const string& dir, TransactionToken* token) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(dir, &fs));
-
-  // Count the number of entries in the directory, and only delete if it's
-  // non-empty. This is consistent with the interface, but note that there's
-  // a race condition where a file may be added after this check, in which
-  // case the directory will still be deleted.
-  int entries = 0;
-  hdfsFileInfo* info =
-      libhdfs()->hdfsListDirectory(fs, TranslateName(dir).c_str(), &entries);
-  if (info != nullptr) {
-    libhdfs()->hdfsFreeFileInfo(info, entries);
-  }
-  // Due to HDFS bug HDFS-8407, we can't distinguish between an error and empty
-  // folder, especially for Kerberos enable setup, EAGAIN is quite common when
-  // the call is actually successful. Check again by Stat.
-  if (info == nullptr && errno != 0) {
-    FileStatistics stat;
-    TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
-  }
-
-  if (entries > 0) {
-    return errors::FailedPrecondition("Cannot delete a non-empty directory.");
-  }
-  if (libhdfs()->hdfsDelete(fs, TranslateName(dir).c_str(),
-                            /*recursive=*/1) != 0) {
-    return IOError(dir, errno);
-  }
-  return Status::OK();
-}
-
-Status HadoopFileSystem::GetFileSize(const string& fname,
-                                     TransactionToken* token, uint64* size) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  hdfsFileInfo* info =
-      libhdfs()->hdfsGetPathInfo(fs, TranslateName(fname).c_str());
-  if (info == nullptr) {
-    return IOError(fname, errno);
-  }
-  *size = static_cast<uint64>(info->mSize);
-  libhdfs()->hdfsFreeFileInfo(info, 1);
-  return Status::OK();
-}
-
-Status HadoopFileSystem::RenameFile(const string& src, const string& target,
-                                    TransactionToken* token) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(src, &fs));
-
-  if (libhdfs()->hdfsExists(fs, TranslateName(target).c_str()) == 0 &&
-      libhdfs()->hdfsDelete(fs, TranslateName(target).c_str(),
-                            /*recursive=*/0) != 0) {
-    return IOError(target, errno);
-  }
-
-  if (libhdfs()->hdfsRename(fs, TranslateName(src).c_str(),
-                            TranslateName(target).c_str()) != 0) {
-    return IOError(src, errno);
-  }
-  return Status::OK();
-}
-
-Status HadoopFileSystem::Stat(const string& fname, TransactionToken* token,
-                              FileStatistics* stats) {
-  hdfsFS fs = nullptr;
-  TF_RETURN_IF_ERROR(Connect(fname, &fs));
-
-  hdfsFileInfo* info =
-      libhdfs()->hdfsGetPathInfo(fs, TranslateName(fname).c_str());
-  if (info == nullptr) {
-    return IOError(fname, errno);
-  }
-  stats->length = static_cast<int64>(info->mSize);
-  stats->mtime_nsec = static_cast<int64>(info->mLastMod) * 1e9;
-  stats->is_directory = info->mKind == kObjectKindDirectory;
-  libhdfs()->hdfsFreeFileInfo(info, 1);
-  return Status::OK();
-}
-
-REGISTER_LEGACY_FILE_SYSTEM("hdfs", HadoopFileSystem);
-REGISTER_LEGACY_FILE_SYSTEM("viewfs", HadoopFileSystem);
-REGISTER_LEGACY_FILE_SYSTEM("har", HadoopFileSystem);
-
-}  // namespace tensorflow
-
-#endif  // TF_VERSION_INTEGER >= 2070
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.h b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.h
deleted file mode 100755
index a664f2b2c..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/hadoop_file_system/hadoop_file_system.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef HADOOP_FILE_SYSTEM_H_
-#define HADOOP_FILE_SYSTEM_H_
-
-#include <map>
-
-#include "hdfs/hdfs.h"
-#include "tensorflow/core/platform/env.h"
-
-extern "C" {
-struct hdfs_internal;
-typedef hdfs_internal* hdfsFS;
-}
-
-namespace tensorflow {
-
-class LibHDFS;
-
-class HadoopFileSystem : public FileSystem {
- public:
-  HadoopFileSystem();
-  ~HadoopFileSystem();
-
-  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
-
-  Status NewRandomAccessFile(
-      const string& fname, TransactionToken* token,
-      std::unique_ptr<RandomAccessFile>* result) override;
-
-  Status NewWritableFile(const string& fname, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) override;
-
-  Status NewAppendableFile(const string& fname, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) override;
-
-  Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, TransactionToken* token,
-      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
-
-  Status FileExists(const string& fname, TransactionToken* token) override;
-
-  Status GetChildren(const string& dir, TransactionToken* token,
-                     std::vector<string>* result) override;
-
-  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                          std::vector<string>* results) override;
-
-  Status DeleteFile(const string& fname, TransactionToken* token) override;
-
-  Status CreateDir(const string& dir, TransactionToken* token) override;
-
-  Status DeleteDir(const string& dir, TransactionToken* token) override;
-
-  Status GetFileSize(const string& fname, TransactionToken* token,
-                     uint64* size) override;
-
-  Status RenameFile(const string& src, const string& target,
-                    TransactionToken* token) override;
-
-  Status Stat(const string& fname, TransactionToken* token,
-              FileStatistics* stat) override;
-
-  string TranslateName(const string& name) const override;
-
- private:
-  mutex mu_;
-  std::map<std::string, hdfsFS> connectionCache_ TF_GUARDED_BY(mu_);
-  Status Connect(StringPiece fname, hdfsFS* fs);
-};
-
-Status SplitArchiveNameAndPath(StringPiece& path, string& nn);
-
-}  // namespace tensorflow
-
-#endif  // HADOOP_FILE_SYSTEM_H_
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/cuckoo_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/cuckoo_hashtable_ops.cc
index 7ba22c84b..b41150651 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/cuckoo_hashtable_ops.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/cuckoo_hashtable_ops.cc
@@ -254,11 +254,14 @@ REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableExport))
       return Status::OK();
     });
 
-REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableSaveToHDFS))
+REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableSaveToFileSystem))
     .Input("table_handle: resource")
-    .Input("filepath: string")
+    .Input("dirpath: string")
+    .Input("file_name: string")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("append_to_file: bool")
     .Attr("buffer_size: int >= 1");
 
 REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableImport))
@@ -277,11 +280,14 @@ REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableImport))
       return Status::OK();
     });
 
-REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableLoadFromHDFS))
+REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableLoadFromFileSystem))
     .Input("table_handle: resource")
-    .Input("filepath: string")
+    .Input("dirpath: string")
+    .Input("file_name: string")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("load_entire_dir: bool")
     .Attr("buffer_size: int >= 1");
 
 REGISTER_OP(PREFIX_OP_NAME(CuckooHashTableOfTensors))
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/redis_table_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/redis_table_ops.cc
index fd14dbc23..ef109155e 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/redis_table_ops.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/redis_table_ops.cc
@@ -235,6 +235,16 @@ REGISTER_OP(PREFIX_OP_NAME(RedisTableExport))
       return Status::OK();
     });
 
+REGISTER_OP(PREFIX_OP_NAME(RedisTableSaveToFileSystem))
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("append_to_file: bool")
+    .Attr("buffer_size: int >= 1");
+
 REGISTER_OP(PREFIX_OP_NAME(RedisTableImport))
     .Input("table_handle: resource")
     .Input("keys: Tin")
@@ -251,6 +261,16 @@ REGISTER_OP(PREFIX_OP_NAME(RedisTableImport))
       return Status::OK();
     });
 
+REGISTER_OP(PREFIX_OP_NAME(RedisTableLoadFromFileSystem))
+    .Input("table_handle: resource")
+    .Input("dirpath: string")
+    .Input("file_name: string")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("dirpath_env: string")
+    .Attr("load_entire_dir: bool")
+    .Attr("buffer_size: int >= 1");
+
 Status RedisTableShape(InferenceContext *c, const ShapeHandle &key,
                        const ShapeHandle &value) {
   c->set_output(0, c->Scalar());
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
index d82de66b6..ed6bf1f50 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import sys
+import os
 
 from tensorflow_recommenders_addons import dynamic_embedding as de
 
@@ -26,8 +27,15 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
+import tensorflow as tf
+try:
+  import tensorflow_io
+except:
+  print()
+
 default_config = config_pb2.ConfigProto(
     allow_soft_placement=False,
     gpu_options=config_pb2.GPUOptions(allow_growth=True))
@@ -83,26 +91,80 @@ def test_cuckoo_hashtable_import_and_export(self):
           id += 1
 
   @test_util.run_in_graph_and_eager_modes()
-  def test_cuckoo_hashtable_save_hdfs(self):
-    self.skipTest('Only test for hdfs export, need hdfs path.')
-    initializer = tf.keras.initializers.RandomNormal()
+  def test_cuckoo_hashtable_save_file_system(self):
+    self.skipTest('Only test for file_system export, need file_system path.')
+    test_devices = ['/CPU:0']
+    if test_util.is_gpu_available():
+      test_devices = ['/GPU:0']
     dim = 8
+    for idx, device in enumerate(test_devices):
+      var1 = de.get_variable('fsv1_' + str(idx),
+                             key_dtype=dtypes.int64,
+                             value_dtype=dtypes.float32,
+                             initializer=init_ops.random_normal_initializer(
+                                 0.0, 0.01),
+                             devices=[device],
+                             dim=dim)
+      var2 = de.get_variable('fsv2_' + str(idx),
+                             key_dtype=dtypes.int64,
+                             value_dtype=dtypes.float32,
+                             initializer=init_ops.random_normal_initializer(
+                                 0.0, 0.01),
+                             devices=[device],
+                             dim=dim)
+      init_keys = constant_op.constant(list(range(10000)), dtypes.int64)
+      init_values = var1.lookup(init_keys)
 
+      os.environ["AWS_ACCESS_KEY_ID"] = "Q3AM3UQ867SPQQA43P2F"
+      os.environ[
+          "AWS_SECRET_ACCESS_KEY"] = "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG"
+      os.environ["S3_ENDPOINT"] = "https://play.min.io"
+
+      with self.session():
+        self.evaluate(var1.upsert(init_keys, init_values))
+
+        np_keys = self.evaluate(init_keys)
+        np_values = self.evaluate(init_values)
+
+        dirpath = "s3://test/tfra_embedding"
+        self.evaluate(var1.tables[0].save_to_file_system(dirpath,
+                                                         file_name='fsv_' +
+                                                         str(idx),
+                                                         buffer_size=4096))
+        self.evaluate(var2.tables[0].load_from_file_system(dirpath,
+                                                           file_name='fsv_' +
+                                                           str(idx),
+                                                           buffer_size=4096))
+        load_keys, load_values = self.evaluate(var2.export())
+        sort_idx = load_keys.argsort()
+        load_keys = load_keys[sort_idx[::1]]
+        load_values = load_values[sort_idx[::1]]
+
+        self.assertAllEqual(np_keys, load_keys)
+        self.assertAllEqual(np_values, load_values)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cuckoo_hashtable_save_local_file_system(self):
     test_devices = ['/CPU:0']
+    if test_util.is_gpu_available():
+      test_devices = ['/GPU:0']
+    dim = 8
     for idx, device in enumerate(test_devices):
-      var1 = de.get_variable('vmas142_' + str(idx),
-                             key_dtype=tf.int64,
-                             value_dtype=tf.float32,
-                             initializer=initializer,
+      var1 = de.get_variable('lfsv1_' + str(idx),
+                             key_dtype=dtypes.int64,
+                             value_dtype=dtypes.float32,
+                             initializer=init_ops.random_normal_initializer(
+                                 0.0, 0.01),
                              devices=[device],
                              dim=dim)
-      var2 = de.get_variable('lfwa031_' + str(idx),
-                             key_dtype=tf.int64,
-                             value_dtype=tf.float32,
-                             initializer=initializer,
+      var2 = de.get_variable('lfsv2_' + str(idx),
+                             key_dtype=dtypes.int64,
+                             value_dtype=dtypes.float32,
+                             initializer=init_ops.random_normal_initializer(
+                                 0.0, 0.01),
                              devices=[device],
                              dim=dim)
-      init_keys = tf.range(0, 10000, dtype=tf.int64)
+      init_keys = constant_op.constant(list(range(10000)), dtypes.int64)
       init_values = var1.lookup(init_keys)
 
       with self.session():
@@ -111,9 +173,83 @@ def test_cuckoo_hashtable_save_hdfs(self):
         np_keys = self.evaluate(init_keys)
         np_values = self.evaluate(init_values)
 
-        filepath = "hdfs://path_to_test"
-        self.evaluate(var1.tables[0].save_to_hdfs(filepath, buffer_size=4096))
-        self.evaluate(var2.tables[0].load_from_hdfs(filepath, buffer_size=4096))
+        dirpath = "file:///tmp/test_local_file_system/tfra_embedding"
+        self.evaluate(var1.tables[0].save_to_file_system(dirpath,
+                                                         file_name='lfsv_' +
+                                                         str(idx),
+                                                         buffer_size=4096))
+        self.evaluate(var2.tables[0].load_from_file_system(dirpath,
+                                                           file_name='lfsv_' +
+                                                           str(idx),
+                                                           buffer_size=4096))
+        load_keys, load_values = self.evaluate(var2.export())
+        sort_idx = load_keys.argsort()
+        load_keys = load_keys[sort_idx[::1]]
+        load_values = load_values[sort_idx[::1]]
+
+        self.assertAllEqual(np_keys, load_keys)
+        self.assertAllEqual(np_values, load_values)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cuckoo_hashtable_save_and_load_all_with_local_file_system(self):
+    test_devices = [['/CPU:0', '/CPU:1']]
+    if test_util.is_gpu_available():
+      tf.debugging.set_log_device_placement(True)
+      gpus = tf.config.list_physical_devices('GPU')
+      if gpus:
+        # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
+        try:
+          tf.config.set_logical_device_configuration(gpus[0], [
+              tf.config.experimental.VirtualDeviceConfiguration(
+                  memory_limit=1024)
+          ])
+          logical_gpus = tf.config.list_logical_devices('GPU')
+          print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
+        except RuntimeError as e:
+          # Virtual devices must be set before GPUs have been initialized
+          print(e)
+      test_devices = [['/GPU:0', '/GPU:1']]
+    dim = 8
+    for idx, devices in enumerate(test_devices):
+      var1 = de.get_variable(
+          'lfslav1_' + str(idx),
+          key_dtype=tf.int64,
+          value_dtype=tf.float32,
+          initializer=init_ops.random_normal_initializer(0.0, 0.01),
+          devices=devices,
+          dim=dim,
+      )
+      var2 = de.get_variable(
+          'lfslav2_' + str(idx),
+          key_dtype=tf.int64,
+          value_dtype=tf.float32,
+          initializer=init_ops.random_normal_initializer(0.0, 0.01),
+          devices=devices,
+          dim=dim,
+      )
+      init_keys = tf.range(0, 20000, dtype=tf.int64)
+      init_values = var1.lookup(init_keys)
+
+      with self.session():
+        self.evaluate(var1.clear())
+        self.evaluate(var1.upsert(init_keys[0:10000], init_values[0:10000]))
+        self.evaluate(
+            var1.upsert(init_keys[10000:20000], init_values[10000:20000]))
+        self.evaluate(var2.clear())
+
+        np_keys = self.evaluate(init_keys)
+        np_values = self.evaluate(init_values)
+
+        dirpath = "file:///tmp/test_tfra/test"
+        self.evaluate(var1.tables[0].save_to_file_system(dirpath,
+                                                         buffer_size=1000))
+        self.evaluate(var1.tables[1].save_to_file_system(dirpath,
+                                                         buffer_size=1000))
+        self.evaluate(var2.tables[0].load_from_file_system(
+            dirpath,
+            file_name='lfslav1_' + str(idx),
+            load_entire_dir=True,
+            buffer_size=1000))
         load_keys, load_values = self.evaluate(var2.export())
         sort_idx = load_keys.argsort()
         load_keys = load_keys[sort_idx[::1]]
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
index e79b49d47..45ce665ed 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
@@ -59,6 +59,11 @@
 from tensorflow.python.training.tracking import util as track_util
 from tensorflow.python.util import compat
 
+try:
+  import tensorflow_io
+except:
+  print()
+
 
 # pylint: disable=missing-class-docstring
 # pylint: disable=missing-function-docstring
@@ -633,8 +638,95 @@ def test_save_restore(self):
 
       del table
 
-  def test_save_restore_hdfs(self):
-    self.skipTest('Only test for hdfs export, need hdfs path.')
+  def test_save_restore_file_system(self):
+    self.skipTest('Only test for file_system export, need file_system path.')
+    if context.executing_eagerly():
+      self.skipTest('skip eager test when using legacy Saver.')
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    os.environ["AWS_ACCESS_KEY_ID"] = "Q3AM3UQ867SPQQA43P2F"
+    os.environ[
+        "AWS_SECRET_ACCESS_KEY"] = "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG"
+    os.environ["S3_ENDPOINT"] = "https://play.min.io"
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+
+      keys = constant_op.constant([0, 1, 2], dtypes.int64)
+      values = constant_op.constant([[0.0], [1.0], [2.0]], dtypes.float32)
+      table = de.Variable(
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          name="t1",
+          dim=1,
+      )
+
+      save = saver.Saver(var_list=[v0, v1])
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+      self.evaluate(table.upsert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      # save table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.save_to_file_system("s3://test/" + str(k), buffer_size=4096))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+      del table
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      table = de.Variable(
+          name="t1",
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          dim=1,
+          checkpoint=True,
+      )
+      self.evaluate(
+          table.upsert(
+              constant_op.constant([0, 1], dtypes.int64),
+              constant_op.constant([[12.0], [24.0]], dtypes.float32),
+          ))
+      size_op = table.size()
+      self.assertAllEqual(2, self.evaluate(size_op))
+
+      save = saver.Saver(var_list=[v0, v1])
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      # load table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.load_from_file_system("s3://test/" + str(k), buffer_size=4096))
+      # Check that the parameter nodes have been restored.
+      self.assertEqual([10.0], self.evaluate(v0))
+      self.assertEqual([20.0], self.evaluate(v1))
+
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      remove_keys = constant_op.constant([5, 0, 1, 2, 6], dtypes.int64)
+      output = table.lookup(remove_keys)
+      self.assertAllEqual([[-1.0], [0.0], [1.0], [2.0], [-1.0]],
+                          self.evaluate(output))
+
+      del table
+
+  def test_save_restore_local_file_system(self):
     if context.executing_eagerly():
       self.skipTest('skip eager test when using legacy Saver.')
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
@@ -668,7 +760,9 @@ def test_save_restore_hdfs(self):
       # save table
       for k, v in enumerate(table.tables):
         self.evaluate(
-            v.save_to_hdfs("hdfs://path_to_test" + str(k), buffer_size=4096))
+            v.save_to_file_system("file:///tmp/test_local_file_system/" +
+                                  str(k),
+                                  buffer_size=4096))
 
       val = save.save(sess, save_path)
       self.assertIsInstance(val, six.string_types)
@@ -687,6 +781,7 @@ def test_save_restore_hdfs(self):
           dim=1,
           checkpoint=True,
       )
+      self.evaluate(table.clear())
       self.evaluate(
           table.upsert(
               constant_op.constant([0, 1], dtypes.int64),
@@ -702,7 +797,9 @@ def test_save_restore_hdfs(self):
       # load table
       for k, v in enumerate(table.tables):
         self.evaluate(
-            v.load_from_hdfs("hdfs://path_to_test" + str(k), buffer_size=4096))
+            v.load_from_file_system("file:///tmp/test_local_file_system/" +
+                                    str(k),
+                                    buffer_size=4096))
       # Check that the parameter nodes have been restored.
       self.assertEqual([10.0], self.evaluate(v0))
       self.assertEqual([20.0], self.evaluate(v1))
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/redis_table_variable_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/redis_table_variable_test.py
index 17ca714f7..9fc66626d 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/redis_table_variable_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/redis_table_variable_test.py
@@ -57,6 +57,13 @@
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
+try:
+  import tensorflow_io
+except:
+  print()
+
+os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
+
 
 # pylint: disable=missing-class-docstring
 # pylint: disable=missing-function-docstring
@@ -686,6 +693,190 @@ def test_save_restore(self):
       self.evaluate(table.clear())
       del table
 
+  def test_save_restore_file_system(self):
+    if _redis_health_check(redis_config_params["redis_host_ip"][0],
+                           redis_config_params["redis_host_port"][0]) == False:
+      self.skipTest('skip redis test when unable to access the redis service.')
+    self.skipTest('Only test for file_system export, need file_system path.')
+    if context.executing_eagerly():
+      self.skipTest('skip eager test when using legacy Saver.')
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    os.environ["AWS_ACCESS_KEY_ID"] = "Q3AM3UQ867SPQQA43P2F"
+    os.environ[
+        "AWS_SECRET_ACCESS_KEY"] = "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG"
+    os.environ["S3_ENDPOINT"] = "https://play.min.io"
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+
+      keys = constant_op.constant([0, 1, 2], dtypes.int64)
+      values = constant_op.constant([[0.0], [1.0], [2.0]], dtypes.float32)
+      table = de.Variable(
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          name="t1_test_file_system",
+          dim=1,
+          kv_creator=de.RedisTableCreator(config=redis_config),
+      )
+
+      save = saver.Saver(var_list=[v0, v1])
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+      self.evaluate(table.upsert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      # save table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.save_to_file_system("s3://test/" + str(k), buffer_size=4096))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+      del table
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      table = de.Variable(
+          name="t1_test_file_system",
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          dim=1,
+          checkpoint=True,
+          kv_creator=de.RedisTableCreator(config=redis_config),
+      )
+      self.evaluate(
+          table.upsert(
+              constant_op.constant([0, 1], dtypes.int64),
+              constant_op.constant([[12.0], [24.0]], dtypes.float32),
+          ))
+      size_op = table.size()
+      self.assertAllEqual(2, self.evaluate(size_op))
+
+      save = saver.Saver(var_list=[v0, v1])
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      # load table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.load_from_file_system("s3://test/" + str(k), buffer_size=4096))
+      # Check that the parameter nodes have been restored.
+      self.assertEqual([10.0], self.evaluate(v0))
+      self.assertEqual([20.0], self.evaluate(v1))
+
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      remove_keys = constant_op.constant([5, 0, 1, 2, 6], dtypes.int64)
+      output = table.lookup(remove_keys)
+      self.assertAllEqual([[-1.0], [0.0], [1.0], [2.0], [-1.0]],
+                          self.evaluate(output))
+
+      del table
+
+  def test_save_restore_local_file_system(self):
+    if _redis_health_check(redis_config_params["redis_host_ip"][0],
+                           redis_config_params["redis_host_port"][0]) == False:
+      self.skipTest('skip redis test when unable to access the redis service.')
+    if context.executing_eagerly():
+      self.skipTest('skip eager test when using legacy Saver.')
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+
+      keys = constant_op.constant([0, 1, 2], dtypes.int64)
+      values = constant_op.constant([[0.0], [1.0], [2.0]], dtypes.float32)
+      table = de.Variable(
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          name="t1_test_local_file_system",
+          dim=1,
+          kv_creator=de.RedisTableCreator(config=redis_config),
+      )
+
+      save = saver.Saver(var_list=[v0, v1])
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+      self.evaluate(table.upsert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      # save table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.save_to_file_system("file:///tmp/test_local_file_system/" +
+                                  str(k),
+                                  buffer_size=4096))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+      del table
+
+    with self.session(config=default_config, graph=ops.Graph()) as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      table = de.Variable(
+          name="t1_test_local_file_system",
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=-1.0,
+          dim=1,
+          checkpoint=True,
+          kv_creator=de.RedisTableCreator(config=redis_config),
+      )
+      self.evaluate(
+          table.upsert(
+              constant_op.constant([0, 1], dtypes.int64),
+              constant_op.constant([[12.0], [24.0]], dtypes.float32),
+          ))
+      size_op = table.size()
+      self.assertAllEqual(2, self.evaluate(size_op))
+
+      save = saver.Saver(var_list=[v0, v1])
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      # load table
+      for k, v in enumerate(table.tables):
+        self.evaluate(
+            v.load_from_file_system("file:///tmp/test_local_file_system/" +
+                                    str(k),
+                                    buffer_size=4096))
+      # Check that the parameter nodes have been restored.
+      self.assertEqual([10.0], self.evaluate(v0))
+      self.assertEqual([20.0], self.evaluate(v1))
+
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      remove_keys = constant_op.constant([5, 0, 1, 2, 6], dtypes.int64)
+      output = table.lookup(remove_keys)
+      self.assertAllEqual([[-1.0], [0.0], [1.0], [2.0], [-1.0]],
+                          self.evaluate(output))
+
+      del table
+
   def test_save_restore_only_table(self):
     if _redis_health_check(redis_config_params["redis_host_ip"][0],
                            redis_config_params["redis_host_port"][0]) == False:
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
index 72d96855b..2344bc737 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
@@ -338,46 +338,70 @@ def export(self, name=None):
             self.resource_handle, self._key_dtype, self._value_dtype)
     return keys, values
 
-  def save_to_hdfs(self, filepath, buffer_size=4194304, name=None):
+  def save_to_file_system(self,
+                          dirpath,
+                          file_name=None,
+                          dirpath_env='TFRA_SAVED_KV',
+                          append_to_file=False,
+                          buffer_size=4194304,
+                          name=None):
     """
-    Returns an operation to save the keys and values in table to
-    filepath. The keys and values will be stored in HDFS, appended to the filepath.
+    Returns an operation to save the keys and values in table to dirpath. 
+    The keys and values will be stored in FileSystem, rewrited or appended to the filepath.
     Args:
-      filepath: A path to save the table.
+      dirpath: A directory path to save the table.
+      dirpath_env: A environment variable stored a path to save the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in write buffer to file.
+      append_to_file: If true, operation will append data to the file but not write a new one.
       name: Name for the operation.
-      buffer_size: Number of kv pairs buffer write to file.
     Returns:
       An operation to save the table.
     """
     with ops.name_scope(name, "%s_save_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_save_to_hdfs(
+        return cuckoo_ops.tfra_cuckoo_hash_table_save_to_file_system(
             self.resource_handle,
-            filepath,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
             key_dtype=self._key_dtype,
             value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            append_to_file=append_to_file,
             buffer_size=buffer_size)
 
-  def load_from_hdfs(self, filepath, buffer_size=4194304, name=None):
+  def load_from_file_system(self,
+                            dirpath,
+                            file_name=None,
+                            dirpath_env='TFRA_SAVED_KV',
+                            load_entire_dir=False,
+                            buffer_size=4194304,
+                            name=None):
     """
     Returns an operation to load keys and values to table from
-    HDFS. The keys and values files are generated from `save_to_hdfs`.
+    FileSystem. The keys and values files are generated from `save_to_file_system`.
     Args:
-      filepath: A file path stored the table keys and values.
+      dirpath: A directory path stored the table keys and values.
+      dirpath_env: A environment variable stored a path to load the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in read buffer from file.
+      load_entire_dir: If true, operation will load all key value files in the dirpath regardless partition.
       name: Name for the operation.
-      buffer_size: Number of kv pairs buffer to read file.
     Returns:
-      An operation to load keys and values to table from HDFS.
+      An operation to load keys and values to table from FileSystem.
     """
     with ops.name_scope(name, "%s_load_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_load_from_hdfs(
+        return cuckoo_ops.tfra_cuckoo_hash_table_load_from_file_system(
             self.resource_handle,
-            filepath,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
             key_dtype=self._key_dtype,
             value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            load_entire_dir=load_entire_dir,
             buffer_size=buffer_size)
 
   def _gather_saveables_for_checkpoint(self):
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/redis_table_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/redis_table_ops.py
index 43b74bd19..1937c483e 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/redis_table_ops.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/redis_table_ops.py
@@ -476,6 +476,72 @@ def export(self, name=None):
                                                     self._value_dtype)
     return exported_keys, exported_values
 
+  def save_to_file_system(self,
+                          dirpath,
+                          file_name=None,
+                          dirpath_env='TFRA_SAVED_KV',
+                          append_to_file=False,
+                          buffer_size=4194304,
+                          name=None):
+    """
+    Returns an operation to save the keys and values in table to dirpath. 
+    The keys and values will be stored in FileSystem, rewrited or appended to the filepath.
+    Args:
+      dirpath: A directory path to save the table.
+      dirpath_env: A environment variable stored a path to save the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in write buffer to file.
+      append_to_file: If true, operation will append data to the file but not write a new one.
+      name: Name for the operation.
+    Returns:
+      An operation to save the table.
+    """
+    with ops.name_scope(name, "%s_save_table" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(None, ignore_existing=True):
+        return redis_table_ops.tfra_redis_table_save_to_file_system(
+            self.resource_handle,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            append_to_file=append_to_file,
+            buffer_size=buffer_size)
+
+  def load_from_file_system(self,
+                            dirpath,
+                            file_name=None,
+                            dirpath_env='TFRA_SAVED_KV',
+                            load_entire_dir=False,
+                            buffer_size=4194304,
+                            name=None):
+    """
+    Returns an operation to load keys and values to table from
+    FileSystem. The keys and values files are generated from `save_to_file_system`.
+    Args:
+      dirpath: A directory path stored the table keys and values.
+      dirpath_env: A environment variable stored a path to load the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in read buffer from file.
+      load_entire_dir: If true, operation will load all key value files in the dirpath regardless partition.
+      name: Name for the operation.
+    Returns:
+      An operation to load keys and values to table from FileSystem.
+    """
+    with ops.name_scope(name, "%s_load_table" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(None, ignore_existing=True):
+        return redis_table_ops.tfra_redis_table_load_from_file_system(
+            self.resource_handle,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            load_entire_dir=load_entire_dir,
+            buffer_size=buffer_size)
+
   def _gather_saveables_for_checkpoint(self):
     """For object-based checkpointing."""
     # full_name helps to figure out the name-based Saver's name for this saveable.
diff --git a/third_party/hadoop.BUILD b/third_party/hadoop.BUILD
deleted file mode 100644
index 457c4d701..000000000
--- a/third_party/hadoop.BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "hadoop",
-    hdrs = ["hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h"],
-    includes = ["hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include"],
-    visibility = ["//visibility:public"],
-)
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
index 75eecd3c3..01c0f4390 100644
--- a/tools/docker/build_wheel.Dockerfile
+++ b/tools/docker/build_wheel.Dockerfile
@@ -43,6 +43,8 @@ RUN python -m pip install -r /install_deps/pytest.txt
 COPY requirements.txt .
 RUN python -m pip install -r requirements.txt
 
+RUN python -m pip install tensorflow-io
+
 RUN python -m pip install --upgrade protobuf==3.20.0
 
 COPY ./ /recommenders-addons
diff --git a/tools/docker/cpu_tests.Dockerfile b/tools/docker/cpu_tests.Dockerfile
index 2fce977c1..a1d9b261d 100644
--- a/tools/docker/cpu_tests.Dockerfile
+++ b/tools/docker/cpu_tests.Dockerfile
@@ -27,6 +27,8 @@ RUN pip install -r pytest.txt pytest-cov
 COPY ./ /recommenders-addons
 WORKDIR recommenders-addons
 
+RUN python -m pip install tensorflow-io
+
 RUN python -m pip install --upgrade protobuf==3.20.0
 
 RUN python configure.py
diff --git a/tools/docker/sanity_check.Dockerfile b/tools/docker/sanity_check.Dockerfile
index dd6a4b753..4c9da8fb0 100644
--- a/tools/docker/sanity_check.Dockerfile
+++ b/tools/docker/sanity_check.Dockerfile
@@ -33,6 +33,8 @@ RUN pip install -e /recommenders-addons
 
 WORKDIR /recommenders-addons
 
+RUN python -m pip install tensorflow-io
+
 RUN python -m pip install --upgrade protobuf==3.20.0
 
 RUN python configure.py