From 568a329c83312df89defe22f24dc9ef497ac0aca Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 9 May 2018 20:59:46 +0800
Subject: [PATCH 01/56] add checkpoint util class and implement

---
 paddle/fluid/operators/detail/checkpoint.cc | 54 +++++++++++++++++++++
 paddle/fluid/operators/detail/checkpoint.h  | 33 +++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 paddle/fluid/operators/detail/checkpoint.cc
 create mode 100644 paddle/fluid/operators/detail/checkpoint.h

diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc
new file mode 100644
index 00000000000000..78506a0a72e422
--- /dev/null
+++ b/paddle/fluid/operators/detail/checkpoint.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/detail/checkpoint.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+Checkpoint::Save(const framework::Scope& scope, const platform::Place& place,
+                 const std::string& save_dir, const std::string& var_name,
+                 const bool overwrite) {
+  auto* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                 var_name);
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Checkpoint only supports LoDTensor, %s has wrong type",
+                 var_name);
+
+  bool is_present = FileExists(save_dir);
+  if (is_present && !overwrite) {
+    PADDLE_THROW("%s exists!, checkpoint cannot write  it when overwrite=false",
+                 save_dir, overwrite);
+  }
+
+  MkDirRecursively(DirName(save_dir).c_str());
+  std::ofstream fout(save_dir);
+  PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write", save_dir);
+
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
+
+  auto& tensor = var->Get<framework::LoDTensor>();
+  // Serialize tensor
+  framework::SerializeToStream(fout, tensor, dev_ctx);
+  fout.close();
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h
new file mode 100644
index 00000000000000..0f0f450ce17bb5
--- /dev/null
+++ b/paddle/fluid/operators/detail/checkpoint.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class Checkpoint {
+ public:
+  static void Save(const framework::Scope& scope, const platform::Place& place,
+                   const std::string& save_dir, const std::string& var_name,
+                   const bool overwrite);
+
+  static void Load();
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle

From 1fabbbade28d4a642700c0df9ac6c4a0be0d4a66 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 10 May 2018 12:33:33 +0800
Subject: [PATCH 02/56] modify const to const &

---
 paddle/fluid/operators/detail/checkpoint.cc | 2 +-
 paddle/fluid/operators/detail/checkpoint.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc
index 78506a0a72e422..38e46532e6e18b 100644
--- a/paddle/fluid/operators/detail/checkpoint.cc
+++ b/paddle/fluid/operators/detail/checkpoint.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 Checkpoint::Save(const framework::Scope& scope, const platform::Place& place,
                  const std::string& save_dir, const std::string& var_name,
-                 const bool overwrite) {
+                 const bool& overwrite) {
   auto* var = scope.FindVar(var_name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
                  var_name);
diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h
index 0f0f450ce17bb5..dfa41979734ec3 100644
--- a/paddle/fluid/operators/detail/checkpoint.h
+++ b/paddle/fluid/operators/detail/checkpoint.h
@@ -24,7 +24,7 @@ class Checkpoint {
  public:
   static void Save(const framework::Scope& scope, const platform::Place& place,
                    const std::string& save_dir, const std::string& var_name,
-                   const bool overwrite);
+                   const bool& overwrite);
 
   static void Load();
 }

From 77c6b71ec44e3ba5220576fa528f3600b8784908 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 10 May 2018 20:03:13 +0800
Subject: [PATCH 03/56] add ckpt to sync loop

---
 paddle/fluid/operators/listen_and_serv_op.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 8acbf820250957..7fb7f07a610a7c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -101,6 +101,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::Scope *recv_scope,
                                   framework::BlockDesc *prefetch_block) const {
   auto fan_in = Attr<int>("Fanin");
+  auto checkpoint = Attr<std::string>("Checkpoint");
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
@@ -188,6 +189,18 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
     for (auto &var : sparse_vars) {
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     }
+
+    /******************** CHECK POINT ***********************/
+    std::vector<std::string> all_vars = recv_scope.LocalVarNames();
+
+    std::vector<std::string>::iterator it;
+    for (it = all_vars.begin(); it != all_vars.end(); it++) {
+      VLOG(2) << "Checkpoint Var: " << *it;
+      break;
+    }
+
+    /******************** CHECK POINT ***********************/
+
     rpc_service_->SetCond(1);
     // FIXME(typhoonzero): use another condition to sync wait clients get.
     rpc_service_->WaitClientGet(fan_in);

From b81671ecf214edca344cce12da51d6f0e1d21a66 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 10 May 2018 20:03:58 +0800
Subject: [PATCH 04/56] add ckpt attr to pserver python config

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 640ac9f085e6dc..8cd7cd5d3a9f81 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -481,7 +481,8 @@ def __append_optimize_op__(op, block, grad_to_block_id):
                 "Fanin": self.trainer_num,
                 "PrefetchBlock": prefetch_block,
                 "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id
+                "grad_to_block_id": grad_to_block_id,
+                "Checkpoint": "/tmp/tangwei_ckpt/"
             })
 
         pserver_program.sync_with_cpp()

From 2a05b3d5a3e8f8e58d01eebc2c0826e61c15c5dd Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 16:23:30 +0800
Subject: [PATCH 05/56] delete checkpoint function

---
 paddle/fluid/operators/detail/checkpoint.cc   | 54 -------------------
 paddle/fluid/operators/detail/checkpoint.h    | 33 ------------
 paddle/fluid/operators/listen_and_serv_op.cc  | 12 -----
 .../fluid/transpiler/distribute_transpiler.py |  3 +-
 4 files changed, 1 insertion(+), 101 deletions(-)
 delete mode 100644 paddle/fluid/operators/detail/checkpoint.cc
 delete mode 100644 paddle/fluid/operators/detail/checkpoint.h

diff --git a/paddle/fluid/operators/detail/checkpoint.cc b/paddle/fluid/operators/detail/checkpoint.cc
deleted file mode 100644
index 38e46532e6e18b..00000000000000
--- a/paddle/fluid/operators/detail/checkpoint.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/detail/checkpoint.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-Checkpoint::Save(const framework::Scope& scope, const platform::Place& place,
-                 const std::string& save_dir, const std::string& var_name,
-                 const bool& overwrite) {
-  auto* var = scope.FindVar(var_name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                 var_name);
-  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                 "Checkpoint only supports LoDTensor, %s has wrong type",
-                 var_name);
-
-  bool is_present = FileExists(save_dir);
-  if (is_present && !overwrite) {
-    PADDLE_THROW("%s exists!, checkpoint cannot write  it when overwrite=false",
-                 save_dir, overwrite);
-  }
-
-  MkDirRecursively(DirName(save_dir).c_str());
-  std::ofstream fout(save_dir);
-  PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write", save_dir);
-
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  auto& tensor = var->Get<framework::LoDTensor>();
-  // Serialize tensor
-  framework::SerializeToStream(fout, tensor, dev_ctx);
-  fout.close();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/checkpoint.h b/paddle/fluid/operators/detail/checkpoint.h
deleted file mode 100644
index dfa41979734ec3..00000000000000
--- a/paddle/fluid/operators/detail/checkpoint.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-class Checkpoint {
- public:
-  static void Save(const framework::Scope& scope, const platform::Place& place,
-                   const std::string& save_dir, const std::string& var_name,
-                   const bool& overwrite);
-
-  static void Load();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 7fb7f07a610a7c..8a3d747f86cf60 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -101,7 +101,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::Scope *recv_scope,
                                   framework::BlockDesc *prefetch_block) const {
   auto fan_in = Attr<int>("Fanin");
-  auto checkpoint = Attr<std::string>("Checkpoint");
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
@@ -190,17 +189,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     }
 
-    /******************** CHECK POINT ***********************/
-    std::vector<std::string> all_vars = recv_scope.LocalVarNames();
-
-    std::vector<std::string>::iterator it;
-    for (it = all_vars.begin(); it != all_vars.end(); it++) {
-      VLOG(2) << "Checkpoint Var: " << *it;
-      break;
-    }
-
-    /******************** CHECK POINT ***********************/
-
     rpc_service_->SetCond(1);
     // FIXME(typhoonzero): use another condition to sync wait clients get.
     rpc_service_->WaitClientGet(fan_in);
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 635763ed35270d..b45cb987d896bd 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -483,8 +483,7 @@ def __append_optimize_op__(op, block, grad_to_block_id):
                 "Fanin": self.trainer_num,
                 "PrefetchBlock": prefetch_block,
                 "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id,
-                "Checkpoint": "/tmp/tangwei_ckpt/"
+                "grad_to_block_id": grad_to_block_id
             })
 
         pserver_program.sync_with_cpp()

From 87a08563841715806972398dfeb7770d2b69d30b Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 16:24:08 +0800
Subject: [PATCH 06/56] add checkpoint save op

---
 paddle/fluid/operators/cpkt_save_op.cc | 158 +++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 paddle/fluid/operators/cpkt_save_op.cc

diff --git a/paddle/fluid/operators/cpkt_save_op.cc b/paddle/fluid/operators/cpkt_save_op.cc
new file mode 100644
index 00000000000000..352bd3350796d8
--- /dev/null
+++ b/paddle/fluid/operators/cpkt_save_op.cc
@@ -0,0 +1,158 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class CkptSaveOp : public framework::OperatorBase {
+ public:
+  CkptSaveOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensors one by one
+
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = framework::ToDataType(tensor.type());
+      auto out_dtype = in_dtype;
+
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
+    }
+    fout.close();
+  }
+};
+
+class CkptSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CkptSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(ckpt_save, ops::CkptSaveOp, ops::CkptSaveOpProtoMaker);

From dc534fc19525b2671a9620863daa7ace47a37c00 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 16:44:10 +0800
Subject: [PATCH 07/56] add checkpoint save op test

---
 paddle/fluid/operators/cpkt_save_op_test.cc | 44 +++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 paddle/fluid/operators/cpkt_save_op_test.cc

diff --git a/paddle/fluid/operators/cpkt_save_op_test.cc b/paddle/fluid/operators/cpkt_save_op_test.cc
new file mode 100644
index 00000000000000..3e620a0e9cbbd7
--- /dev/null
+++ b/paddle/fluid/operators/cpkt_save_op_test.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+TEST(CkptSaveOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(paddle::platform::float16(i));
+  }
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "ckpt_save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+}

From 802d10cf53c693a6fe551a9d007ce988fe89ccab Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 19:10:04 +0800
Subject: [PATCH 08/56] rename cpkt_save_op

---
 paddle/fluid/operators/{cpkt_save_op.cc => ckpt_save_op.cc}       | 0
 .../operators/{cpkt_save_op_test.cc => ckpt_save_op_test.cc}      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename paddle/fluid/operators/{cpkt_save_op.cc => ckpt_save_op.cc} (100%)
 rename paddle/fluid/operators/{cpkt_save_op_test.cc => ckpt_save_op_test.cc} (100%)

diff --git a/paddle/fluid/operators/cpkt_save_op.cc b/paddle/fluid/operators/ckpt_save_op.cc
similarity index 100%
rename from paddle/fluid/operators/cpkt_save_op.cc
rename to paddle/fluid/operators/ckpt_save_op.cc
diff --git a/paddle/fluid/operators/cpkt_save_op_test.cc b/paddle/fluid/operators/ckpt_save_op_test.cc
similarity index 100%
rename from paddle/fluid/operators/cpkt_save_op_test.cc
rename to paddle/fluid/operators/ckpt_save_op_test.cc

From d1bd3fdefc9ec5a2c8d3746ab833dabd9f841948 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 19:10:24 +0800
Subject: [PATCH 09/56] add build and test make

---
 paddle/fluid/operators/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 256aded8ca234a..a6c7690d6b7c7d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -229,6 +229,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(ckpt_save_op DEPS lod_tensor)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
@@ -277,5 +278,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+cc_test(ckpt_save_op_test SRCS ckpt_save_op_test.cc DEPS ckpt_save_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)

From 5e74db3f2a1872b9433ec0348092f150f727359c Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 21:38:49 +0800
Subject: [PATCH 10/56] add build and test make

---
 paddle/fluid/operators/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a6c7690d6b7c7d..948ce79da7db56 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -229,7 +229,6 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
-op_library(ckpt_save_op DEPS lod_tensor)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
@@ -243,6 +242,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(ckpt_save_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
 # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency

From a1419f1062826167313e6ff68f894eb00fe1f34f Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 11 May 2018 22:54:54 +0800
Subject: [PATCH 11/56] test add op declare

---
 paddle/fluid/operators/ckpt_save_op_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/ckpt_save_op_test.cc b/paddle/fluid/operators/ckpt_save_op_test.cc
index 3e620a0e9cbbd7..f8616ef53ce1a6 100644
--- a/paddle/fluid/operators/ckpt_save_op_test.cc
+++ b/paddle/fluid/operators/ckpt_save_op_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+USE_NO_KERNEL_OP(ckpt_save)
+
 TEST(CkptSaveOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;

From 461d2fc0d7ef3ddfc2bcb47561facb43929ecd56 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 14 May 2018 15:21:08 +0800
Subject: [PATCH 12/56] rename ckpt -> checkpoint

---
 paddle/fluid/operators/CMakeLists.txt            |  4 ++--
 ...op_test.cc => che'ck'po'in't_save_op_test.cc} |  6 +++---
 .../{ckpt_save_op.cc => checkpoint_save_op.cc}   | 16 +++++++++-------
 3 files changed, 14 insertions(+), 12 deletions(-)
 rename paddle/fluid/operators/{ckpt_save_op_test.cc => che'ck'po'in't_save_op_test.cc} (92%)
 rename paddle/fluid/operators/{ckpt_save_op.cc => checkpoint_save_op.cc} (90%)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 948ce79da7db56..34ec82c294b602 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -242,7 +242,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(ckpt_save_op DEPS lod_tensor)
+op_library(checkpoint_save_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
 # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
@@ -278,6 +278,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-cc_test(ckpt_save_op_test SRCS ckpt_save_op_test.cc DEPS ckpt_save_op)
+cc_test(checkpoint_save_op_test SRCS checkpoint_save_op_test.cc DEPS checkpoint_save_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/ckpt_save_op_test.cc b/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc
similarity index 92%
rename from paddle/fluid/operators/ckpt_save_op_test.cc
rename to paddle/fluid/operators/che'ck'po'in't_save_op_test.cc
index f8616ef53ce1a6..b49bbd1a58f2c1 100644
--- a/paddle/fluid/operators/ckpt_save_op_test.cc
+++ b/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-USE_NO_KERNEL_OP(ckpt_save)
+USE_NO_KERNEL_OP(checkpoint_save)
 
-TEST(CkptSaveOp, CPU) {
+TEST(CheckpointSaveOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
 
@@ -41,6 +41,6 @@ TEST(CkptSaveOp, CPU) {
   attrs.insert({"file_path", std::string("tensor.save")});
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "ckpt_save", {{"X", {"test_var"}}}, {}, attrs);
+      "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs);
   save_op->Run(scope, place);
 }
diff --git a/paddle/fluid/operators/ckpt_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
similarity index 90%
rename from paddle/fluid/operators/ckpt_save_op.cc
rename to paddle/fluid/operators/checkpoint_save_op.cc
index 352bd3350796d8..2462ec09d6b865 100644
--- a/paddle/fluid/operators/ckpt_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -57,11 +57,12 @@ static void MkDirRecursively(const char *fullpath) {
   MkDir(fullpath);
 }
 
-class CkptSaveOp : public framework::OperatorBase {
+class CheckpointSaveOp : public framework::OperatorBase {
  public:
-  CkptSaveOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
+  CheckpointSaveOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
  private:
@@ -122,9 +123,9 @@ class CkptSaveOp : public framework::OperatorBase {
   }
 };
 
-class CkptSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CkptSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+  CheckpointSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -155,4 +156,5 @@ to a file on disk.
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(ckpt_save, ops::CkptSaveOp, ops::CkptSaveOpProtoMaker);
+REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
+                  ops::CheckpointSaveOpProtoMaker);

From 2f4c039e6218c68f6047c6ef8f1ba23431689e68 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 14 May 2018 21:36:34 +0800
Subject: [PATCH 13/56] rename, modify ckpt structure

---
 paddle/fluid/operators/checkpoint_save_op.cc  | 34 ++++++-------------
 ..._op_test.cc => checkpoint_save_op_test.cc} |  2 +-
 .../fluid/transpiler/distribute_transpiler.py | 12 +++++++
 3 files changed, 24 insertions(+), 24 deletions(-)
 rename paddle/fluid/operators/{che'ck'po'in't_save_op_test.cc => checkpoint_save_op_test.cc} (96%)

diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 2462ec09d6b865..1e621a00e5028c 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -68,19 +68,16 @@ class CheckpointSaveOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
+    auto dir = Attr<std::string>("dir");
     auto overwrite = Attr<bool>("overwrite");
 
-    bool is_present = FileExists(filename);
+    bool is_present = FileExists(dir);
     if (is_present && !overwrite) {
       PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
+                   dir, overwrite);
     }
 
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
+    MkDirRecursively(dir.c_str());
 
     auto inp_var_names = Inputs("X");
     PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
@@ -92,6 +89,10 @@ class CheckpointSaveOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < inp_var_names.size(); i++) {
       auto *var = scope.FindVar(inp_var_names[i]);
+      std::string var_file;
+      var_file.append(dir);
+      var_file.append("/");
+      var_file.append(inp_var_names[i]);
 
       PADDLE_ENFORCE(var != nullptr,
                      "Cannot find variable %s for save_combine_op",
@@ -103,23 +104,10 @@ class CheckpointSaveOp : public framework::OperatorBase {
       auto &tensor = var->Get<framework::LoDTensor>();
       // Serialize tensors one by one
 
-      // Check types to see if a fp16 transformation is required
-      auto in_dtype = framework::ToDataType(tensor.type());
-      auto out_dtype = in_dtype;
-
-      if (in_dtype != out_dtype) {
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor out;
-        // copy LoD info to the new tensor
-        out.set_lod(tensor.lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
-      } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
-      }
+      std::ofstream fout(var_file);
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+      fout.close();
     }
-    fout.close();
   }
 };
 
diff --git a/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc b/paddle/fluid/operators/checkpoint_save_op_test.cc
similarity index 96%
rename from paddle/fluid/operators/che'ck'po'in't_save_op_test.cc
rename to paddle/fluid/operators/checkpoint_save_op_test.cc
index b49bbd1a58f2c1..7b5aa7bcde16ea 100644
--- a/paddle/fluid/operators/che'ck'po'in't_save_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_save_op_test.cc
@@ -38,7 +38,7 @@ TEST(CheckpointSaveOp, CPU) {
   }
 
   paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("tensor.save")});
+  attrs.insert({"dir", std::string("tensor/ckpt")});
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
       "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs);
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index b45cb987d896bd..b76f8de5040102 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -207,6 +207,11 @@ def transpile(self,
         self.pserver_endpoints = pserver_endpoints
         self.optimize_ops, params_grads = self._get_optimize_pass()
 
+        # is_chief (no.0 triner) for checkpoint
+        # the no.0 trainer will save all variables and its own reader offset to checkpoint
+        # other trianers will save its own reader offset to checkpoint
+        self.is_chief = trainer_id == 0
+
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
         # 2. check all lookup_table_op share the same table.
@@ -309,6 +314,13 @@ def transpile(self,
                 "epmap": eplist,
                 "sync_mode": self.sync_mode
             })
+
+        program.global_block().append_op(
+            type="checkpoint_save",
+            inputs={"X": send_outputs},
+            attrs={"overwrite": True,
+                   "file_path": "/workspace/ckpt/"})
+
         # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
             if len(splited_var) <= 1:

From 38596cfb1e3b034bd26e68e97f3291dbbdea3de0 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 14 May 2018 21:37:09 +0800
Subject: [PATCH 14/56] move file_path to dir

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index b76f8de5040102..6366ba8a58558c 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -319,7 +319,7 @@ def transpile(self,
             type="checkpoint_save",
             inputs={"X": send_outputs},
             attrs={"overwrite": True,
-                   "file_path": "/workspace/ckpt/"})
+                   "dir": "/workspace/ckpt/"})
 
         # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():

From ce1bcc947f5d036dad34fabcc854531cb63cbc25 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 14 May 2018 23:11:23 +0800
Subject: [PATCH 15/56] add op to framework.py

---
 python/paddle/fluid/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 28e54f5492e7b0..46122635404771 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -489,7 +489,7 @@ def find_name(var_list, name):
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
             'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
             'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select'
+            'channel_send', 'channel_recv', 'select', 'checkpoint_save'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)

From 3c820064defc0ef2e24439f3674b7d1f34269436 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 14 May 2018 23:14:06 +0800
Subject: [PATCH 16/56] remove overwrite judge to test load

---
 paddle/fluid/operators/checkpoint_save_op.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 1e621a00e5028c..94a1cc05c76a33 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -30,6 +30,9 @@ namespace operators {
 // TODO(sidgoyal78): These function are needed by other files (save_op), move
 // them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
 constexpr char kSEP = '/';
+// write empty file named _SUCCESS
+const char SUCCESS[] = "_SUCCESS";
+
 static bool FileExists(const std::string &filepath) {
   struct stat buffer;
   return (stat(filepath.c_str(), &buffer) == 0);
@@ -73,8 +76,11 @@ class CheckpointSaveOp : public framework::OperatorBase {
 
     bool is_present = FileExists(dir);
     if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   dir, overwrite);
+      return;
+      // todo(tangwei) judge the folder is exist
+      // PADDLE_THROW("%s exists!, cannot save_combine to it when
+      // overwrite=false",
+      //              dir, overwrite);
     }
 
     MkDirRecursively(dir.c_str());
@@ -108,6 +114,13 @@ class CheckpointSaveOp : public framework::OperatorBase {
       framework::SerializeToStream(fout, tensor, dev_ctx);
       fout.close();
     }
+
+    std::string success;
+    success.append(dir);
+    success.append("/");
+    success.append(SUCCESS);
+    std::ofstream fout(success);
+    fout.close();
   }
 };
 

From f04b23adf96651185bd0b47d90f8b5f1fee77706 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 15 May 2018 16:13:41 +0800
Subject: [PATCH 17/56] add checkpoint_load, update checkpoint save

---
 paddle/fluid/operators/CMakeLists.txt         |  3 +-
 paddle/fluid/operators/checkpoint_load_op.cc  | 87 +++++++++++++++++++
 ..._save_op_test.cc => checkpoint_op_test.cc} |  0
 paddle/fluid/operators/checkpoint_save_op.cc  | 21 +++--
 4 files changed, 103 insertions(+), 8 deletions(-)
 create mode 100644 paddle/fluid/operators/checkpoint_load_op.cc
 rename paddle/fluid/operators/{checkpoint_save_op_test.cc => checkpoint_op_test.cc} (100%)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 34ec82c294b602..df0292d902f2af 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -243,6 +243,7 @@ op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(checkpoint_save_op DEPS lod_tensor)
+op_library(checkpoint_load_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
 # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
@@ -278,6 +279,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-cc_test(checkpoint_save_op_test SRCS checkpoint_save_op_test.cc DEPS checkpoint_save_op)
+cc_test(checkpoint_op_test SRCS checkpoint_op_test.cc DEPS checkpoint_save_op checkpoint_load_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
new file mode 100644
index 00000000000000..b2ca59f2b5b5bf
--- /dev/null
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kSEP = '/';
+// write empty file named _SUCCESS
+const char SUCCESS[] = "_SUCCESS";
+
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+class CheckpointLoadOp : public framework::OperatorBase {
+ public:
+  CheckpointLoadOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dir = Attr<std::string>("dir");
+    bool is_present = FileExists(dir);
+    if (!is_present) {
+      return;
+    }
+
+    // UPDATE LATER ...
+  }
+};
+
+class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<std::string>(
+        "dir",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
+                  ops::CheckpointLoadOpProtoMaker);
diff --git a/paddle/fluid/operators/checkpoint_save_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
similarity index 100%
rename from paddle/fluid/operators/checkpoint_save_op_test.cc
rename to paddle/fluid/operators/checkpoint_op_test.cc
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 94a1cc05c76a33..7007ab9e1a1a72 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
 constexpr char kSEP = '/';
 // write empty file named _SUCCESS
 const char SUCCESS[] = "_SUCCESS";
@@ -82,7 +80,14 @@ class CheckpointSaveOp : public framework::OperatorBase {
       // overwrite=false",
       //              dir, overwrite);
     }
+    MkDirRecursively(dir.c_str());
 
+    auto serial_var_name = Output("Serial");
+    auto *serial_var = scope.FindVar(serial_var_name);
+    std::string *serial_num = serial_var->GetMutable<std::string>();
+    serial_num->append("0");
+    dir.append("/");
+    dir.append(serial_num);
     MkDirRecursively(dir.c_str());
 
     auto inp_var_names = Inputs("X");
@@ -93,6 +98,7 @@ class CheckpointSaveOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
+    // todo (tangwei) made it async
     for (size_t i = 0; i < inp_var_names.size(); i++) {
       auto *var = scope.FindVar(inp_var_names[i]);
       std::string var_file;
@@ -132,19 +138,20 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
         .AsDuplicable();
+    AddOutput("Serial", "the serial number");
     AddComment(R"DOC(
-SaveCombine operator
+CheckpointSave operator
 
 This operator will serialize and write a list of input LoDTensor variables 
 to a file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if it exists.")
-        .SetDefault(true);
+                  "(boolean, default false)"
+                  "Delete the output dir if it exists.")
+        .SetDefault(false);
 
     AddAttr<std::string>(
-        "file_path",
+        "dir",
         "(string)"
         "The \"file_path\" where the LoDTensor variables will be saved.")
         .AddCustomChecker(

From c80125f286fb641472b62a51c6f350e00e904519 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 15 May 2018 17:16:17 +0800
Subject: [PATCH 18/56] add checkpoint_load to python framework

---
 python/paddle/fluid/framework.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 46122635404771..6ab31ec9463b8e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -489,7 +489,8 @@ def find_name(var_list, name):
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
             'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
             'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'checkpoint_save'
+            'channel_send', 'channel_recv', 'select', 'checkpoint_save',
+            'checkpoint_save'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)

From 2e25e739f33189002c8aea56a5180666794e5dcc Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 15 May 2018 17:17:14 +0800
Subject: [PATCH 19/56] write checkpoint_load code simply

---
 paddle/fluid/operators/checkpoint_load_op.cc |  8 -------
 paddle/fluid/operators/checkpoint_op_test.cc | 22 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index b2ca59f2b5b5bf..8edf3b6429dbdd 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -36,14 +36,6 @@ static bool FileExists(const std::string &filepath) {
   return (stat(filepath.c_str(), &buffer) == 0);
 }
 
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
 class CheckpointLoadOp : public framework::OperatorBase {
  public:
   CheckpointLoadOp(const std::string &type,
diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
index 7b5aa7bcde16ea..1445d9f9acffc9 100644
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_op_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 USE_NO_KERNEL_OP(checkpoint_save)
+USE_NO_KERNEL_OP(checkpoint_load)
 
 TEST(CheckpointSaveOp, CPU) {
   paddle::framework::Scope scope;
@@ -37,10 +38,27 @@ TEST(CheckpointSaveOp, CPU) {
     expect[i] = static_cast<float>(paddle::platform::float16(i));
   }
 
+  scope.Var("SERIAL_NUMBER");
+
   paddle::framework::AttributeMap attrs;
-  attrs.insert({"dir", std::string("tensor/ckpt")});
+  attrs.insert({"dir", std::string("ckpt")});
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs);
+      "checkpoint_save", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}},
+      attrs);
+  save_op->Run(scope, place);
+}
+
+TEST(CheckpointLoadOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  scope.Var("test_var");
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"dir", std::string("ckpt")});
+
+  auto save_op =
+      paddle::framework::OpRegistry::CreateOp("checkpoint_load", {}, {}, attrs);
   save_op->Run(scope, place);
 }

From 30b50dcf8cd07efedd3d99a36199f589b29a448a Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 15 May 2018 17:23:48 +0800
Subject: [PATCH 20/56] fix Serial output type

---
 paddle/fluid/operators/checkpoint_save_op.cc | 25 +++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 7007ab9e1a1a72..7449352117b58a 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -87,7 +87,7 @@ class CheckpointSaveOp : public framework::OperatorBase {
     std::string *serial_num = serial_var->GetMutable<std::string>();
     serial_num->append("0");
     dir.append("/");
-    dir.append(serial_num);
+    dir.append(serial_num->c_str());
     MkDirRecursively(dir.c_str());
 
     auto inp_var_names = Inputs("X");
@@ -159,10 +159,29 @@ to a file on disk.
   }
 };
 
+class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output("Serial").front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
-                  ops::CheckpointSaveOpProtoMaker);
+REGISTER_OPERATOR(send_vars, ops::CheckpointSaveOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CheckpointSaveOpProtoMaker,
+                  ops::CheckpointSaveOpVarTypeInference,
+                  ops::CheckpointSaveOpShapeInference);

From 0334d494406ff3fc0ac6e9a078ce17bee38a2fd6 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 15 May 2018 18:58:00 +0800
Subject: [PATCH 21/56] fix bug

---
 paddle/fluid/operators/checkpoint_load_op.cc | 7 +++++++
 paddle/fluid/operators/checkpoint_save_op.cc | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 8edf3b6429dbdd..ec451c9f3f0cd2 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -61,6 +61,13 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+CheckpointLoad operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+
     AddAttr<std::string>(
         "dir",
         "(string)"
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 7449352117b58a..1082bb4a345a2e 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -180,7 +180,7 @@ class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(send_vars, ops::CheckpointSaveOp,
+REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
                   paddle::framework::EmptyGradOpMaker,
                   ops::CheckpointSaveOpProtoMaker,
                   ops::CheckpointSaveOpVarTypeInference,

From d081256cd541521c17c1e8f1988e02109582d2f2 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 15:29:57 +0800
Subject: [PATCH 22/56] add api in distribute transpiler

---
 .../fluid/transpiler/distribute_transpiler.py | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 6366ba8a58558c..104e2405322e96 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -315,10 +315,21 @@ def transpile(self,
                 "sync_mode": self.sync_mode
             })
 
+        serial_var = program.global_block().create_var(
+            name="SERIAL_NUMBER",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        save_vars = []
+        for var in self.origin_program.list_vars():
+            if self.is_persistable(var):
+                save_vars.append(var.name)
+
         program.global_block().append_op(
             type="checkpoint_save",
-            inputs={"X": send_outputs},
-            attrs={"overwrite": True,
+            inputs={"X": save_vars},
+            outputs={"Serial": serial_var},
+            attrs={"overwrite": False,
                    "dir": "/workspace/ckpt/"})
 
         # step4: Concat the parameters splits together after recv.
@@ -501,6 +512,27 @@ def __append_optimize_op__(op, block, grad_to_block_id):
         pserver_program.sync_with_cpp()
         return pserver_program
 
+    def is_persistable(self, var):
+        if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                var.desc.type() == core.VarDesc.VarType.RAW :
+            return False
+        return var.persistable
+
+    def get_train_startup_program(self, checkpoint_load_dir=None):
+        startup_prog = default_startup_program()
+
+        if not checkpoint_load_dir:
+            return startup_prog
+
+        for var in startup_prog.list_vars():
+            if self.is_persistable(var):
+                print("var: %s" % var.name)
+
+        startup_prog.global_block().append_op(
+            type="checkpoint_load", attrs={"dir": checkpoint_load_dir})
+        return startup_prog
+
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.

From 886897ccf742f3c95714703b5ed925d35a56e46e Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 16:05:33 +0800
Subject: [PATCH 23/56] load implement

---
 paddle/fluid/operators/checkpoint_load_op.cc | 48 ++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index ec451c9f3f0cd2..ba8b5dbb51c10a 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -47,13 +47,54 @@ class CheckpointLoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto dir = Attr<std::string>("dir");
-    bool is_present = FileExists(dir);
+    std::string dir = Attr<std::string>("dir");
+
+    VLOG(3) << "Load checkpoint from dir: " << dir;
+
+    std::string success;
+    success.append(dir);
+    success.append("/");
+    success.append(SUCCESS);
+
+    bool is_present = FileExists(success);
     if (!is_present) {
+      VLOG(3) << "can not find _SUCCESS from  path: " << success;
       return;
     }
 
-    // UPDATE LATER ...
+    auto inp_var_names = Output("Out");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // todo (tangwei) made it async
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      std::string var_file;
+      var_file.append(dir);
+      var_file.append("/");
+      var_file.append(inp_var_names[i]);
+      VLOG(3) << "ready to load var: " << inp_var_names[i];
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+
+      std::ifstream fin(var_file);
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                     var_file);
+      DeserializeFromStream(fin, tensor, *dev_ctx);
+      fin.close();
+      VLOG(3) << " load var: " << inp_var_names[i] << " finished";
+    }
   }
 };
 
@@ -61,6 +102,7 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) The tensor need to be loaded");
     AddComment(R"DOC(
 CheckpointLoad operator
 

From 9cf47afe6154d86214b74f7082155cc3ae014ea8 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 16:06:21 +0800
Subject: [PATCH 24/56] modify get trainer param

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 104e2405322e96..04aa51d2cdd381 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -525,12 +525,15 @@ def get_train_startup_program(self, checkpoint_load_dir=None):
         if not checkpoint_load_dir:
             return startup_prog
 
+        load_vars = []
         for var in startup_prog.list_vars():
             if self.is_persistable(var):
-                print("var: %s" % var.name)
+                load_vars.append(var.name)
 
         startup_prog.global_block().append_op(
-            type="checkpoint_load", attrs={"dir": checkpoint_load_dir})
+            type="checkpoint_load",
+            outputs={"Out": load_vars},
+            attrs={"dir": checkpoint_load_dir})
         return startup_prog
 
     def get_startup_program(self, endpoint, pserver_program):

From c6f042f5d653af725b8af31f73570c153cb790be Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 16:27:33 +0800
Subject: [PATCH 25/56] modify load op

---
 paddle/fluid/operators/checkpoint_load_op.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index ba8b5dbb51c10a..026820ca303247 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -62,7 +62,7 @@ class CheckpointLoadOp : public framework::OperatorBase {
       return;
     }
 
-    auto inp_var_names = Output("Out");
+    auto inp_var_names = Inputs("X");
     PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
                       "The number of input variables should be greater than 0");
     // get device context from pool
@@ -102,7 +102,10 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
     AddComment(R"DOC(
 CheckpointLoad operator
 

From b677d8216e4454fadfc5204e00f7d483bb189368 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 16:48:16 +0800
Subject: [PATCH 26/56] bug fix

---
 paddle/fluid/operators/checkpoint_load_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 026820ca303247..241886e2be5669 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -87,12 +87,12 @@ class CheckpointLoadOp : public framework::OperatorBase {
       VLOG(3) << "ready to load var: " << inp_var_names[i];
 
       auto &tensor = var->Get<framework::LoDTensor>();
-
       std::ifstream fin(var_file);
       PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
                      var_file);
-      DeserializeFromStream(fin, tensor, *dev_ctx);
+      DeserializeFromStream(fin, tensor, dev_ctx);
       fin.close();
+
       VLOG(3) << " load var: " << inp_var_names[i] << " finished";
     }
   }

From 744e95d30559cc5518b612678e0af6d0680fdbbe Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 19:06:02 +0800
Subject: [PATCH 27/56] add ckpt load

---
 python/paddle/fluid/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 6ab31ec9463b8e..c33d15e32f9e52 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -490,7 +490,7 @@ def find_name(var_list, name):
             'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
             'load_combine', 'ncclInit', 'channel_create', 'channel_close',
             'channel_send', 'channel_recv', 'select', 'checkpoint_save',
-            'checkpoint_save'
+            'checkpoint_load'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)

From 955c79340c24adad885539d8e89b67835f666481 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 19:07:23 +0800
Subject: [PATCH 28/56] add X to test

---
 paddle/fluid/operators/checkpoint_op_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
index 1445d9f9acffc9..bea44b35cada29 100644
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_op_test.cc
@@ -58,7 +58,7 @@ TEST(CheckpointLoadOp, CPU) {
   paddle::framework::AttributeMap attrs;
   attrs.insert({"dir", std::string("ckpt")});
 
-  auto save_op =
-      paddle::framework::OpRegistry::CreateOp("checkpoint_load", {}, {}, attrs);
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "checkpoint_load", {{"X", {"test_var"}}}, {}, attrs);
   save_op->Run(scope, place);
 }

From 3dd274657fb20c17e02fb2f76e1169b218828d93 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 19:08:15 +0800
Subject: [PATCH 29/56] modify Get -> GetMutable

---
 paddle/fluid/operators/checkpoint_load_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 241886e2be5669..72cfccaaa22b7d 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -86,11 +86,11 @@ class CheckpointLoadOp : public framework::OperatorBase {
       var_file.append(inp_var_names[i]);
       VLOG(3) << "ready to load var: " << inp_var_names[i];
 
-      auto &tensor = var->Get<framework::LoDTensor>();
+      auto *tensor = var->GetMutable<framework::LoDTensor>();
       std::ifstream fin(var_file);
       PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
                      var_file);
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      framework::DeserializeFromStream(fin, tensor, dev_ctx);
       fin.close();
 
       VLOG(3) << " load var: " << inp_var_names[i] << " finished";

From 4220b31d4f45918fbc0a74cc05ba14ffd4ab093c Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 16 May 2018 20:50:24 +0800
Subject: [PATCH 30/56] update pserver startup

---
 .../fluid/transpiler/distribute_transpiler.py | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 04aa51d2cdd381..84cfc6e0117e8b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -520,6 +520,11 @@ def is_persistable(self, var):
         return var.persistable
 
     def get_train_startup_program(self, checkpoint_load_dir=None):
+        """
+        Get train startup program.
+        If checkpoint_load_dir is None, rerurn default startup program.
+        IF checkpoint_load_dir is Exist, add checkpoint_load op and load Var.
+        """
         startup_prog = default_startup_program()
 
         if not checkpoint_load_dir:
@@ -536,7 +541,10 @@ def get_train_startup_program(self, checkpoint_load_dir=None):
             attrs={"dir": checkpoint_load_dir})
         return startup_prog
 
-    def get_startup_program(self, endpoint, pserver_program):
+    def get_startup_program(self,
+                            endpoint,
+                            pserver_program,
+                            checkpoint_load_dir=None):
         """
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
@@ -561,6 +569,7 @@ def _get_splited_name_and_shape(varname):
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
+        load_vars = []
         for op in orig_s_prog.global_block().ops:
             new_inputs = dict()
             new_outputs = dict()
@@ -588,6 +597,16 @@ def _get_splited_name_and_shape(varname):
                     inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
+                for var in new_outputs.values():
+                    load_vars.append(var.name)
+        # add checkpoint op 
+        if not checkpoint_load_dir:
+            return s_prog
+
+        s_prog.global_block().append_op(
+            type="checkpoint_load",
+            inputs={"X": load_vars},
+            attrs={"dir": checkpoint_load_dir})
         return s_prog
 
     # transpiler function for dis lookup_table

From 6d53dceeec5b0b014c614821d6e1bf355a280d64 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 17 May 2018 21:47:44 +0800
Subject: [PATCH 31/56] optimized checkpoint serial number and folder

---
 paddle/fluid/operators/checkpoint_load_op.cc | 121 +++++++++++++------
 paddle/fluid/operators/checkpoint_op_test.cc |  10 +-
 paddle/fluid/operators/checkpoint_save_op.cc | 103 ++++++++--------
 3 files changed, 143 insertions(+), 91 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 72cfccaaa22b7d..ad237a889ad0a2 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include <fstream>
 #include <numeric>
 #include <sstream>
+#include <string>
+
+#include <boost/filesystem.hpp>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -30,12 +34,70 @@ namespace operators {
 constexpr char kSEP = '/';
 // write empty file named _SUCCESS
 const char SUCCESS[] = "_SUCCESS";
+const char SERIAL_VAR[] = "SERIAL_NUMBER";
 
 static bool FileExists(const std::string &filepath) {
   struct stat buffer;
   return (stat(filepath.c_str(), &buffer) == 0);
 }
 
+static std::string GenePath(const std::string &dir, const std::string &file) {
+  boost::filesystem::path dir(dir);
+  boost::filesystem::path file(file);
+  boost::filesystem::path full_path = dir / file;
+  return full_path;
+}
+
+static void LoadInputVars(const framework::Scope &scope,
+                          const platform::Place &place,
+                          const std::vector<std::string> &inp_var_names,
+                          const std::string &dir) {
+  // get device context from pool
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+
+  // todo (tangwei) made it async
+  for (size_t i = 0; i < inp_var_names.size(); i++) {
+    auto *var = scope.FindVar(inp_var_names[i]);
+
+    PADDLE_ENFORCE(var != nullptr,
+                   "Cannot find variable %s for save_combine_op",
+                   inp_var_names[i]);
+    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                   "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                   inp_var_names[i]);
+
+    std::string var_file = GenePath(dir, inp_var_names[i]);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    std::ifstream fin(var_file);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   var_file);
+    framework::DeserializeFromStream(fin, tensor, dev_ctx);
+    fin.close();
+    VLOG(3) << " load var: " << inp_var_names[i] << " finished";
+  }
+}
+
+static void LoadStringArgv(const framework::Scope &scope,
+                           const platform::Place &place,
+                           const std::string &argv, const std::string &dir) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+
+  for (size_t i = 0; i < argv.size(); i++) {
+    auto *var = scope.FindVar(inp_var_names[i]);
+    std::string *var_str = var->GetMutable<std::string>();
+
+    std::string var_file = GenePath(dir, argv);
+    std::ifstream fin(var_file);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   var_file);
+    std::getline(fin, var_str);
+    fin.close();
+    VLOG(3) << " load String argv: " << argv << " value is: " << var_str;
+  }
+}
+
 class CheckpointLoadOp : public framework::OperatorBase {
  public:
   CheckpointLoadOp(const std::string &type,
@@ -48,53 +110,33 @@ class CheckpointLoadOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     std::string dir = Attr<std::string>("dir");
+    int serial_num = Attr<int>("Serial");
 
-    VLOG(3) << "Load checkpoint from dir: " << dir;
+    auto *serial_var = scope.FindVar(SERIAL_VAR);
+    serial_var = serial_num;
+    VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER
+            << " value: " << serial_num;
 
     std::string success;
-    success.append(dir);
-    success.append("/");
-    success.append(SUCCESS);
-
+    = GenePath(dir, std::to_string(serial_num));
+    VLOG(3) << "Load checkpoint from dir: " << success;
+    success = GenePath(success, SUCCESS);
     bool is_present = FileExists(success);
     if (!is_present) {
-      VLOG(3) << "can not find _SUCCESS from  path: " << success;
+      VLOG(1) << "CheckpointLoadOp can not find " << SUCCESS
+              << " from: " << success;
       return;
     }
 
+    VLOG(3) << "Ready to load vars to scope";
     auto inp_var_names = Inputs("X");
     PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
                       "The number of input variables should be greater than 0");
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // todo (tangwei) made it async
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      auto *var = scope.FindVar(inp_var_names[i]);
-
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-
-      std::string var_file;
-      var_file.append(dir);
-      var_file.append("/");
-      var_file.append(inp_var_names[i]);
-      VLOG(3) << "ready to load var: " << inp_var_names[i];
-
-      auto *tensor = var->GetMutable<framework::LoDTensor>();
-      std::ifstream fin(var_file);
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                     var_file);
-      framework::DeserializeFromStream(fin, tensor, dev_ctx);
-      fin.close();
-
-      VLOG(3) << " load var: " << inp_var_names[i] << " finished";
-    }
+    LoadInputVars(scope, place, &inp_var_names);
+
+    VLOG(3) << "Ready to load string argv to scope";
+    auto argv = Inputs("Argv");
+    LoadStringArgv(scope, place, &argv, &dir);
   }
 };
 
@@ -106,6 +148,10 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
         .AsDuplicable();
+    AddInput(
+        "Argv",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
     AddComment(R"DOC(
 CheckpointLoad operator
 
@@ -113,6 +159,9 @@ This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
 
+    AddAttr<int>("Serial",
+                 "(int)"
+                 "The  serial number of the checkpoint will to be load.");
     AddAttr<std::string>(
         "dir",
         "(string)"
diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
index bea44b35cada29..75bfc3f840765b 100644
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_op_test.cc
@@ -44,8 +44,7 @@ TEST(CheckpointSaveOp, CPU) {
   attrs.insert({"dir", std::string("ckpt")});
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_save", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}},
-      attrs);
+      "checkpoint_save", {{"X", {"test_var"}}}, attrs);
   save_op->Run(scope, place);
 }
 
@@ -58,7 +57,8 @@ TEST(CheckpointLoadOp, CPU) {
   paddle::framework::AttributeMap attrs;
   attrs.insert({"dir", std::string("ckpt")});
 
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_load", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "checkpoint_load", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}},
+      attrs);
+  load_op->Run(scope, place);
 }
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 1082bb4a345a2e..54911fc054c213 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include <fstream>
 #include <numeric>
 #include <sstream>
+#include <string>
+
+#include <boost/filesystem.hpp>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -30,6 +34,14 @@ namespace operators {
 constexpr char kSEP = '/';
 // write empty file named _SUCCESS
 const char SUCCESS[] = "_SUCCESS";
+const char SERIAL_VAR[] = "SERIAL_NUMBER";
+
+static std::string GenePath(const std::string &dir, const std::string &file) {
+  boost::filesystem::path dir(dir);
+  boost::filesystem::path file(file);
+  boost::filesystem::path full_path = dir / file;
+  return full_path;
+}
 
 static bool FileExists(const std::string &filepath) {
   struct stat buffer;
@@ -72,24 +84,20 @@ class CheckpointSaveOp : public framework::OperatorBase {
     auto dir = Attr<std::string>("dir");
     auto overwrite = Attr<bool>("overwrite");
 
+    auto serial_num = scope.FindVar(SERIAL_VAR);
+    if (serial_num == nullptr) {
+      serial_num = scope.Var(SERIAL_VAR);
+    }
+    serial_num = serial_num + 1;
+
+    dir = GenePath(dir, std::to_string(serial_num));
     bool is_present = FileExists(dir);
     if (is_present && !overwrite) {
-      return;
-      // todo(tangwei) judge the folder is exist
-      // PADDLE_THROW("%s exists!, cannot save_combine to it when
-      // overwrite=false",
-      //              dir, overwrite);
+      PADDLE_THROW("%s exists!, checkpoint save cannot to  overwrite it", dir,
+                   overwrite);
     }
     MkDirRecursively(dir.c_str());
 
-    auto serial_var_name = Output("Serial");
-    auto *serial_var = scope.FindVar(serial_var_name);
-    std::string *serial_num = serial_var->GetMutable<std::string>();
-    serial_num->append("0");
-    dir.append("/");
-    dir.append(serial_num->c_str());
-    MkDirRecursively(dir.c_str());
-
     auto inp_var_names = Inputs("X");
     PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
                       "The number of input variables should be greater than 0");
@@ -101,30 +109,24 @@ class CheckpointSaveOp : public framework::OperatorBase {
     // todo (tangwei) made it async
     for (size_t i = 0; i < inp_var_names.size(); i++) {
       auto *var = scope.FindVar(inp_var_names[i]);
-      std::string var_file;
-      var_file.append(dir);
-      var_file.append("/");
-      var_file.append(inp_var_names[i]);
 
       PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     "Cannot find variable %s for checkpoint save op",
                      inp_var_names[i]);
+      PADDLE_ENFORCE(
+          var->IsType<framework::LoDTensor>(),
+          "CheckpointSaveOp only supports LoDTensor, %s has wrong type",
+          inp_var_names[i]);
 
       auto &tensor = var->Get<framework::LoDTensor>();
       // Serialize tensors one by one
-
+      std::string var_file = GenePath(dir, inp_var_names[i]);
       std::ofstream fout(var_file);
       framework::SerializeToStream(fout, tensor, dev_ctx);
       fout.close();
     }
 
-    std::string success;
-    success.append(dir);
-    success.append("/");
-    success.append(SUCCESS);
+    std::string success = GenePath(dir, SUCCESS);
     std::ofstream fout(success);
     fout.close();
   }
@@ -138,7 +140,6 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
         .AsDuplicable();
-    AddOutput("Serial", "the serial number");
     AddComment(R"DOC(
 CheckpointSave operator
 
@@ -150,30 +151,29 @@ to a file on disk.
                   "Delete the output dir if it exists.")
         .SetDefault(false);
 
-    AddAttr<std::string>(
-        "dir",
-        "(string)"
-        "The \"file_path\" where the LoDTensor variables will be saved.")
+    AddAttr<std::string>("dir",
+                         "(string)"
+                         "The dir where the LoDTensor variables will be saved.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
   }
 };
 
-class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Serial").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
-class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
+// class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference {
+//  public:
+//   void operator()(const framework::OpDesc &op_desc,
+//                   framework::BlockDesc *block) const override {
+//     auto out_var_name = op_desc.Output("Serial").front();
+//     auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+//     auto var_type = framework::proto::VarType::RAW;
+//     out_var.SetType(var_type);
+//   }
+// };
+
+// class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
+//  public:
+//   void operator()(framework::InferShapeContext *ctx) const override {}
+// };
 
 }  // namespace operators
 }  // namespace paddle
@@ -181,7 +181,10 @@ class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CheckpointSaveOpProtoMaker,
-                  ops::CheckpointSaveOpVarTypeInference,
-                  ops::CheckpointSaveOpShapeInference);
+                  ops::CheckpointSaveOpProtoMaker);
+
+// REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
+//                   paddle::framework::EmptyGradOpMaker,
+//                   ops::CheckpointSaveOpProtoMaker,
+//                   ops::CheckpointSaveOpVarTypeInference,
+//                   ops::CheckpointSaveOpShapeInference);

From 8430c8d798d4b722ad8da940c94c7696fd308606 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 17 May 2018 21:56:49 +0800
Subject: [PATCH 32/56] remove boost filesystem

---
 paddle/fluid/operators/checkpoint_load_op.cc | 10 ++++------
 paddle/fluid/operators/checkpoint_save_op.cc | 10 ++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index ad237a889ad0a2..d270ae31ed7911 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <numeric>
 #include <sstream>
 #include <string>
-
-#include <boost/filesystem.hpp>
-
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -42,9 +39,10 @@ static bool FileExists(const std::string &filepath) {
 }
 
 static std::string GenePath(const std::string &dir, const std::string &file) {
-  boost::filesystem::path dir(dir);
-  boost::filesystem::path file(file);
-  boost::filesystem::path full_path = dir / file;
+  std::string file_path;
+  file_path.append(file_path);
+  file_path.append("/");
+  file_path.append(file);
   return full_path;
 }
 
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 54911fc054c213..ee494c68822c43 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <numeric>
 #include <sstream>
 #include <string>
-
-#include <boost/filesystem.hpp>
-
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -37,9 +34,10 @@ const char SUCCESS[] = "_SUCCESS";
 const char SERIAL_VAR[] = "SERIAL_NUMBER";
 
 static std::string GenePath(const std::string &dir, const std::string &file) {
-  boost::filesystem::path dir(dir);
-  boost::filesystem::path file(file);
-  boost::filesystem::path full_path = dir / file;
+  std::string file_path;
+  file_path.append(file_path);
+  file_path.append("/");
+  file_path.append(file);
   return full_path;
 }
 

From 7b6c0abfc9b1e5ab44404ed0c253d4250d9a440a Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 17 May 2018 22:41:02 +0800
Subject: [PATCH 33/56] modify variable point

---
 paddle/fluid/operators/checkpoint_load_op.cc | 24 +++++++++++++-------
 paddle/fluid/operators/checkpoint_save_op.cc | 20 ++++++++++++----
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index d270ae31ed7911..0f0d989ccd2f7f 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -108,15 +108,22 @@ class CheckpointLoadOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     std::string dir = Attr<std::string>("dir");
-    int serial_num = Attr<int>("Serial");
+    std::string serial_num = Attr<std::string>("Serial");
+
+    std::string serial_var_name = std::string(SERIAL_VAR);
+    auto *serial_var = scope.FindVar(serial_var_name);
+    auto *serial_num;
+    if (serial_var == nullptr) {
+      *serial_var = scope.Var(serial_var_name);
+      *serial_num = serial_var->GetMutable<std::string>();
+      serial_num->append("0");
+    }
 
-    auto *serial_var = scope.FindVar(SERIAL_VAR);
-    serial_var = serial_num;
+    *serial_num = serial_var->GetMutable<std::string>();
     VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER
             << " value: " << serial_num;
 
-    std::string success;
-    = GenePath(dir, std::to_string(serial_num));
+    std::string success = GenePath(dir, serial_num);
     VLOG(3) << "Load checkpoint from dir: " << success;
     success = GenePath(success, SUCCESS);
     bool is_present = FileExists(success);
@@ -157,9 +164,10 @@ This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
 
-    AddAttr<int>("Serial",
-                 "(int)"
-                 "The  serial number of the checkpoint will to be load.");
+    AddAttr<std::string>(
+        "Serial",
+        "(std::string)"
+        "The  serial number of the checkpoint will to be load.");
     AddAttr<std::string>(
         "dir",
         "(string)"
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index ee494c68822c43..3c2cc50ac490a5 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -82,13 +82,23 @@ class CheckpointSaveOp : public framework::OperatorBase {
     auto dir = Attr<std::string>("dir");
     auto overwrite = Attr<bool>("overwrite");
 
-    auto serial_num = scope.FindVar(SERIAL_VAR);
-    if (serial_num == nullptr) {
-      serial_num = scope.Var(SERIAL_VAR);
+    std::string serial_var_name = std::string(SERIAL_VAR);
+    auto *serial_var = scope.FindVar(serial_var_name);
+    auto *serial_num;
+    if (serial_var == nullptr) {
+      *serial_var = scope.Var(serial_var_name);
+      *serial_num = serial_var->GetMutable<std::string>();
+      serial_num->append("0");
     }
-    serial_num = serial_num + 1;
 
-    dir = GenePath(dir, std::to_string(serial_num));
+    *serial_num = serial_var->GetMutable<std::string>();
+    VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER
+            << " value: " << serial_num;
+
+    auto *serial_num = serial_var->GetMutable<std::string>();
+    serial_num->append("1");
+
+    dir = GenePath(dir, serial_num);
     bool is_present = FileExists(dir);
     if (is_present && !overwrite) {
       PADDLE_THROW("%s exists!, checkpoint save cannot to  overwrite it", dir,

From f9d4b9dabfcf33de11154aa5dc67be5537a34bb8 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 17 May 2018 22:49:40 +0800
Subject: [PATCH 34/56] fix auto serial_num has no initializer

---
 paddle/fluid/operators/checkpoint_load_op.cc | 8 ++++----
 paddle/fluid/operators/checkpoint_save_op.cc | 9 ++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 0f0d989ccd2f7f..5fd3a7af9cf1f7 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -112,14 +112,14 @@ class CheckpointLoadOp : public framework::OperatorBase {
 
     std::string serial_var_name = std::string(SERIAL_VAR);
     auto *serial_var = scope.FindVar(serial_var_name);
-    auto *serial_num;
+
     if (serial_var == nullptr) {
       *serial_var = scope.Var(serial_var_name);
-      *serial_num = serial_var->GetMutable<std::string>();
-      serial_num->append("0");
+      auto *serial_tmp = serial_var->GetMutable<std::string>();
+      serial_tmp->append("0");
     }
 
-    *serial_num = serial_var->GetMutable<std::string>();
+    auto *serial_num = serial_var->GetMutable<std::string>();
     VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER
             << " value: " << serial_num;
 
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 3c2cc50ac490a5..5fccefeed251a2 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -84,14 +84,13 @@ class CheckpointSaveOp : public framework::OperatorBase {
 
     std::string serial_var_name = std::string(SERIAL_VAR);
     auto *serial_var = scope.FindVar(serial_var_name);
-    auto *serial_num;
+
     if (serial_var == nullptr) {
       *serial_var = scope.Var(serial_var_name);
-      *serial_num = serial_var->GetMutable<std::string>();
-      serial_num->append("0");
+      *serial_tmp = serial_var->GetMutable<std::string>();
+      serial_tmp->append("0");
     }
-
-    *serial_num = serial_var->GetMutable<std::string>();
+    auto *serial_num = serial_var->GetMutable<std::string>();
     VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER
             << " value: " << serial_num;
 

From a4fd3756bbd95fb8c676af9aab7a22cfe87d9cc5 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 09:46:14 +0800
Subject: [PATCH 35/56] bug fix

---
 paddle/fluid/operators/checkpoint_load_op.cc | 85 +++++++++++++-------
 paddle/fluid/operators/checkpoint_op_test.cc | 24 +++++-
 paddle/fluid/operators/checkpoint_save_op.cc | 36 +++++----
 3 files changed, 95 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 5fd3a7af9cf1f7..d24c7819990f04 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <fstream>
 #include <numeric>
 #include <sstream>
+#include <streambuf>
 #include <string>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
@@ -43,7 +44,13 @@ static std::string GenePath(const std::string &dir, const std::string &file) {
   file_path.append(file_path);
   file_path.append("/");
   file_path.append(file);
-  return full_path;
+  return file_path;
+}
+
+static bool IsNumber(const std::string &s) {
+  std::string::const_iterator it = s.begin();
+  while (it != s.end() && std::isdigit(*it)) ++it;
+  return !s.empty() && it == s.end();
 }
 
 static void LoadInputVars(const framework::Scope &scope,
@@ -62,7 +69,7 @@ static void LoadInputVars(const framework::Scope &scope,
                    "Cannot find variable %s for save_combine_op",
                    inp_var_names[i]);
     PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                   "LoadCombineOp only supports LoDTensor, %s has wrong type",
                    inp_var_names[i]);
 
     std::string var_file = GenePath(dir, inp_var_names[i]);
@@ -78,21 +85,18 @@ static void LoadInputVars(const framework::Scope &scope,
 
 static void LoadStringArgv(const framework::Scope &scope,
                            const platform::Place &place,
-                           const std::string &argv, const std::string &dir) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-
+                           const std::vector<std::string> &argv,
+                           const std::string &dir) {
   for (size_t i = 0; i < argv.size(); i++) {
-    auto *var = scope.FindVar(inp_var_names[i]);
+    auto *var = scope.FindVar(argv[i]);
     std::string *var_str = var->GetMutable<std::string>();
-
-    std::string var_file = GenePath(dir, argv);
+    std::string var_file = GenePath(dir, argv[i]);
     std::ifstream fin(var_file);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
                    var_file);
-    std::getline(fin, var_str);
+    std::getline(fin, *var_str);
     fin.close();
-    VLOG(3) << " load String argv: " << argv << " value is: " << var_str;
+    VLOG(3) << " load String argv: " << argv[i] << " value is: " << var_str;
   }
 }
 
@@ -108,22 +112,24 @@ class CheckpointLoadOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     std::string dir = Attr<std::string>("dir");
-    std::string serial_num = Attr<std::string>("Serial");
+    std::string serial_num_attr = Attr<std::string>("Serial");
+
+    PADDLE_ENFORCE(IsNumber(serial_num_attr),
+                   "Checkpoint Serial must be a number");
 
     std::string serial_var_name = std::string(SERIAL_VAR);
     auto *serial_var = scope.FindVar(serial_var_name);
-
-    if (serial_var == nullptr) {
-      *serial_var = scope.Var(serial_var_name);
-      auto *serial_tmp = serial_var->GetMutable<std::string>();
-      serial_tmp->append("0");
-    }
+    PADDLE_ENFORCE(serial_var != nullptr,
+                   "Cannot find variable %s for checkpoint_load_op",
+                   serial_var_name);
 
     auto *serial_num = serial_var->GetMutable<std::string>();
-    VLOG(1) << "CheckpointLoadOp set " << SERIAL_NUMBER
+    serial_num = serial_num_attr;
+
+    VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR
             << " value: " << serial_num;
 
-    std::string success = GenePath(dir, serial_num);
+    std::string success = GenePath(dir, serial_num->c_str());
     VLOG(3) << "Load checkpoint from dir: " << success;
     success = GenePath(success, SUCCESS);
     bool is_present = FileExists(success);
@@ -137,11 +143,11 @@ class CheckpointLoadOp : public framework::OperatorBase {
     auto inp_var_names = Inputs("X");
     PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
                       "The number of input variables should be greater than 0");
-    LoadInputVars(scope, place, &inp_var_names);
+    LoadInputVars(scope, place, inp_var_names, dir);
 
-    VLOG(3) << "Ready to load string argv to scope";
-    auto argv = Inputs("Argv");
-    LoadStringArgv(scope, place, &argv, &dir);
+    // VLOG(3) << "Ready to load string argv to scope";
+    // auto argv = Output("Argv");
+    // LoadStringArgv(scope, place, argv, dir);
   }
 };
 
@@ -153,14 +159,13 @@ class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
         .AsDuplicable();
-    AddInput(
+    AddOutput(
         "Argv",
-        "(vector) Input LoDTensors that need to be saved together in a file.")
-        .AsDuplicable();
+        "(vector) Input LoDTensors that need to be saved together in a file.");
     AddComment(R"DOC(
 CheckpointLoad operator
 
-This operator will serialize and write a list of input LoDTensor variables 
+This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
 
@@ -177,10 +182,32 @@ to a file on disk.
   }
 };
 
+class CheckpointLoadOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output("Argv").front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class CheckpointLoadOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
-                  ops::CheckpointLoadOpProtoMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CheckpointLoadOpProtoMaker,
+                  ops::CheckpointLoadOpVarTypeInference,
+                  ops::CheckpointLoadOpShapeInference);
+
+// REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
+//                  ops::CheckpointLoadOpProtoMaker);
diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
index 75bfc3f840765b..2acce227d23de5 100644
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_op_test.cc
@@ -44,7 +44,7 @@ TEST(CheckpointSaveOp, CPU) {
   attrs.insert({"dir", std::string("ckpt")});
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_save", {{"X", {"test_var"}}}, attrs);
+      "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs);
   save_op->Run(scope, place);
 }
 
@@ -52,13 +52,29 @@ TEST(CheckpointLoadOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
 
-  scope.Var("test_var");
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(paddle::platform::float16(i));
+  }
+
+  scope.Var("SERIAL_NUMBER");
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"dir", std::string("ckpt")});
+  attrs.insert({"Serial", std::string("SERIAL_NUMBER")});
 
   auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_load", {{"X", {"test_var"}}}, {{"Serial", {"SERIAL_NUMBER"}}},
-      attrs);
+      "checkpoint_load", {{"X", {"test_var"}}}, {{"Argv", {}}}, attrs);
   load_op->Run(scope, place);
 }
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 5fccefeed251a2..bab979e4074a61 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -33,12 +33,18 @@ constexpr char kSEP = '/';
 const char SUCCESS[] = "_SUCCESS";
 const char SERIAL_VAR[] = "SERIAL_NUMBER";
 
+static bool IsNumber(const std::string &s) {
+  std::string::const_iterator it = s.begin();
+  while (it != s.end() && std::isdigit(*it)) ++it;
+  return !s.empty() && it == s.end();
+}
+
 static std::string GenePath(const std::string &dir, const std::string &file) {
   std::string file_path;
-  file_path.append(file_path);
+  file_path.append(dir);
   file_path.append("/");
   file_path.append(file);
-  return full_path;
+  return file_path;
 }
 
 static bool FileExists(const std::string &filepath) {
@@ -79,28 +85,24 @@ class CheckpointSaveOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto dir = Attr<std::string>("dir");
+    auto ck_dir = Attr<std::string>("dir");
     auto overwrite = Attr<bool>("overwrite");
 
     std::string serial_var_name = std::string(SERIAL_VAR);
-    auto *serial_var = scope.FindVar(serial_var_name);
-
-    if (serial_var == nullptr) {
-      *serial_var = scope.Var(serial_var_name);
-      *serial_tmp = serial_var->GetMutable<std::string>();
-      serial_tmp->append("0");
-    }
-    auto *serial_num = serial_var->GetMutable<std::string>();
-    VLOG(1) << "CheckpointSaveOp get " << SERIAL_NUMBER
+    auto *serial_num =
+        scope.FindVar(serial_var_name)->GetMutable<std::string>();
+    VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR
             << " value: " << serial_num;
 
-    auto *serial_num = serial_var->GetMutable<std::string>();
-    serial_num->append("1");
+    if (!IsNumber(serial_num)) {
+      serial_num = "0";
+    }
 
-    dir = GenePath(dir, serial_num);
+    std::string dir = GenePath(ck_dir, serial_num->c_str());
+    VLOG(1) << "CheckpointSaveOp current dir: " << dir;
     bool is_present = FileExists(dir);
     if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, checkpoint save cannot to  overwrite it", dir,
+      PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir,
                    overwrite);
     }
     MkDirRecursively(dir.c_str());
@@ -150,7 +152,7 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 CheckpointSave operator
 
-This operator will serialize and write a list of input LoDTensor variables 
+This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
     AddAttr<bool>("overwrite",

From f688652f1e3ee2eaf949ef79cbd56c05fc4980cd Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 10:26:41 +0800
Subject: [PATCH 36/56] bug fix

---
 paddle/fluid/operators/checkpoint_load_op.cc | 5 +++--
 paddle/fluid/operators/checkpoint_op_test.cc | 2 ++
 paddle/fluid/operators/checkpoint_save_op.cc | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index d24c7819990f04..a9676de369b4b4 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -114,7 +114,7 @@ class CheckpointLoadOp : public framework::OperatorBase {
     std::string dir = Attr<std::string>("dir");
     std::string serial_num_attr = Attr<std::string>("Serial");
 
-    PADDLE_ENFORCE(IsNumber(serial_num_attr),
+    PADDLE_ENFORCE(!IsNumber(serial_num_attr),
                    "Checkpoint Serial must be a number");
 
     std::string serial_var_name = std::string(SERIAL_VAR);
@@ -124,7 +124,8 @@ class CheckpointLoadOp : public framework::OperatorBase {
                    serial_var_name);
 
     auto *serial_num = serial_var->GetMutable<std::string>();
-    serial_num = serial_num_attr;
+    serial_num->clear();
+    serial_num->append(serial_num_attr);
 
     VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR
             << " value: " << serial_num;
diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
index 2acce227d23de5..5312225e5f9523 100644
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ b/paddle/fluid/operators/checkpoint_op_test.cc
@@ -69,6 +69,8 @@ TEST(CheckpointLoadOp, CPU) {
   }
 
   scope.Var("SERIAL_NUMBER");
+  auto* serial_num = scope.FindVar("SERIAL_NUMBER")->GetMutable<std::string>();
+  serial_num->append("0");
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"dir", std::string("ckpt")});
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index bab979e4074a61..30eda30c5f52fb 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -94,8 +94,8 @@ class CheckpointSaveOp : public framework::OperatorBase {
     VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR
             << " value: " << serial_num;
 
-    if (!IsNumber(serial_num)) {
-      serial_num = "0";
+    if (serial_num->empty()) {
+      serial_num->append("0");
     }
 
     std::string dir = GenePath(ck_dir, serial_num->c_str());

From 821acdb3bffdf0594d4bf94a4cddc47c2c681ca6 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 11:18:22 +0800
Subject: [PATCH 37/56] update op to trianer and pserver

---
 .../fluid/transpiler/distribute_transpiler.py | 99 ++++++++++++++-----
 1 file changed, 72 insertions(+), 27 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 84cfc6e0117e8b..4e15718771162f 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import math
 
 import distributed_splitter as splitter
@@ -26,6 +27,10 @@
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
 RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
 
+# for checkpoint
+SUCCESS = "_SUCCESS"
+SERIAL_VAR_NAME = "SERIAL_NUMBER"
+
 
 class VarBlock:
     def __init__(self, varname, offset, size):
@@ -153,7 +158,8 @@ def transpile(self,
                   pservers="127.0.0.1:6174",
                   trainers=1,
                   split_method=splitter.round_robin,
-                  sync_mode=True):
+                  sync_mode=True,
+                  checkpoint_dir=None):
         """
         Transpile the program to distributed data-parallelism programs.
         The main_program will be transformed to use a remote parameter server
@@ -315,22 +321,22 @@ def transpile(self,
                 "sync_mode": self.sync_mode
             })
 
-        serial_var = program.global_block().create_var(
-            name="SERIAL_NUMBER",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+        if checkpoint_dir and self.is_chief:
+            program.global_block().create_var(
+                name=SERIAL_VAR_NAME,
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
 
-        save_vars = []
-        for var in self.origin_program.list_vars():
-            if self.is_persistable(var):
-                save_vars.append(var.name)
+            save_vars = []
+            for var in self.origin_program.list_vars():
+                if self._is_persistable(var):
+                    save_vars.append(var.name)
 
-        program.global_block().append_op(
-            type="checkpoint_save",
-            inputs={"X": save_vars},
-            outputs={"Serial": serial_var},
-            attrs={"overwrite": False,
-                   "dir": "/workspace/ckpt/"})
+            program.global_block().append_op(
+                type="checkpoint_save",
+                inputs={"X": save_vars},
+                attrs={"overwrite": True,
+                       "dir": checkpoint_dir})
 
         # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
@@ -512,13 +518,6 @@ def __append_optimize_op__(op, block, grad_to_block_id):
         pserver_program.sync_with_cpp()
         return pserver_program
 
-    def is_persistable(self, var):
-        if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                var.desc.type() == core.VarDesc.VarType.RAW :
-            return False
-        return var.persistable
-
     def get_train_startup_program(self, checkpoint_load_dir=None):
         """
         Get train startup program.
@@ -532,13 +531,16 @@ def get_train_startup_program(self, checkpoint_load_dir=None):
 
         load_vars = []
         for var in startup_prog.list_vars():
-            if self.is_persistable(var):
+            if self._is_persistable(var):
                 load_vars.append(var.name)
 
+        serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir)
+
         startup_prog.global_block().append_op(
             type="checkpoint_load",
-            outputs={"Out": load_vars},
-            attrs={"dir": checkpoint_load_dir})
+            inputs={"X": load_vars},
+            attrs={"dir": checkpoint_load_dir,
+                   "Serial": serial_number})
         return startup_prog
 
     def get_startup_program(self,
@@ -599,16 +601,59 @@ def _get_splited_name_and_shape(varname):
                     attrs=op.attrs)
                 for var in new_outputs.values():
                     load_vars.append(var.name)
-        # add checkpoint op 
+        # add checkpoint op
         if not checkpoint_load_dir:
             return s_prog
 
+        serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir)
+
         s_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
-            attrs={"dir": checkpoint_load_dir})
+            attrs={"dir": checkpoint_load_dir,
+                   "Serial": serial_number})
+
         return s_prog
 
+    def _is_persistable(self, var):
+        """only save LodTensor variable"""
+        if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                var.desc.type() == core.VarDesc.VarType.RAW :
+            return False
+        return var.persistable
+
+    def _get_lastest_checkpoint_dir(self, checkpoint_dir):
+        """
+        get the biggest number in checkpoint_dir, which has _SUCCESS
+        """
+        if not checkpoint_dir.strip():
+            return ""
+
+        def has_success(checkpoint_dir, cur_dir):
+            """
+            is _SUCCESS in this dir
+            """
+            if not os.path.isdir(cur_dir):
+                return -1
+
+            try:
+                int(cur_dir)
+            except ValueError:
+                return -1
+
+            success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS)
+            if os.path.isfile(success_path):
+                return int(cur_dir)
+
+        current_dir = 0
+        dirs = os.listdir(checkpoint_dir)
+        for cur_dir in dirs:
+            success_num = has_success(checkpoint_dir, cur_dir)
+            if success_num > current_dir:
+                current_dir = success_num
+        return str(current_dir)
+
     # transpiler function for dis lookup_table
     def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
                                                eplist):

From cd98f2b7e0eb251659565c9f9171f52c95c819f8 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 17:44:48 +0800
Subject: [PATCH 38/56] bug fix

---
 paddle/fluid/operators/checkpoint_load_op.cc | 3 +--
 paddle/fluid/operators/checkpoint_save_op.cc | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index a9676de369b4b4..82a40e18d5c9e5 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -154,8 +154,7 @@ class CheckpointLoadOp : public framework::OperatorBase {
 
 class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 30eda30c5f52fb..790fd4ea68505a 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -143,8 +143,7 @@ class CheckpointSaveOp : public framework::OperatorBase {
 
 class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CheckpointSaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")

From dbd023771f82cb69574b374bea30836f3804015a Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 19:37:49 +0800
Subject: [PATCH 39/56] fix serial number

---
 paddle/fluid/operators/checkpoint_load_op.cc |  2 +-
 paddle/fluid/operators/checkpoint_save_op.cc | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 82a40e18d5c9e5..c18edf63062044 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -176,7 +176,7 @@ to a file on disk.
     AddAttr<std::string>(
         "dir",
         "(string)"
-        "The \"file_path\" where the LoDTensor variables will be saved.")
+        "The \"dir\" where the checkpoint files will be loaded.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
   }
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 790fd4ea68505a..1832c5792a18c7 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -94,10 +94,16 @@ class CheckpointSaveOp : public framework::OperatorBase {
     VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR
             << " value: " << serial_num;
 
-    if (serial_num->empty()) {
-      serial_num->append("0");
+    int serials = 0;
+    if (!serial_num->empty()) {
+      std::string::size_type sz;
+      serials = std::stoi(serial_num->data, &sz);
+      serials += 1;
     }
 
+    serial_num->clear();
+    serial_num->append(std::to_string(serials));
+
     std::string dir = GenePath(ck_dir, serial_num->c_str());
     VLOG(1) << "CheckpointSaveOp current dir: " << dir;
     bool is_present = FileExists(dir);

From 22df4c278c19ab5eca71431d878eb78f053e6bc5 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 21:17:37 +0800
Subject: [PATCH 40/56] fix serial number

---
 paddle/fluid/operators/checkpoint_load_op.cc            | 2 +-
 paddle/fluid/operators/checkpoint_save_op.cc            | 3 +--
 python/paddle/fluid/transpiler/distribute_transpiler.py | 4 +++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index c18edf63062044..6c88cbdab0758d 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -114,7 +114,7 @@ class CheckpointLoadOp : public framework::OperatorBase {
     std::string dir = Attr<std::string>("dir");
     std::string serial_num_attr = Attr<std::string>("Serial");
 
-    PADDLE_ENFORCE(!IsNumber(serial_num_attr),
+    PADDLE_ENFORCE(IsNumber(serial_num_attr),
                    "Checkpoint Serial must be a number");
 
     std::string serial_var_name = std::string(SERIAL_VAR);
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
index 1832c5792a18c7..f904cdc8269e71 100644
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ b/paddle/fluid/operators/checkpoint_save_op.cc
@@ -96,8 +96,7 @@ class CheckpointSaveOp : public framework::OperatorBase {
 
     int serials = 0;
     if (!serial_num->empty()) {
-      std::string::size_type sz;
-      serials = std::stoi(serial_num->data, &sz);
+      serials = std::stoi(serial_num->data());
       serials += 1;
     }
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index e1a2fe86a58045..335dc2342d08c0 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -545,6 +545,7 @@ def get_train_startup_program(self, checkpoint_load_dir=None):
         startup_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
+            outputs={"Argv": []},
             attrs={"dir": checkpoint_load_dir,
                    "Serial": serial_number})
         return startup_prog
@@ -616,6 +617,7 @@ def _get_splited_name_and_shape(varname):
         s_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
+            outputs={"Argv": []},
             attrs={"dir": checkpoint_load_dir,
                    "Serial": serial_number})
 
@@ -640,7 +642,7 @@ def has_success(checkpoint_dir, cur_dir):
             """
             is _SUCCESS in this dir
             """
-            if not os.path.isdir(cur_dir):
+            if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
                 return -1
 
             try:

From d98480cff5fe2e08fadc79fccd5bce9ab01ed28c Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 21:55:33 +0800
Subject: [PATCH 41/56] fix serial number

---
 paddle/fluid/operators/checkpoint_load_op.cc            | 4 ++--
 python/paddle/fluid/transpiler/distribute_transpiler.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
index 6c88cbdab0758d..18871e56c50171 100644
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ b/paddle/fluid/operators/checkpoint_load_op.cc
@@ -114,8 +114,8 @@ class CheckpointLoadOp : public framework::OperatorBase {
     std::string dir = Attr<std::string>("dir");
     std::string serial_num_attr = Attr<std::string>("Serial");
 
-    PADDLE_ENFORCE(IsNumber(serial_num_attr),
-                   "Checkpoint Serial must be a number");
+    VLOG(3) << "CheckpointLoadOp  get Attr  dir: " << dir;
+    VLOG(3) << "CheckpointLoadOp  get Attr  Serial: " << serial_num_attr;
 
     std::string serial_var_name = std::string(SERIAL_VAR);
     auto *serial_var = scope.FindVar(serial_var_name);
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 335dc2342d08c0..bb1e026a6b1ffd 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -654,6 +654,9 @@ def has_success(checkpoint_dir, cur_dir):
             if os.path.isfile(success_path):
                 return int(cur_dir)
 
+        if os.path.isdir(checkpoint_dir):
+            return "-1"
+
         current_dir = 0
         dirs = os.listdir(checkpoint_dir)
         for cur_dir in dirs:

From ee91e48e346a0504cd3c478ca5ba4e905f5442ff Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 22:20:07 +0800
Subject: [PATCH 42/56] fix serial number

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index bb1e026a6b1ffd..8b379ddcf89f05 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -654,7 +654,7 @@ def has_success(checkpoint_dir, cur_dir):
             if os.path.isfile(success_path):
                 return int(cur_dir)
 
-        if os.path.isdir(checkpoint_dir):
+        if not os.path.isdir(checkpoint_dir):
             return "-1"
 
         current_dir = 0

From b6ee59ae2573fbbe66ab574be299d6b6fe52552c Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 22:24:24 +0800
Subject: [PATCH 43/56] optimize python checkpint dir config

---
 .../fluid/transpiler/distribute_transpiler.py | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 8b379ddcf89f05..dc9d254fa55311 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -219,7 +219,8 @@ def transpile(self,
         # is_chief (no.0 triner) for checkpoint
         # the no.0 trainer will save all variables and its own reader offset to checkpoint
         # other trianers will save its own reader offset to checkpoint
-        self.is_chief = trainer_id == 0
+        self._is_chief = trainer_id == 0
+        self.checkpoint_dir = checkpoint_dir
 
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
@@ -327,7 +328,7 @@ def transpile(self,
                 "sync_mode": self.sync_mode
             })
 
-        if checkpoint_dir and self.is_chief:
+        if self.checkpoint_dir and self._is_chief:
             program.global_block().create_var(
                 name=SERIAL_VAR_NAME,
                 persistable=True,
@@ -342,7 +343,7 @@ def transpile(self,
                 type="checkpoint_save",
                 inputs={"X": save_vars},
                 attrs={"overwrite": True,
-                       "dir": checkpoint_dir})
+                       "dir": self.checkpoint_dir})
 
         # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
@@ -524,15 +525,15 @@ def __append_optimize_op__(op, block, grad_to_block_id):
         pserver_program.sync_with_cpp()
         return pserver_program
 
-    def get_train_startup_program(self, checkpoint_load_dir=None):
+    def get_train_startup_program(self):
         """
         Get train startup program.
-        If checkpoint_load_dir is None, rerurn default startup program.
-        IF checkpoint_load_dir is Exist, add checkpoint_load op and load Var.
+        If self.checkpoint_dir is None, rerurn default startup program.
+        IF self.checkpoint_dir is Exist, add checkpoint_load op and load Var.
         """
         startup_prog = default_startup_program()
 
-        if not checkpoint_load_dir:
+        if not self.checkpoint_dir:
             return startup_prog
 
         load_vars = []
@@ -540,20 +541,17 @@ def get_train_startup_program(self, checkpoint_load_dir=None):
             if self._is_persistable(var):
                 load_vars.append(var.name)
 
-        serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir)
+        serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
 
         startup_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
             outputs={"Argv": []},
-            attrs={"dir": checkpoint_load_dir,
+            attrs={"dir": self.checkpoint_dir,
                    "Serial": serial_number})
         return startup_prog
 
-    def get_startup_program(self,
-                            endpoint,
-                            pserver_program,
-                            checkpoint_load_dir=None):
+    def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
@@ -609,16 +607,16 @@ def _get_splited_name_and_shape(varname):
                 for var in new_outputs.values():
                     load_vars.append(var.name)
         # add checkpoint op
-        if not checkpoint_load_dir:
+        if not self.checkpoint_dir:
             return s_prog
 
-        serial_number = self._get_lastest_checkpoint_dir(checkpoint_load_dir)
+        serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
 
         s_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
             outputs={"Argv": []},
-            attrs={"dir": checkpoint_load_dir,
+            attrs={"dir": self.checkpoint_dir,
                    "Serial": serial_number})
 
         return s_prog

From e130bf375235cf349904c433f3ff1c1c99f12083 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 May 2018 23:28:46 +0800
Subject: [PATCH 44/56] optimize python checkpint dir config

---
 .../paddle/fluid/transpiler/distribute_transpiler.py   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index dc9d254fa55311..1d51ed45792664 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -543,6 +543,11 @@ def get_train_startup_program(self):
 
         serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
 
+        startup_prog.global_block().create_var(
+            name=SERIAL_VAR_NAME,
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
         startup_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},
@@ -612,6 +617,11 @@ def _get_splited_name_and_shape(varname):
 
         serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
 
+        s_prog.global_block().create_var(
+            name=SERIAL_VAR_NAME,
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
         s_prog.global_block().append_op(
             type="checkpoint_load",
             inputs={"X": load_vars},

From 5451c78ded15dedb1e9e89f25d5145e646f83563 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 21 May 2018 15:13:58 +0800
Subject: [PATCH 45/56] add checkpoint in io

---
 python/paddle/fluid/io.py | 87 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 54506e97ed5c9a..502386016cfad7 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -454,3 +454,90 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+SUCCESS = "_SUCCESS"
+
+
+def save_checkpoint(executor,
+                    dirname,
+                    keep_max=10,
+                    save_secs=600,
+                    main_program=None):
+    """
+    Save Variables to Checkpint Dir
+
+    :param dirname
+    :param keep_max
+    :param save_secs
+    """
+    if dirname is None:
+        raise Exception("save checkpoint dir can not be none")
+
+    if not os.path.isdir(dirname):
+        os.makedirs(dirname)
+    serial = _get_lastest_checkpoint_dir(dirname) + 1
+
+    cur_dir = os.path.join(dirname, serial)
+    save_persistables(executor, cur_dir, main_program)
+    _write_success(cur_dir)
+
+
+def restore_checkpoint(dirname, executor, main_program=None):
+    """
+    Load Variables from Checkpint Dir
+
+    :param dir
+    """
+    if dirname is None and os.path.isdir(dirname):
+        raise Exception("restore checkpoint can not load variables from %s" %
+                        dirname)
+    serial = _get_lastest_checkpoint_dir(dirname) + 1
+
+    if serial < -1:
+        return
+    cur_dir = os.path.join(dirname, serial)
+    load_persistables(executor, cur_dir, main_program)
+
+
+def _write_success(dirname):
+    """
+    """
+    success_file = os.path.join(dirname, SUCCESS)
+    with open(success_file, 'a'):
+        pass
+
+
+def _get_lastest_checkpoint_dir(checkpoint_dir):
+    """
+    get the biggest number in checkpoint_dir, which has _SUCCESS
+    """
+    if not checkpoint_dir.strip():
+        return ""
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        try:
+            int(cur_dir)
+        except ValueError:
+            return -1
+
+        success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS)
+        if os.path.isfile(success_path):
+            return int(cur_dir)
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir

From 01975ec1c749c9576a1124a7f029234caa86e0ed Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 21 May 2018 16:53:59 +0800
Subject: [PATCH 46/56] add checkpoint in io

---
 python/paddle/fluid/io.py | 65 +++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 502386016cfad7..83c32fe9d6ebdc 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,21 +13,17 @@
 # limitations under the License.
 
 import os
+import time
+import shutil
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core
 
 __all__ = [
-    'save_vars',
-    'save_params',
-    'save_persistables',
-    'load_vars',
-    'load_params',
-    'load_persistables',
-    'save_inference_model',
-    'load_inference_model',
-    'get_inference_program',
+    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
+    'load_persistables', 'save_inference_model', 'load_inference_model',
+    'get_inference_program', 'save_checkpoint', 'restore_checkpoint'
 ]
 
 
@@ -195,6 +191,8 @@ def load_vars(executor,
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
+            if each_var.type == core.VarDesc.VarType.RAW:
+                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -457,11 +455,12 @@ def get_parameter_value_by_name(name, executor, program=None):
 
 
 SUCCESS = "_SUCCESS"
+BEGIN_SECS = time.time()
 
 
 def save_checkpoint(executor,
                     dirname,
-                    keep_max=10,
+                    keep_max=3,
                     save_secs=600,
                     main_program=None):
     """
@@ -470,38 +469,70 @@ def save_checkpoint(executor,
     :param dirname
     :param keep_max
     :param save_secs
+    :param main_program
     """
     if dirname is None:
         raise Exception("save checkpoint dir can not be none")
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
-    serial = _get_lastest_checkpoint_dir(dirname) + 1
 
-    cur_dir = os.path.join(dirname, serial)
+    global BEGIN_SECS
+    if time.time() - BEGIN_SECS < save_secs:
+        return
+    BEGIN_SECS = time.time()
+
+    serial = _get_lastest_checkpoint_dir(dirname) + 1
+    cur_dir = os.path.join(dirname, str(serial))
     save_persistables(executor, cur_dir, main_program)
     _write_success(cur_dir)
+    _lru_delete(dirname, keep_max)
 
 
 def restore_checkpoint(dirname, executor, main_program=None):
     """
     Load Variables from Checkpint Dir
 
-    :param dir
+    :param dirname
+    :param executor
+    :param main_program
     """
     if dirname is None and os.path.isdir(dirname):
         raise Exception("restore checkpoint can not load variables from %s" %
                         dirname)
-    serial = _get_lastest_checkpoint_dir(dirname) + 1
+    serial = _get_lastest_checkpoint_dir(dirname)
 
-    if serial < -1:
+    if serial < 0:
         return
-    cur_dir = os.path.join(dirname, serial)
+    cur_dir = os.path.join(dirname, str(serial))
     load_persistables(executor, cur_dir, main_program)
 
 
+def _lru_delete(dirname, keep_max=3):
+    """
+    retain checkpoint nums with keep_max
+    """
+    dirs = os.listdir(dirname)
+    serials = []
+    for serial in dirs:
+        try:
+            serials.append(int(serial))
+        except ValueError:
+            continue
+
+    if len(serials) <= keep_max:
+        return
+
+    serials.sort(reverse=True)
+    serials = serials[keep_max:]
+    for serial in serials:
+        cur_dir = os.path.join(dirname, str(serial))
+        shutil.rmtree(cur_dir)
+
+
 def _write_success(dirname):
     """
+    write _SUCCESS to checkpoint dir
     """
     success_file = os.path.join(dirname, SUCCESS)
     with open(success_file, 'a'):
@@ -513,7 +544,7 @@ def _get_lastest_checkpoint_dir(checkpoint_dir):
     get the biggest number in checkpoint_dir, which has _SUCCESS
     """
     if not checkpoint_dir.strip():
-        return ""
+        return -1
 
     def has_success(checkpoint_dir, cur_dir):
         """

From ed2129cc50b794f76574065430577e0303a6703d Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 21 May 2018 16:57:40 +0800
Subject: [PATCH 47/56] revert distribute_transpiler.py

---
 .../fluid/transpiler/distribute_transpiler.py | 126 +-----------------
 1 file changed, 1 insertion(+), 125 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 1d51ed45792664..42ff0a9eb1112e 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-import os
 import math
 
 import distributed_splitter as splitter
@@ -27,10 +26,6 @@
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
 RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
 
-# for checkpoint
-SUCCESS = "_SUCCESS"
-SERIAL_VAR_NAME = "SERIAL_NUMBER"
-
 
 class VarBlock:
     def __init__(self, varname, offset, size):
@@ -161,8 +156,7 @@ def transpile(self,
                   pservers="127.0.0.1:6174",
                   trainers=1,
                   split_method=splitter.round_robin,
-                  sync_mode=True,
-                  checkpoint_dir=None):
+                  sync_mode=True):
         """
         Transpile the program to distributed data-parallelism programs.
         The main_program will be transformed to use a remote parameter server
@@ -216,12 +210,6 @@ def transpile(self,
         self.pserver_endpoints = pserver_endpoints
         self.optimize_ops, params_grads = self._get_optimize_pass()
 
-        # is_chief (no.0 triner) for checkpoint
-        # the no.0 trainer will save all variables and its own reader offset to checkpoint
-        # other trianers will save its own reader offset to checkpoint
-        self._is_chief = trainer_id == 0
-        self.checkpoint_dir = checkpoint_dir
-
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
         # 2. check all lookup_table_op share the same table.
@@ -327,24 +315,6 @@ def transpile(self,
                 "epmap": eplist,
                 "sync_mode": self.sync_mode
             })
-
-        if self.checkpoint_dir and self._is_chief:
-            program.global_block().create_var(
-                name=SERIAL_VAR_NAME,
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-
-            save_vars = []
-            for var in self.origin_program.list_vars():
-                if self._is_persistable(var):
-                    save_vars.append(var.name)
-
-            program.global_block().append_op(
-                type="checkpoint_save",
-                inputs={"X": save_vars},
-                attrs={"overwrite": True,
-                       "dir": self.checkpoint_dir})
-
         # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
             if len(splited_var) <= 1:
@@ -525,37 +495,6 @@ def __append_optimize_op__(op, block, grad_to_block_id):
         pserver_program.sync_with_cpp()
         return pserver_program
 
-    def get_train_startup_program(self):
-        """
-        Get train startup program.
-        If self.checkpoint_dir is None, rerurn default startup program.
-        IF self.checkpoint_dir is Exist, add checkpoint_load op and load Var.
-        """
-        startup_prog = default_startup_program()
-
-        if not self.checkpoint_dir:
-            return startup_prog
-
-        load_vars = []
-        for var in startup_prog.list_vars():
-            if self._is_persistable(var):
-                load_vars.append(var.name)
-
-        serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
-
-        startup_prog.global_block().create_var(
-            name=SERIAL_VAR_NAME,
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        startup_prog.global_block().append_op(
-            type="checkpoint_load",
-            inputs={"X": load_vars},
-            outputs={"Argv": []},
-            attrs={"dir": self.checkpoint_dir,
-                   "Serial": serial_number})
-        return startup_prog
-
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
@@ -581,7 +520,6 @@ def _get_splited_name_and_shape(varname):
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
-        load_vars = []
         for op in orig_s_prog.global_block().ops:
             new_inputs = dict()
             new_outputs = dict()
@@ -609,70 +547,8 @@ def _get_splited_name_and_shape(varname):
                     inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
-                for var in new_outputs.values():
-                    load_vars.append(var.name)
-        # add checkpoint op
-        if not self.checkpoint_dir:
-            return s_prog
-
-        serial_number = self._get_lastest_checkpoint_dir(self.checkpoint_dir)
-
-        s_prog.global_block().create_var(
-            name=SERIAL_VAR_NAME,
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        s_prog.global_block().append_op(
-            type="checkpoint_load",
-            inputs={"X": load_vars},
-            outputs={"Argv": []},
-            attrs={"dir": self.checkpoint_dir,
-                   "Serial": serial_number})
-
         return s_prog
 
-    def _is_persistable(self, var):
-        """only save LodTensor variable"""
-        if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                var.desc.type() == core.VarDesc.VarType.RAW :
-            return False
-        return var.persistable
-
-    def _get_lastest_checkpoint_dir(self, checkpoint_dir):
-        """
-        get the biggest number in checkpoint_dir, which has _SUCCESS
-        """
-        if not checkpoint_dir.strip():
-            return ""
-
-        def has_success(checkpoint_dir, cur_dir):
-            """
-            is _SUCCESS in this dir
-            """
-            if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
-                return -1
-
-            try:
-                int(cur_dir)
-            except ValueError:
-                return -1
-
-            success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS)
-            if os.path.isfile(success_path):
-                return int(cur_dir)
-
-        if not os.path.isdir(checkpoint_dir):
-            return "-1"
-
-        current_dir = 0
-        dirs = os.listdir(checkpoint_dir)
-        for cur_dir in dirs:
-            success_num = has_success(checkpoint_dir, cur_dir)
-            if success_num > current_dir:
-                current_dir = success_num
-        return str(current_dir)
-
     # transpiler function for dis lookup_table
     def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
                                                eplist):

From be050565241780003cef777e0b0ad0e49cd7f6b1 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 21 May 2018 19:11:23 +0800
Subject: [PATCH 48/56] delete old checkpoint code

---
 paddle/fluid/operators/CMakeLists.txt        |   3 -
 paddle/fluid/operators/checkpoint_load_op.cc | 213 -------------------
 paddle/fluid/operators/checkpoint_op_test.cc |  82 -------
 paddle/fluid/operators/checkpoint_save_op.cc | 203 ------------------
 python/paddle/fluid/framework.py             |   3 +-
 python/paddle/fluid/io.py                    |  36 +++-
 6 files changed, 32 insertions(+), 508 deletions(-)
 delete mode 100644 paddle/fluid/operators/checkpoint_load_op.cc
 delete mode 100644 paddle/fluid/operators/checkpoint_op_test.cc
 delete mode 100644 paddle/fluid/operators/checkpoint_save_op.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2288987eaf9f94..ac1f3f44ae8703 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -252,8 +252,6 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(checkpoint_save_op DEPS lod_tensor)
-op_library(checkpoint_load_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
 # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
@@ -294,6 +292,5 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-cc_test(checkpoint_op_test SRCS checkpoint_op_test.cc DEPS checkpoint_save_op checkpoint_load_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/checkpoint_load_op.cc b/paddle/fluid/operators/checkpoint_load_op.cc
deleted file mode 100644
index 18871e56c50171..00000000000000
--- a/paddle/fluid/operators/checkpoint_load_op.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <sys/stat.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include <streambuf>
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kSEP = '/';
-// write empty file named _SUCCESS
-const char SUCCESS[] = "_SUCCESS";
-const char SERIAL_VAR[] = "SERIAL_NUMBER";
-
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string GenePath(const std::string &dir, const std::string &file) {
-  std::string file_path;
-  file_path.append(file_path);
-  file_path.append("/");
-  file_path.append(file);
-  return file_path;
-}
-
-static bool IsNumber(const std::string &s) {
-  std::string::const_iterator it = s.begin();
-  while (it != s.end() && std::isdigit(*it)) ++it;
-  return !s.empty() && it == s.end();
-}
-
-static void LoadInputVars(const framework::Scope &scope,
-                          const platform::Place &place,
-                          const std::vector<std::string> &inp_var_names,
-                          const std::string &dir) {
-  // get device context from pool
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-
-  // todo (tangwei) made it async
-  for (size_t i = 0; i < inp_var_names.size(); i++) {
-    auto *var = scope.FindVar(inp_var_names[i]);
-
-    PADDLE_ENFORCE(var != nullptr,
-                   "Cannot find variable %s for save_combine_op",
-                   inp_var_names[i]);
-    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "LoadCombineOp only supports LoDTensor, %s has wrong type",
-                   inp_var_names[i]);
-
-    std::string var_file = GenePath(dir, inp_var_names[i]);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    std::ifstream fin(var_file);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   var_file);
-    framework::DeserializeFromStream(fin, tensor, dev_ctx);
-    fin.close();
-    VLOG(3) << " load var: " << inp_var_names[i] << " finished";
-  }
-}
-
-static void LoadStringArgv(const framework::Scope &scope,
-                           const platform::Place &place,
-                           const std::vector<std::string> &argv,
-                           const std::string &dir) {
-  for (size_t i = 0; i < argv.size(); i++) {
-    auto *var = scope.FindVar(argv[i]);
-    std::string *var_str = var->GetMutable<std::string>();
-    std::string var_file = GenePath(dir, argv[i]);
-    std::ifstream fin(var_file);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   var_file);
-    std::getline(fin, *var_str);
-    fin.close();
-    VLOG(3) << " load String argv: " << argv[i] << " value is: " << var_str;
-  }
-}
-
-class CheckpointLoadOp : public framework::OperatorBase {
- public:
-  CheckpointLoadOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    std::string dir = Attr<std::string>("dir");
-    std::string serial_num_attr = Attr<std::string>("Serial");
-
-    VLOG(3) << "CheckpointLoadOp  get Attr  dir: " << dir;
-    VLOG(3) << "CheckpointLoadOp  get Attr  Serial: " << serial_num_attr;
-
-    std::string serial_var_name = std::string(SERIAL_VAR);
-    auto *serial_var = scope.FindVar(serial_var_name);
-    PADDLE_ENFORCE(serial_var != nullptr,
-                   "Cannot find variable %s for checkpoint_load_op",
-                   serial_var_name);
-
-    auto *serial_num = serial_var->GetMutable<std::string>();
-    serial_num->clear();
-    serial_num->append(serial_num_attr);
-
-    VLOG(1) << "CheckpointLoadOp set " << SERIAL_VAR
-            << " value: " << serial_num;
-
-    std::string success = GenePath(dir, serial_num->c_str());
-    VLOG(3) << "Load checkpoint from dir: " << success;
-    success = GenePath(success, SUCCESS);
-    bool is_present = FileExists(success);
-    if (!is_present) {
-      VLOG(1) << "CheckpointLoadOp can not find " << SUCCESS
-              << " from: " << success;
-      return;
-    }
-
-    VLOG(3) << "Ready to load vars to scope";
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-    LoadInputVars(scope, place, inp_var_names, dir);
-
-    // VLOG(3) << "Ready to load string argv to scope";
-    // auto argv = Output("Argv");
-    // LoadStringArgv(scope, place, argv, dir);
-  }
-};
-
-class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(vector) Input LoDTensors that need to be saved together in a file.")
-        .AsDuplicable();
-    AddOutput(
-        "Argv",
-        "(vector) Input LoDTensors that need to be saved together in a file.");
-    AddComment(R"DOC(
-CheckpointLoad operator
-
-This operator will serialize and write a list of input LoDTensor variables
-to a file on disk.
-)DOC");
-
-    AddAttr<std::string>(
-        "Serial",
-        "(std::string)"
-        "The  serial number of the checkpoint will to be load.");
-    AddAttr<std::string>(
-        "dir",
-        "(string)"
-        "The \"dir\" where the checkpoint files will be loaded.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-  }
-};
-
-class CheckpointLoadOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Argv").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
-class CheckpointLoadOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CheckpointLoadOpProtoMaker,
-                  ops::CheckpointLoadOpVarTypeInference,
-                  ops::CheckpointLoadOpShapeInference);
-
-// REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
-//                  ops::CheckpointLoadOpProtoMaker);
diff --git a/paddle/fluid/operators/checkpoint_op_test.cc b/paddle/fluid/operators/checkpoint_op_test.cc
deleted file mode 100644
index 5312225e5f9523..00000000000000
--- a/paddle/fluid/operators/checkpoint_op_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-USE_NO_KERNEL_OP(checkpoint_save)
-USE_NO_KERNEL_OP(checkpoint_load)
-
-TEST(CheckpointSaveOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  float* expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(paddle::platform::float16(i));
-  }
-
-  scope.Var("SERIAL_NUMBER");
-
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"dir", std::string("ckpt")});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-}
-
-TEST(CheckpointLoadOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  float* expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(paddle::platform::float16(i));
-  }
-
-  scope.Var("SERIAL_NUMBER");
-  auto* serial_num = scope.FindVar("SERIAL_NUMBER")->GetMutable<std::string>();
-  serial_num->append("0");
-
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"dir", std::string("ckpt")});
-  attrs.insert({"Serial", std::string("SERIAL_NUMBER")});
-
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "checkpoint_load", {{"X", {"test_var"}}}, {{"Argv", {}}}, attrs);
-  load_op->Run(scope, place);
-}
diff --git a/paddle/fluid/operators/checkpoint_save_op.cc b/paddle/fluid/operators/checkpoint_save_op.cc
deleted file mode 100644
index f904cdc8269e71..00000000000000
--- a/paddle/fluid/operators/checkpoint_save_op.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <sys/stat.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kSEP = '/';
-// write empty file named _SUCCESS
-const char SUCCESS[] = "_SUCCESS";
-const char SERIAL_VAR[] = "SERIAL_NUMBER";
-
-static bool IsNumber(const std::string &s) {
-  std::string::const_iterator it = s.begin();
-  while (it != s.end() && std::isdigit(*it)) ++it;
-  return !s.empty() && it == s.end();
-}
-
-static std::string GenePath(const std::string &dir, const std::string &file) {
-  std::string file_path;
-  file_path.append(dir);
-  file_path.append("/");
-  file_path.append(file);
-  return file_path;
-}
-
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
-class CheckpointSaveOp : public framework::OperatorBase {
- public:
-  CheckpointSaveOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto ck_dir = Attr<std::string>("dir");
-    auto overwrite = Attr<bool>("overwrite");
-
-    std::string serial_var_name = std::string(SERIAL_VAR);
-    auto *serial_num =
-        scope.FindVar(serial_var_name)->GetMutable<std::string>();
-    VLOG(1) << "CheckpointSaveOp get " << SERIAL_VAR
-            << " value: " << serial_num;
-
-    int serials = 0;
-    if (!serial_num->empty()) {
-      serials = std::stoi(serial_num->data());
-      serials += 1;
-    }
-
-    serial_num->clear();
-    serial_num->append(std::to_string(serials));
-
-    std::string dir = GenePath(ck_dir, serial_num->c_str());
-    VLOG(1) << "CheckpointSaveOp current dir: " << dir;
-    bool is_present = FileExists(dir);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, checkpoint save cannot to overwrite it", dir,
-                   overwrite);
-    }
-    MkDirRecursively(dir.c_str());
-
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // todo (tangwei) made it async
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      auto *var = scope.FindVar(inp_var_names[i]);
-
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for checkpoint save op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(
-          var->IsType<framework::LoDTensor>(),
-          "CheckpointSaveOp only supports LoDTensor, %s has wrong type",
-          inp_var_names[i]);
-
-      auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensors one by one
-      std::string var_file = GenePath(dir, inp_var_names[i]);
-      std::ofstream fout(var_file);
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-      fout.close();
-    }
-
-    std::string success = GenePath(dir, SUCCESS);
-    std::ofstream fout(success);
-    fout.close();
-  }
-};
-
-class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(vector) Input LoDTensors that need to be saved together in a file.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-CheckpointSave operator
-
-This operator will serialize and write a list of input LoDTensor variables
-to a file on disk.
-)DOC");
-    AddAttr<bool>("overwrite",
-                  "(boolean, default false)"
-                  "Delete the output dir if it exists.")
-        .SetDefault(false);
-
-    AddAttr<std::string>("dir",
-                         "(string)"
-                         "The dir where the LoDTensor variables will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-  }
-};
-
-// class CheckpointSaveOpVarTypeInference : public framework::VarTypeInference {
-//  public:
-//   void operator()(const framework::OpDesc &op_desc,
-//                   framework::BlockDesc *block) const override {
-//     auto out_var_name = op_desc.Output("Serial").front();
-//     auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-//     auto var_type = framework::proto::VarType::RAW;
-//     out_var.SetType(var_type);
-//   }
-// };
-
-// class CheckpointSaveOpShapeInference : public framework::InferShapeBase {
-//  public:
-//   void operator()(framework::InferShapeContext *ctx) const override {}
-// };
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
-                  ops::CheckpointSaveOpProtoMaker);
-
-// REGISTER_OPERATOR(checkpoint_save, ops::CheckpointSaveOp,
-//                   paddle::framework::EmptyGradOpMaker,
-//                   ops::CheckpointSaveOpProtoMaker,
-//                   ops::CheckpointSaveOpVarTypeInference,
-//                   ops::CheckpointSaveOpShapeInference);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index c5044a07c9421e..38c765938fe9d7 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -489,8 +489,7 @@ def find_name(var_list, name):
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
             'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
             'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'gen_nccl_id',
-            'checkpoint_save', 'checkpoint_load'
+            'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 83c32fe9d6ebdc..b1748f0ad0a39a 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -455,7 +455,7 @@ def get_parameter_value_by_name(name, executor, program=None):
 
 
 SUCCESS = "_SUCCESS"
-BEGIN_SECS = time.time()
+BEGIN_SECS = None
 
 
 def save_checkpoint(executor,
@@ -478,13 +478,21 @@ def save_checkpoint(executor,
         os.makedirs(dirname)
 
     global BEGIN_SECS
-    if time.time() - BEGIN_SECS < save_secs:
-        return
+    if BEGIN_SECS is not None:
+        if time.time() - BEGIN_SECS < save_secs:
+            return
     BEGIN_SECS = time.time()
 
     serial = _get_lastest_checkpoint_dir(dirname) + 1
     cur_dir = os.path.join(dirname, str(serial))
-    save_persistables(executor, cur_dir, main_program)
+    # save_persistables(executor, cur_dir, main_program)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=main_program,
+        vars=None,
+        predicate=is_checkpoint_var,
+        filename=None)
     _write_success(cur_dir)
     _lru_delete(dirname, keep_max)
 
@@ -505,7 +513,25 @@ def restore_checkpoint(dirname, executor, main_program=None):
     if serial < 0:
         return
     cur_dir = os.path.join(dirname, str(serial))
-    load_persistables(executor, cur_dir, main_program)
+    # load_persistables(executor, cur_dir, main_program)
+    load_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=main_program,
+        predicate=is_checkpoint_var,
+        filename=None)
+
+
+def is_checkpoint_var(var):
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+
+    if var.name.endswith("@GRAD"):
+        return False
+
+    return var.persistable
 
 
 def _lru_delete(dirname, keep_max=3):

From 2412dee3769189e2e1f94cc0e2c298c4c1035699 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 21 May 2018 21:20:03 +0800
Subject: [PATCH 49/56] code optimized

---
 python/paddle/fluid/io.py | 55 +++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index b1748f0ad0a39a..01debaff56a61e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -454,17 +454,16 @@ def get_parameter_value_by_name(name, executor, program=None):
     return get_parameter_value(var, executor)
 
 
-SUCCESS = "_SUCCESS"
-BEGIN_SECS = None
+SUCCESS_MARK_FILENAME = "_SUCCESS"
 
 
 def save_checkpoint(executor,
-                    dirname,
-                    keep_max=3,
-                    save_secs=600,
+                    dirname=None,
+                    max_num_checkpoints=3,
+                    save_interval_secs=600,
                     main_program=None):
     """
-    Save Variables to Checkpint Dir
+    Save Variables to Checkpoint Directory
 
     :param dirname
     :param keep_max
@@ -472,20 +471,19 @@ def save_checkpoint(executor,
     :param main_program
     """
     if dirname is None:
-        raise Exception("save checkpoint dir can not be none")
+        dirname = os.getcwd()
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    global BEGIN_SECS
-    if BEGIN_SECS is not None:
-        if time.time() - BEGIN_SECS < save_secs:
-            return
-    BEGIN_SECS = time.time()
+    serial = _get_lastest_checkpoint_dir(dirname)
+    if serial >= 0 and not _interval_secs_exceed(
+            os.path.join(dirname, str(serial)), save_interval_secs):
+        return
 
-    serial = _get_lastest_checkpoint_dir(dirname) + 1
+    serial = serial + 1
     cur_dir = os.path.join(dirname, str(serial))
-    # save_persistables(executor, cur_dir, main_program)
+
     save_vars(
         executor,
         dirname=cur_dir,
@@ -494,10 +492,10 @@ def save_checkpoint(executor,
         predicate=is_checkpoint_var,
         filename=None)
     _write_success(cur_dir)
-    _lru_delete(dirname, keep_max)
+    _lru_delete(dirname, max_num_checkpoints)
 
 
-def restore_checkpoint(dirname, executor, main_program=None):
+def restore_checkpoint(executor, dirname=None, main_program=None):
     """
     Load Variables from Checkpint Dir
 
@@ -505,15 +503,16 @@ def restore_checkpoint(dirname, executor, main_program=None):
     :param executor
     :param main_program
     """
-    if dirname is None and os.path.isdir(dirname):
-        raise Exception("restore checkpoint can not load variables from %s" %
-                        dirname)
+
+    if dirname is None:
+        dirname = os.getcwd()
+
     serial = _get_lastest_checkpoint_dir(dirname)
 
     if serial < 0:
         return
     cur_dir = os.path.join(dirname, str(serial))
-    # load_persistables(executor, cur_dir, main_program)
+
     load_vars(
         executor,
         dirname=cur_dir,
@@ -523,6 +522,10 @@ def restore_checkpoint(dirname, executor, main_program=None):
 
 
 def is_checkpoint_var(var):
+    """
+    VarType will fliter out FEED_MINIBATCH FETCH_LIST RAW
+    VarName will fliter out Gradient
+    """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
             var.desc.type() == core.VarDesc.VarType.RAW:
@@ -534,6 +537,13 @@ def is_checkpoint_var(var):
     return var.persistable
 
 
+def _interval_secs_exceed(dirname, save_interval_secs):
+    dir_time = os.path.getmtime(dirname)
+    if save_interval_secs > (time.time() - dir_time):
+        return False
+    return True
+
+
 def _lru_delete(dirname, keep_max=3):
     """
     retain checkpoint nums with keep_max
@@ -560,7 +570,7 @@ def _write_success(dirname):
     """
     write _SUCCESS to checkpoint dir
     """
-    success_file = os.path.join(dirname, SUCCESS)
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
     with open(success_file, 'a'):
         pass
 
@@ -584,7 +594,8 @@ def has_success(checkpoint_dir, cur_dir):
         except ValueError:
             return -1
 
-        success_path = os.path.join(checkpoint_dir, cur_dir, SUCCESS)
+        success_path = os.path.join(checkpoint_dir, cur_dir,
+                                    SUCCESS_MARK_FILENAME)
         if os.path.isfile(success_path):
             return int(cur_dir)
 

From e901de66814041adfec471673ac970de2ffe7bbc Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 22 May 2018 10:07:15 +0800
Subject: [PATCH 50/56] update var name

---
 python/paddle/fluid/io.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 01debaff56a61e..ac26991d41dd66 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -23,7 +23,7 @@
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model',
-    'get_inference_program', 'save_checkpoint', 'restore_checkpoint'
+    'get_inference_program', 'save_checkpoint', 'load_checkpoint'
 ]
 
 
@@ -466,7 +466,7 @@ def save_checkpoint(executor,
     Save Variables to Checkpoint Directory
 
     :param dirname
-    :param keep_max
+    :param max_num_checkpoints
     :param save_secs
     :param main_program
     """
@@ -495,7 +495,7 @@ def save_checkpoint(executor,
     _lru_delete(dirname, max_num_checkpoints)
 
 
-def restore_checkpoint(executor, dirname=None, main_program=None):
+def load_checkpoint(executor, dirname=None, main_program=None):
     """
     Load Variables from Checkpint Dir
 
@@ -544,9 +544,9 @@ def _interval_secs_exceed(dirname, save_interval_secs):
     return True
 
 
-def _lru_delete(dirname, keep_max=3):
+def _lru_delete(dirname, max_num_checkpoints=3):
     """
-    retain checkpoint nums with keep_max
+    retain checkpoint nums with max_num_checkpoints
     """
     dirs = os.listdir(dirname)
     serials = []
@@ -556,11 +556,11 @@ def _lru_delete(dirname, keep_max=3):
         except ValueError:
             continue
 
-    if len(serials) <= keep_max:
+    if len(serials) <= max_num_checkpoints:
         return
 
     serials.sort(reverse=True)
-    serials = serials[keep_max:]
+    serials = serials[max_num_checkpoints:]
     for serial in serials:
         cur_dir = os.path.join(dirname, str(serial))
         shutil.rmtree(cur_dir)

From 27b717516f466ca1068af5a211fdda9d35f5334d Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 22 May 2018 18:02:31 +0800
Subject: [PATCH 51/56] update python annotation

---
 python/paddle/fluid/io.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index ac26991d41dd66..3a7b68a682d04e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -463,8 +463,11 @@ def save_checkpoint(executor,
                     save_interval_secs=600,
                     main_program=None):
     """
-    Save Variables to Checkpoint Directory
-
+    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
+    directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
+    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
+    The interval time between two save_checkpoint must great than or equal to save_interval_secs.
+    
     :param dirname
     :param max_num_checkpoints
     :param save_secs
@@ -489,7 +492,7 @@ def save_checkpoint(executor,
         dirname=cur_dir,
         main_program=main_program,
         vars=None,
-        predicate=is_checkpoint_var,
+        predicate=_is_checkpoint_var,
         filename=None)
     _write_success(cur_dir)
     _lru_delete(dirname, max_num_checkpoints)
@@ -497,10 +500,11 @@ def save_checkpoint(executor,
 
 def load_checkpoint(executor, dirname=None, main_program=None):
     """
-    Load Variables from Checkpint Dir
+    Load checkpoint from directory by executor,
+    it will find lastest checkpoint file and load it auto.
 
-    :param dirname
     :param executor
+    :param dirname
     :param main_program
     """
 
@@ -517,14 +521,16 @@ def load_checkpoint(executor, dirname=None, main_program=None):
         executor,
         dirname=cur_dir,
         main_program=main_program,
-        predicate=is_checkpoint_var,
+        predicate=_is_checkpoint_var,
         filename=None)
 
 
-def is_checkpoint_var(var):
+def _is_checkpoint_var(var):
     """
-    VarType will fliter out FEED_MINIBATCH FETCH_LIST RAW
-    VarName will fliter out Gradient
+    checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW and var name is end with @GRAD are discarded.
+    
+    :param var
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
@@ -545,9 +551,6 @@ def _interval_secs_exceed(dirname, save_interval_secs):
 
 
 def _lru_delete(dirname, max_num_checkpoints=3):
-    """
-    retain checkpoint nums with max_num_checkpoints
-    """
     dirs = os.listdir(dirname)
     serials = []
     for serial in dirs:
@@ -568,7 +571,9 @@ def _lru_delete(dirname, max_num_checkpoints=3):
 
 def _write_success(dirname):
     """
-    write _SUCCESS to checkpoint dir
+    write an empty _SUCCESS file to checkpoint dir, indicate this checkpoint is correct.
+
+    :param dirname
     """
     success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
     with open(success_file, 'a'):
@@ -577,7 +582,9 @@ def _write_success(dirname):
 
 def _get_lastest_checkpoint_dir(checkpoint_dir):
     """
-    get the biggest number in checkpoint_dir, which has _SUCCESS
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    :param checkpoint_dir
     """
     if not checkpoint_dir.strip():
         return -1

From 9d985340e5a1ebf7df7a1a8f9d324c08d4d07a97 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 10:24:22 +0800
Subject: [PATCH 52/56] update annotation grammar

---
 python/paddle/fluid/io.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 3a7b68a682d04e..845e8c9ca2765e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -464,10 +464,10 @@ def save_checkpoint(executor,
                     main_program=None):
     """
     Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
-    directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
+    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
     to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
     The interval time between two save_checkpoint must great than or equal to save_interval_secs.
-    
+
     :param dirname
     :param max_num_checkpoints
     :param save_secs
@@ -500,8 +500,8 @@ def save_checkpoint(executor,
 
 def load_checkpoint(executor, dirname=None, main_program=None):
     """
-    Load checkpoint from directory by executor,
-    it will find lastest checkpoint file and load it auto.
+    Load checkpoint from a directory by executor,
+    it will find latest checkpoint file and load it auto.
 
     :param executor
     :param dirname
@@ -527,9 +527,9 @@ def load_checkpoint(executor, dirname=None, main_program=None):
 
 def _is_checkpoint_var(var):
     """
-    checkpoint will not save or load all the variables.
-    var type is FEED_MINIBATCH/FETCH_LIST/RAW and var name is end with @GRAD are discarded.
-    
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
     :param var
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
@@ -571,7 +571,7 @@ def _lru_delete(dirname, max_num_checkpoints=3):
 
 def _write_success(dirname):
     """
-    write an empty _SUCCESS file to checkpoint dir, indicate this checkpoint is correct.
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
 
     :param dirname
     """

From d96b4427a25a7839d11aa9c94224570c35e51d76 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 11:05:09 +0800
Subject: [PATCH 53/56] rename checkpoint folder to checkpoint_serial

---
 python/paddle/fluid/io.py | 66 +++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 845e8c9ca2765e..239736aad08540 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -455,10 +455,12 @@ def get_parameter_value_by_name(name, executor, program=None):
 
 
 SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+CHECKPOINT_SEPARATOR = "_"
 
 
 def save_checkpoint(executor,
-                    dirname=None,
+                    checkpoint_dir=None,
                     max_num_checkpoints=3,
                     save_interval_secs=600,
                     main_program=None):
@@ -466,26 +468,27 @@ def save_checkpoint(executor,
     Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
     the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
     to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
-    The interval time between two save_checkpoint must great than or equal to save_interval_secs.
+    The interval between two saved checkpoints must greater than save_interval_secs.
 
-    :param dirname
+    :param executor
+    :param checkpoint_dir
     :param max_num_checkpoints
-    :param save_secs
+    :param save_interval_secs
     :param main_program
     """
-    if dirname is None:
-        dirname = os.getcwd()
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
 
-    if not os.path.isdir(dirname):
-        os.makedirs(dirname)
+    if not os.path.isdir(checkpoint_dir):
+        os.makedirs(checkpoint_dir)
 
-    serial = _get_lastest_checkpoint_dir(dirname)
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
     if serial >= 0 and not _interval_secs_exceed(
-            os.path.join(dirname, str(serial)), save_interval_secs):
+            _get_serial_dir(serial, checkpoint_dir), save_interval_secs):
         return
 
-    serial = serial + 1
-    cur_dir = os.path.join(dirname, str(serial))
+    serial += 1
+    cur_dir = _get_serial_dir(serial, checkpoint_dir)
 
     save_vars(
         executor,
@@ -495,27 +498,28 @@ def save_checkpoint(executor,
         predicate=_is_checkpoint_var,
         filename=None)
     _write_success(cur_dir)
-    _lru_delete(dirname, max_num_checkpoints)
+    _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
-def load_checkpoint(executor, dirname=None, main_program=None):
+def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
     """
     Load checkpoint from a directory by executor,
-    it will find latest checkpoint file and load it auto.
+    it will find  the most recent saved checkpoint file and load it auto.
 
     :param executor
-    :param dirname
+    :param checkpoint_dir
     :param main_program
     """
 
-    if dirname is None:
-        dirname = os.getcwd()
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
 
-    serial = _get_lastest_checkpoint_dir(dirname)
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
 
     if serial < 0:
         return
-    cur_dir = os.path.join(dirname, str(serial))
+
+    cur_dir = _get_serial_dir(serial, checkpoint_dir)
 
     load_vars(
         executor,
@@ -525,6 +529,11 @@ def load_checkpoint(executor, dirname=None, main_program=None):
         filename=None)
 
 
+def _get_serial_dir(serial, checkpoint_dir):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    return os.path.join(checkpoint_dir, serial_folder)
+
+
 def _is_checkpoint_var(var):
     """
     the checkpoint will not save or load all the variables.
@@ -577,7 +586,8 @@ def _write_success(dirname):
     """
     success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
     with open(success_file, 'a'):
-        pass
+        now = time.ctime()
+        success_file.write(now)
 
 
 def _get_lastest_checkpoint_dir(checkpoint_dir):
@@ -593,18 +603,20 @@ def has_success(checkpoint_dir, cur_dir):
         """
         is _SUCCESS in this dir
         """
-        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
-            return -1
+        _, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
 
         try:
-            int(cur_dir)
+            int(serial)
         except ValueError:
             return -1
 
-        success_path = os.path.join(checkpoint_dir, cur_dir,
-                                    SUCCESS_MARK_FILENAME)
+        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME)
         if os.path.isfile(success_path):
-            return int(cur_dir)
+            return int(serial)
 
     if not os.path.isdir(checkpoint_dir):
         return -1

From 192f9a5a70a12bf57ec487d791f535e515524bd0 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 11:37:24 +0800
Subject: [PATCH 54/56] bug fix

---
 python/paddle/fluid/io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 239736aad08540..c638da67c825d4 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -585,9 +585,9 @@ def _write_success(dirname):
     :param dirname
     """
     success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
-    with open(success_file, 'a'):
+    with open(success_file, 'a') as f:
         now = time.ctime()
-        success_file.write(now)
+        f.write(now)
 
 
 def _get_lastest_checkpoint_dir(checkpoint_dir):

From cf3fb2488c667b1cfbf7bc4a5c7441bdf837b6e7 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 17:26:21 +0800
Subject: [PATCH 55/56] add clean checkpoint

---
 python/paddle/fluid/io.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index c638da67c825d4..9e0bc425f0e34d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -529,6 +529,19 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
         filename=None)
 
 
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    """
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
+    _lru_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
 def _get_serial_dir(serial, checkpoint_dir):
     serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
     return os.path.join(checkpoint_dir, serial_folder)

From 2c47e067ae8485c6ad1ae0be870b792775e4e276 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 18:03:20 +0800
Subject: [PATCH 56/56] add clean checkpoint

---
 python/paddle/fluid/io.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 9e0bc425f0e34d..8e58e5eb794e1b 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -23,7 +23,8 @@
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model',
-    'get_inference_program', 'save_checkpoint', 'load_checkpoint'
+    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
+    'clean_checkpoint'
 ]