Skip to content

Commit

Permalink
Merge From Develop
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanRisheng committed Nov 3, 2021
2 parents 2ca5414 + 3121f88 commit a88ca81
Show file tree
Hide file tree
Showing 102 changed files with 2,327 additions and 463 deletions.
3 changes: 2 additions & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211029")
#SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/distributed/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
if(NOT WITH_PSCORE)
add_subdirectory(fleet_executor)
return()
endif()

Expand All @@ -16,6 +17,7 @@ add_subdirectory(service)
add_subdirectory(table)
add_subdirectory(test)
add_subdirectory(index_dataset)
add_subdirectory(fleet_executor)

get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)

Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/distributed/fleet_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
proto_library(fleet_executor_desc_proto SRCS fleet_executor_desc.proto)
cc_library(fleet_executor SRCS fleet_executor.cc DEPS fleet_executor_desc_proto)

if(WITH_PYTHON)
py_proto_compile(fleet_executor_desc_py_proto SRCS fleet_executor_desc.proto)
endif()
43 changes: 43 additions & 0 deletions paddle/fluid/distributed/fleet_executor/fleet_executor.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/framework/program_desc.h"

namespace paddle {
namespace distributed {

FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
// Initialize Executor
}

FleetExecutor::~FleetExecutor() {
// Destroy Executor
}

void FleetExecutor::Init(const paddle::framework::ProgramDesc& program_desc) {
// Compile and Initialize
}

void FleetExecutor::Run() {
// Run
}

void FleetExecutor::Release() {
// Release
}

} // namespace distributed
} // namespace paddle
44 changes: 44 additions & 0 deletions paddle/fluid/distributed/fleet_executor/fleet_executor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <memory>
#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
#include "paddle/fluid/platform/macros.h"

namespace paddle {
namespace framework {
class ProgramDesc;
}

namespace distributed {
class RuntimeGraph;

class FleetExecutor final {
public:
FleetExecutor() = delete;
FleetExecutor(const std::string& exe_desc_str);
~FleetExecutor();
void Init(const paddle::framework::ProgramDesc& program_desc);
void Run();
void Release();

private:
DISABLE_COPY_AND_ASSIGN(FleetExecutor);
FleetExecutorDesc exe_desc_;
std::unique_ptr<RuntimeGraph> runtime_graph_;
};

} // namespace distributed
} // namespace paddle
21 changes: 21 additions & 0 deletions paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";
package paddle.distributed;

message FleetExecutorDesc {
optional string grain = 1 [ default = "coarse" ];
repeated string addrs = 2; // "ip:port" of all ranks
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,21 @@

#pragma once

#include "paddle/pten/common/data_type.h"
#include "paddle/pten/common/scalar.h"
#include "paddle/pten/hapi/include/tensor.h"

namespace paddle {
namespace experimental {
namespace framework {
class ProgramDesc;
}

Tensor full_like(const Tensor& x,
const Scalar& value,
DataType dtype = DataType::UNDEFINED);
namespace distributed {

Tensor ones_like(const Tensor& x, DataType dtype = DataType::UNDEFINED);
class RuntimeGraph final {
public:
RuntimeGraph() = default;
explicit RuntimeGraph(const paddle::framework::ProgramDesc &program) {}
~RuntimeGraph() = default;

Tensor zeros_like(const Tensor& x, DataType dtype = DataType::UNDEFINED);
DISABLE_COPY_AND_ASSIGN(RuntimeGraph);
};

} // namespace experimental
} // namespace distributed
} // namespace paddle
4 changes: 3 additions & 1 deletion paddle/fluid/distributed/table/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI

set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)

cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor)
cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
62 changes: 0 additions & 62 deletions paddle/fluid/distributed/table/depends/dense.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,15 +221,6 @@ class DAdamD2Sum : public DenseOptimizer {
void update(const float* update_values, size_t num, int begin,
int end) override {
auto update_numel = end - begin;

/*
// for debug
std::cout << "before update:\n";
for (int i = 0; i < 3; ++ i) {
std::cout << "param: " << i << " " << *(param+begin+i) <<
"grad: " << *(update_values+begin+i) << "\n";
}*/

std::vector<float> grad, grad2, scale;
grad.resize(update_numel);
grad2.resize(update_numel);
Expand All @@ -240,88 +231,35 @@ class DAdamD2Sum : public DenseOptimizer {
blas.VCOPY(update_numel, update_values + begin, grad.data());
blas.VCOPY(update_numel, update_values + begin, grad2.data());

/*
for (int i = 0; i < end-begin; ++ i) {
std::cout << "copy grad: " << i << " " << *(grad.data()+begin+i) <<
"copy grad2: " << *(grad2.data()+begin+i) << "\n";
}
for (int i = 0; i < 3; ++ i) {
std::cout << "d2sum before: " << i << " " << *(ada_d2sum+begin+i) << "\n";
}*/

// d2sum
blas.SCAL(update_numel, ada_decay_rate[0], ada_d2sum + begin);
ADD<float>(update_numel, ada_d2sum + begin, 1, ada_d2sum + begin);

/*
for (int i = 0; i < end-begin; ++ i) {
std::cout << "d2sum update: " << i << " " << *(ada_d2sum+begin+i) << "\n";
}
for (int i = 0; i < 3; ++ i) {
std::cout << "g2sum before: " << i << " " << *(ada_g2sum+begin+i) << "\n";
}*/

// g2sum
blas.SCAL(update_numel, ada_decay_rate[0], ada_g2sum + begin);
blas.VSQUARE(update_numel, grad2.data(), grad2.data());
blas.VADD(update_numel, ada_g2sum + begin, grad2.data(), ada_g2sum + begin);

/*
for (int i = 0; i < end-begin; ++ i) {
std::cout << "g2sum update: " << i << " " << *(ada_g2sum+begin+i) << "\n";
}
for (int i = 0; i < 3; ++ i) {
std::cout << "mom before: " << i << " " << *(mom_velocity+begin+i) <<
"\n";
}*/

// mom
blas.SCAL(update_numel, mom_decay_rate[0], mom_velocity + begin);
blas.SCAL(update_numel, 1 - mom_decay_rate[0], grad.data());
blas.VADD(update_numel, mom_velocity + begin, grad.data(),
mom_velocity + begin);

/*
for (int i = 0; i < end-begin; ++ i) {
std::cout << "mom update: " << i << " " << *(mom_velocity+begin+i) <<
"\n";
}
for (int i = 0; i < 3; ++ i) {
std::cout << "scale before: " << i << " " << *(scale.data()+begin+i) <<
"\n";
}*/

// scale
float* scale_ = scale.data();
blas.VDIV(update_numel, ada_g2sum + begin, ada_d2sum + begin, scale_);
ADD<float>(update_numel, scale_, ada_epsilon[0], scale_);
DIV<float>(update_numel, 1 + ada_epsilon[0], scale_, scale_);
SQRT<float>(update_numel, scale_, scale_);

/*
for (int i = 0; i < 3; ++ i) {
std::cout << "scale update: " << i << " " << *(scale.data()+begin+i) <<
"\n";
}*/

blas.SCAL(update_numel, learning_rate[0], scale_);

// TODO(zhaocaibei123): check if there exists elementwise_multiply in blas
// TODO(zhaocaibei123): blas.VMUL
ELE_MUL<float>(update_numel, scale_, mom_velocity + begin, scale_);

/*
for (int i = 0; i < 3; ++ i) {
std::cout << "scale update2: " << i << " " << *(scale.data()+begin+i) <<
"\n";
}*/

blas.VSUB(update_numel, param + begin, scale_, param + begin);

/*
for (int i = 0; i < end-begin; ++ i) {
std::cout << "param update " << i << " " << *(param+begin+i) << "\n";
}*/
}

float* learning_rate;
Expand Down
Loading

1 comment on commit a88ca81

@paddle-bot-old
Copy link

@paddle-bot-old paddle-bot-old bot commented on a88ca81 Nov 3, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🕵️ CI failures summary

🔍 PR: #36957 Commit ID: a88ca81 contains failed CI.

🔹 Failed: PR-CI-musl

Unknown Failed
2021-11-03 11:30:29 [ 44%] Linking CXX static library libconcat_op.a
2021-11-03 11:30:30 [ 44%] Built target concat_op
2021-11-03 11:30:33 [ 44%] Linking CXX static library libflip_op.a
2021-11-03 11:30:33 [ 44%] Built target flip_op
2021-11-03 11:30:34 [ 44%] Linking CXX static library libdetection_map_op.a
2021-11-03 11:30:34 [ 44%] Built target detection_map_op
2021-11-03 11:30:36 [ 44%] Linking CXX static library libhierarchical_sigmoid_op.a
2021-11-03 11:30:36 [ 44%] Built target hierarchical_sigmoid_op
2021-11-03 11:30:37 [ 44%] Linking CXX static library libframe_op.a
2021-11-03 11:30:37 [ 44%] Built target frame_op
2021-11-03 11:30:38 [ 44%] Linking CXX static library libarg_max_op.a
2021-11-03 11:30:38 [ 44%] Built target arg_max_op
2021-11-03 11:30:38 [ 44%] Linking CXX static library libbroadcast_tensors_op.a
2021-11-03 11:30:38 [ 44%] Built target broadcast_tensors_op
2021-11-03 11:31:08 [ 44%] Linking CXX static library libset_value_op.a
2021-11-03 11:31:08 [ 44%] Built target set_value_op
2021-11-03 11:32:13 [ 44%] Linking CXX static library libactivation_op.a
2021-11-03 11:32:13 [ 44%] Built target activation_op
2021-11-03 11:32:13 make: *** [Makefile:130: all] Error 2

🔹 Failed: PR-CI-NPU

Unknown Failed
2021-11-03 11:38:34 + set +x
2021-11-03 11:38:34 + SOURCE=/paddle/build/coverage-diff
2021-11-03 11:38:34 + [[ -d /paddle/build/coverage-diff ]]
2021-11-03 11:38:34 + [[ -f /paddle/build/coverage-diff ]]
2021-11-03 11:38:34 + echo 'No such file or directory: /paddle/build/coverage-diff'
2021-11-03 11:38:34 No such file or directory: /paddle/build/coverage-diff
2021-11-03 11:38:34 + exit 0
2021-11-03 11:38:34 report uploaded
2021-11-03 11:38:34 ===================================================================
2021-11-03 11:38:34 c++-coverage
2021-11-03 11:38:34 https://xly.bce.baidu.com/ipipe/ipipe-report/report/8880596/c++-coverage/
2021-11-03 11:38:34 ===================================================================
2021-11-03 11:38:34 + [[ 7 -eq 0 ]]
2021-11-03 11:38:34 + [[ 7 -eq 4 ]]
2021-11-03 11:38:34 + [[ 7 -eq 6 ]]
2021-11-03 11:38:34 Sorry, build failed.
2021-11-03 11:38:34 + [[ 7 -eq 7 ]]
2021-11-03 11:38:34 + echo 'Sorry, build failed.'
2021-11-03 11:38:34 + exit 7

🔹 Failed: PR-CI-Windows

build_failed
2021-11-03 11:37:39 [22/1068] Building CUDA object paddle\fluid\operators\CMakeFiles\paddle_operators_unity.dir\paddle_operators_unity_13_cu.cu.obj
2021-11-03 11:37:39 paddle_operators_unity_13_cu.cu
2021-11-03 11:37:46 [23/1068] Building CUDA object paddle\fluid\operators\CMakeFiles\paddle_operators_unity.dir\paddle_operators_unity_25_cu.cu.obj
2021-11-03 11:37:46 paddle_operators_unity_25_cu.cu
2021-11-03 11:37:50 [24/1068] Building CUDA object paddle\fluid\operators\CMakeFiles\paddle_operators_unity.dir\paddle_operators_unity_15_cu.cu.obj
2021-11-03 11:37:51 paddle_operators_unity_15_cu.cu
2021-11-03 11:37:56 [25/1068] Building CUDA object paddle\fluid\operators\CMakeFiles\paddle_operators_unity.dir\paddle_operators_unity_32_cu.cu.obj
2021-11-03 11:37:56 paddle_operators_unity_32_cu.cu
2021-11-03 11:37:56 ninja: build stopped: subcommand failed.
2021-11-03 11:37:56 7
2021-11-03 11:37:56 Build Paddle failed, will exit
2021-11-03 11:37:58 EXCODE: 7

🔹 Failed: PR-CI-Kunlun

Unknown Failed
2021-11-03 11:37:54 [ 49%] Linking CXX static library libset_value_op.a
2021-11-03 11:37:54 [ 49%] Built target set_value_op
2021-11-03 11:38:22 [ 49%] Linking CXX static library libsvd_op.a
2021-11-03 11:38:23 [ 49%] Built target svd_op
2021-11-03 11:39:18 [ 49%] Linking CXX static library libactivation_op.a
2021-11-03 11:39:18 [ 49%] Built target activation_op
2021-11-03 11:39:18 Makefile:140: recipe for target 'all' failed
2021-11-03 11:39:18 make: *** [all] Error 2
2021-11-03 11:39:18 + build_error=2
2021-11-03 11:39:18 + collect_ccache_hits
2021-11-03 11:39:18 ++ ccache -s
2021-11-03 11:39:18 ++ grep 'cache hit rate'
2021-11-03 11:39:18 ++ awk '{print $4}'
2021-11-03 11:39:18 + rate=0.00
2021-11-03 11:39:18 + echo 'ccache hit rate: 0.00%'
2021-11-03 11:39:18 ccache hit rate: 0.00%
2021-11-03 11:39:18 + echo 'ipipe_log_param_Ccache_Hit_Rate: 0.00%'
2021-11-03 11:39:18 + '[' 2 '!=' 0 ']'
2021-11-03 11:39:18 + exit 7

🔹 Failed: PR-CI-ROCM-Compile

Unknown Failed
2021-11-03 11:44:34 + echo 'ipipe_log_param_Ccache_Hit_Rate: 39.23%'
2021-11-03 11:44:34 + '[' 2 '!=' 0 ']'
2021-11-03 11:44:34 + exit 7
2021-11-03 11:44:34 + EXCODE=7
2021-11-03 11:44:34 + export current_dir=/paddle
2021-11-03 11:44:34 + current_dir=/paddle
2021-11-03 11:44:34 + set +x
2021-11-03 11:44:34 + SOURCE=/paddle/build/coverage-diff
2021-11-03 11:44:34 + [[ -d /paddle/build/coverage-diff ]]
2021-11-03 11:44:34 + [[ -f /paddle/build/coverage-diff ]]
2021-11-03 11:44:34 + echo 'No such file or directory: /paddle/build/coverage-diff'
2021-11-03 11:44:34 + exit 0
2021-11-03 11:44:34 No such file or directory: /paddle/build/coverage-diff
2021-11-03 11:44:34 report uploaded
2021-11-03 11:44:34 ===================================================================
2021-11-03 11:44:34 c++-coverage
2021-11-03 11:44:34 https://xly.bce.baidu.com/ipipe/ipipe-report/report/8880587/c++-coverage/
2021-11-03 11:44:34 ===================================================================
2021-11-03 11:44:34 Sorry, build failed.

🔹 Failed: PR-CI-Build

Unknown Failed
2021-11-03 11:50:22 make: *** [all] Error 2
2021-11-03 11:50:22 + build_error=2
2021-11-03 11:50:22 + collect_ccache_hits
2021-11-03 11:50:22 ++ grep 'cache hit rate'
2021-11-03 11:50:22 ++ ccache -s
2021-11-03 11:50:22 ++ awk '{print $4}'
2021-11-03 11:50:23 + rate=96.60
2021-11-03 11:50:23 + echo 'ccache hit rate: 96.60%'
2021-11-03 11:50:23 ccache hit rate: 96.60%
2021-11-03 11:50:23 + echo 'ipipe_log_param_Ccache_Hit_Rate: 96.60%'
2021-11-03 11:50:23 + '[' 2 '!=' 0 ']'
2021-11-03 11:50:23 + exit 7
2021-11-03 11:50:23 + EXCODE=7
2021-11-03 11:50:23 + '[' 7 -eq 0 ']'
2021-11-03 11:50:23 + set +x
2021-11-03 11:50:23 Sorry, build failed.
2021-11-03 11:50:23 + exit 7
2021-11-03 11:50:23 {build code state=7}
2021-11-03 11:50:33 kill agent BUILD_CODE_FAIL

🔹 Failed: PR-CI-Mac-Python3

build_failed
2021-11-03 11:50:57 5 warnings generated.
2021-11-03 11:50:57 [ 85%] Linking CXX static library libgru_op.a
2021-11-03 11:50:57 [ 85%] Built target gru_op
2021-11-03 11:50:57 make: *** [all] Error 2
2021-11-03 11:50:57 + build_error=2
2021-11-03 11:50:57 + collect_ccache_hits
2021-11-03 11:50:57 ++ ccache -s
2021-11-03 11:50:57 ++ grep 'cache hit rate'
2021-11-03 11:50:57 ++ awk '{print $4}'
2021-11-03 11:50:57 + rate=46.78
2021-11-03 11:50:57 + echo 'ccache hit rate: 46.78%'
2021-11-03 11:50:57 ccache hit rate: 46.78%
2021-11-03 11:50:57 + echo 'ipipe_log_param_Ccache_Hit_Rate: 46.78%'
2021-11-03 11:50:57 + '[' 2 '!=' 0 ']'
2021-11-03 11:50:57 + exit 7
2021-11-03 11:50:57 EXCODE: 7
2021-11-03 11:50:57 ipipe_log_param_EXCODE: 7
2021-11-03 11:50:57 Sorry, build failed.
2021-11-03 11:50:57 + exit 7

Please sign in to comment.