Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

confilct #1

Merged
merged 32 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4527d24
Remove LoDTensor and Tensor in fluid except operators folder (#48416)
Liyulingyue Nov 28, 2022
8d00f76
【fluid api clear】Remove reduce sum (#48330)
xiaoguoguo626807 Nov 28, 2022
81d0a3c
add square fp16 *test=kunlun (#48095)
HarperCy Nov 28, 2022
827fd5c
fix expand as op (#48336)
HexToString Nov 28, 2022
20c3224
[Paddle Inference] Add gather_nd trt converter. (#47589)
xiaoxiaohehe001 Nov 28, 2022
1a92098
clean fluid task: transfer logical_and api (#48341)
201716010711 Nov 28, 2022
b5cd67e
delete strided_slice api (#48395)
201716010711 Nov 28, 2022
d983fc3
clear fluid api: warpctc, nce, identity_loss (#48142)
isky-cd Nov 28, 2022
b35ec18
[Auto Parallel] Add matmul flops (#47816)
CjhHa1 Nov 28, 2022
d7540a4
add pbtxt (#48326)
b3602sss Nov 28, 2022
86d9209
Use phi layernorm (#48276)
MARD1NO Nov 28, 2022
d3f52ef
Fix bug of TransToFluidOpName (#48355)
zyfncg Nov 28, 2022
2bae75e
[Phi decouple] remove dependece to "paddle/fluid/platform/device/xpu/…
huangjiyi Nov 28, 2022
e7d459a
Remove kSyncRun in StreamAnalyzer (#48425)
From00 Nov 28, 2022
174726f
[Auto Parallel]Add pattern for auto search (#48316)
Caozhou1995 Nov 28, 2022
fd68910
Remove unnecessary exports in `distributed.communication` and move `w…
HermitSun Nov 28, 2022
ab18644
remove fluid (#47959)
esythan Nov 28, 2022
bb1fffd
Add trace mode for interpretercore (#48370)
zhangbo9674 Nov 28, 2022
c39d1cf
[CodeStyle][isort] introduce isort (part5) (#48404)
SigureMo Nov 28, 2022
74d411e
(fluid清理)remove flatten in nn.py under fluid (#47940)
Courtesy-Xs Nov 28, 2022
32143f4
[NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039)
qili93 Nov 28, 2022
fe617f9
delete gaussian_random_batch_size_like api (#48394)
201716010711 Nov 28, 2022
923ad5d
add cpu_info.h (#48403)
AndPuQing Nov 28, 2022
b4b926f
migrate top_k_function_cuda.h from fluid to phi (#48251)
Asthestarsfalll Nov 28, 2022
df82fd3
[BugFix]Fix OneDNN Kernels Bug when use pass (#48364)
YuanRisheng Nov 28, 2022
8424cf2
Optimize the log of broadcast and decrease the log level. (#48327)
Xreki Nov 28, 2022
30a31a5
replace LoDTensor with phi::DenseTensor in fluid\operators\*\ except …
Liyulingyue Nov 28, 2022
fd9c91c
[PHI decoupling] move several header files from fluid to phi (#48415)
huangjiyi Nov 28, 2022
ea830d4
[Fluid Clean] Migrate program_translate.py/jit.py into paddle.jit dir…
Aurelius84 Nov 28, 2022
4edf37d
[Clean Fluid API]Remove API: crop_tensor (#47983)
Vvsmile Nov 28, 2022
2fe9299
[Clean Fluid API]Remove squeeze: use paddle.squeeze to replace paddle…
Vvsmile Nov 28, 2022
b5c6c36
Generate static graph code for some ops by yaml (part5) (#48284)
zyfncg Nov 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ paddle/fluid/operators/generated_op.cc
paddle/fluid/operators/generated_sparse_op.cc
paddle/phi/ops/compat/generated_sig.cc
paddle/phi/ops/compat/generated_sparse_sig.cc
paddle/phi/api/yaml/parsed_apis/
paddle/fluid/operators/generator/parsed_ops/
paddle/fluid/pybind/tmp_eager_op_function_impl.h
paddle/fluid/pybind/eager_op_function_impl.h
Expand Down
28 changes: 14 additions & 14 deletions paddle/fluid/distributed/ps/service/communicator/communicator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ limitations under the License. */
namespace paddle {
namespace distributed {

using LoDTensor = phi::DenseTensor;
using phi::SelectedRows;

const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100;
Expand Down Expand Up @@ -97,11 +96,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
regions.reserve(varnames.size());
for (auto &t : varnames) {
Variable *var = scope->Var(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor *tensor = var->GetMutable<phi::DenseTensor>();
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
Variable *temp_var = xpu_temp_scope_->Var(t);
LoDTensor *temp_tensor = temp_var->GetMutable<LoDTensor>();
phi::DenseTensor *temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
temp_tensor->Resize(tensor->dims());
float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
paddle::distributed::Region reg(temp_data, tensor->numel());
Expand All @@ -122,7 +121,7 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,

for (auto &t : varnames) {
Variable *var = scope->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor *tensor = var->GetMutable<phi::DenseTensor>();
VLOG(3) << "Communicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());

Expand All @@ -132,8 +131,8 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
<< " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1];
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
LoDTensor *temp_tensor =
xpu_temp_scope_->FindVar(t)->GetMutable<LoDTensor>();
phi::DenseTensor *temp_tensor =
xpu_temp_scope_->FindVar(t)->GetMutable<phi::DenseTensor>();
framework::TensorCopy(*temp_tensor, tensor->place(), tensor);
float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
VLOG(1) << "Communicator::RpcRecvDense Var " << t << " table_id "
Expand All @@ -157,11 +156,11 @@ void Communicator::RpcSendDenseParam(const std::vector<std::string> &varnames,
for (auto &t : varnames) {
Variable *var = scope.FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor *tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor *tensor = var->GetMutable<phi::DenseTensor>();
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
Variable *temp_var = xpu_temp_scope_->Var(t);
LoDTensor *temp_tensor = temp_var->GetMutable<LoDTensor>();
phi::DenseTensor *temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
temp_tensor->Resize(tensor->dims());
float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
framework::TensorCopy(*tensor, platform::CPUPlace(), temp_tensor);
Expand Down Expand Up @@ -203,7 +202,8 @@ void Communicator::RpcSendDense(const CommContext &ctx,
float *data = dense_data->data();
uint32_t pos = 0;
for (size_t i = 0; i < var_names.size(); ++i) {
const LoDTensor tensor = scope.FindVar(var_names[i])->Get<LoDTensor>();
const phi::DenseTensor tensor =
scope.FindVar(var_names[i])->Get<phi::DenseTensor>();
size_t count = static_cast<size_t>(tensor.numel());
const float *g = tensor.data<float>();
CHECK(pos + count <= dense_data->size())
Expand Down Expand Up @@ -472,13 +472,13 @@ void AsyncCommunicator::RecvNoBarrier() {
auto var_names = iter.second;
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor *tensor = var->GetMutable<phi::DenseTensor>();
VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
LoDTensor *temp_tensor =
xpu_temp_scope_->FindVar(t)->GetMutable<LoDTensor>();
phi::DenseTensor *temp_tensor =
xpu_temp_scope_->FindVar(t)->GetMutable<phi::DenseTensor>();
framework::TensorCopy(*temp_tensor, tensor->place(), tensor);
#endif
}
Expand Down Expand Up @@ -591,8 +591,8 @@ void AsyncCommunicator::PullSparseToTensorSync(
uint64_t padding_id,
platform::Place place,
bool is_training,
std::vector<const LoDTensor *> *inputs,
std::vector<LoDTensor *> *outputs) {
std::vector<const phi::DenseTensor *> *inputs,
std::vector<phi::DenseTensor *> *outputs) {
std::vector<uint64_t> fea_keys;
std::vector<float *> pull_result_ptr;
fea_keys.reserve(MAX_FEASIGN_NUM / 100);
Expand Down
46 changes: 23 additions & 23 deletions paddle/fluid/distributed/ps/wrapper/fleet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ limitations under the License. */
namespace paddle {
namespace distributed {

using LoDTensor = phi::DenseTensor;
using framework::ProgramDesc;
using framework::VarDesc;
using framework::Variable;
Expand Down Expand Up @@ -232,7 +231,7 @@ std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
int64_t* ids = tensor->data<int64_t>();
size_t len = tensor->numel();
Expand Down Expand Up @@ -279,7 +278,7 @@ void FleetWrapper::PullSparseVarsSync(
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
int64_t* ids = tensor->data<int64_t>();
size_t len = tensor->numel();
Expand Down Expand Up @@ -327,13 +326,14 @@ void FleetWrapper::PullSparseVarsSync(
// is_training is true means training, false means inference, the behavior is
// different on pserver

void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id,
int fea_dim,
uint64_t padding_id,
platform::Place place,
bool is_training,
std::vector<const LoDTensor*>* inputs,
std::vector<LoDTensor*>* outputs) {
void FleetWrapper::PullSparseToTensorSync(
const uint64_t table_id,
int fea_dim,
uint64_t padding_id,
platform::Place place,
bool is_training,
std::vector<const phi::DenseTensor*>* inputs,
std::vector<phi::DenseTensor*>* outputs) {
std::vector<uint64_t> fea_keys;
std::vector<float*> pull_result_ptr;
fea_keys.reserve(MAX_FEASIGN_NUM / 100);
Expand Down Expand Up @@ -398,7 +398,7 @@ void FleetWrapper::PullDenseVarsAsync(
varname = var_names[i] + "pin";
}
Variable* var = scope.FindVar(varname);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
float* w = tensor->data<float>();
paddle::distributed::Region reg(w, tensor->numel());
regions[i] = std::move(reg);
Expand All @@ -417,7 +417,7 @@ void FleetWrapper::PullDenseVarsSync(
regions.reserve(var_names.size());
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
if (!platform::is_gpu_place(tensor->place())) {
float* w = tensor->data<float>();
paddle::distributed::Region reg(w, tensor->numel());
Expand All @@ -437,7 +437,7 @@ void FleetWrapper::PushDenseParamSync(
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
if (!platform::is_gpu_place(tensor->place())) {
float* g = tensor->mutable_data<float>(place);
paddle::distributed::Region reg(g, tensor->numel());
Expand Down Expand Up @@ -468,7 +468,7 @@ void FleetWrapper::PushDenseVarsAsync(
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
int count = tensor->numel();
float* g = tensor->mutable_data<float>(place);
// TODO(zhaocaibei123): how to get batch_size in op?
Expand Down Expand Up @@ -544,8 +544,8 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
const std::string& click_name,
platform::Place place,
const std::vector<std::string>& input_names,
std::vector<const LoDTensor*>* inputs,
std::vector<const LoDTensor*>* outputs) {
std::vector<const phi::DenseTensor*>* inputs,
std::vector<const phi::DenseTensor*>* outputs) {
// not support
return;
}
Expand All @@ -555,11 +555,11 @@ void FleetWrapper::PushSparseFromTensorAsync(
int fea_dim,
uint64_t padding_id,
platform::Place place,
std::vector<const LoDTensor*>* inputs,
std::vector<const phi::DenseTensor*>* inputs,
std::vector<int>& slots,
const LoDTensor* shows,
const LoDTensor* clks,
std::vector<LoDTensor*>* outputs,
const phi::DenseTensor* shows,
const phi::DenseTensor* clks,
std::vector<phi::DenseTensor*>* outputs,
bool use_cvm_op) {
CHECK(slots.size() == inputs->size());
int batch_size = -1;
Expand Down Expand Up @@ -777,7 +777,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id,
Variable* var = scope->FindVar(name);
CHECK(var != nullptr) << "var[" << name << "] not found";
VLOG(3) << "prepare shrink dense batch_sum";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
float* g = tensor->data<float>();

// show_batch_sum += N * log(decay)
Expand All @@ -787,7 +787,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id,
Variable* var_size = scope->FindVar(size_name);
CHECK(var_size != nullptr) << "var[" << size_name << "] not found";
VLOG(3) << "shrink dense batch_sum: " << name << ", " << size_name;
float* g_size = var_size->GetMutable<LoDTensor>()->data<float>();
float* g_size = var_size->GetMutable<phi::DenseTensor>()->data<float>();

for (int k = 0; k < tensor->numel(); k += emb_dim) {
g[k] = g[k] + g_size[k] * log(decay);
Expand All @@ -797,7 +797,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id,
} else {
Variable* var = scope->FindVar(name);
CHECK(var != nullptr) << "var[" << name << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
float* g = tensor->data<float>();
paddle::distributed::Region reg(g, tensor->numel());
regions.emplace_back(std::move(reg));
Expand Down
28 changes: 14 additions & 14 deletions paddle/fluid/distributed/ps/wrapper/fleet.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ namespace distributed {

class PSCore;

using LoDTensor = phi::DenseTensor;
using framework::Scope;
using framework::Variable;
using phi::SelectedRows;
Expand Down Expand Up @@ -111,13 +110,14 @@ class FleetWrapper {
// is_training is true means training, false means inference, the behavior is
// different on pserver

void PullSparseToTensorSync(const uint64_t table_id,
int fea_dim,
uint64_t padding_id,
platform::Place place,
bool is_training,
std::vector<const LoDTensor*>* inputs, // NOLINT
std::vector<LoDTensor*>* outputs); // NOLINT
void PullSparseToTensorSync(
const uint64_t table_id,
int fea_dim,
uint64_t padding_id,
platform::Place place,
bool is_training,
std::vector<const phi::DenseTensor*>* inputs, // NOLINT
std::vector<phi::DenseTensor*>* outputs); // NOLINT

// pull dense variables from server in sync mod
// Param<in>: scope, table_id, var_names
Expand Down Expand Up @@ -188,18 +188,18 @@ class FleetWrapper {
const std::string& click_name,
platform::Place place,
const std::vector<std::string>& input_names,
std::vector<const LoDTensor*>* inputs, // NOLINT
std::vector<const LoDTensor*>* outputs); // NOLINT
std::vector<const phi::DenseTensor*>* inputs, // NOLINT
std::vector<const phi::DenseTensor*>* outputs); // NOLINT

void PushSparseFromTensorAsync(const uint64_t table_id,
int fea_dim,
uint64_t padding_id,
platform::Place place,
std::vector<const LoDTensor*>* inputs,
std::vector<const phi::DenseTensor*>* inputs,
std::vector<int>& slots, // NOLINT
const LoDTensor* shows,
const LoDTensor* clicks,
std::vector<LoDTensor*>* outputs,
const phi::DenseTensor* shows,
const phi::DenseTensor* clicks,
std::vector<phi::DenseTensor*>* outputs,
bool use_cvm_op = false);
// Push sparse variables to server in Async mode
// Param<In>: scope, table_id, fea_keys, sparse_grad_names
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ namespace distributed {

class PSCore;

using LoDTensor = phi::DenseTensor;
using framework::Scope;
using framework::Variable;
using phi::SelectedRows;
Expand Down
36 changes: 11 additions & 25 deletions paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ class ContextManager {
inline std::string RunTypeToString(DownstreamRunType run_type) {
if (run_type == DownstreamRunType::kDirectRun) {
return "DirectRun";
} else if (run_type == DownstreamRunType::kSyncRun) {
return "SyncRun";
} else {
return "EventRun";
}
Expand Down Expand Up @@ -238,16 +236,10 @@ void StreamAnalyzer::AnalyseAllEventInfo(
event_info) const {
for (size_t cur_instr_id = 0; cur_instr_id < instructions.size();
++cur_instr_id) {
const Instruction& cur_instr = instructions[cur_instr_id];
const std::vector<std::vector<size_t>>& next_instr_list =
run_type_info[cur_instr_id];
const std::vector<size_t>& next_instr_ids =
run_type_info[cur_instr_id][DownstreamRunType::kEventRun];
std::set<size_t> waiter_instr_ids;

std::vector<size_t> next_instr_ids =
next_instr_list[DownstreamRunType::kSyncRun];
next_instr_ids.insert(next_instr_ids.end(),
next_instr_list[DownstreamRunType::kEventRun].begin(),
next_instr_list[DownstreamRunType::kEventRun].end());
for (size_t next_instr_id : next_instr_ids) {
AnalyseEventInfoForTwoInstructions(instructions,
run_type_info,
Expand All @@ -257,8 +249,9 @@ void StreamAnalyzer::AnalyseAllEventInfo(
}

for (size_t waiter_instr_id : waiter_instr_ids) {
(*event_info)[&(cur_instr.DeviceContext())][waiter_instr_id].insert(
cur_instr_id);
(*event_info)[&(instructions[cur_instr_id].DeviceContext())]
[waiter_instr_id]
.insert(cur_instr_id);
}
}
}
Expand All @@ -284,7 +277,7 @@ void StreamAnalyzer::AnalyseAllRunType(
}
}

// The caller should guarantee cur_instr and next_instr is kSyncRun or kEventRun
// The caller should guarantee cur_instr and next_instr is kEventRun
void StreamAnalyzer::AnalyseEventInfoForTwoInstructions(
const std::vector<Instruction>& instructions,
const std::vector<std::vector<std::vector<size_t>>>& run_type_info,
Expand All @@ -311,16 +304,15 @@ void StreamAnalyzer::AnalyseEventInfoForTwoInstructions(
// can only add event for it with the help of depend_op.
if (HasDataDependency(instructions[cur_instr_id],
instructions[next_instr_id]) ||
run_type_info[next_instr_id][DownstreamRunType::kSyncRun].size() ||
run_type_info[next_instr_id][DownstreamRunType::kEventRun].size() ||
instructions[next_instr_id].OpBase()->Type() == "depend") {
waiter_instr_ids->insert(next_instr_id);
return;
}

// NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and
// simultaneously next_instr has neither sync_run nor event_run downstream
// instr, we try to recursively add events between cur_instr and next_instr's
// simultaneously next_instr has no event_run downstream instr, we try to
// recursively add events between cur_instr and next_instr's
// direct-run-instrs. This can delay the event wait and achieve better
// scheduling performance in some scenarios. However, when next_instr has too
// many direct-run-instrs, it may perform worse than add event directly
Expand Down Expand Up @@ -393,15 +385,9 @@ DownstreamRunType StreamAnalyzer::AnalyseRunTypeForTwoInstructions(
}
}

if (cur_instr.KernelType() == OpFuncType::kGpuAsync) {
if (next_instr.KernelType() == OpFuncType::kCpuSync) {
return DownstreamRunType::kSyncRun;
} else {
// cross-stream: kGpuAsync -> kGpuSync, kGpuAsync -> kGpuSync
if (&cur_instr.DeviceContext() != &next_instr.DeviceContext()) {
return DownstreamRunType::kEventRun;
}
}
if (cur_instr.KernelType() == OpFuncType::kGpuAsync &&
(&cur_instr.DeviceContext() != &next_instr.DeviceContext())) {
return DownstreamRunType::kEventRun;
}

return DownstreamRunType::kDirectRun;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace paddle {
namespace framework {
namespace interpreter {

enum DownstreamRunType { kDirectRun, kSyncRun, kEventRun };
enum DownstreamRunType { kDirectRun, kEventRun };

class StreamAnalyzer {
public:
Expand Down
Loading