Skip to content

Commit

Permalink
Added fluid dependencies to Eager Dygraph (PaddlePaddle#37555)
Browse files Browse the repository at this point in the history
  • Loading branch information
jim19930609 authored Nov 26, 2021
1 parent a68eeb0 commit a9608f6
Show file tree
Hide file tree
Showing 5 changed files with 971 additions and 0 deletions.
2 changes: 2 additions & 0 deletions paddle/fluid/eager/legacy/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
file(GLOB DYGRAPH_LEGACY "*.cpp" "*.cc")
set(DYGRAPH_LEGACY ${DYGRAPH_LEGACY} PARENT_SCOPE)
258 changes: 258 additions & 0 deletions paddle/fluid/eager/legacy/amp_auto_cast.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
#include <memory>
#include <string>
#include "paddle/fluid/eager/legacy/op_runner.h"
#include "paddle/fluid/eager/legacy/tensor_helper.h"
#include "paddle/fluid/framework/operator.h"

namespace egr {

AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()),
unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
auto fp16_dtype = paddle::framework::proto::VarType::FP16;
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false;
for (auto& kernel_type : it->second) {
if ((paddle::platform::is_gpu_place(kernel_type.first.place_) ||
paddle::platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) {
supported = true;
}
}
if (!supported) {
unsupported_fp16_ops_->insert(it->first);
}
}
}

AmpOperators::~AmpOperators() {}

AmpOperators& AmpOperators::Instance() {
static AmpOperators instance;
return instance;
}

std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableAllowOps() {
return allow_ops_;
}

std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableBlockOps() {
return block_ops_;
}

std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableUnsupportedFp16Ops() {
return unsupported_fp16_ops_;
}

std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
os << "allow ops: ";
auto allow_ops = ops.GetMutableAllowOps();
std::copy((*allow_ops).begin(), (*allow_ops).end(),
std::ostream_iterator<std::string>(os, " "));
os << "\n";
os << "block ops: ";
auto block_ops = ops.GetMutableBlockOps();
std::copy((*block_ops).begin(), (*block_ops).end(),
std::ostream_iterator<std::string>(os, " "));
os << "\n";
os << "unsupported fp16 ops: ";
auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
std::ostream_iterator<std::string>(os, " "));
return os;
}

inline std::string GetDtypeStr(
const std::shared_ptr<egr::EagerTensor>& tensor) {
return paddle::framework::DataTypeToString(
egr::GetDtypeFromVar(tensor->Var()));
}

inline bool NeedCast(const std::shared_ptr<egr::EagerTensor>& tensor) {
auto place = egr::GetPlaceFromVar(tensor->Var());
auto data_type = egr::GetDtypeFromVar(tensor->Var());
if (paddle::platform::is_gpu_place(place) ||
paddle::platform::is_cuda_pinned_place(place) ||
paddle::platform::is_xpu_place(place)) {
// CudaPinndePlace is added for varbase created by dataloader
if (data_type == paddle::framework::proto::VarType::FP32 ||
data_type == paddle::framework::proto::VarType::FP16) {
return true;
}
}
return false;
}

// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
// var will be cast back from fp16 to fp32 during backward phase.
static inline std::shared_ptr<egr::EagerTensor> CastToType(
const std::shared_ptr<egr::EagerTensor>& tensor,
const paddle::framework::proto::VarType::Type dst_type) {
NameTensorMap ins = {{"X", {tensor}}};
auto in_data_type = egr::GetDtypeFromVar(tensor->Var());
paddle::framework::AttributeMap attrs = {{"in_dtype", in_data_type},
{"out_dtype", dst_type}};
auto out = std::shared_ptr<egr::EagerTensor>(new egr::EagerTensor());
NameTensorMap outs = {{"Out", {out}}};

{
AutoCastGuard guard(0);
paddle::framework::AttributeMap default_attrs;
RunOp("cast", ins, outs, std::move(attrs), {}, &default_attrs, true);
}

return out;
}

static inline std::shared_ptr<egr::EagerTensor> CastToFP16(
const std::shared_ptr<egr::EagerTensor>& tensor) {
auto dst_type = paddle::framework::proto::VarType::FP16;
if (NeedCast(tensor) && (egr::GetDtypeFromVar(tensor->Var()) != dst_type)) {
return CastToType(tensor, dst_type);
}
return tensor;
}

static inline std::shared_ptr<egr::EagerTensor> CastToFP32(
const std::shared_ptr<egr::EagerTensor>& tensor) {
auto dst_type = paddle::framework::proto::VarType::FP32;
if (NeedCast(tensor) && (egr::GetDtypeFromVar(tensor->Var()) != dst_type)) {
return CastToType(tensor, dst_type);
}
return tensor;
}

static inline paddle::framework::proto::VarType::Type GetPromoteType(
const std::string& op_type, const NameTensorMap& ins) {
auto dst_type = paddle::framework::proto::VarType::FP16;
for (const auto& pair : ins) {
for (const auto& tensor : pair.second) {
if (egr::GetDtypeFromVar(tensor->Var()) ==
paddle::framework::proto::VarType::FP32) {
dst_type = egr::GetDtypeFromVar(tensor->Var());
break;
}
}
}

// NOTE(juncai): moving_average_abs_max_scale only consider the
// dtype of input(X)
if (op_type == "moving_average_abs_max_scale") {
for (const auto& pair : ins) {
if (pair.first == "X" &&
egr::GetDtypeFromVar(pair.second.front()->Var()) ==
paddle::framework::proto::VarType::FP16) {
dst_type = paddle::framework::proto::VarType::FP16;
}
}
}

return dst_type;
}

NameTensorMap AutoCastInputs(const std::string& op_type,
const NameTensorMap& ins) {
NameTensorMap new_ins(ins);
if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
for (auto& pair : new_ins) {
// NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}

VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float16";
for (auto& var : pair.second) {
var = CastToFP16(var);
}
}
return new_ins;
} else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
for (auto& pair : new_ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float";
for (auto& var : pair.second) {
var = CastToFP32(var);
}
}
return new_ins;
} else {
auto dst_type = GetPromoteType(op_type, ins);

// NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
if (dst_type == paddle::framework::proto::VarType::FP16 &&
AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
op_type)) {
dst_type = paddle::framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
// NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first == "X" &&
dst_type == paddle::framework::proto::VarType::FP32) {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< paddle::framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == paddle::framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}
return new_ins;
}

NameTensorMap CastPureFp16Inputs(const std::string& op_type,
const NameTensorMap& ins) {
NameTensorMap new_ins(ins);
auto dst_type = paddle::framework::proto::VarType::FP16;
if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
dst_type = paddle::framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< paddle::framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == paddle::framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}

} // namespace egr
95 changes: 95 additions & 0 deletions paddle/fluid/eager/legacy/amp_auto_cast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <unordered_set>

#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/legacy/type_def.h"

namespace egr {

// NOTE(zhiqiu): only O1 and O2 are valid now
enum class AmpLevel {
O0 = 0, // fp32
O1, // amp, mixed fp32-fp16
O2, // almost fp16
O3, // fp16
};

class AmpOperators {
public:
~AmpOperators();
AmpOperators(const AmpOperators& o) = delete;
const AmpOperators& operator=(const AmpOperators& o) = delete;

static AmpOperators& Instance();

std::shared_ptr<std::unordered_set<std::string>> GetMutableAllowOps();

std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();

std::shared_ptr<std::unordered_set<std::string>>
GetMutableUnsupportedFp16Ops();

private:
AmpOperators(); // forbid calling default constructor

// The set of ops that support fp16 calculation and are considered numerically
// safe and performance critical. These ops are always converted to fp16.
std::shared_ptr<std::unordered_set<std::string>> allow_ops_;

// The set of ops that support fp16 calculation and are considered numerically
// dangerous and whose effects may also be observed in downstream ops.
std::shared_ptr<std::unordered_set<std::string>> block_ops_;

// The set of ops that has no fp16 CUDA kennel.
std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
};

std::ostream& operator<<(std::ostream& os, AmpOperators& ops);

// NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard {
public:
explicit AutoCastGuard(int guard_level) {
pre_amp_level_ = Controller::Instance().GetAMPLevel();

if (pre_amp_level_ != guard_level) {
Controller::Instance().SetAMPLevel(guard_level);
}
}

~AutoCastGuard() { Controller::Instance().SetAMPLevel(pre_amp_level_); }

// forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete;
AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;

private:
int pre_amp_level_;
};

NameTensorMap AutoCastInputs(const std::string& op_type,
const NameTensorMap& ins);

NameTensorMap CastPureFp16Inputs(const std::string& op_type,
const NameTensorMap& ins);

} // namespace egr
Loading

0 comments on commit a9608f6

Please sign in to comment.