diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc new file mode 100644 index 0000000000000..ae5d02c1e943f --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -0,0 +1,324 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" + +#include +#include +#include +#include +#include + +// Eager +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/eager/utils.h" + +// Eager Generated +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" + +// Fluid +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/basic_engine.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/memory/memcpy.h" + +#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" + +#include "paddle/pten/core/kernel_registry.h" + +static size_t max_num_benchmark_runs = 5000; + +namespace egr { + +/* --------------------- */ +/* ---- Eager Scale ---- */ +/* --------------------- */ +void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) { + EagerTensor input_tensor = tensor; + float scale = 2.0; + float bias = 3.0; + + size_t max_num_runs = accuracy_check ? 10 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor = + egr::scale(input_tensor, scale, bias, true /*bias_after_scale*/, + true /*trace_backward*/); + } + + std::vector target_tensors = {input_tensor}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 10) + CompareTensorWithValue(input_tensor, 8189.0); + // Examine Backward Grad (w.r.t max_num_runs = 10) + CompareGradTensorWithValue(tensor, 1024.0); + } +} + +/* ----------------------------------- */ +/* ---- Eager Intermediate Matmul ---- */ +/* ----------------------------------- */ +void benchmark_eager_intermediate_matmul(const EagerTensor& X, + const EagerTensor& Y, + bool accuracy_check) { + EagerTensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = matmul_v2_dygraph_function( + input_tensor0, Y, {{"trans_x", false}, {"trans_y", false}}); + } + + std::vector target_tensors = {input_tensor0}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + CompareVariableWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + CompareGradVariableWithValue(X, 16); + CompareGradVariableWithValue(Y, 16); + } +} + +/* -------------------------------- */ +/* ---- Eager Intermediate MLP ---- */ +/* -------------------------------- */ +void benchmark_eager_intermediate_mlp(const EagerTensor& X, + const std::vector& Ws, + const std::vector& Bs, + bool accuracy_check) { + EagerTensor input0 = X; + + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + EagerTensor Out = matmul_v2_dygraph_function( + input0, Ws[i], {{"trans_x", false}, {"trans_y", false}}); + + input0 = elementwise_add_dygraph_function(Out, Bs[i], {}); + } + + EagerTensor Out = reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); + + std::vector target_tensors = {Out}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + std::unordered_map result = + compute_mlp_expected_results(); + + // Examine Forward Grad (w.r.t max_num_runs = 2) + CompareVariableWithValue(Out, result["Out"]); + + // Examine Backward Grad (w.r.t max_num_runs = 2) + CompareGradVariableWithValue(X, result["GradX"]); + CompareGradVariableWithValue(Ws[0], result["GradW"]); + } +} + +} // namespace egr + +namespace paddle { +namespace imperative { + +static void FluidCheckTensorValue(const std::shared_ptr& X, + const paddle::platform::Place& place, + float value) { + auto* tensor = X->MutableVar()->GetMutable(); + float* t_ptr = tensor->mutable_data(place); + std::vector host_data(tensor->numel()); + if (place == paddle::platform::CUDAPlace()) { + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = + dynamic_cast(pool.Get(place)); + auto stream = dev_ctx->stream(); + + paddle::memory::Copy(paddle::platform::CPUPlace(), host_data.data(), + paddle::platform::CUDAPlace(), t_ptr, + sizeof(float) * tensor->numel(), stream); + t_ptr = host_data.data(); + } + VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value; + PADDLE_ENFORCE( + t_ptr[0] == value, + paddle::platform::errors::Fatal( + "Detected numerical Error, Expected %f but got %f", value, t_ptr[0])); +} + +static void FluidCheckGradTensorValue( + const std::shared_ptr& X, + const paddle::platform::Place& place, float value) { + auto* grad_tensor = X->MutableGradVar()->GetMutable(); + float* g_ptr = grad_tensor->mutable_data(place); + std::vector g_host_data(grad_tensor->numel()); + if (place == paddle::platform::CUDAPlace()) { + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = + dynamic_cast(pool.Get(place)); + auto stream = dev_ctx->stream(); + + paddle::memory::Copy(paddle::platform::CPUPlace(), g_host_data.data(), + paddle::platform::CUDAPlace(), g_ptr, + sizeof(float) * grad_tensor->numel(), stream); + g_ptr = g_host_data.data(); + } + VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value; + PADDLE_ENFORCE( + g_ptr[0] == value, + paddle::platform::errors::Fatal( + "Detected numerical Error, Expected %f but got %f", value, g_ptr[0])); +} + +/* --------------------- */ +/* ---- Fluid Scale ---- */ +/* --------------------- */ +// TODO(jiabin): Change this and remove nolint +void benchmark_fluid_scale(const std::shared_ptr& X, + const paddle::platform::Place& place, + bool accuracy_check) { + imperative::Tracer tracer; + framework::AttributeMap attrs; + + attrs["use_mkldnn"] = false; + attrs["scale"] = 2; + attrs["bias"] = 3; + attrs["bias_after_scale"] = true; + + std::shared_ptr tmp_out = X; + + size_t max_num_runs = accuracy_check ? 10 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + imperative::NameVarBaseMap ins = {{"X", {tmp_out}}}; + imperative::NameVarBaseMap outs = { + {"Out", + {std::shared_ptr( + new imperative::VarBase(true, "Out"))}}}; + + tracer.TraceOp("scale", ins, outs, attrs, place, true); + + tmp_out = outs["Out"][0]; + } + + auto* engine = tracer.GetEngine(); + std::vector> grad_tensors{nullptr}; + engine->Init({tmp_out}, grad_tensors, false /*retain_graph*/); + engine->Execute(); + + if (accuracy_check) { + FluidCheckTensorValue(tmp_out, place, 8189.0); + FluidCheckGradTensorValue(X, place, 1024.0); + } +} + +/* ---------------------- */ +/* ---- Fluid Matmul ---- */ +/* ---------------------- */ +void benchmark_fluid_matmul(const std::shared_ptr& X, + const std::shared_ptr& Y, + const paddle::platform::Place& place, + bool accuracy_check) { + imperative::Tracer tracer; + + std::shared_ptr tmp_out = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + framework::AttributeMap attrs; + imperative::NameVarBaseMap ins = {{"X", {tmp_out}}, {"Y", {Y}}}; + imperative::NameVarBaseMap outs = { + {"Out", + {std::shared_ptr( + new imperative::VarBase(true, "Out"))}}}; + + tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true); + + tmp_out = outs["Out"][0]; + } + + auto* engine = tracer.GetEngine(); + std::vector> grad_tensors{nullptr}; + engine->Init({tmp_out}, grad_tensors, false /*retain_graph*/); + engine->Execute(); + + if (accuracy_check) { + FluidCheckTensorValue(tmp_out, place, 16); + FluidCheckGradTensorValue(X, place, 16); + FluidCheckGradTensorValue(Y, place, 16); + } +} + +/* ------------------- */ +/* ---- Fluid MLP ---- */ +/* ------------------- */ +void benchmark_fluid_mlp( + const std::shared_ptr& X, + const std::vector>& Ws, + const std::vector>& Bs, + const paddle::platform::Place& place, bool accuracy_check) { + imperative::Tracer tracer; + + imperative::NameVarBaseMap ins; + imperative::NameVarBaseMap outs; + framework::AttributeMap attrs; + std::shared_ptr input0 = X; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + // Matmul0 + ins = {{"X", {input0}}, {"Y", {Ws[0]}}}; + outs = {{"Out", + {std::shared_ptr( + new imperative::VarBase(true, "Out"))}}}; + + tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true); + + // EW-Add0 + ins = {{"X", outs["Out"]}, {"Y", {Bs[i]}}}; + outs = {{"Out", + {std::shared_ptr( + new imperative::VarBase(true, "Out"))}}}; + + tracer.TraceOp("elementwise_add", ins, outs, attrs, place, true); + input0 = outs["Out"][0]; + } + + // ReduceSum + ins = {{"X", {input0}}}; + outs = {{"Out", + {std::shared_ptr( + new imperative::VarBase(true, "Out"))}}}; + attrs = {{"reduce_all", true}}; + + tracer.TraceOp("reduce_sum", ins, outs, attrs, place, true); + + auto* engine = tracer.GetEngine(); + std::vector> grad_tensors{nullptr}; + engine->Init(outs["Out"], grad_tensors, false /*retain_graph*/); + engine->Execute(); + + if (accuracy_check) { + std::unordered_map result = + egr::compute_mlp_expected_results(); + + FluidCheckTensorValue(outs["Out"][0], place, result["Out"]); + FluidCheckGradTensorValue(X, place, result["GradX"]); + FluidCheckGradTensorValue(Ws[0], place, result["GradW"]); + } +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h new file mode 100644 index 0000000000000..70ecf2af8e4c3 --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -0,0 +1,95 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/pten/api/all.h" + +/* MLP Configurations */ +// Out1 = X[M, N] x W[N, K] + B[K] +// ... x MLP_NUM_LINEAR +// Out = ReduceSum(OutN) +#define MLP_M 4 +#define MLP_N 16 +#define MLP_K MLP_N +#define MLP_X_VAL 1.0 +#define MLP_W_VAL 2.0 +#define MLP_B_VAL 3.0 +#define MLP_NUM_LINEAR 1000 + +namespace egr { + +inline std::unordered_map compute_mlp_expected_results() { + float Out = MLP_X_VAL; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + Out = Out * MLP_W_VAL * MLP_N + MLP_B_VAL; + } + Out = Out * MLP_M * MLP_N; + + float GradX = 1.0 * pow((MLP_W_VAL * MLP_N), MLP_NUM_LINEAR); + float GradW0 = + 1.0 * pow((MLP_W_VAL * MLP_N), (MLP_NUM_LINEAR - 1)) * MLP_X_VAL * MLP_M; + return {{"Out", Out}, {"GradX", GradX}, {"GradW", GradW0}}; +} + +/* ---- Eager Scale ---- */ +void benchmark_eager_scale(const EagerTensor& tensor, + bool accuracy_check = false); + +/* ---- Eager MatMul ---- */ +/* +void benchmark_eager_matmul(const EagerTensor& X, const EagerTensor& Y, + bool accuracy_check = false); +void benchmark_eager_mlp(const EagerTensor& X, + const std::vector& Ws, + const std::vector& Bs, + bool accuracy_check = false); +*/ +void benchmark_eager_intermediate_matmul(const EagerTensor& X, + const EagerTensor& Y, + bool accuracy_check = false); + +void benchmark_eager_intermediate_mlp(const EagerTensor& X, + const std::vector& Ws, + const std::vector& Bs, + bool accuracy_check = false); + +} // namespace egr + +namespace paddle { +namespace imperative { +/* ---- Fluid Scale ---- */ +// TODO(jiabin): Change this and remove nolint +void benchmark_fluid_scale( + const std::shared_ptr& X, // NOLINT + const paddle::platform::Place& place, bool accuracy_check = false); + +/* ---- Fluid MatMul ---- */ +void benchmark_fluid_matmul( + const std::shared_ptr& X, + const std::shared_ptr& Y, // NOLINT + const paddle::platform::Place& place, bool accuracy_check = false); + +/* ---- Fluid MLP ---- */ +void benchmark_fluid_mlp( + const std::shared_ptr& X, + const std::vector>& Ws, + const std::vector>& Bs, + const paddle::platform::Place& place, bool accuracy_check = false); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index 7c1e8ba023775..28cffca920425 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -4,3 +4,4 @@ cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eage cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc new file mode 100644 index 0000000000000..751e95487659c --- /dev/null +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -0,0 +1,417 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/grad_node_info.h" + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/tensor_meta.h" + +#include "paddle/fluid/eager/tests/test_utils.h" + +// TODO(jiabin): remove nolint here!!! +using namespace egr; // NOLINT + +namespace eager_test { + +egr::EagerTensor hook_function(const egr::EagerTensor& t) { + auto t_dense = std::dynamic_pointer_cast(t.impl()); + + auto ret_meta = pten::DenseTensorMeta(t_dense->dtype(), t_dense->dims(), + t_dense->layout()); + auto place = t_dense->place(); + size_t bytes_size = + paddle::framework::product(t_dense->dims()) * SizeOf(t_dense->dtype()); + auto ret_dense = std::make_shared( + pten::make_intrusive( + paddle::memory::Alloc(place, bytes_size), 0), + std::move(ret_meta)); + + float* t_ptr = t_dense->mutable_data(); + float* ret_ptr = ret_dense->mutable_data(); + for (int i = 0; i < ret_dense->numel(); i++) { + ret_ptr[i] = t_ptr[i] + 5.0; + } + + auto ret_impl = std::dynamic_pointer_cast(ret_dense); + egr::EagerTensor ret = egr::EagerTensor(); + ret.set_impl(ret_impl); + + return ret; +} + +TEST(FwdBwdJoint, SingleNode) { + InitEnv(paddle::platform::CPUPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + float scale = 2.0; + float bias = 3.0; + egr::EagerTensor out = egr::scale( + tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/); + + // Examine Forward Output + CompareTensorWithValue(out, 13.0); + + std::vector outs = {out}; + // 4. Run Backward + RunBackward(outs, {}); + + VLOG(7) << "Target Grad is: " + << std::static_pointer_cast( + EagerUtils::unsafe_autograd_meta(tensor)->Grad().impl()) + ->data()[0]; + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 2.0); +} + +/* + inp + | +Node0 + | +Node1 + | + out +*/ +TEST(FwdBwdJoint, LinearNodes) { + InitEnv(paddle::platform::CPUPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + // Run Forward Node 0 + float scale0 = 2.0; + float bias0 = 3.0; + egr::EagerTensor out0 = + egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, + true /*trace_backward*/); + + // Run Forward Node 1 + float scale1 = 5.0; + float bias1 = 10.0; + egr::EagerTensor out1 = egr::scale( + out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); + + // Examine Forward Output 0 + CompareTensorWithValue(out0, 13.0); + + // Examine Forward Output 1 + CompareTensorWithValue(out1, 75.0); + + std::vector outs = {out1}; + // 4. Run Backward + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 10.0); +} + +/* + inp + | + Node0 + ____|____ + | | + Node1 Node2 + | | + out1 out2 +*/ +TEST(FwdBwdJoint, BranchedNodes) { + InitEnv(paddle::platform::CPUPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + // Run Forward Node 0 + float scale0 = 2.0; + float bias0 = 3.0; + egr::EagerTensor out0 = + egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, + true /*trace_backward*/); + + // Run Forward Node 1 + float scale1 = 5.0; + float bias1 = 10.0; + egr::EagerTensor out1 = egr::scale( + out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); + + // Run Forward Node 2 + float scale2 = 10.0; + float bias2 = 20.0; + egr::EagerTensor out2 = egr::scale( + out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); + + // Examine Forward Output 0 + CompareTensorWithValue(out0, 13.0); + + // Examine Forward Output 1 + CompareTensorWithValue(out1, 75.0); + + // Examine Forward Output 2 + { + auto dense_out = std::dynamic_pointer_cast(out2.impl()); + float* ptr = dense_out->mutable_data(); + for (int i = 0; i < 20; i++) { + PADDLE_ENFORCE(ptr[i] == 150.0, + paddle::platform::errors::Fatal( + "Detected numerical Error, Expected %f but got %f", + 150.0, ptr[i])); + } + } + + // 4. Run Backward + std::vector outs = {out1, out2}; + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 30.0); +} + +/* + inp + | + Node0 + ____|____ + | | + Node1 Node2 + | | + out1 out2 +*/ +TEST(FwdBwdJoint, GradientHook) { + InitEnv(paddle::platform::CPUPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + std::function hook = + &hook_function; + + // 3. Run Forward + // Run Forward Node 0 + float scale0 = 2.0; + float bias0 = 3.0; + egr::EagerTensor out0 = + egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, + true /*trace_backward*/); + RetainGradForTensor(out0); // hook: +5 + RegisterGradientHookForTensor(out0, hook); // hook: +5 + + // Run Forward Node 1 + float scale1 = 5.0; + float bias1 = 10.0; + egr::EagerTensor out1 = egr::scale( + out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); + RetainGradForTensor(out1); // hook: +5 + RegisterGradientHookForTensor(out1, hook); // hook: +5 + + // Run Forward Node 2 + float scale2 = 10.0; + float bias2 = 20.0; + egr::EagerTensor out2 = egr::scale( + out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); + RetainGradForTensor(out2); // hook: +5 + RegisterGradientHookForTensor(out2, hook); // hook: +5 + + // 4. Run Backward + std::vector outs = {out1, out2}; + RunBackward(outs, {}); + + // Examine Backward Grad + // leaf grad + CompareGradTensorWithValue(tensor, 190.0); + + // out0 grad + CompareGradTensorWithValue(out0, 90.0); + + // out1 grad + CompareGradTensorWithValue(out1, 1.0); + + // out2 grad + CompareGradTensorWithValue(out2, 1.0); +} + +/* + inp + | + Node0 + ____|____ + | | + Node1 Node2 + | | + out1 out2 +*/ +TEST(FwdBwdJoint, CrossBatchAccumulation) { + InitEnv(paddle::platform::CPUPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + // Run Forward Node 0 + float scale0 = 2.0; + float bias0 = 3.0; + egr::EagerTensor out0 = + egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, + true /*trace_backward*/); + + // Run Forward Node 1 + float scale1 = 5.0; + float bias1 = 10.0; + egr::EagerTensor out1 = egr::scale( + out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); + + // Run Forward Node 2 + float scale2 = 10.0; + float bias2 = 20.0; + egr::EagerTensor out2 = egr::scale( + out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); + + // 4. Run Backward + std::vector outs = {out1, out2}; + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 30.0); + + // Cross Batch Accumulation + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 60.0); +} + +/* ---------------------------------------------------- */ +/* ---------------------- CUDA Tests ------------------ */ +/* ---------------------------------------------------- */ + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +TEST(FwdBwdJoint, SingleNodeCUDA) { + InitEnv(paddle::platform::CUDAPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + float scale = 2.0; + float bias = 3.0; + egr::EagerTensor out = egr::scale( + tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/); + + // Examine Forward Output + CompareTensorWithValue(out, 13.0); + + std::vector outs = {out}; + // 4. Run Backward + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 2.0); +} + +/* + inp + | + Node0 + ____|____ + | | + Node1 Node2 + | | + out1 out2 +*/ +TEST(FwdBwdJoint, BranchedNodesCUDA) { + InitEnv(paddle::platform::CUDAPlace()); + + // 1. Prepare Input + paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); + egr::EagerTensor tensor = CreateTensorWithValue( + ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + // 3. Run Forward + // Run Forward Node 0 + float scale0 = 2.0; + float bias0 = 3.0; + egr::EagerTensor out0 = + egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, + true /*trace_backward*/); + + // Run Forward Node 1 + float scale1 = 5.0; + float bias1 = 10.0; + egr::EagerTensor out1 = egr::scale( + out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); + + // Run Forward Node 2 + float scale2 = 10.0; + float bias2 = 20.0; + egr::EagerTensor out2 = egr::scale( + out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); + + // Examine Forward Output 0 + CompareTensorWithValue(out0, 13.0); + // Examine Forward Output 1 + CompareTensorWithValue(out1, 75.0); + // Examine Forward Output 2 + CompareTensorWithValue(out2, 150.0); + + // TODO(jiabin): fix this with add functor + // 4. Run Backward + std::vector outs = {out1, out2}; + RunBackward(outs, {}); + + // Examine Backward Grad + CompareGradTensorWithValue(tensor, 30.0); +} +#endif + +} // namespace eager_test