From a3dd973e301ca99611dd3567605d278319bf0e0e Mon Sep 17 00:00:00 2001 From: Wang Huan Date: Thu, 22 Aug 2024 06:33:04 +0000 Subject: [PATCH] fix 3 case --- test/amp/test_amp_master_grad_static.py | 108 +++++---- test/amp/test_model_cast_to_bf16.py | 204 ++++++++-------- .../legacy_test/test_static_save_load_bf16.py | 229 +++++++++--------- 3 files changed, 281 insertions(+), 260 deletions(-) diff --git a/test/amp/test_amp_master_grad_static.py b/test/amp/test_amp_master_grad_static.py index 7f137517dcb425..4264c78f474f82 100644 --- a/test/amp/test_amp_master_grad_static.py +++ b/test/amp/test_amp_master_grad_static.py @@ -87,9 +87,10 @@ def amp_fp16_o2(self, use_master_grad): ) def test_amp_fp16_o2(self): - use_master_grad_list = [False, True] - for master_grad in use_master_grad_list: - self.amp_fp16_o2(master_grad) + with paddle.pir_utils.OldIrGuard(): + use_master_grad_list = [False, True] + for master_grad in use_master_grad_list: + self.amp_fp16_o2(master_grad) class TestMasterGradAccuracy(AmpTestBase): @@ -155,53 +156,60 @@ def _run( ) return losses - dtype = "float16" - max_iters = 25 - x_f32, x_f16 = self._generate_feed_x(dtype) - if paddle.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - elif paddle.device.is_compiled_with_xpu(): - place = paddle.device.XPUPlace(0) - else: - raise ValueError("Only support CUDA or XPU Place.") - exe = paddle.static.Executor(place) - use_grad_clip_list = [False, True] - for use_grad_clip in use_grad_clip_list: - losses_o1 = _run( - place, exe, x_f32, max_iters, 'O1', use_grad_clip, dtype=dtype - ) - losses_o2_no_master_grad = _run( - place, - exe, - x_f16, - max_iters, - 'O2', - use_grad_clip, - dtype=dtype, - use_master_grad=False, - ) - losses_o2_master_grad = _run( - place, - exe, - x_f16, - max_iters, - 'O2', - use_grad_clip, - dtype=dtype, - use_master_grad=True, - ) - - self.assertNotEqual( - losses_o1, - losses_o2_no_master_grad, - f"dtype: {dtype}, loss of o1 and o2-wo-master_grad should not be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2_no_master_grad}", - ) - - self.assertEqual( - losses_o1, - losses_o2_master_grad, - f"dtype: {dtype}, loss of o1 and o2-w-master_grad should be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2_master_grad}", - ) + with paddle.pir_utils.OldIrGuard(): + dtype = "float16" + max_iters = 25 + x_f32, x_f16 = self._generate_feed_x(dtype) + if paddle.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + elif paddle.device.is_compiled_with_xpu(): + place = paddle.device.XPUPlace(0) + else: + raise ValueError("Only support CUDA or XPU Place.") + exe = paddle.static.Executor(place) + use_grad_clip_list = [False, True] + for use_grad_clip in use_grad_clip_list: + losses_o1 = _run( + place, + exe, + x_f32, + max_iters, + 'O1', + use_grad_clip, + dtype=dtype, + ) + losses_o2_no_master_grad = _run( + place, + exe, + x_f16, + max_iters, + 'O2', + use_grad_clip, + dtype=dtype, + use_master_grad=False, + ) + losses_o2_master_grad = _run( + place, + exe, + x_f16, + max_iters, + 'O2', + use_grad_clip, + dtype=dtype, + use_master_grad=True, + ) + + self.assertNotEqual( + losses_o1, + losses_o2_no_master_grad, + f"dtype: {dtype}, loss of o1 and o2-wo-master_grad should not be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2_no_master_grad}", + ) + + self.assertEqual( + losses_o1, + losses_o2_master_grad, + f"dtype: {dtype}, loss of o1 and o2-w-master_grad should be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2_master_grad}", + ) if __name__ == '__main__': diff --git a/test/amp/test_model_cast_to_bf16.py b/test/amp/test_model_cast_to_bf16.py index d41202f4aafe2f..dd08e9dcc9d764 100644 --- a/test/amp/test_model_cast_to_bf16.py +++ b/test/amp/test_model_cast_to_bf16.py @@ -172,29 +172,31 @@ def _graph_common(self, _amp_fun, startup_prog=None): ) def test_graph_rewrite(self): - self._graph_common( - lambda prog: amp.bf16.rewrite_program_bf16( - prog, - amp.bf16.AutoMixedPrecisionListsBF16( - custom_bf16_list={'elementwise_add'}, - custom_fp32_varnames={'elementwise_add_0.tmp_0'}, - ), + with paddle.pir_utils.OldIrGuard(): + self._graph_common( + lambda prog: amp.bf16.rewrite_program_bf16( + prog, + amp.bf16.AutoMixedPrecisionListsBF16( + custom_bf16_list={'elementwise_add'}, + custom_fp32_varnames={'elementwise_add_0.tmp_0'}, + ), + ) ) - ) def test_graph_cast(self): - self._graph_common( - lambda prog, startup_prog: amp.bf16.cast_model_to_bf16( - prog, - startup_prog, - amp.bf16.AutoMixedPrecisionListsBF16( - custom_bf16_list={'elementwise_add'}, - custom_fp32_list={'elementwise_mul'}, + with paddle.pir_utils.OldIrGuard(): + self._graph_common( + lambda prog, startup_prog: amp.bf16.cast_model_to_bf16( + prog, + startup_prog, + amp.bf16.AutoMixedPrecisionListsBF16( + custom_bf16_list={'elementwise_add'}, + custom_fp32_list={'elementwise_mul'}, + ), + use_bf16_guard=True, ), - use_bf16_guard=True, - ), - startup_prog=base.default_startup_program(), - ) + startup_prog=base.default_startup_program(), + ) @unittest.skipIf( @@ -221,48 +223,50 @@ def _check_optimizer(self, program, expected_num_mp): ) def test_amp_bf16_o1(self): - main_program, startup_program, _, _, _ = build_embedding_model( - True, "bfloat16", "O1" - ) - self.assertEqual(main_program.num_blocks, 1) - self._check_optimizer(main_program, 0) - - amp.debugging.collect_operator_stats(main_program) - op_stats_list = amp.debugging._get_op_stats_list(main_program) - expected_bf16_calls = { - "matmul_v2": 1, - "elementwise_add": 1, - "dropout": 1, - "lookup_table_v2": 0, - "squared_l2_norm": 0, - "adamw": 0, - } - self._check_op_calls(op_stats_list[0], expected_bf16_calls) + with paddle.pir_utils.OldIrGuard(): + main_program, startup_program, _, _, _ = build_embedding_model( + True, "bfloat16", "O1" + ) + self.assertEqual(main_program.num_blocks, 1) + self._check_optimizer(main_program, 0) + + amp.debugging.collect_operator_stats(main_program) + op_stats_list = amp.debugging._get_op_stats_list(main_program) + expected_bf16_calls = { + "matmul_v2": 1, + "elementwise_add": 1, + "dropout": 1, + "lookup_table_v2": 0, + "squared_l2_norm": 0, + "adamw": 0, + } + self._check_op_calls(op_stats_list[0], expected_bf16_calls) def test_amp_bf16_o2(self): - main_program, startup_program, _, _, _ = build_embedding_model( - True, "bfloat16", "O2" - ) - self.assertEqual(main_program.num_blocks, 1) - - amp.debugging.collect_operator_stats(main_program) - op_stats_list = amp.debugging._get_op_stats_list(main_program) - expected_fp32_calls = {"lookup_table_v2": 1} - expected_bf16_calls = { - "matmul_v2": 1, - "elementwise_add": 1, - "dropout": 1, - "lookup_table_v2": 0, - "squared_l2_norm": 3, - "adamw": 3, - } - self._check_optimizer( - main_program, - expected_bf16_calls["matmul_v2"] - + expected_bf16_calls["elementwise_add"] - + expected_fp32_calls["lookup_table_v2"], - ) - self._check_op_calls(op_stats_list[0], expected_bf16_calls) + with paddle.pir_utils.OldIrGuard(): + main_program, startup_program, _, _, _ = build_embedding_model( + True, "bfloat16", "O2" + ) + self.assertEqual(main_program.num_blocks, 1) + + amp.debugging.collect_operator_stats(main_program) + op_stats_list = amp.debugging._get_op_stats_list(main_program) + expected_fp32_calls = {"lookup_table_v2": 1} + expected_bf16_calls = { + "matmul_v2": 1, + "elementwise_add": 1, + "dropout": 1, + "lookup_table_v2": 0, + "squared_l2_norm": 3, + "adamw": 3, + } + self._check_optimizer( + main_program, + expected_bf16_calls["matmul_v2"] + + expected_bf16_calls["elementwise_add"] + + expected_fp32_calls["lookup_table_v2"], + ) + self._check_op_calls(op_stats_list[0], expected_bf16_calls) @unittest.skipIf( @@ -278,47 +282,49 @@ def _generate_feed_x(self): return x_fp32, x_bf16 def test_compare_o1_o2(self): - def _run(place, exe, x_np, max_iters, level): - ( - main_program, - startup_program, - optimizer, - feed_vars, - fetch_vars, - ) = build_add_model(True, "bfloat16", level) - - losses = self.run_program( - main_program, - startup_program, - optimizer, - feed_vars, - fetch_vars, - place, - exe, - x_np, - max_iters, - "bfloat16", - level, + with paddle.pir_utils.OldIrGuard(): + + def _run(place, exe, x_np, max_iters, level): + ( + main_program, + startup_program, + optimizer, + feed_vars, + fetch_vars, + ) = build_add_model(True, "bfloat16", level) + + losses = self.run_program( + main_program, + startup_program, + optimizer, + feed_vars, + fetch_vars, + place, + exe, + x_np, + max_iters, + "bfloat16", + level, + ) + return losses + + max_iters = 2 + x_fp32, x_bf16 = self._generate_feed_x() + if paddle.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + elif paddle.is_compiled_with_xpu(): + place = paddle.device.XPUPlace(0) + else: + raise ValueError("Only support CUDA or XPU Place.") + exe = paddle.static.Executor(place) + losses_o1 = _run(place, exe, x_fp32, max_iters, 'O1') + losses_o2 = _run(place, exe, x_bf16, max_iters, 'O2') + + self.assertEqual( + losses_o1, + losses_o2, + f"loss of o1 and o2 should be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2}", ) - return losses - - max_iters = 2 - x_fp32, x_bf16 = self._generate_feed_x() - if paddle.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - elif paddle.is_compiled_with_xpu(): - place = paddle.device.XPUPlace(0) - else: - raise ValueError("Only support CUDA or XPU Place.") - exe = paddle.static.Executor(place) - losses_o1 = _run(place, exe, x_fp32, max_iters, 'O1') - losses_o2 = _run(place, exe, x_bf16, max_iters, 'O2') - - self.assertEqual( - losses_o1, - losses_o2, - f"loss of o1 and o2 should be equal, but received loss o1: {losses_o1}, loss o2: {losses_o2}", - ) if __name__ == '__main__': diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py index fe088936f671f3..6a0fca87900a79 100644 --- a/test/legacy_test/test_static_save_load_bf16.py +++ b/test/legacy_test/test_static_save_load_bf16.py @@ -44,125 +44,132 @@ def set_place(self): return base.CPUPlace() def test_ptb_rnn_cpu_bfloat16(self): - seed = 90 - hidden_size = 10 - vocab_size = 500 - num_layers = 1 - num_steps = 3 - init_scale = 0.1 - batch_size = 4 - batch_num = 100 - - with new_program_scope(): - paddle.seed(seed) - ptb_model = PtbModel( - "ptb_model", - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - num_steps=num_steps, - init_scale=init_scale, - ) - - place = self.set_place() - exe = base.Executor(place) - sgd = paddle.optimizer.SGD(learning_rate=1e-3) - x = paddle.static.data( - name="x", shape=[-1, num_steps], dtype='int64' - ) - x.desc.set_need_check_feed(False) - y = paddle.static.data(name="y", shape=[-1, 1], dtype='float32') - y.desc.set_need_check_feed(False) - init_hidden = paddle.static.data( - name="init_hidden", shape=[-1, 1], dtype='float32' - ) - init_hidden.desc.set_need_check_feed(False) - init_cell = paddle.static.data( - name="init_cell", shape=[-1, 1], dtype='float32' - ) - init_cell.desc.set_need_check_feed(False) - - static_loss, static_last_hidden, static_last_cell = ptb_model( - x, y, init_hidden, init_cell - ) - - sgd = paddle.static.amp.bf16.decorate_bf16( - sgd, - amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( - custom_fp32_list={'transpose2', 'concat'} - ), - use_bf16_guard=False, - use_pure_bf16=True, - ) - - sgd.minimize(static_loss, framework.default_startup_program()) - out = exe.run(framework.default_startup_program()) - - for i in range(batch_num): - x_data = np.arange(12).reshape(4, 3).astype('int64') - y_data = np.arange(1, 13).reshape(4, 3).astype('int64') - x_data = x_data.reshape((-1, num_steps, 1)) - y_data = y_data.reshape((-1, 1)) - # TODO investigate initializing model with "float32" instead of "uint16" as it was before - # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) - init_hidden_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='uint16' + with paddle.pir_utils.OldIrGuard(): + seed = 90 + hidden_size = 10 + vocab_size = 500 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 100 + + with new_program_scope(): + paddle.seed(seed) + ptb_model = PtbModel( + "ptb_model", + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale, ) - init_cell_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='uint16' + + place = self.set_place() + exe = base.Executor(place) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) + x = paddle.static.data( + name="x", shape=[-1, num_steps], dtype='int64' + ) + x.desc.set_need_check_feed(False) + y = paddle.static.data(name="y", shape=[-1, 1], dtype='float32') + y.desc.set_need_check_feed(False) + init_hidden = paddle.static.data( + name="init_hidden", shape=[-1, 1], dtype='float32' + ) + init_hidden.desc.set_need_check_feed(False) + init_cell = paddle.static.data( + name="init_cell", shape=[-1, 1], dtype='float32' + ) + init_cell.desc.set_need_check_feed(False) + + static_loss, static_last_hidden, static_last_cell = ptb_model( + x, y, init_hidden, init_cell ) - fetch_list = [static_loss, static_last_hidden, static_last_cell] - out = exe.run( - base.default_main_program(), - feed={ - "x": x_data, - "y": y_data, - "init_hidden": init_hidden_data, - "init_cell": init_cell_data, - }, - fetch_list=fetch_list, + sgd = paddle.static.amp.bf16.decorate_bf16( + sgd, + amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_list={'transpose2', 'concat'} + ), + use_bf16_guard=False, + use_pure_bf16=True, ) - # get value before save - main_program = framework.default_main_program() - base_map = {} - for var in main_program.list_vars(): - if isinstance(var, framework.Parameter) or var.persistable: - t = np.array( - base.global_scope().find_var(var.name).get_tensor() + sgd.minimize(static_loss, framework.default_startup_program()) + out = exe.run(framework.default_startup_program()) + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + # TODO investigate initializing model with "float32" instead of "uint16" as it was before + # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16' ) - # make sure all the parameter or optimizer var have been update - self.assertTrue(np.sum(np.abs(t)) != 0) - base_map[var.name] = t - save_dir = os.path.join(self.temp_dir.name, "test_1") - paddle.static.save(main_program, save_dir) - - # set var to zero - for var in main_program.list_vars(): - if isinstance(var, framework.Parameter) or var.persistable: - ten = base.global_scope().find_var(var.name).get_tensor() - ten.set(np.zeros_like(np.array(ten)), place) - - new_t = np.array( - base.global_scope().find_var(var.name).get_tensor() + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16' ) - # make sure all the parameter or optimizer var have been set to zero - self.assertTrue(np.sum(np.abs(new_t)) == 0) - - paddle.static.load( - main_program, - os.path.join(self.temp_dir.name, "test_1.pdparams"), - exe, - ) - - for var in main_program.list_vars(): - if isinstance(var, framework.Parameter) or var.persistable: - new_t = np.array( - base.global_scope().find_var(var.name).get_tensor() + + fetch_list = [ + static_loss, + static_last_hidden, + static_last_cell, + ] + out = exe.run( + base.default_main_program(), + feed={ + "x": x_data, + "y": y_data, + "init_hidden": init_hidden_data, + "init_cell": init_cell_data, + }, + fetch_list=fetch_list, ) - base_t = base_map[var.name] - np.testing.assert_array_equal(new_t, base_t) + + # get value before save + main_program = framework.default_main_program() + base_map = {} + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + # make sure all the parameter or optimizer var have been update + self.assertTrue(np.sum(np.abs(t)) != 0) + base_map[var.name] = t + save_dir = os.path.join(self.temp_dir.name, "test_1") + paddle.static.save(main_program, save_dir) + + # set var to zero + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + ten = ( + base.global_scope().find_var(var.name).get_tensor() + ) + ten.set(np.zeros_like(np.array(ten)), place) + + new_t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + # make sure all the parameter or optimizer var have been set to zero + self.assertTrue(np.sum(np.abs(new_t)) == 0) + + paddle.static.load( + main_program, + os.path.join(self.temp_dir.name, "test_1.pdparams"), + exe, + ) + + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + new_t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + base_t = base_map[var.name] + np.testing.assert_array_equal(new_t, base_t) def test_ptb_rnn_cpu_bfloat16_pir(self): with IrGuard():