diff --git a/benchmark/transformer/static/predict.py b/benchmark/transformer/static/predict.py index 245e690e2d0f1..efe25b6209ced 100644 --- a/benchmark/transformer/static/predict.py +++ b/benchmark/transformer/static/predict.py @@ -20,6 +20,7 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) logger = logging.getLogger(__name__) + def cast_parameters_to_fp32(place, program, scope=None): all_parameters = [] for block in program.blocks: @@ -33,6 +34,7 @@ def cast_parameters_to_fp32(place, program, scope=None): data = np.array(tensor) tensor.set(np.float32(data), place) + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( diff --git a/benchmark/transformer/static/train.py b/benchmark/transformer/static/train.py index 38818666f9bb5..fa33aac4de2e4 100644 --- a/benchmark/transformer/static/train.py +++ b/benchmark/transformer/static/train.py @@ -147,7 +147,7 @@ def do_train(args): if args.use_amp: optimizer.amp_init(places[0]) - + # the best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log( @@ -181,6 +181,9 @@ def do_train(args): 'lbl_word': data[i][2], } for i in range(trainer_count)], fetch_list=[sum_cost.name, token_num.name]) + train_batch_cost = time.time() - batch_start + batch_ips_avg.record(train_batch_cost, + np.asarray(outs[1]).sum()) else: outs = exe.run(compiled_train_program, feed=[{ @@ -189,12 +192,13 @@ def do_train(args): 'lbl_word': data[i][2], } for i in range(trainer_count)], fetch_list=[sum_cost.name, token_num.name]) + train_batch_cost = time.time() - batch_start + batch_ips_avg.record(train_batch_cost, + np.asarray(outs[1]).sum() / trainer_count) scheduler.step() - train_batch_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) batch_cost_avg.record(train_batch_cost) - batch_ips_avg.record(train_batch_cost, np.asarray(outs[1]).sum()) if step_idx % args.print_step == 0: sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[