diff --git a/language_model/continuous_evaluation.py b/language_model/continuous_evaluation.py index a7bc53bd..51f32d1d 100644 --- a/language_model/continuous_evaluation.py +++ b/language_model/continuous_evaluation.py @@ -10,8 +10,13 @@ imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02, 0, actived=True) +imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0) +imikolov_20_pass_duration_kpi_card4 = DurationKpi('imikolov_20_pass_duration_card4', 0.02, + 0, actived=True) tracking_kpis = [ imikolov_20_avg_ppl_kpi, imikolov_20_pass_duration_kpi, + imikolov_20_avg_ppl_kpi_card4, + imikolov_20_pass_duration_kpi_card4, ] diff --git a/language_model/latest_kpis/imikolov_20_avg_ppl_card4_factor.txt b/language_model/latest_kpis/imikolov_20_avg_ppl_card4_factor.txt new file mode 100644 index 00000000..51fd8177 --- /dev/null +++ b/language_model/latest_kpis/imikolov_20_avg_ppl_card4_factor.txt @@ -0,0 +1 @@ +[43.20116806451471] diff --git a/language_model/latest_kpis/imikolov_20_pass_duration_card4_factor.txt b/language_model/latest_kpis/imikolov_20_pass_duration_card4_factor.txt new file mode 100644 index 00000000..d05b1bbb --- /dev/null +++ b/language_model/latest_kpis/imikolov_20_pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[29.93801981608073] diff --git a/language_model/run.xsh b/language_model/run.xsh index 5a40853d..6177d183 100755 --- a/language_model/run.xsh +++ b/language_model/run.xsh @@ -2,7 +2,13 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 + cudaid=${language_model:=0} # use 0-th card as default export CUDA_VISIBLE_DEVICES=$cudaid FLAGS_benchmark=true python train.py + +cudaid=${language_model:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --gpu_card_num 4 diff --git a/language_model/train.py b/language_model/train.py index 773c7431..21310271 100644 --- a/language_model/train.py +++ b/language_model/train.py @@ -4,12 +4,13 @@ import numpy as np import math +import argparse import paddle.fluid as fluid import paddle import utils -from continuous_evaluation import imikolov_20_avg_ppl_kpi, imikolov_20_pass_duration_kpi +from continuous_evaluation import * def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): @@ -51,6 +52,13 @@ def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): cost = fluid.layers.cross_entropy(input=fc, label=dst) return cost +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + + args = parser.parse_args() + return args def train(train_reader, vocab, @@ -65,6 +73,8 @@ def train(train_reader, init_low_bound=-0.04, init_high_bound=0.04): """ train network """ + args = parse_args() + vocab_size = len(vocab) src_wordseq = fluid.layers.data( @@ -134,16 +144,25 @@ def train(train_reader, epoch_idx, i, total_time / epoch_idx) if pass_idx == pass_num - 1: - imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx) - imikolov_20_avg_ppl_kpi.add_record(newest_ppl) + if args.gpu_card_num == 1: + imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx) + imikolov_20_avg_ppl_kpi.add_record(newest_ppl) + else: + imikolov_20_pass_duration_kpi_card4.add_record(total_time / epoch_idx) + imikolov_20_avg_ppl_kpi_card4.add_record(newest_ppl) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["src_wordseq", "dst_wordseq"] fetch_vars = [avg_cost] fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) print("model saved in %s" % save_dir) - imikolov_20_pass_duration_kpi.persist() - imikolov_20_avg_ppl_kpi.persist() + if args.gpu_card_num == 1: + imikolov_20_pass_duration_kpi.persist() + imikolov_20_avg_ppl_kpi.persist() + else: + imikolov_20_pass_duration_kpi_card4.persist() + imikolov_20_avg_ppl_kpi_card4.persist() + print("finish training") diff --git a/sequence_tagging_for_ner/continuous_evaluation.py b/sequence_tagging_for_ner/continuous_evaluation.py index 01c221fc..4528de3b 100644 --- a/sequence_tagging_for_ner/continuous_evaluation.py +++ b/sequence_tagging_for_ner/continuous_evaluation.py @@ -9,8 +9,12 @@ train_acc_kpi = AccKpi('train_acc', 0.2, 0) pass_duration_kpi = DurationKpi('pass_duration', 0.02, 0, actived=True) +train_acc_kpi_card4 = AccKpi('train_acc_card4', 0.02, 0, actived=True) +pass_duration_kpi_card4 = DurationKpi('pass_duration_card4', 0.02, 0, actived=True) tracking_kpis = [ train_acc_kpi, pass_duration_kpi, + train_acc_kpi_card4, + pass_duration_kpi_card4, ] diff --git a/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt new file mode 100644 index 00000000..bbcc1bf4 --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[0.04497942033021347] \ No newline at end of file diff --git a/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt new file mode 100644 index 00000000..e7a19a6e --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt @@ -0,0 +1 @@ +[1.0] \ No newline at end of file diff --git a/sequence_tagging_for_ner/run.xsh b/sequence_tagging_for_ner/run.xsh index fdebda79..37124255 100755 --- a/sequence_tagging_for_ner/run.xsh +++ b/sequence_tagging_for_ner/run.xsh @@ -2,9 +2,15 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 + cudaid=${sequence_tagging:=0} # use 0-th card as default export CUDA_VISIBLE_DEVICES=$cudaid - #pass_num 2200 sh download.sh FLAGS_benchmark=true python train.py + +cudaid=${sequence_tagging:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid +#pass_num 2200 +sh download.sh +FLAGS_benchmark=true python train.py --gpu_card_num 4 diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py index cfd56f82..908df93f 100644 --- a/sequence_tagging_for_ner/train.py +++ b/sequence_tagging_for_ner/train.py @@ -5,13 +5,20 @@ import paddle import paddle.fluid as fluid - +import argparse import reader from network_conf import ner_net from utils import logger, load_dict from utils_extend import to_lodtensor, get_embedding -from continuous_evaluation import train_acc_kpi, pass_duration_kpi +from continuous_evaluation import * + +def parse_args(): + parser = argparse.ArgumentParser("sequence_tagging_for_ner model benchmark.") + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + args = parser.parse_args() + return args def test(exe, chunk_evaluator, inference_program, test_data, place): chunk_evaluator.reset(exe) @@ -28,6 +35,8 @@ def test(exe, chunk_evaluator, inference_program, test_data, place): def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, model_save_dir, num_passes, use_gpu, parallel): + + args = parse_args() if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) @@ -97,8 +106,13 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, total_time += t1 - start_time pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) if pass_id == num_passes - 1: - train_acc_kpi.add_record(pass_precision) - pass_duration_kpi.add_record(total_time / num_passes) + if args.gpu_card_num == 1: + train_acc_kpi.add_record(pass_precision) + pass_duration_kpi.add_record(total_time / num_passes) + else: + train_acc_kpi_card4.add_record(pass_precision) + pass_duration_kpi_card4.add_record(total_time / num_passes) + if pass_id % 100 == 0: print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str( @@ -113,8 +127,13 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) fluid.io.save_inference_model( save_dirname, ['word', 'mark', 'target'], [crf_decode], exe) - train_acc_kpi.persist() - pass_duration_kpi.persist() + + if args.gpu_card_num == 1: + train_acc_kpi.persist() + pass_duration_kpi.persist() + else: + train_acc_kpi_card4.persist() + pass_duration_kpi_card4.persist() if __name__ == "__main__": diff --git a/text_classification/continuous_evaluation.py b/text_classification/continuous_evaluation.py index 133a0d35..82b94e30 100644 --- a/text_classification/continuous_evaluation.py +++ b/text_classification/continuous_evaluation.py @@ -10,4 +10,10 @@ lstm_train_cost_kpi = CostKpi('lstm_train_cost', 5, 0) lstm_pass_duration_kpi = DurationKpi('lstm_pass_duration', 0.02, 0, actived=True) -tracking_kpis = [lstm_train_cost_kpi, lstm_pass_duration_kpi] +lstm_train_cost_kpi_card4 = CostKpi('lstm_train_cost_card4', 0.2, 0) +lstm_pass_duration_kpi_card4 = DurationKpi('lstm_pass_duration_card4', 0.05, 0, actived=True) + +tracking_kpis = [ + lstm_train_cost_kpi, lstm_pass_duration_kpi, + lstm_train_cost_kpi_card4, lstm_pass_duration_kpi_card4, + ] diff --git a/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt new file mode 100644 index 00000000..bfd66206 --- /dev/null +++ b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[17.750867716471355] \ No newline at end of file diff --git a/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt new file mode 100644 index 00000000..f8d4e66e --- /dev/null +++ b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt @@ -0,0 +1 @@ +[0.0030332264248281717] diff --git a/text_classification/run.xsh b/text_classification/run.xsh index 9f93ed3d..330c24e2 100755 --- a/text_classification/run.xsh +++ b/text_classification/run.xsh @@ -2,8 +2,13 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 -cudaid=${text_classification:=0} # use 0-th card as default + +cudaid=${text_classification:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --model lstm + +cudaid=${text_classification:=0,1,2,3} # use 0-th card as default export CUDA_VISIBLE_DEVICES=$cudaid #LSTM pass_num 15 -FLAGS_benchmark=true python train.py lstm +FLAGS_benchmark=true python train.py --model lstm --gpu_card_num 4 diff --git a/text_classification/train.py b/text_classification/train.py index b22001ea..82e0a762 100644 --- a/text_classification/train.py +++ b/text_classification/train.py @@ -5,14 +5,23 @@ import paddle.fluid as fluid import paddle - +import argparse import utils from nets import bow_net from nets import cnn_net from nets import lstm_net from nets import gru_net -from continuous_evaluation import lstm_train_cost_kpi, lstm_pass_duration_kpi +from continuous_evaluation import * + +def parse_args(): + parser = argparse.ArgumentParser("text_classification model benchmark.") + parser.add_argument( + '--model', type=str, default="lstm", help='model to run.') + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + args = parser.parse_args() + return args def train(train_reader, word_dict, @@ -26,6 +35,7 @@ def train(train_reader, """ train network """ + args = parse_args() data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) @@ -76,20 +86,29 @@ def train(train_reader, print("pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) if pass_id == pass_num - 1: - lstm_train_cost_kpi.add_record(newest_avg_cost) - lstm_pass_duration_kpi.add_record(total_time / pass_num) + if args.gpu_card_num == 1: + lstm_train_cost_kpi.add_record(newest_avg_cost) + lstm_pass_duration_kpi.add_record(total_time / pass_num) + else: + lstm_train_cost_kpi_card4.add_record(newest_avg_cost) + lstm_pass_duration_kpi_card4.add_record(total_time / pass_num) + epoch_model = save_dirname + "/" + "epoch" + str(pass_id) fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe) - lstm_train_cost_kpi.persist() - lstm_pass_duration_kpi.persist() - + if args.gpu_card_num == 1: + lstm_train_cost_kpi.persist() + lstm_pass_duration_kpi.persist() + else: + lstm_train_cost_kpi_card4.persist() + lstm_pass_duration_kpi_card4.persist() def train_net(): + args = parse_args() word_dict, train_reader, test_reader = utils.prepare_data( "imdb", self_dict=False, batch_size=128, buf_size=50000) - if sys.argv[1] == "bow": + if args.model == "bow": train( train_reader, word_dict, @@ -100,7 +119,7 @@ def train_net(): lr=0.002, pass_num=30, batch_size=128) - elif sys.argv[1] == "cnn": + elif args.model == "cnn": train( train_reader, word_dict, @@ -111,18 +130,18 @@ def train_net(): lr=0.01, pass_num=30, batch_size=4) - elif sys.argv[1] == "lstm": + elif args.model == "lstm": train( train_reader, word_dict, lstm_net, use_cuda=True, - parallel=False, + parallel=True, save_dirname="lstm_model", lr=0.05, pass_num=15, batch_size=4) - elif sys.argv[1] == "gru": + elif args.model == "gru": train( train_reader, word_dict, diff --git a/transformer/continuous_evaluation.py b/transformer/continuous_evaluation.py index 7a39755e..041d5287 100644 --- a/transformer/continuous_evaluation.py +++ b/transformer/continuous_evaluation.py @@ -3,10 +3,14 @@ sys.path.append(os.environ['ceroot']) from kpi import CostKpi, DurationKpi, AccKpi -train_avg_ppl_kpi = CostKpi('train_avg_ppl_kpi', 0.2, 0) +test_avg_ppl_kpi = CostKpi('test_avg_ppl_kpi', 0.2, 0) train_pass_duration_kpi = DurationKpi('train_pass_duration_kpi', 0.2, 0) +test_avg_ppl_kpi_card4 = CostKpi('test_avg_ppl_kpi_card4', 0.05, 0, actived=True) +train_pass_duration_kpi_card4 = DurationKpi('train_pass_duration_kpi_card4', 0.02, 0, actived=True) tracking_kpis = [ - train_avg_ppl_kpi, + test_avg_ppl_kpi, train_pass_duration_kpi, + test_avg_ppl_kpi_card4, + train_pass_duration_kpi_card4, ] diff --git a/transformer/latest_kpis/test_avg_ppl_kpi_card4_factor.txt b/transformer/latest_kpis/test_avg_ppl_kpi_card4_factor.txt new file mode 100644 index 00000000..a4a05d9d --- /dev/null +++ b/transformer/latest_kpis/test_avg_ppl_kpi_card4_factor.txt @@ -0,0 +1 @@ +[22.963890075683594] diff --git a/transformer/latest_kpis/train_pass_duration_kpi_card4_factor.txt b/transformer/latest_kpis/train_pass_duration_kpi_card4_factor.txt new file mode 100644 index 00000000..9c878711 --- /dev/null +++ b/transformer/latest_kpis/train_pass_duration_kpi_card4_factor.txt @@ -0,0 +1 @@ +[79.1509850025177] \ No newline at end of file diff --git a/transformer/run.xsh b/transformer/run.xsh index 2f6f1ffd..87dcf26a 100755 --- a/transformer/run.xsh +++ b/transformer/run.xsh @@ -2,7 +2,11 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 -cudaid=${transformer_cudaid:=0} # use 0-th card as default +cudaid=${transformer_cudaid:=0} export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --gpu_card_num 1 -FLAGS_benchmark=true python train.py + +cudaid=${transformer_cudaid:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --gpu_card_num 4 diff --git a/transformer/train.py b/transformer/train.py index 6455c106..095579ed 100644 --- a/transformer/train.py +++ b/transformer/train.py @@ -1,15 +1,14 @@ import os import time import numpy as np - +import argparse import paddle import paddle.fluid as fluid from model import transformer, position_encoding_init from optim import LearningRateScheduler from transformer_config import * -from continuous_evaluation import train_avg_ppl_kpi, train_pass_duration_kpi - +from continuous_evaluation import * def pad_batch_data(insts, pad_idx, @@ -137,8 +136,17 @@ def __impl__(): return __impl__ +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + + args = parser.parse_args() + return args + def main(): + args = parse_args() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) @@ -263,18 +271,23 @@ def test(exe): (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True + pass_end_time = time.time() # Validate and save the model for inference. val_avg_cost, val_ppl = test(test_exe) - pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time print("pass_id = " + str(pass_id) + " time_consumed = " + str( time_consumed)) if pass_id == TrainTaskConfig.pass_num - 1: - train_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32')) - train_pass_duration_kpi.add_record(time_consumed) - train_avg_ppl_kpi.persist() - train_pass_duration_kpi.persist() - - + if args.gpu_card_num == 1: + test_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32')) + train_pass_duration_kpi.add_record(time_consumed) + test_avg_ppl_kpi.persist() + train_pass_duration_kpi.persist() + else: + test_avg_ppl_kpi_card4.add_record(np.array(val_ppl, dtype='float32')) + train_pass_duration_kpi_card4.add_record(time_consumed) + test_avg_ppl_kpi_card4.persist() + train_pass_duration_kpi_card4.persist() + if __name__ == "__main__": main()