Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add multi card for 4 models #14

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions language_model/continuous_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@
imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02,
0, actived=True)
imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
imikolov_20_pass_duration_kpi_card4 = DurationKpi('imikolov_20_pass_duration_card4', 0.02,
0, actived=True)

tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
imikolov_20_avg_ppl_kpi_card4,
imikolov_20_pass_duration_kpi_card4,
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[43.20116806451471]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[29.93801981608073]
6 changes: 6 additions & 0 deletions language_model/run.xsh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1

cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid

FLAGS_benchmark=true python train.py

cudaid=${language_model:=0,1,2,3} # use 0-th card as default
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里赋的0,1,2,3不起作用

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are right!

export CUDA_VISIBLE_DEVICES=$cudaid

FLAGS_benchmark=true python train.py --gpu_card_num 4
29 changes: 24 additions & 5 deletions language_model/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import numpy as np
import math

import argparse
import paddle.fluid as fluid
import paddle

import utils

from continuous_evaluation import imikolov_20_avg_ppl_kpi, imikolov_20_pass_duration_kpi
from continuous_evaluation import *


def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
Expand Down Expand Up @@ -51,6 +52,13 @@ def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
cost = fluid.layers.cross_entropy(input=fc, label=dst)
return cost

def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--gpu_card_num', type=int, default=1, help='gpu card num used.')

args = parser.parse_args()
return args

def train(train_reader,
vocab,
Expand All @@ -65,6 +73,8 @@ def train(train_reader,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
args = parse_args()

vocab_size = len(vocab)

src_wordseq = fluid.layers.data(
Expand Down Expand Up @@ -134,16 +144,25 @@ def train(train_reader,
epoch_idx, i, total_time / epoch_idx)

if pass_idx == pass_num - 1:
imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx)
imikolov_20_avg_ppl_kpi.add_record(newest_ppl)
if args.gpu_card_num == 1:
imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx)
imikolov_20_avg_ppl_kpi.add_record(newest_ppl)
else:
imikolov_20_pass_duration_kpi_card4.add_record(total_time / epoch_idx)
imikolov_20_avg_ppl_kpi_card4.add_record(newest_ppl)
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
exe)
print("model saved in %s" % save_dir)
imikolov_20_pass_duration_kpi.persist()
imikolov_20_avg_ppl_kpi.persist()
if args.gpu_card_num == 1:
imikolov_20_pass_duration_kpi.persist()
imikolov_20_avg_ppl_kpi.persist()
else:
imikolov_20_pass_duration_kpi_card4.persist()
imikolov_20_avg_ppl_kpi_card4.persist()

print("finish training")


Expand Down
4 changes: 4 additions & 0 deletions sequence_tagging_for_ner/continuous_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@

train_acc_kpi = AccKpi('train_acc', 0.2, 0)
pass_duration_kpi = DurationKpi('pass_duration', 0.02, 0, actived=True)
train_acc_kpi_card4 = AccKpi('train_acc_card4', 0.02, 0, actived=True)
pass_duration_kpi_card4 = DurationKpi('pass_duration_card4', 0.02, 0, actived=True)

tracking_kpis = [
train_acc_kpi,
pass_duration_kpi,
train_acc_kpi_card4,
pass_duration_kpi_card4,
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[0.04497942033021347]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[1.0]
8 changes: 7 additions & 1 deletion sequence_tagging_for_ner/run.xsh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@

export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1

cudaid=${sequence_tagging:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid

#pass_num 2200
sh download.sh
FLAGS_benchmark=true python train.py

cudaid=${sequence_tagging:=0,1,2,3} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
#pass_num 2200
sh download.sh
FLAGS_benchmark=true python train.py --gpu_card_num 4
31 changes: 25 additions & 6 deletions sequence_tagging_for_ner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,20 @@

import paddle
import paddle.fluid as fluid

import argparse
import reader
from network_conf import ner_net
from utils import logger, load_dict
from utils_extend import to_lodtensor, get_embedding
from continuous_evaluation import train_acc_kpi, pass_duration_kpi
from continuous_evaluation import *

def parse_args():
parser = argparse.ArgumentParser("sequence_tagging_for_ner model benchmark.")
parser.add_argument(
'--gpu_card_num', type=int, default=1, help='gpu card num used.')

args = parser.parse_args()
return args

def test(exe, chunk_evaluator, inference_program, test_data, place):
chunk_evaluator.reset(exe)
Expand All @@ -28,6 +35,8 @@ def test(exe, chunk_evaluator, inference_program, test_data, place):

def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
model_save_dir, num_passes, use_gpu, parallel):

args = parse_args()
if not os.path.exists(model_save_dir):
os.mkdir(model_save_dir)

Expand Down Expand Up @@ -97,8 +106,13 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
total_time += t1 - start_time
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
if pass_id == num_passes - 1:
train_acc_kpi.add_record(pass_precision)
pass_duration_kpi.add_record(total_time / num_passes)
if args.gpu_card_num == 1:
train_acc_kpi.add_record(pass_precision)
pass_duration_kpi.add_record(total_time / num_passes)
else:
train_acc_kpi_card4.add_record(pass_precision)
pass_duration_kpi_card4.add_record(total_time / num_passes)

if pass_id % 100 == 0:
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
str(pass_precision) + " pass_recall:" + str(
Expand All @@ -113,8 +127,13 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
fluid.io.save_inference_model(
save_dirname, ['word', 'mark', 'target'], [crf_decode], exe)
train_acc_kpi.persist()
pass_duration_kpi.persist()

if args.gpu_card_num == 1:
train_acc_kpi.persist()
pass_duration_kpi.persist()
else:
train_acc_kpi_card4.persist()
pass_duration_kpi_card4.persist()


if __name__ == "__main__":
Expand Down
8 changes: 7 additions & 1 deletion text_classification/continuous_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,10 @@
lstm_train_cost_kpi = CostKpi('lstm_train_cost', 5, 0)
lstm_pass_duration_kpi = DurationKpi('lstm_pass_duration', 0.02, 0, actived=True)

tracking_kpis = [lstm_train_cost_kpi, lstm_pass_duration_kpi]
lstm_train_cost_kpi_card4 = CostKpi('lstm_train_cost_card4', 0.2, 0)
lstm_pass_duration_kpi_card4 = DurationKpi('lstm_pass_duration_card4', 0.05, 0, actived=True)

tracking_kpis = [
lstm_train_cost_kpi, lstm_pass_duration_kpi,
lstm_train_cost_kpi_card4, lstm_pass_duration_kpi_card4,
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[17.750867716471355]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[0.0030332264248281717]
9 changes: 7 additions & 2 deletions text_classification/run.xsh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@

export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${text_classification:=0} # use 0-th card as default

cudaid=${text_classification:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model lstm

cudaid=${text_classification:=0,1,2,3} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid

#LSTM pass_num 15
FLAGS_benchmark=true python train.py lstm
FLAGS_benchmark=true python train.py --model lstm --gpu_card_num 4
43 changes: 31 additions & 12 deletions text_classification/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,23 @@

import paddle.fluid as fluid
import paddle

import argparse
import utils
from nets import bow_net
from nets import cnn_net
from nets import lstm_net
from nets import gru_net
from continuous_evaluation import lstm_train_cost_kpi, lstm_pass_duration_kpi
from continuous_evaluation import *

def parse_args():
parser = argparse.ArgumentParser("text_classification model benchmark.")
parser.add_argument(
'--model', type=str, default="lstm", help='model to run.')
parser.add_argument(
'--gpu_card_num', type=int, default=1, help='gpu card num used.')

args = parser.parse_args()
return args

def train(train_reader,
word_dict,
Expand All @@ -26,6 +35,7 @@ def train(train_reader,
"""
train network
"""
args = parse_args()
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)

Expand Down Expand Up @@ -76,20 +86,29 @@ def train(train_reader,
print("pass_id: %d, avg_acc: %f, avg_cost: %f" %
(pass_id, avg_acc, avg_cost))
if pass_id == pass_num - 1:
lstm_train_cost_kpi.add_record(newest_avg_cost)
lstm_pass_duration_kpi.add_record(total_time / pass_num)
if args.gpu_card_num == 1:
lstm_train_cost_kpi.add_record(newest_avg_cost)
lstm_pass_duration_kpi.add_record(total_time / pass_num)
else:
lstm_train_cost_kpi_card4.add_record(newest_avg_cost)
lstm_pass_duration_kpi_card4.add_record(total_time / pass_num)

epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
fluid.io.save_inference_model(epoch_model, ["words", "label"], acc,
exe)
lstm_train_cost_kpi.persist()
lstm_pass_duration_kpi.persist()

if args.gpu_card_num == 1:
lstm_train_cost_kpi.persist()
lstm_pass_duration_kpi.persist()
else:
lstm_train_cost_kpi_card4.persist()
lstm_pass_duration_kpi_card4.persist()

def train_net():
args = parse_args()
word_dict, train_reader, test_reader = utils.prepare_data(
"imdb", self_dict=False, batch_size=128, buf_size=50000)

if sys.argv[1] == "bow":
if args.model == "bow":
train(
train_reader,
word_dict,
Expand All @@ -100,7 +119,7 @@ def train_net():
lr=0.002,
pass_num=30,
batch_size=128)
elif sys.argv[1] == "cnn":
elif args.model == "cnn":
train(
train_reader,
word_dict,
Expand All @@ -111,18 +130,18 @@ def train_net():
lr=0.01,
pass_num=30,
batch_size=4)
elif sys.argv[1] == "lstm":
elif args.model == "lstm":
train(
train_reader,
word_dict,
lstm_net,
use_cuda=True,
parallel=False,
parallel=True,
save_dirname="lstm_model",
lr=0.05,
pass_num=15,
batch_size=4)
elif sys.argv[1] == "gru":
elif args.model == "gru":
train(
train_reader,
word_dict,
Expand Down
8 changes: 6 additions & 2 deletions transformer/continuous_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi

train_avg_ppl_kpi = CostKpi('train_avg_ppl_kpi', 0.2, 0)
test_avg_ppl_kpi = CostKpi('test_avg_ppl_kpi', 0.2, 0)
train_pass_duration_kpi = DurationKpi('train_pass_duration_kpi', 0.2, 0)
test_avg_ppl_kpi_card4 = CostKpi('test_avg_ppl_kpi_card4', 0.05, 0, actived=True)
train_pass_duration_kpi_card4 = DurationKpi('train_pass_duration_kpi_card4', 0.02, 0, actived=True)

tracking_kpis = [
train_avg_ppl_kpi,
test_avg_ppl_kpi,
train_pass_duration_kpi,
test_avg_ppl_kpi_card4,
train_pass_duration_kpi_card4,
]
1 change: 1 addition & 0 deletions transformer/latest_kpis/test_avg_ppl_kpi_card4_factor.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[22.963890075683594]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[79.1509850025177]
8 changes: 6 additions & 2 deletions transformer/run.xsh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${transformer_cudaid:=0} # use 0-th card as default
cudaid=${transformer_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --gpu_card_num 1

FLAGS_benchmark=true python train.py

cudaid=${transformer_cudaid:=0,1,2,3} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --gpu_card_num 4
Loading