diff --git a/resnet50/model.py b/resnet50/model.py index f4e7beea..919f2fed 100644 --- a/resnet50/model.py +++ b/resnet50/model.py @@ -13,6 +13,7 @@ import cProfile import pstats import StringIO +from itertools import * import paddle import paddle.fluid as fluid @@ -21,6 +22,8 @@ from continuous_evaluation import tracking_kpis +fluid.default_startup_program().random_seed = 91 + def parse_args(): parser = argparse.ArgumentParser('Convolution model benchmark.') @@ -69,8 +72,8 @@ def parse_args(): help='The device type.') parser.add_argument( "--gpu_id", - type=int, - default=3, + type=str, + default='0,1,2,3', help="The GPU Card Id. (default: %(default)d)") parser.add_argument( '--data_set', @@ -202,57 +205,68 @@ def run_benchmark(model, args): else: dshape = [224, 224, 3] - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + # Input data + image = fluid.layers.data(name='image', shape=dshape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) + + #Train program + predict = model(image, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) + # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size_tensor) - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) - + # Optimization to minimize lost optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) opts = optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) + # Reader train_reader = paddle.batch( - paddle.reader.shuffle( paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), batch_size=args.batch_size) + test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), batch_size=args.batch_size) + + # Register test program + test_program = fluid.default_main_program().clone() + with fluid.program_guard(test_program): + test_program = fluid.io.get_inference_program( + target_vars=[batch_acc]) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) - def test(exe): - test_accuracy = fluid.average.WeightedAverage() + # Define parallel exe + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, main_program=test_program, share_vars_from=train_exe) + + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + def test(test_exe): + test_accuracy = [] for batch_id, data in enumerate(test_reader()): - img_data = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - acc, weight = exe.run(inference_program, - feed={"data": img_data, - "label": y_data}, - fetch_list=[batch_acc, batch_size_tensor]) - test_accuracy.add(value=acc, weight=weight) + acc, = test_exe.run( + fetch_list=[batch_acc.name], + feed=feeder.feed(data) + ) + acc_avg = np.mean(np.array(acc)) + test_accuracy.append(acc_avg) + + return np.array(test_accuracy).mean() - return test_accuracy.eval() - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - accuracy = fluid.average.WeightedAverage() if args.use_fake_data: data = train_reader().next() image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( @@ -262,7 +276,6 @@ def test(exe): im_num = 0 total_train_time = 0.0 - total_iters = 0 train_acc_kpi = None for kpi in tracking_kpis: @@ -275,7 +288,7 @@ def test(exe): for pass_id in range(args.pass_num): every_pass_loss = [] - accuracy.reset() + every_pass_acc = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): @@ -287,32 +300,35 @@ def test(exe): data)).astype('float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) - loss, acc, weight = exe.run( - fluid.default_main_program(), - feed={'data': image, - 'label': label}, - fetch_list=[avg_cost, batch_acc, batch_size_tensor]) - accuracy.add(value=acc, weight=weight) + loss, acc, _ = train_exe.run( + fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name], + feed=feeder.feed(data) + ) + + loss_avg, acc_avg = np.mean(np.array(loss)), np.mean(np.array(acc)) + print("Pass: %d, Iter: %d, loss: %s, acc: %s" % \ + (pass_id, batch_id, loss_avg, acc_avg)) + if iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - batch_start pass_duration += batch_duration im_num += label.shape[0] - every_pass_loss.append(loss) - # print("Pass: %d, Iter: %d, loss: %s, acc: %s" % - # (pass_id, iter, str(loss), str(acc))) + every_pass_loss.append(loss_avg) + every_pass_acc.append(acc_avg) iter += 1 - total_iters += 1 total_train_time += pass_duration - pass_train_acc = accuracy.eval() - pass_test_acc = test(exe) + # Begin test + pass_test_acc = test(test_exe) print( - "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n" - % (pass_id, np.mean(every_pass_loss), pass_train_acc, + "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f,\ + Handle Images Duration: %f\n" + % (pass_id, np.mean(every_pass_loss), np.mean(every_pass_acc), pass_test_acc, pass_duration)) if pass_id == args.pass_num - 1 and args.data_set == 'cifar10': train_acc_kpi.add_record(np.array(pass_train_acc, dtype='float32')) train_acc_kpi.persist() + examples_per_sec = 0 if total_train_time > 0.0 and iter != args.skip_batch_num: examples_per_sec = im_num / total_train_time sec_per_batch = total_train_time / \ @@ -339,16 +355,23 @@ def collect_gpu_memory_data(alive): collect the GPU memory data """ global is_alive - status, output = commands.getstatusoutput('rm -rf memory.txt') + status, output = commands.getstatusoutput('rm -rf memory.*') if status == 0: - print('del memory.txt') - command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id - p = subprocess.Popen(command, shell=True) - if p.pid < 0: - print('Get GPU memory data error') + print('del memory') + pid_list = [] + for gpu_id in args.gpu_id.split(','): + command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv\ + -lms 1000 > memory_%s.txt" % (gpu_id, gpu_id) + p = subprocess.Popen(command, shell=True) + if p.pid < 0: + print('Get GPU memory data error') + else: + pid_list.append(p) + while (is_alive): time.sleep(1) - p.kill() + for p in pid_list: + p.kill() def save_gpu_data(mem_list): @@ -371,11 +394,11 @@ def save_gpu_data(mem_list): is_alive = True if args.data_format == 'NHWC': raise ValueError('Only support NCHW data_format now.') - if args.device == 'GPU': - collect_memory_thread = threading.Thread( - target=collect_gpu_memory_data, args=(is_alive, )) - collect_memory_thread.setDaemon(True) - collect_memory_thread.start() + #if args.device == 'GPU': + # collect_memory_thread = threading.Thread( + # target=collect_gpu_memory_data, args=(is_alive, )) + # collect_memory_thread.setDaemon(True) + # collect_memory_thread.start() if args.use_nvprof and args.device == 'GPU': with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: run_benchmark(model_map[args.model], args)