From 04b53d451304cb6179c2642922a466ea3a1d8c63 Mon Sep 17 00:00:00 2001 From: Jianbang Yang Date: Mon, 18 Sep 2023 10:17:02 +0800 Subject: [PATCH] update t5_small bs config --- training/benchmarks/t5_small/pytorch/config/_base.py | 4 ++-- training/nvidia/t5_small-pytorch/README.md | 4 +++- training/nvidia/t5_small-pytorch/config/config_A100x1x1.py | 2 ++ training/nvidia/t5_small-pytorch/config/config_A100x1x8.py | 4 ++-- training/nvidia/t5_small-pytorch/config/config_A100x2x8.py | 2 ++ 5 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x1x1.py create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x2x8.py diff --git a/training/benchmarks/t5_small/pytorch/config/_base.py b/training/benchmarks/t5_small/pytorch/config/_base.py index 41a82a6e1..f105c6d31 100755 --- a/training/benchmarks/t5_small/pytorch/config/_base.py +++ b/training/benchmarks/t5_small/pytorch/config/_base.py @@ -18,8 +18,8 @@ # ========================================================= # train && evaluate # ========================================================= -train_batch_size: int = 4 -eval_batch_size: int = 4 +train_batch_size: int = 32 +eval_batch_size: int = 32 max_epoch: int = 3 target_rouge1: float = 40.5 diff --git a/training/nvidia/t5_small-pytorch/README.md b/training/nvidia/t5_small-pytorch/README.md index 49168789e..0d83bbbf6 100644 --- a/training/nvidia/t5_small-pytorch/README.md +++ b/training/nvidia/t5_small-pytorch/README.md @@ -45,4 +45,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | rouge1 | rouge2 | rougeL | rougeLsum | mem | | ------------------ | --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| A100单机8卡(1x8) | fp32 | / | 2658 | 135 | 168 | 186 | 41.27 | 19.02 | 29.27 | 38.47 | 4.3 /40.0 | +| A100单机8卡(1x1) | fp32 | / | | | | | | | | | | +| A100单机8卡(1x8) | fp32 | / | 996.11 | 338 | 398 | 400 | 41.12 | 18.84 | 29.15 | 38.32 | 35.3 /40.0 | +| A100单机8卡(2x8) | fp32 | / | | | | | | | | | | diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py new file mode 100644 index 000000000..c1a1569cc --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py index efa17085f..c1a1569cc 100644 --- a/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py +++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py @@ -1,2 +1,2 @@ -train_batch_size = 4 -eval_batch_size = 4 +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py new file mode 100644 index 000000000..c1a1569cc --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +eval_batch_size = 32